Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements #267

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 122 additions & 11 deletions vcf/cparse.pyx
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from model import _Call

cdef _map(func, iterable, bad=['.', '']):
cdef int INTEGER = 0
cdef int STRING = 1
cdef int FLOAT = 2
cdef int FLAG = 3

cdef list _map(func, iterable, bad=['.', '']):
'''``map``, but make bad values None.'''
return [func(x) if x not in bad else None
for x in iterable]

INTEGER = 'Integer'
FLOAT = 'Float'
NUMERIC = 'Numeric'

def _parse_filter(filt_str):
cdef _parse_filter(str filt_str):
'''Parse the FILTER field of a VCF entry into a Python list

NOTE: this method has a python equivalent and care must be taken
Expand All @@ -26,10 +27,12 @@ def parse_samples(
list names, list samples, samp_fmt,
list samp_fmt_types, list samp_fmt_nums, site):

cdef char *name, *fmt, *entry_type, *sample
cdef char *name
cdef char *fmt
cdef char *sample
cdef int entry_type
cdef int i, j
cdef list samp_data = []
cdef dict sampdict
cdef list sampvals
n_samples = len(samples)
n_formats = len(samp_fmt._fields)
Expand Down Expand Up @@ -71,7 +74,7 @@ def parse_samples(
sampdat[j] = int(vals)
except ValueError:
sampdat[j] = float(vals)
elif entry_type == FLOAT or entry_type == NUMERIC:
elif entry_type == FLOAT:
sampdat[j] = float(vals)
else:
sampdat[j] = vals
Expand All @@ -82,8 +85,8 @@ def parse_samples(
try:
sampdat[j] = _map(int, vals)
except ValueError:
sampdat[j] = map(float, vals)
elif entry_type == FLOAT or entry_type == NUMERIC:
sampdat[j] = _map(float, vals)
elif entry_type == FLOAT:
sampdat[j] = _map(float, vals)
else:
sampdat[j] = vals
Expand All @@ -93,3 +96,111 @@ def parse_samples(
samp_data.append(call)

return samp_data

def parse_info(str info_str, infos, dict reserved_info_codes):
'''Parse the INFO field of a VCF entry into a dictionary of Python
types.

'''
if info_str == '.':
return {}

cdef list entries = info_str.split(';')
cdef list vals
cdef dict retdict = {}
cdef int entry_type
cdef str entry
cdef str entry_key
cdef str entry_val

for entry in entries:
entry_key, _, entry_val = entry.partition('=')
try:
entry_type = infos[entry_key].type_code
except KeyError:
try:
entry_type = reserved_info_codes[entry_key]
except KeyError:
if entry_val:
entry_type = STRING
else:
entry_type = FLAG

if entry_type == INTEGER:
vals = entry_val.split(',')
try:
retdict[entry_key] = _map(int, vals)
# Allow specified integers to be flexibly parsed as floats.
# Handles cases with incorrectly specified header types.
except ValueError:
retdict[entry_key] = _map(float, vals)
elif entry_type == FLOAT:
vals = entry_val.split(',')
retdict[entry_key] = _map(float, vals)
elif entry_type == FLAG:
retdict[entry_key] = True
elif entry_type == STRING:
try:
vals = entry_val.split(',') # commas are reserved characters indicating multiple values
retdict[entry_key] = _map(str, vals)
except AttributeError:
entry_type = FLAG
retdict[entry_key] = True

try:
if infos[entry_key].num == 1 and entry_type != FLAG:
retdict[entry_key] = retdict[entry_key][0]
except KeyError:
pass

return retdict

def format_info(dict info, info_order):
if not info:
return '.'
def order_key(str field):
# Order by header definition first, alphabetically second.
return info_order[field], field
return ';'.join(_stringify_pair(f, info[f]) for f in
sorted(info, key=order_key))

def format_sample(str fmt, sample):
cdef str gt
cdef list result

if hasattr(sample.data, 'GT'):
gt = sample.data.GT
else:
gt = './.' if 'GT' in fmt else ''

result = [gt] if gt else []
# Following the VCF spec, GT is always the first item whenever it is present.
for field in sample.data._fields:
value = getattr(sample.data, field)
if field == 'GT':
continue
if field == 'FT':
result.append(_format_filter(value))
else:
result.append(_stringify(value))
return ':'.join(result)

cdef str _format_filter(flt):
if flt == []:
return 'PASS'
return _stringify(flt, none='.', delim=';')

cdef str _stringify(x, none='.', delim=','):
if type(x) == type([]):
return delim.join(_write_map(str, x, none))
return str(x) if x is not None else none

cdef str _stringify_pair(x, y, none='.', delim=','):
if isinstance(y, bool):
return str(x) if y else ""
return "%s=%s" % (str(x), _stringify(y, none=none, delim=delim))

cdef list _write_map(func, iterable, none='.'):
'''``map``, but make None values none.'''
return [func(x) if x is not None else none
for x in iterable]
75 changes: 52 additions & 23 deletions vcf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,24 @@
'HAP': 'Integer', 'AHAP': 'Integer'
}

INTEGER = 0
STRING = 1
FLOAT = 2
FLAG = 3

def _encode_type(field_type):
return {
'Integer': INTEGER,
'String': STRING,
'Character': STRING,
'Float': FLOAT,
'Numeric': FLOAT,
'Flag': FLAG,
}[field_type]

RESERVED_INFO_CODES = { k: _encode_type(v) for k, v in RESERVED_INFO.items() }
RESERVED_FORMAT_CODES = { k: _encode_type(v) for k, v in RESERVED_FORMAT.items() }

# Spec is a bit weak on which metadata lines are singular, like fileformat
# and which can have repeats, like contig
SINGULAR_METADATA = ['fileformat', 'fileDate', 'reference']
Expand All @@ -69,10 +87,10 @@
}


_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version', 'type_code'])
_Filter = collections.namedtuple('Filter', ['id', 'desc'])
_Alt = collections.namedtuple('Alt', ['id', 'desc'])
_Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc'])
_Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc', 'type_code'])
_SampleInfo = collections.namedtuple('SampleInfo', ['samples', 'gt_bases', 'gt_types', 'gt_phases'])
_Contig = collections.namedtuple('Contig', ['id', 'length'])

Expand Down Expand Up @@ -131,7 +149,8 @@ def read_info(self, info_string):

info = _Info(match.group('id'), num,
match.group('type'), match.group('desc'),
match.group('source'), match.group('version'))
match.group('source'), match.group('version'),
_encode_type(match.group('type')))

return (match.group('id'), info)

Expand Down Expand Up @@ -167,7 +186,8 @@ def read_format(self, format_string):
num = self.vcf_field_count(match.group('number'))

form = _Format(match.group('id'), num,
match.group('type'), match.group('desc'))
match.group('type'), match.group('desc'),
_encode_type(match.group('type')))

return (match.group('id'), form)

Expand Down Expand Up @@ -387,39 +407,39 @@ def _parse_info(self, info_str):
entry = entry.split('=', 1)
ID = entry[0]
try:
entry_type = self.infos[ID].type
entry_type = self.infos[ID].type_code
except KeyError:
try:
entry_type = RESERVED_INFO[ID]
entry_type = RESERVED_INFO_CODES[ID]
except KeyError:
if entry[1:]:
entry_type = 'String'
entry_type = STRING
else:
entry_type = 'Flag'
entry_type = FLAG

if entry_type == 'Integer':
if entry_type == INTEGER:
vals = entry[1].split(',')
try:
val = self._map(int, vals)
# Allow specified integers to be flexibly parsed as floats.
# Handles cases with incorrectly specified header types.
except ValueError:
val = self._map(float, vals)
elif entry_type == 'Float':
elif entry_type == FLOAT:
vals = entry[1].split(',')
val = self._map(float, vals)
elif entry_type == 'Flag':
elif entry_type == FLAG:
val = True
elif entry_type in ('String', 'Character'):
elif entry_type == STRING:
try:
vals = entry[1].split(',') # commas are reserved characters indicating multiple values
val = self._map(str, vals)
except IndexError:
entry_type = 'Flag'
entry_type = FLAG
val = True

try:
if self.infos[ID].num == 1 and entry_type not in ( 'Flag', ):
if self.infos[ID].num == 1 and entry_type != FLAG:
val = val[0]
except KeyError:
pass
Expand All @@ -434,14 +454,14 @@ def _parse_sample_format(self, samp_fmt):

for fmt in samp_fmt._fields:
try:
entry_type = self.formats[fmt].type
entry_type = self.formats[fmt].type_code
entry_num = self.formats[fmt].num
except KeyError:
entry_num = None
try:
entry_type = RESERVED_FORMAT[fmt]
entry_type = RESERVED_FORMAT_CODES[fmt]
except KeyError:
entry_type = 'String'
entry_type = STRING
samp_fmt._types.append(entry_type)
samp_fmt._nums.append(entry_num)
return samp_fmt
Expand Down Expand Up @@ -492,24 +512,24 @@ def _parse_samples(self, samples, samp_fmt, site):

# we don't need to split single entries
if entry_num == 1:
if entry_type == 'Integer':
if entry_type == INTEGER:
try:
sampdat[i] = int(vals)
except ValueError:
sampdat[i] = float(vals)
elif entry_type == 'Float' or entry_type == 'Numeric':
elif entry_type == FLOAT:
sampdat[i] = float(vals)
else:
sampdat[i] = vals
continue

vals = vals.split(',')
if entry_type == 'Integer':
if entry_type == INTEGER:
try:
sampdat[i] = _map(int, vals)
except ValueError:
sampdat[i] = _map(float, vals)
elif entry_type == 'Float' or entry_type == 'Numeric':
elif entry_type == FLOAT:
sampdat[i] = _map(float, vals)
else:
sampdat[i] = vals
Expand Down Expand Up @@ -574,7 +594,10 @@ def next(self):
qual = None

filt = self._parse_filter(row[6])
info = self._parse_info(row[7])
if cparse is not None:
info = cparse.parse_info(row[7], self.infos, RESERVED_INFO_CODES)
else:
info = self._parse_info(row[7])

try:
fmt = row[8]
Expand Down Expand Up @@ -732,6 +755,9 @@ def _format_filter(self, flt):
return self._stringify(flt, none='.', delim=';')

def _format_info(self, info):
if cparse:
return cparse.format_info(info, self.info_order)

if not info:
return '.'
def order_key(field):
Expand All @@ -741,6 +767,9 @@ def order_key(field):
sorted(info, key=order_key))

def _format_sample(self, fmt, sample):
if cparse:
return cparse.format_sample(fmt, sample)

if hasattr(sample.data, 'GT'):
gt = sample.data.GT
else:
Expand All @@ -749,7 +778,7 @@ def _format_sample(self, fmt, sample):
result = [gt] if gt else []
# Following the VCF spec, GT is always the first item whenever it is present.
for field in sample.data._fields:
value = getattr(sample.data,field)
value = getattr(sample.data, field)
if field == 'GT':
continue
if field == 'FT':
Expand Down
10 changes: 8 additions & 2 deletions vcf/test/prof.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
import timeit
import pstats
import sys
import os

def parse_1kg():
for line in vcf.Reader(filename='vcf/test/1kg.vcf.gz'):
pass
in_vcf = vcf.Reader(filename='vcf/test/1kg.vcf.gz')
with open(os.devnull, "w") as fh:
out_vcf = vcf.Writer(fh, template=in_vcf)
for line in in_vcf:
out_vcf.write_record(line)

if len(sys.argv) == 1:
sys.argv.append(None)
Expand All @@ -29,5 +33,7 @@ def parse_1kg():
finally:
statprof.stop()
statprof.display()
elif sys.argv[1] == 'run':
parse_1kg()
else:
print 'prof.py profile/time'