From dc41a55d0025d0562b737a77d7d873c140818301 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Fri, 27 Jan 2017 14:07:56 +0100 Subject: [PATCH 1/5] Remove INFO string comparisons --- vcf/parser.py | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index c3c3d08..b5bc4fb 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -45,6 +45,23 @@ 'CICN': 'Integer', 'CICNADJ': 'Integer' } +INTEGER = 0 +STRING = 1 +FLOAT = 2 +FLAG = 3 + +def _encode_type(field_type): + return { + 'Integer': INTEGER, + 'String': STRING, + 'Character': STRING, + 'Float': FLOAT, + 'Numeric': FLOAT, + 'Flag': FLAG, + }[field_type] + +RESERVED_INFO_CODES = { k: _encode_type(v) for k, v in RESERVED_INFO.items() } + RESERVED_FORMAT = { 'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float', 'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer', @@ -69,7 +86,7 @@ } -_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version']) +_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version', 'type_code']) _Filter = collections.namedtuple('Filter', ['id', 'desc']) _Alt = collections.namedtuple('Alt', ['id', 'desc']) _Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc']) @@ -131,7 +148,8 @@ def read_info(self, info_string): info = _Info(match.group('id'), num, match.group('type'), match.group('desc'), - match.group('source'), match.group('version')) + match.group('source'), match.group('version'), + _encode_type(match.group('type'))) return (match.group('id'), info) @@ -387,17 +405,17 @@ def _parse_info(self, info_str): entry = entry.split('=', 1) ID = entry[0] try: - entry_type = self.infos[ID].type + entry_type = self.infos[ID].type_code except KeyError: try: - entry_type = RESERVED_INFO[ID] + entry_type = RESERVED_INFO_CODES[ID] except KeyError: if entry[1:]: - entry_type = 'String' + entry_type = STRING else: - entry_type = 'Flag' + entry_type = FLAG - if entry_type == 'Integer': + if entry_type == INTEGER: vals = entry[1].split(',') try: val = self._map(int, vals) @@ -405,21 +423,21 @@ def _parse_info(self, info_str): # Handles cases with incorrectly specified header types. except ValueError: val = self._map(float, vals) - elif entry_type == 'Float': + elif entry_type == FLOAT: vals = entry[1].split(',') val = self._map(float, vals) - elif entry_type == 'Flag': + elif entry_type == FLAG: val = True - elif entry_type in ('String', 'Character'): + elif entry_type == STRING: try: vals = entry[1].split(',') # commas are reserved characters indicating multiple values val = self._map(str, vals) except IndexError: - entry_type = 'Flag' + entry_type = FLAG val = True try: - if self.infos[ID].num == 1 and entry_type not in ( 'Flag', ): + if self.infos[ID].num == 1 and entry_type != FLAG: val = val[0] except KeyError: pass From 7f8b675097ded3d6d7e4a9e3b6290875891a8058 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Fri, 27 Jan 2017 15:04:50 +0100 Subject: [PATCH 2/5] Remove FORMAT string comparisons --- vcf/cparse.pyx | 25 ++++++++++++++----------- vcf/parser.py | 42 ++++++++++++++++++++++-------------------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index 334542a..b4cab7f 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -1,15 +1,16 @@ from model import _Call -cdef _map(func, iterable, bad=['.', '']): +cdef int INTEGER = 0 +cdef int STRING = 1 +cdef int FLOAT = 2 +cdef int FLAG = 3 + +cdef list _map(func, iterable, bad=['.', '']): '''``map``, but make bad values None.''' return [func(x) if x not in bad else None for x in iterable] -INTEGER = 'Integer' -FLOAT = 'Float' -NUMERIC = 'Numeric' - -def _parse_filter(filt_str): +cdef _parse_filter(str filt_str): '''Parse the FILTER field of a VCF entry into a Python list NOTE: this method has a python equivalent and care must be taken @@ -26,10 +27,12 @@ def parse_samples( list names, list samples, samp_fmt, list samp_fmt_types, list samp_fmt_nums, site): - cdef char *name, *fmt, *entry_type, *sample + cdef char *name + cdef char *fmt + cdef char *sample + cdef int entry_type cdef int i, j cdef list samp_data = [] - cdef dict sampdict cdef list sampvals n_samples = len(samples) n_formats = len(samp_fmt._fields) @@ -71,7 +74,7 @@ def parse_samples( sampdat[j] = int(vals) except ValueError: sampdat[j] = float(vals) - elif entry_type == FLOAT or entry_type == NUMERIC: + elif entry_type == FLOAT: sampdat[j] = float(vals) else: sampdat[j] = vals @@ -82,8 +85,8 @@ def parse_samples( try: sampdat[j] = _map(int, vals) except ValueError: - sampdat[j] = map(float, vals) - elif entry_type == FLOAT or entry_type == NUMERIC: + sampdat[j] = _map(float, vals) + elif entry_type == FLOAT: sampdat[j] = _map(float, vals) else: sampdat[j] = vals diff --git a/vcf/parser.py b/vcf/parser.py index b5bc4fb..bfb119b 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -45,6 +45,17 @@ 'CICN': 'Integer', 'CICNADJ': 'Integer' } +RESERVED_FORMAT = { + 'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float', + 'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer', + 'HQ': 'Integer', 'PS': 'Integer', 'PQ': 'Integer', 'EC': 'Integer', + 'MQ': 'Integer', + + # Keys used for structural variants + 'CN': 'Integer', 'CNQ': 'Float', 'CNL': 'Float', 'NQ': 'Integer', + 'HAP': 'Integer', 'AHAP': 'Integer' +} + INTEGER = 0 STRING = 1 FLOAT = 2 @@ -61,17 +72,7 @@ def _encode_type(field_type): }[field_type] RESERVED_INFO_CODES = { k: _encode_type(v) for k, v in RESERVED_INFO.items() } - -RESERVED_FORMAT = { - 'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float', - 'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer', - 'HQ': 'Integer', 'PS': 'Integer', 'PQ': 'Integer', 'EC': 'Integer', - 'MQ': 'Integer', - - # Keys used for structural variants - 'CN': 'Integer', 'CNQ': 'Float', 'CNL': 'Float', 'NQ': 'Integer', - 'HAP': 'Integer', 'AHAP': 'Integer' -} +RESERVED_FORMAT_CODES = { k: _encode_type(v) for k, v in RESERVED_FORMAT.items() } # Spec is a bit weak on which metadata lines are singular, like fileformat # and which can have repeats, like contig @@ -89,7 +90,7 @@ def _encode_type(field_type): _Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version', 'type_code']) _Filter = collections.namedtuple('Filter', ['id', 'desc']) _Alt = collections.namedtuple('Alt', ['id', 'desc']) -_Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc']) +_Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc', 'type_code']) _SampleInfo = collections.namedtuple('SampleInfo', ['samples', 'gt_bases', 'gt_types', 'gt_phases']) _Contig = collections.namedtuple('Contig', ['id', 'length']) @@ -185,7 +186,8 @@ def read_format(self, format_string): num = self.vcf_field_count(match.group('number')) form = _Format(match.group('id'), num, - match.group('type'), match.group('desc')) + match.group('type'), match.group('desc'), + _encode_type(match.group('type'))) return (match.group('id'), form) @@ -452,14 +454,14 @@ def _parse_sample_format(self, samp_fmt): for fmt in samp_fmt._fields: try: - entry_type = self.formats[fmt].type + entry_type = self.formats[fmt].type_code entry_num = self.formats[fmt].num except KeyError: entry_num = None try: - entry_type = RESERVED_FORMAT[fmt] + entry_type = RESERVED_FORMAT_CODES[fmt] except KeyError: - entry_type = 'String' + entry_type = STRING samp_fmt._types.append(entry_type) samp_fmt._nums.append(entry_num) return samp_fmt @@ -510,24 +512,24 @@ def _parse_samples(self, samples, samp_fmt, site): # we don't need to split single entries if entry_num == 1: - if entry_type == 'Integer': + if entry_type == INTEGER: try: sampdat[i] = int(vals) except ValueError: sampdat[i] = float(vals) - elif entry_type == 'Float' or entry_type == 'Numeric': + elif entry_type == FLOAT: sampdat[i] = float(vals) else: sampdat[i] = vals continue vals = vals.split(',') - if entry_type == 'Integer': + if entry_type == INTEGER: try: sampdat[i] = _map(int, vals) except ValueError: sampdat[i] = _map(float, vals) - elif entry_type == 'Float' or entry_type == 'Numeric': + elif entry_type == FLOAT: sampdat[i] = _map(float, vals) else: sampdat[i] = vals From 1e56cd5c05a0ba2ce54a11f5b54dc665b900ff84 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Tue, 31 Jan 2017 09:54:42 +0100 Subject: [PATCH 3/5] Add naive Cython implementation for INFO parsing --- vcf/cparse.pyx | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ vcf/parser.py | 5 ++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index b4cab7f..7add792 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -96,3 +96,61 @@ def parse_samples( samp_data.append(call) return samp_data + +def parse_info(str info_str, infos, dict reserved_info_codes): + '''Parse the INFO field of a VCF entry into a dictionary of Python + types. + + ''' + if info_str == '.': + return {} + + cdef list entries = info_str.split(';') + cdef list vals + cdef dict retdict = {} + cdef int entry_type + cdef str entry + cdef str entry_key + cdef str entry_val + + for entry in entries: + entry_key, _, entry_val = entry.partition('=') + try: + entry_type = infos[entry_key].type_code + except KeyError: + try: + entry_type = reserved_info_codes[entry_key] + except KeyError: + if entry_val: + entry_type = STRING + else: + entry_type = FLAG + + if entry_type == INTEGER: + vals = entry_val.split(',') + try: + retdict[entry_key] = _map(int, vals) + # Allow specified integers to be flexibly parsed as floats. + # Handles cases with incorrectly specified header types. + except ValueError: + retdict[entry_key] = _map(float, vals) + elif entry_type == FLOAT: + vals = entry_val.split(',') + retdict[entry_key] = _map(float, vals) + elif entry_type == FLAG: + retdict[entry_key] = True + elif entry_type == STRING: + try: + vals = entry_val.split(',') # commas are reserved characters indicating multiple values + retdict[entry_key] = _map(str, vals) + except AttributeError: + entry_type = FLAG + retdict[entry_key] = True + + try: + if infos[entry_key].num == 1 and entry_type != FLAG: + retdict[entry_key] = retdict[entry_key][0] + except KeyError: + pass + + return retdict diff --git a/vcf/parser.py b/vcf/parser.py index bfb119b..5f019d4 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -594,7 +594,10 @@ def next(self): qual = None filt = self._parse_filter(row[6]) - info = self._parse_info(row[7]) + if cparse is not None: + info = cparse.parse_info(row[7], self.infos, RESERVED_INFO_CODES) + else: + info = self._parse_info(row[7]) try: fmt = row[8] From 539cfd93f99bed1214c39621f40b1afcdffaddc3 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Tue, 31 Jan 2017 17:03:45 +0100 Subject: [PATCH 4/5] Add writing to performance testing, allow running directly with external profiler --- vcf/test/prof.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vcf/test/prof.py b/vcf/test/prof.py index 953d169..51a367c 100755 --- a/vcf/test/prof.py +++ b/vcf/test/prof.py @@ -3,10 +3,14 @@ import timeit import pstats import sys +import os def parse_1kg(): - for line in vcf.Reader(filename='vcf/test/1kg.vcf.gz'): - pass + in_vcf = vcf.Reader(filename='vcf/test/1kg.vcf.gz') + with open(os.devnull, "w") as fh: + out_vcf = vcf.Writer(fh, template=in_vcf) + for line in in_vcf: + out_vcf.write_record(line) if len(sys.argv) == 1: sys.argv.append(None) @@ -29,5 +33,7 @@ def parse_1kg(): finally: statprof.stop() statprof.display() +elif sys.argv[1] == 'run': + parse_1kg() else: print 'prof.py profile/time' From c7305b9db6b0521919d4202e123b5e52ea94fa20 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Tue, 31 Jan 2017 18:42:10 +0100 Subject: [PATCH 5/5] Add naive Cython implementation of output formatting --- vcf/cparse.pyx | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ vcf/parser.py | 8 +++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index 7add792..66860ab 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -154,3 +154,53 @@ def parse_info(str info_str, infos, dict reserved_info_codes): pass return retdict + +def format_info(dict info, info_order): + if not info: + return '.' + def order_key(str field): + # Order by header definition first, alphabetically second. + return info_order[field], field + return ';'.join(_stringify_pair(f, info[f]) for f in + sorted(info, key=order_key)) + +def format_sample(str fmt, sample): + cdef str gt + cdef list result + + if hasattr(sample.data, 'GT'): + gt = sample.data.GT + else: + gt = './.' if 'GT' in fmt else '' + + result = [gt] if gt else [] + # Following the VCF spec, GT is always the first item whenever it is present. + for field in sample.data._fields: + value = getattr(sample.data, field) + if field == 'GT': + continue + if field == 'FT': + result.append(_format_filter(value)) + else: + result.append(_stringify(value)) + return ':'.join(result) + +cdef str _format_filter(flt): + if flt == []: + return 'PASS' + return _stringify(flt, none='.', delim=';') + +cdef str _stringify(x, none='.', delim=','): + if type(x) == type([]): + return delim.join(_write_map(str, x, none)) + return str(x) if x is not None else none + +cdef str _stringify_pair(x, y, none='.', delim=','): + if isinstance(y, bool): + return str(x) if y else "" + return "%s=%s" % (str(x), _stringify(y, none=none, delim=delim)) + +cdef list _write_map(func, iterable, none='.'): + '''``map``, but make None values none.''' + return [func(x) if x is not None else none + for x in iterable] diff --git a/vcf/parser.py b/vcf/parser.py index 5f019d4..ed9af3c 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -755,6 +755,9 @@ def _format_filter(self, flt): return self._stringify(flt, none='.', delim=';') def _format_info(self, info): + if cparse: + return cparse.format_info(info, self.info_order) + if not info: return '.' def order_key(field): @@ -764,6 +767,9 @@ def order_key(field): sorted(info, key=order_key)) def _format_sample(self, fmt, sample): + if cparse: + return cparse.format_sample(fmt, sample) + if hasattr(sample.data, 'GT'): gt = sample.data.GT else: @@ -772,7 +778,7 @@ def _format_sample(self, fmt, sample): result = [gt] if gt else [] # Following the VCF spec, GT is always the first item whenever it is present. for field in sample.data._fields: - value = getattr(sample.data,field) + value = getattr(sample.data, field) if field == 'GT': continue if field == 'FT':