From dc41a55d0025d0562b737a77d7d873c140818301 Mon Sep 17 00:00:00 2001
From: Sam Brightman <sam.brightman@gmail.com>
Date: Fri, 27 Jan 2017 14:07:56 +0100
Subject: [PATCH 1/5] Remove INFO string comparisons

---
 vcf/parser.py | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/vcf/parser.py b/vcf/parser.py
index c3c3d08..b5bc4fb 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -45,6 +45,23 @@
     'CICN': 'Integer', 'CICNADJ': 'Integer'
 }
 
+INTEGER = 0
+STRING = 1
+FLOAT = 2
+FLAG = 3
+
+def _encode_type(field_type):
+    return {
+        'Integer': INTEGER,
+        'String': STRING,
+        'Character': STRING,
+        'Float': FLOAT,
+        'Numeric': FLOAT,
+        'Flag': FLAG,
+    }[field_type]
+
+RESERVED_INFO_CODES = { k: _encode_type(v) for k, v in RESERVED_INFO.items() }
+
 RESERVED_FORMAT = {
     'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float',
     'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer',
@@ -69,7 +86,7 @@
 }
 
 
-_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version'])
+_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version', 'type_code'])
 _Filter = collections.namedtuple('Filter', ['id', 'desc'])
 _Alt = collections.namedtuple('Alt', ['id', 'desc'])
 _Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc'])
@@ -131,7 +148,8 @@ def read_info(self, info_string):
 
         info = _Info(match.group('id'), num,
                      match.group('type'), match.group('desc'),
-                     match.group('source'), match.group('version'))
+                     match.group('source'), match.group('version'),
+                     _encode_type(match.group('type')))
 
         return (match.group('id'), info)
 
@@ -387,17 +405,17 @@ def _parse_info(self, info_str):
             entry = entry.split('=', 1)
             ID = entry[0]
             try:
-                entry_type = self.infos[ID].type
+                entry_type = self.infos[ID].type_code
             except KeyError:
                 try:
-                    entry_type = RESERVED_INFO[ID]
+                    entry_type = RESERVED_INFO_CODES[ID]
                 except KeyError:
                     if entry[1:]:
-                        entry_type = 'String'
+                        entry_type = STRING
                     else:
-                        entry_type = 'Flag'
+                        entry_type = FLAG
 
-            if entry_type == 'Integer':
+            if entry_type == INTEGER:
                 vals = entry[1].split(',')
                 try:
                     val = self._map(int, vals)
@@ -405,21 +423,21 @@ def _parse_info(self, info_str):
                 # Handles cases with incorrectly specified header types.
                 except ValueError:
                     val = self._map(float, vals)
-            elif entry_type == 'Float':
+            elif entry_type == FLOAT:
                 vals = entry[1].split(',')
                 val = self._map(float, vals)
-            elif entry_type == 'Flag':
+            elif entry_type == FLAG:
                 val = True
-            elif entry_type in ('String', 'Character'):
+            elif entry_type == STRING:
                 try:
                     vals = entry[1].split(',') # commas are reserved characters indicating multiple values
                     val = self._map(str, vals)
                 except IndexError:
-                    entry_type = 'Flag'
+                    entry_type = FLAG
                     val = True
 
             try:
-                if self.infos[ID].num == 1 and entry_type not in ( 'Flag', ):
+                if self.infos[ID].num == 1 and entry_type != FLAG:
                     val = val[0]
             except KeyError:
                 pass

From 7f8b675097ded3d6d7e4a9e3b6290875891a8058 Mon Sep 17 00:00:00 2001
From: Sam Brightman <sam.brightman@gmail.com>
Date: Fri, 27 Jan 2017 15:04:50 +0100
Subject: [PATCH 2/5] Remove FORMAT string comparisons

---
 vcf/cparse.pyx | 25 ++++++++++++++-----------
 vcf/parser.py  | 42 ++++++++++++++++++++++--------------------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx
index 334542a..b4cab7f 100644
--- a/vcf/cparse.pyx
+++ b/vcf/cparse.pyx
@@ -1,15 +1,16 @@
 from model import _Call
 
-cdef _map(func, iterable, bad=['.', '']):
+cdef int INTEGER = 0
+cdef int STRING = 1
+cdef int FLOAT = 2
+cdef int FLAG = 3
+
+cdef list _map(func, iterable, bad=['.', '']):
     '''``map``, but make bad values None.'''
     return [func(x) if x not in bad else None
             for x in iterable]
 
-INTEGER = 'Integer'
-FLOAT = 'Float'
-NUMERIC = 'Numeric'
-
-def _parse_filter(filt_str):
+cdef _parse_filter(str filt_str):
     '''Parse the FILTER field of a VCF entry into a Python list
 
     NOTE: this method has a python equivalent and care must be taken
@@ -26,10 +27,12 @@ def parse_samples(
         list names, list samples, samp_fmt,
         list samp_fmt_types, list samp_fmt_nums, site):
 
-    cdef char *name, *fmt, *entry_type, *sample
+    cdef char *name
+    cdef char *fmt
+    cdef char *sample
+    cdef int entry_type
     cdef int i, j
     cdef list samp_data = []
-    cdef dict sampdict
     cdef list sampvals
     n_samples = len(samples)
     n_formats = len(samp_fmt._fields)
@@ -71,7 +74,7 @@ def parse_samples(
                         sampdat[j] = int(vals)
                     except ValueError:
                         sampdat[j] = float(vals)
-                elif entry_type == FLOAT or entry_type == NUMERIC:
+                elif entry_type == FLOAT:
                     sampdat[j] = float(vals)
                 else:
                     sampdat[j] = vals
@@ -82,8 +85,8 @@ def parse_samples(
                 try:
                     sampdat[j] = _map(int, vals)
                 except ValueError:
-                    sampdat[j] = map(float, vals)
-            elif entry_type == FLOAT or entry_type == NUMERIC:
+                    sampdat[j] = _map(float, vals)
+            elif entry_type == FLOAT:
                 sampdat[j] = _map(float, vals)
             else:
                 sampdat[j] = vals
diff --git a/vcf/parser.py b/vcf/parser.py
index b5bc4fb..bfb119b 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -45,6 +45,17 @@
     'CICN': 'Integer', 'CICNADJ': 'Integer'
 }
 
+RESERVED_FORMAT = {
+    'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float',
+    'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer',
+    'HQ': 'Integer', 'PS': 'Integer', 'PQ': 'Integer', 'EC': 'Integer',
+    'MQ': 'Integer',
+
+    # Keys used for structural variants
+    'CN': 'Integer', 'CNQ': 'Float', 'CNL': 'Float', 'NQ': 'Integer',
+    'HAP': 'Integer', 'AHAP': 'Integer'
+}
+
 INTEGER = 0
 STRING = 1
 FLOAT = 2
@@ -61,17 +72,7 @@ def _encode_type(field_type):
     }[field_type]
 
 RESERVED_INFO_CODES = { k: _encode_type(v) for k, v in RESERVED_INFO.items() }
-
-RESERVED_FORMAT = {
-    'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float',
-    'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer',
-    'HQ': 'Integer', 'PS': 'Integer', 'PQ': 'Integer', 'EC': 'Integer',
-    'MQ': 'Integer',
-
-    # Keys used for structural variants
-    'CN': 'Integer', 'CNQ': 'Float', 'CNL': 'Float', 'NQ': 'Integer',
-    'HAP': 'Integer', 'AHAP': 'Integer'
-}
+RESERVED_FORMAT_CODES = { k: _encode_type(v) for k, v in RESERVED_FORMAT.items() }
 
 # Spec is a bit weak on which metadata lines are singular, like fileformat
 # and which can have repeats, like contig
@@ -89,7 +90,7 @@ def _encode_type(field_type):
 _Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version', 'type_code'])
 _Filter = collections.namedtuple('Filter', ['id', 'desc'])
 _Alt = collections.namedtuple('Alt', ['id', 'desc'])
-_Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc'])
+_Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc', 'type_code'])
 _SampleInfo = collections.namedtuple('SampleInfo', ['samples', 'gt_bases', 'gt_types', 'gt_phases'])
 _Contig = collections.namedtuple('Contig', ['id', 'length'])
 
@@ -185,7 +186,8 @@ def read_format(self, format_string):
         num = self.vcf_field_count(match.group('number'))
 
         form = _Format(match.group('id'), num,
-                       match.group('type'), match.group('desc'))
+                       match.group('type'), match.group('desc'),
+                       _encode_type(match.group('type')))
 
         return (match.group('id'), form)
 
@@ -452,14 +454,14 @@ def _parse_sample_format(self, samp_fmt):
 
         for fmt in samp_fmt._fields:
             try:
-                entry_type = self.formats[fmt].type
+                entry_type = self.formats[fmt].type_code
                 entry_num = self.formats[fmt].num
             except KeyError:
                 entry_num = None
                 try:
-                    entry_type = RESERVED_FORMAT[fmt]
+                    entry_type = RESERVED_FORMAT_CODES[fmt]
                 except KeyError:
-                    entry_type = 'String'
+                    entry_type = STRING
             samp_fmt._types.append(entry_type)
             samp_fmt._nums.append(entry_num)
         return samp_fmt
@@ -510,24 +512,24 @@ def _parse_samples(self, samples, samp_fmt, site):
 
                 # we don't need to split single entries
                 if entry_num == 1:
-                    if entry_type == 'Integer':
+                    if entry_type == INTEGER:
                         try:
                             sampdat[i] = int(vals)
                         except ValueError:
                             sampdat[i] = float(vals)
-                    elif entry_type == 'Float' or entry_type == 'Numeric':
+                    elif entry_type == FLOAT:
                         sampdat[i] = float(vals)
                     else:
                         sampdat[i] = vals
                     continue
 
                 vals = vals.split(',')
-                if entry_type == 'Integer':
+                if entry_type == INTEGER:
                     try:
                         sampdat[i] = _map(int, vals)
                     except ValueError:
                         sampdat[i] = _map(float, vals)
-                elif entry_type == 'Float' or entry_type == 'Numeric':
+                elif entry_type == FLOAT:
                     sampdat[i] = _map(float, vals)
                 else:
                     sampdat[i] = vals

From 1e56cd5c05a0ba2ce54a11f5b54dc665b900ff84 Mon Sep 17 00:00:00 2001
From: Sam Brightman <sam.brightman@gmail.com>
Date: Tue, 31 Jan 2017 09:54:42 +0100
Subject: [PATCH 3/5] Add naive Cython implementation for INFO parsing

---
 vcf/cparse.pyx | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 vcf/parser.py  |  5 ++++-
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx
index b4cab7f..7add792 100644
--- a/vcf/cparse.pyx
+++ b/vcf/cparse.pyx
@@ -96,3 +96,61 @@ def parse_samples(
         samp_data.append(call)
 
     return samp_data
+
+def parse_info(str info_str, infos, dict reserved_info_codes):
+    '''Parse the INFO field of a VCF entry into a dictionary of Python
+    types.
+
+    '''
+    if info_str == '.':
+        return {}
+
+    cdef list entries = info_str.split(';')
+    cdef list vals
+    cdef dict retdict = {}
+    cdef int entry_type
+    cdef str entry
+    cdef str entry_key
+    cdef str entry_val
+
+    for entry in entries:
+        entry_key, _, entry_val = entry.partition('=')
+        try:
+            entry_type = infos[entry_key].type_code
+        except KeyError:
+            try:
+                entry_type = reserved_info_codes[entry_key]
+            except KeyError:
+                if entry_val:
+                    entry_type = STRING
+                else:
+                    entry_type = FLAG
+
+        if entry_type == INTEGER:
+            vals = entry_val.split(',')
+            try:
+                retdict[entry_key] = _map(int, vals)
+            # Allow specified integers to be flexibly parsed as floats.
+            # Handles cases with incorrectly specified header types.
+            except ValueError:
+                retdict[entry_key] = _map(float, vals)
+        elif entry_type == FLOAT:
+            vals = entry_val.split(',')
+            retdict[entry_key] = _map(float, vals)
+        elif entry_type == FLAG:
+            retdict[entry_key] = True
+        elif entry_type == STRING:
+            try:
+                vals = entry_val.split(',') # commas are reserved characters indicating multiple values
+                retdict[entry_key] = _map(str, vals)
+            except AttributeError:
+                entry_type = FLAG
+                retdict[entry_key] = True
+
+        try:
+            if infos[entry_key].num == 1 and entry_type != FLAG:
+                retdict[entry_key] = retdict[entry_key][0]
+        except KeyError:
+            pass
+
+    return retdict
diff --git a/vcf/parser.py b/vcf/parser.py
index bfb119b..5f019d4 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -594,7 +594,10 @@ def next(self):
                 qual = None
 
         filt = self._parse_filter(row[6])
-        info = self._parse_info(row[7])
+        if cparse is not None:
+            info = cparse.parse_info(row[7], self.infos, RESERVED_INFO_CODES)
+        else:
+            info = self._parse_info(row[7])
 
         try:
             fmt = row[8]

From 539cfd93f99bed1214c39621f40b1afcdffaddc3 Mon Sep 17 00:00:00 2001
From: Sam Brightman <sam.brightman@gmail.com>
Date: Tue, 31 Jan 2017 17:03:45 +0100
Subject: [PATCH 4/5] Add writing to performance testing, allow running
 directly with external profiler

---
 vcf/test/prof.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vcf/test/prof.py b/vcf/test/prof.py
index 953d169..51a367c 100755
--- a/vcf/test/prof.py
+++ b/vcf/test/prof.py
@@ -3,10 +3,14 @@
 import timeit
 import pstats
 import sys
+import os
 
 def parse_1kg():
-    for line in vcf.Reader(filename='vcf/test/1kg.vcf.gz'):
-        pass
+    in_vcf = vcf.Reader(filename='vcf/test/1kg.vcf.gz')
+    with open(os.devnull, "w") as fh:
+        out_vcf = vcf.Writer(fh, template=in_vcf)
+        for line in in_vcf:
+            out_vcf.write_record(line)
 
 if len(sys.argv) == 1:
     sys.argv.append(None)
@@ -29,5 +33,7 @@ def parse_1kg():
     finally:
         statprof.stop()
         statprof.display()
+elif sys.argv[1] == 'run':
+    parse_1kg()
 else:
     print 'prof.py profile/time'

From c7305b9db6b0521919d4202e123b5e52ea94fa20 Mon Sep 17 00:00:00 2001
From: Sam Brightman <sam.brightman@gmail.com>
Date: Tue, 31 Jan 2017 18:42:10 +0100
Subject: [PATCH 5/5] Add naive Cython implementation of output formatting

---
 vcf/cparse.pyx | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 vcf/parser.py  |  8 +++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx
index 7add792..66860ab 100644
--- a/vcf/cparse.pyx
+++ b/vcf/cparse.pyx
@@ -154,3 +154,53 @@ def parse_info(str info_str, infos, dict reserved_info_codes):
             pass
 
     return retdict
+
+def format_info(dict info, info_order):
+    if not info:
+        return '.'
+    def order_key(str field):
+        # Order by header definition first, alphabetically second.
+        return info_order[field], field
+    return ';'.join(_stringify_pair(f, info[f]) for f in
+                    sorted(info, key=order_key))
+
+def format_sample(str fmt, sample):
+    cdef str gt
+    cdef list result
+
+    if hasattr(sample.data, 'GT'):
+        gt = sample.data.GT
+    else:
+        gt = './.' if 'GT' in fmt else ''
+
+    result = [gt] if gt else []
+    # Following the VCF spec, GT is always the first item whenever it is present.
+    for field in sample.data._fields:
+        value = getattr(sample.data, field)
+        if field == 'GT':
+            continue
+        if field == 'FT':
+            result.append(_format_filter(value))
+        else:
+            result.append(_stringify(value))
+    return ':'.join(result)
+
+cdef str _format_filter(flt):
+    if flt == []:
+        return 'PASS'
+    return _stringify(flt, none='.', delim=';')
+
+cdef str _stringify(x, none='.', delim=','):
+    if type(x) == type([]):
+        return delim.join(_write_map(str, x, none))
+    return str(x) if x is not None else none
+
+cdef str _stringify_pair(x, y, none='.', delim=','):
+    if isinstance(y, bool):
+        return str(x) if y else ""
+    return "%s=%s" % (str(x), _stringify(y, none=none, delim=delim))
+
+cdef list _write_map(func, iterable, none='.'):
+    '''``map``, but make None values none.'''
+    return [func(x) if x is not None else none
+            for x in iterable]
diff --git a/vcf/parser.py b/vcf/parser.py
index 5f019d4..ed9af3c 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -755,6 +755,9 @@ def _format_filter(self, flt):
         return self._stringify(flt, none='.', delim=';')
 
     def _format_info(self, info):
+        if cparse:
+            return cparse.format_info(info, self.info_order)
+
         if not info:
             return '.'
         def order_key(field):
@@ -764,6 +767,9 @@ def order_key(field):
                         sorted(info, key=order_key))
 
     def _format_sample(self, fmt, sample):
+        if cparse:
+            return cparse.format_sample(fmt, sample)
+
         if hasattr(sample.data, 'GT'):
             gt = sample.data.GT
         else:
@@ -772,7 +778,7 @@ def _format_sample(self, fmt, sample):
         result = [gt] if gt else []
         # Following the VCF spec, GT is always the first item whenever it is present.
         for field in sample.data._fields:
-            value = getattr(sample.data,field)
+            value = getattr(sample.data, field)
             if field == 'GT':
                 continue
             if field == 'FT':