Add cluster_batch_format.py; clean up MCNV formatting

Add reformatting to GenotypeBatch Expose reformat_script Start ripping stuff out Finish rewriting wdl and template Add TODO and delete unused task Don't assign genotypes to CNVs in add_genotypes.py Set mixed_breakend_window to 1mb Fixes Start implementing ContextAwareClustering Fix wdl Update wdl and gatk docker Comment Set context overlap parameters Update docker Fix size in ReformatGenotypedVcf Update to reformat_genotyped_vcf.py Update legacy reformatter, fix GenotypeBatchMetrics wdl Remove reformat step from GenotypeBatch Use RD_CN if CN is unavailable for mCNVs in svtk vcf2bed Add additional_args to svcluster and groupedsvcluster Add join step Update runtime attr Filter legacy records with invalid coords (needs testing) Fix record dropping; add --fix-end to wdl call Representative breakpoint summary strategy Update gatk docker Integerate SR flags into VCF Update dockers Parse last column in SR flags lists Gatk to svtk formatting Fix CNV strands and overlap breakpoint filter bothside pass file parsing Breakpoint overlap filter now sorts by BOTHSIDES_SUPPORT status rather than fraction Set empty FILTER statuses to PASS Use safer get() methods instead of brackets for accessing FORMAT fields Delete unused VcfClusterSingleChromsome.wdl Remove other unused wdls Do not require CN or RD_CN to be defined for all samples for CNVs in get_called_samples() Fix multi-allelic ALTs and genotype parsing Fix multi-allelic formatting in cleanvcf5 Clean vcf 5 script override Add SR1POS and SR2POS to gatk format to recover INS END coordinate Reset dockers to main Fix mCNV alts again Update gatk docker context to track
broadinstitute · Dec 2, 2024 · 9b729de · 9b729de
1 parent 4b2470f
commit 9b729de
Show file tree

Hide file tree

Showing 30 changed files with 766 additions and 1,370 deletions.
diff --git a/inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl b/inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl
@@ -4,11 +4,17 @@
   "CombineBatches.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }},
   "CombineBatches.empty_file" : {{ reference_resources.empty_file | tojson }},
 
+  "CombineBatches.reference_fasta": {{ reference_resources.reference_fasta | tojson }},
+  "CombineBatches.reference_dict": {{ reference_resources.reference_dict | tojson }},
+  "CombineBatches.reference_fasta_fai": {{ reference_resources.reference_index | tojson }},
+
   "CombineBatches.min_sr_background_fail_batches": 0.5,
+  "CombineBatches.gatk_docker": {{ dockers.gatk_docker | tojson }},
   "CombineBatches.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
   "CombineBatches.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
 
   "CombineBatches.cohort_name": {{ test_batch.name | tojson }},
+  "CombineBatches.ped_file": {{ test_batch.ped_file | tojson }},
   "CombineBatches.batches": [
     {{ test_batch.name | tojson }}
   ],

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
@@ -2,7 +2,7 @@
   "name": "dockers",
   "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-11-08-v1.0-62adb329",
   "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
-  "gatk_docker": "us.gcr.io/broad-dsde-methods/eph/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
+  "gatk_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-het-comparator-c228351",
   "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
   "genomes_in_the_cloud_docker": "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135",
   "linux_docker": "marketplace.gcr.io/google/ubuntu1804",

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py b/src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py
@@ -12,25 +12,6 @@
 import pysam
 
 
-def make_multiallelic_alts(record, max_CN, is_bca=False):
-    """
-    Add alts for CN states up to half of max observed total CN
-    """
-
-    max_haplo_CN = int(np.ceil(max_CN / 2))
-
-    if is_bca:
-        alts = tuple(['<CN1>'] +
-                     ['<CN{0}>'.format(i) for i in range(2, max_haplo_CN + 1)])
-    else:
-        alts = tuple(['<CN0>'] +
-                     ['<CN{0}>'.format(i) for i in range(2, max_haplo_CN + 1)])
-
-    stop = record.stop
-    record.alts = alts
-    record.stop = stop
-
-
 def make_evidence_int(ev):
     ev = ev.split(',')
 
@@ -59,16 +40,14 @@ def add_genotypes(record, genotypes, varGQ):
             del record.format[fmt]
 
     max_GT = genotypes['GT'].max()
-    is_bca = record.info['SVTYPE'] not in 'DEL DUP'.split()
-
-    if is_bca:
-        max_CN = max_GT
-    else:
-        max_CN = max_GT + 2
 
     if max_GT > 2:
-        record.info['MULTIALLELIC'] = True
-        make_multiallelic_alts(record, max_CN, is_bca)
+        record.alts = ('<CNV>',)
+        if record.info['SVTYPE'] != 'DUP':
+            msg = 'Invalid SVTYPE {0} for multiallelic record {1}'
+            msg = msg.format(record.info['SVTYPE'], record.id)
+            raise Exception(msg)
+        record.info['SVTYPE'] = 'CNV'
 
     cols = 'name sample GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split()
     gt_matrix = genotypes.reset_index()[cols].to_numpy()
@@ -85,27 +64,7 @@ def add_genotypes(record, genotypes, varGQ):
             raise Exception(msg)
 
         if max_GT > 2:
-            if record.info['SVTYPE'] == 'DEL':
-                msg = 'Invalid SVTYPE {0} for multiallelic genotype in record {1}'
-                msg = msg.format(record.info['SVTYPE'], record.id)
-                raise Exception(msg)
-
-            if is_bca:
-                idx1 = int(np.floor(data[2] / 2))
-                idx2 = int(np.ceil(data[2] / 2))
-            else:
-                # split copy state roughly evenly between haplotypes
-                idx1 = int(np.floor((data[2] + 2) / 2))
-                idx2 = int(np.ceil((data[2] + 2) / 2))
-
-                # if copy state is 1, assign reference genotype
-                if idx1 == 1:
-                    idx1 = 0
-                if idx2 == 1:
-                    idx2 = 0
-
-            record.samples[sample]['GT'] = (idx1, idx2)
-
+            record.samples[sample]['GT'] = (None, None)
         elif data[2] == 0:
             record.samples[sample]['GT'] = (0, 0)
         elif data[2] == 1:

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py
@@ -34,7 +34,7 @@ def __init__(self):
         self.sample_list = []
 
     def _update_rd_cn(self, variant, sample_indices):
-        self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices}
+        self.rd_cn[variant.id] = {s: variant.samples[s].get(VariantFormatTypes.RD_CN) for s in sample_indices}
 
     @staticmethod
     def get_wider(f):

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py
@@ -43,13 +43,13 @@ def modify_variants(dict_file_gz, vcf, multi_cnvs):
                     for sample_id in geno_normal_revise_dict[variant.id]:
                         o = variant.samples[sample_id]
                         o.update({"GT": (0, 1)})
-                        o.update({"GQ": o["RD_GQ"]})
+                        o.update({"GQ": o.get("RD_GQ")})
 
                 if variant.stop - variant.start >= 1000:
                     if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]:
                         is_del = variant.info[SVTYPE] == SVType.DEL
                         for k, v in variant.samples.items():
-                            rd_cn = v[VariantFormatTypes.RD_CN]
+                            rd_cn = v.get(VariantFormatTypes.RD_CN)
                             if rd_cn is None:
                                 continue
                             if (is_del and rd_cn > 3) or \

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py
@@ -93,30 +93,30 @@ def main():
                     record = revised_lines_by_id[record.id]
                 if record.info.get('SVTYPE', None) == 'DEL':
                     if abs(record.stop - record.pos) >= 1000:
-                        sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
+                        sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples}
                         if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1:
                             multi_del = True
-                    gts = [record.samples[s]['GT'] for s in non_outlier_samples]
+                    gts = [record.samples[s].get('GT') for s in non_outlier_samples]
                     if any(gt not in biallelic_gts for gt in gts):
                         gt5kb_del = True
-                    if abs(record.stop - record.pos) >= 5000:
+                    if abs(record.stop - record.pos) >= 5000 or '<CN0>' in record.alts:
                         if not multi_del:
                             gt5kb_del = True
 
                 if record.info.get('SVTYPE', None) == 'DUP':
                     if abs(record.stop - record.pos) >= 1000:
-                        sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
+                        sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples}
                         if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1:
                             multi_dup = True
                         if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4:
                             gt4_copystate = True
                         if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and
                                 (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and gt4_copystate) > vf_1:
                             multi_dup = True
-                    gts = [record.samples[s]['GT'] for s in non_outlier_samples]
+                    gts = [record.samples[s].get('GT') for s in non_outlier_samples]
                     if any(gt not in biallelic_gts for gt in gts):
                         gt5kb_dup = True
-                    if abs(record.stop - record.pos) >= 5000:
+                    if abs(record.stop - record.pos) >= 5000 or '<CN0>' in record.alts:
                         if not multi_dup:
                             gt5kb_dup = True
 
@@ -125,51 +125,54 @@ def main():
                         # Leave no-calls
                         if sample_obj['GT'] == (None, None):
                             continue
-                        if not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2):
+                        if not sample_obj.get('GQ') is None and \
+                                (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') >= 2):
                             sample_obj['GT'] = (0, 0)
-                        elif not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1):
+                        elif not sample_obj.get('GQ') is None and \
+                                (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 1):
                             sample_obj['GT'] = (0, 1)
-                        elif not sample_obj['GQ'] is None:
+                        elif not sample_obj.get('GQ') is None:
                             sample_obj['GT'] = (1, 1)  # RD_CN 0 DEL
 
                 if gt5kb_dup:
+                    # Convert to bi-allelic
+                    if '<CN0>' in record.alts:
+                        record.alts = ('<DUP>',)
                     for sample_obj in record.samples.itervalues():
                         # Leave no-calls - also causes bug that skips multiallelic genotypes for a biallelic variant
                         if sample_obj['GT'] == (None, None):
                             continue
-                        if not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2):
+                        if not sample_obj.get('GQ') is None and \
+                                (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') <= 2):
                             sample_obj['GT'] = (0, 0)
-                        elif not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3):
+                        elif not sample_obj.get('GQ') is None and \
+                                (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 3):
                             sample_obj['GT'] = (0, 1)
-                        elif not sample_obj['GQ'] is None:
+                        elif not sample_obj.get('GQ') is None:
                             sample_obj['GT'] = (1, 1)  # RD_CN > 3 DUP
 
                 if record.id in multi_geno_ids:
                     record.info['PESR_GT_OVERDISPERSION'] = True
 
                 if multi_del or multi_dup:
                     record.filter.add('MULTIALLELIC')
-                    for j, sample in enumerate(record.samples):
+                    for sample in record.samples:
                         record.samples[sample]['GT'] = None
                         record.samples[sample]['GQ'] = None
-                        record.samples[sample]['CN'] = record.samples[sample]['RD_CN']
-                        record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ']
+                        record.samples[sample]['CN'] = record.samples[sample].get('RD_CN')
+                        record.samples[sample]['CNQ'] = record.samples[sample].get('RD_GQ')
 
                 if len(record.filter) > 1 and 'PASS' in record.filter:
                     del record.filter['PASS']
 
-                if 'MULTIALLELIC' in record.filter and ('<DUP>' in record.alts or '<DEL>' in record.alts):
+                if 'MULTIALLELIC' in record.filter and ('<DUP>' in record.alts or '<DEL>' in record.alts or '<CN0>' in record.alts):
                     record.alts = ('<CNV>',)
                     record.info['SVTYPE'] = 'CNV'
 
                 if record.id in sexchr_revise:
                     for sample in record.samples:
                         if sample in male_samples:
-                            cn = record.samples[sample]['RD_CN']
+                            cn = record.samples[sample].get('RD_CN')
                             if cn is not None and int(cn) > 0:
                                 cn = int(cn)
                                 record.samples[sample]['RD_CN'] = cn - 1

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py
@@ -53,7 +53,7 @@ def __init__(self, record):
         else:
             raise ValueError("Uninterpretable evidence: {}".format(ev))
         if record.id in bothside_pass:
-            self.both_end_support = bothside_pass[record.id]
+            self.both_end_support = 1
         else:
             self.both_end_support = 0
         self.sr_fail = record.id in background_fail
@@ -112,7 +112,7 @@ def __str__(self):
 # Read bothside-pass/background-fail records
 sys.stderr.write("Reading SR files...\n")
 with open(BOTHSIDE_PASS_PATH) as f:
-    bothside_pass = {line.strip().split('\t')[-1]: float(line.strip().split('\t')[0]) for line in f}
+    bothside_pass = set([line.strip().split('\t')[-1] for line in f])
 
 with open(BACKGROUND_FAIL_PATH) as f:
     background_fail = set([line.strip().split('\t')[-1] for line in f])