Skip to content

Commit

Permalink
Add cluster_batch_format.py; clean up MCNV formatting
Browse files Browse the repository at this point in the history
Add reformatting to GenotypeBatch

Expose reformat_script

Start ripping stuff out

Finish rewriting wdl and template

Add TODO and delete unused task

Don't assign genotypes to CNVs in add_genotypes.py

Set mixed_breakend_window to 1mb

Fixes

Start implementing ContextAwareClustering

Fix wdl

Update wdl and gatk docker

Comment

Set context overlap parameters

Update docker

Fix size in ReformatGenotypedVcf

Update to reformat_genotyped_vcf.py

Update legacy reformatter, fix GenotypeBatchMetrics wdl

Remove reformat step from GenotypeBatch

Use RD_CN if CN is unavailable for mCNVs in svtk vcf2bed

Add additional_args to svcluster and groupedsvcluster

Add join step

Update runtime attr

Filter legacy records with invalid coords (needs testing)

Fix record dropping; add --fix-end to wdl call

Representative breakpoint summary strategy

Update gatk docker

Integerate SR flags into VCF

Update dockers

Parse last column in SR flags lists

Gatk to svtk formatting

Fix CNV strands and overlap breakpoint filter bothside pass file parsing

Breakpoint overlap filter now sorts by BOTHSIDES_SUPPORT status rather than fraction

Set empty FILTER statuses to PASS

Use safer get() methods instead of brackets for accessing FORMAT fields

Delete unused VcfClusterSingleChromsome.wdl

Remove other unused wdls

Do not require CN or RD_CN to be defined for all samples for CNVs in get_called_samples()

Fix multi-allelic ALTs and genotype parsing

Fix multi-allelic formatting in cleanvcf5

Clean vcf 5 script override

Add SR1POS and SR2POS to gatk format to recover INS END coordinate

Reset dockers to main

Fix mCNV alts again

Update gatk docker

context to track
  • Loading branch information
mwalker174 committed Dec 2, 2024
1 parent 4b2470f commit 9b729de
Show file tree
Hide file tree
Showing 30 changed files with 766 additions and 1,370 deletions.
6 changes: 6 additions & 0 deletions inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
"CombineBatches.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }},
"CombineBatches.empty_file" : {{ reference_resources.empty_file | tojson }},

"CombineBatches.reference_fasta": {{ reference_resources.reference_fasta | tojson }},
"CombineBatches.reference_dict": {{ reference_resources.reference_dict | tojson }},
"CombineBatches.reference_fasta_fai": {{ reference_resources.reference_index | tojson }},

"CombineBatches.min_sr_background_fail_batches": 0.5,
"CombineBatches.gatk_docker": {{ dockers.gatk_docker | tojson }},
"CombineBatches.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
"CombineBatches.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},

"CombineBatches.cohort_name": {{ test_batch.name | tojson }},
"CombineBatches.ped_file": {{ test_batch.ped_file | tojson }},
"CombineBatches.batches": [
{{ test_batch.name | tojson }}
],
Expand Down
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "dockers",
"cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-11-08-v1.0-62adb329",
"condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
"gatk_docker": "us.gcr.io/broad-dsde-methods/eph/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
"gatk_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-het-comparator-c228351",
"gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
"genomes_in_the_cloud_docker": "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135",
"linux_docker": "marketplace.gcr.io/google/ubuntu1804",
Expand Down
55 changes: 7 additions & 48 deletions src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,6 @@
import pysam


def make_multiallelic_alts(record, max_CN, is_bca=False):
"""
Add alts for CN states up to half of max observed total CN
"""

max_haplo_CN = int(np.ceil(max_CN / 2))

if is_bca:
alts = tuple(['<CN1>'] +
['<CN{0}>'.format(i) for i in range(2, max_haplo_CN + 1)])
else:
alts = tuple(['<CN0>'] +
['<CN{0}>'.format(i) for i in range(2, max_haplo_CN + 1)])

stop = record.stop
record.alts = alts
record.stop = stop


def make_evidence_int(ev):
ev = ev.split(',')

Expand Down Expand Up @@ -59,16 +40,14 @@ def add_genotypes(record, genotypes, varGQ):
del record.format[fmt]

max_GT = genotypes['GT'].max()
is_bca = record.info['SVTYPE'] not in 'DEL DUP'.split()

if is_bca:
max_CN = max_GT
else:
max_CN = max_GT + 2

if max_GT > 2:
record.info['MULTIALLELIC'] = True
make_multiallelic_alts(record, max_CN, is_bca)
record.alts = ('<CNV>',)
if record.info['SVTYPE'] != 'DUP':
msg = 'Invalid SVTYPE {0} for multiallelic record {1}'
msg = msg.format(record.info['SVTYPE'], record.id)
raise Exception(msg)
record.info['SVTYPE'] = 'CNV'

cols = 'name sample GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split()
gt_matrix = genotypes.reset_index()[cols].to_numpy()
Expand All @@ -85,27 +64,7 @@ def add_genotypes(record, genotypes, varGQ):
raise Exception(msg)

if max_GT > 2:
if record.info['SVTYPE'] == 'DEL':
msg = 'Invalid SVTYPE {0} for multiallelic genotype in record {1}'
msg = msg.format(record.info['SVTYPE'], record.id)
raise Exception(msg)

if is_bca:
idx1 = int(np.floor(data[2] / 2))
idx2 = int(np.ceil(data[2] / 2))
else:
# split copy state roughly evenly between haplotypes
idx1 = int(np.floor((data[2] + 2) / 2))
idx2 = int(np.ceil((data[2] + 2) / 2))

# if copy state is 1, assign reference genotype
if idx1 == 1:
idx1 = 0
if idx2 == 1:
idx2 = 0

record.samples[sample]['GT'] = (idx1, idx2)

record.samples[sample]['GT'] = (None, None)
elif data[2] == 0:
record.samples[sample]['GT'] = (0, 0)
elif data[2] == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self):
self.sample_list = []

def _update_rd_cn(self, variant, sample_indices):
self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices}
self.rd_cn[variant.id] = {s: variant.samples[s].get(VariantFormatTypes.RD_CN) for s in sample_indices}

@staticmethod
def get_wider(f):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ def modify_variants(dict_file_gz, vcf, multi_cnvs):
for sample_id in geno_normal_revise_dict[variant.id]:
o = variant.samples[sample_id]
o.update({"GT": (0, 1)})
o.update({"GQ": o["RD_GQ"]})
o.update({"GQ": o.get("RD_GQ")})

if variant.stop - variant.start >= 1000:
if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]:
is_del = variant.info[SVTYPE] == SVType.DEL
for k, v in variant.samples.items():
rd_cn = v[VariantFormatTypes.RD_CN]
rd_cn = v.get(VariantFormatTypes.RD_CN)
if rd_cn is None:
continue
if (is_del and rd_cn > 3) or \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,30 +93,30 @@ def main():
record = revised_lines_by_id[record.id]
if record.info.get('SVTYPE', None) == 'DEL':
if abs(record.stop - record.pos) >= 1000:
sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples}
if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1:
multi_del = True
gts = [record.samples[s]['GT'] for s in non_outlier_samples]
gts = [record.samples[s].get('GT') for s in non_outlier_samples]
if any(gt not in biallelic_gts for gt in gts):
gt5kb_del = True
if abs(record.stop - record.pos) >= 5000:
if abs(record.stop - record.pos) >= 5000 or '<CN0>' in record.alts:
if not multi_del:
gt5kb_del = True

if record.info.get('SVTYPE', None) == 'DUP':
if abs(record.stop - record.pos) >= 1000:
sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples}
if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1:
multi_dup = True
if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4:
gt4_copystate = True
if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and
(sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and gt4_copystate) > vf_1:
multi_dup = True
gts = [record.samples[s]['GT'] for s in non_outlier_samples]
gts = [record.samples[s].get('GT') for s in non_outlier_samples]
if any(gt not in biallelic_gts for gt in gts):
gt5kb_dup = True
if abs(record.stop - record.pos) >= 5000:
if abs(record.stop - record.pos) >= 5000 or '<CN0>' in record.alts:
if not multi_dup:
gt5kb_dup = True

Expand All @@ -125,51 +125,54 @@ def main():
# Leave no-calls
if sample_obj['GT'] == (None, None):
continue
if not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2):
if not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') >= 2):
sample_obj['GT'] = (0, 0)
elif not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1):
elif not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 1):
sample_obj['GT'] = (0, 1)
elif not sample_obj['GQ'] is None:
elif not sample_obj.get('GQ') is None:
sample_obj['GT'] = (1, 1) # RD_CN 0 DEL

if gt5kb_dup:
# Convert to bi-allelic
if '<CN0>' in record.alts:
record.alts = ('<DUP>',)
for sample_obj in record.samples.itervalues():
# Leave no-calls - also causes bug that skips multiallelic genotypes for a biallelic variant
if sample_obj['GT'] == (None, None):
continue
if not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2):
if not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') <= 2):
sample_obj['GT'] = (0, 0)
elif not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3):
elif not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 3):
sample_obj['GT'] = (0, 1)
elif not sample_obj['GQ'] is None:
elif not sample_obj.get('GQ') is None:
sample_obj['GT'] = (1, 1) # RD_CN > 3 DUP

if record.id in multi_geno_ids:
record.info['PESR_GT_OVERDISPERSION'] = True

if multi_del or multi_dup:
record.filter.add('MULTIALLELIC')
for j, sample in enumerate(record.samples):
for sample in record.samples:
record.samples[sample]['GT'] = None
record.samples[sample]['GQ'] = None
record.samples[sample]['CN'] = record.samples[sample]['RD_CN']
record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ']
record.samples[sample]['CN'] = record.samples[sample].get('RD_CN')
record.samples[sample]['CNQ'] = record.samples[sample].get('RD_GQ')

if len(record.filter) > 1 and 'PASS' in record.filter:
del record.filter['PASS']

if 'MULTIALLELIC' in record.filter and ('<DUP>' in record.alts or '<DEL>' in record.alts):
if 'MULTIALLELIC' in record.filter and ('<DUP>' in record.alts or '<DEL>' in record.alts or '<CN0>' in record.alts):
record.alts = ('<CNV>',)
record.info['SVTYPE'] = 'CNV'

if record.id in sexchr_revise:
for sample in record.samples:
if sample in male_samples:
cn = record.samples[sample]['RD_CN']
cn = record.samples[sample].get('RD_CN')
if cn is not None and int(cn) > 0:
cn = int(cn)
record.samples[sample]['RD_CN'] = cn - 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, record):
else:
raise ValueError("Uninterpretable evidence: {}".format(ev))
if record.id in bothside_pass:
self.both_end_support = bothside_pass[record.id]
self.both_end_support = 1
else:
self.both_end_support = 0
self.sr_fail = record.id in background_fail
Expand Down Expand Up @@ -112,7 +112,7 @@ def __str__(self):
# Read bothside-pass/background-fail records
sys.stderr.write("Reading SR files...\n")
with open(BOTHSIDE_PASS_PATH) as f:
bothside_pass = {line.strip().split('\t')[-1]: float(line.strip().split('\t')[0]) for line in f}
bothside_pass = set([line.strip().split('\t')[-1] for line in f])

with open(BACKGROUND_FAIL_PATH) as f:
background_fail = set([line.strip().split('\t')[-1] for line in f])
Expand Down
Loading

0 comments on commit 9b729de

Please sign in to comment.