From 680779d7c6471d66276945fb2289e4b9cb0546ad Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 27 Nov 2024 16:32:38 -0500 Subject: [PATCH] Handle duplicates --- v03_pipeline/lib/reference_datasets/dbnsfp.py | 6 ++++-- v03_pipeline/lib/reference_datasets/eigen.py | 3 ++- v03_pipeline/lib/reference_datasets/hmtvar.py | 3 ++- v03_pipeline/lib/reference_datasets/mitimpact.py | 3 ++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/lib/reference_datasets/dbnsfp.py b/v03_pipeline/lib/reference_datasets/dbnsfp.py index 3b59d75c5..79954da58 100644 --- a/v03_pipeline/lib/reference_datasets/dbnsfp.py +++ b/v03_pipeline/lib/reference_datasets/dbnsfp.py @@ -77,8 +77,10 @@ def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: **{k: predictor_parse(ht[k]) for k in PREDICTOR_FIELDS}, ) ht = ht.rename(rename) - - return key_by_locus_alleles(ht, reference_genome) + ht = key_by_locus_alleles(ht, reference_genome) + return ht.group_by(*ht.key).aggregate( + **{f: hl.agg.max(ht[f]) for f in ht.row_key}, + ) def select(_: ReferenceGenome, dataset_type: DatasetType, ht: hl.Table) -> hl.Table: diff --git a/v03_pipeline/lib/reference_datasets/eigen.py b/v03_pipeline/lib/reference_datasets/eigen.py index 5e56cfdca..d4197299b 100644 --- a/v03_pipeline/lib/reference_datasets/eigen.py +++ b/v03_pipeline/lib/reference_datasets/eigen.py @@ -3,4 +3,5 @@ def get_ht(path: str, *_) -> hl.Table: ht = hl.read_table(path) - return ht.select(Eigen_phred=ht.info['Eigen-phred']) + ht = ht.select(Eigen_phred=ht.info['Eigen-phred']) + return ht.group_by(*ht.key).aggregate(Eigen_phred=hl.agg.max(ht.Eigen_phred)) diff --git a/v03_pipeline/lib/reference_datasets/hmtvar.py b/v03_pipeline/lib/reference_datasets/hmtvar.py index 0fdcdecd8..bd9e963ab 100644 --- a/v03_pipeline/lib/reference_datasets/hmtvar.py +++ b/v03_pipeline/lib/reference_datasets/hmtvar.py @@ -20,4 +20,5 @@ def get_ht( alleles=hl.array([ht.ref_rCRS, ht.alt]), score=ht.disease_score, ) - return ht.key_by('locus', 'alleles') + ht = ht.key_by('locus', 'alleles') + return ht.group_by(*ht.key).aggregate(score=hl.agg.max(ht.score)) diff --git a/v03_pipeline/lib/reference_datasets/mitimpact.py b/v03_pipeline/lib/reference_datasets/mitimpact.py index 4fb7c6d4d..e3f11a4ff 100644 --- a/v03_pipeline/lib/reference_datasets/mitimpact.py +++ b/v03_pipeline/lib/reference_datasets/mitimpact.py @@ -23,4 +23,5 @@ def get_ht( alleles=[ht.Ref, ht.Alt], score=hl.parse_float32(ht.APOGEE2_score), ) - return ht.key_by('locus', 'alleles') + ht = ht.key_by('locus', 'alleles') + return ht.group_by(*ht.key).aggregate(score=hl.agg.max(ht.score))