Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/kbdev 1227 signature support #42

Merged
merged 18 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
a2b53bd
Add IprSignatureVariant to types
mathieulemieux Nov 15, 2024
e43565e
Add cosmicSignatures in content.spec.json
mathieulemieux Nov 15, 2024
5e24418
Add signatures vocab terms in ipr/constants.py
mathieulemieux Nov 16, 2024
eaf868e
Add preprocess_cosmic() & preprocess_hla() to ipr/inputs.py
mathieulemieux Nov 16, 2024
fa02240
Add preprocess_signature_variants() to ipr/inputs.py
mathieulemieux Nov 16, 2024
40ebc41
Add support for Signature reference in graphkb/match.py::match_catego…
mathieulemieux Nov 16, 2024
c186189
Add annotate_signature_variants() to ipr/annotate.py
mathieulemieux Nov 16, 2024
9102265
Add support for signature category variants in ipr/main.py::ipr_report()
mathieulemieux Nov 16, 2024
19ec14a
Add tests for ipr/inputs.py
mathieulemieux Nov 16, 2024
51de7b5
Merge branch 'develop' into feature/KBDEV-1227-signature-support
mathieulemieux Nov 18, 2024
375f3cd
Add test data files for test_ipr/test_inputs.py
mathieulemieux Nov 18, 2024
cf7af4d
Remove DMMR_SIGNATURE_NAME from ipr/constants.py; moved to gsc_report
mathieulemieux Nov 19, 2024
03b7507
Merge branch 'develop' into feature/KBDEV-1227-signature-support
mathieulemieux Feb 14, 2025
ac17af3
match_category_variant with equivalent signatures from get_equivalent…
mathieulemieux Feb 18, 2025
eecc1fd
add cosmicSicnatures and hlaTypes data to civic upload tests
mathieulemieux Feb 18, 2025
0e25ad9
black linting
mathieulemieux Feb 18, 2025
2d462b8
Fix issues with signatureVariants integration tests
mathieulemieux Feb 20, 2025
210347b
Merge branch 'develop' into feature/KBDEV-1227-signature-support
mathieulemieux Feb 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions pori_python/graphkb/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,50 +129,62 @@ def cache_missing_features(conn: GraphKBConnection) -> None:

def match_category_variant(
conn: GraphKBConnection,
gene_name: str,
reference_name: str,
category: str,
root_exclude_term: str = "",
gene_source: str = "",
gene_is_source_id: bool = False,
ignore_cache: bool = False,
reference_class: str = 'Feature',
) -> List[Variant]:
"""
Returns a list of variants matching the input variant

Args:
conn (GraphKBConnection): the graphkb connection object
gene_name (str): the name of the gene the variant is in reference to
reference_name (str): the name of the Feature(gene)/Signature the variant is in reference to
category (str): the variant category (ex. copy loss)
gene_source: The source database the gene is defined by (ex. ensembl)
gene_is_source_id: Indicates the gene name(s) input should be treated as sourceIds not names
reference_class (str): Class name of the variant reference. Default to 'Feature'
Raises:
FeatureNotFoundError: The gene could not be found in GraphKB

Returns:
Array.<dict>: List of variant records from GraphKB which match the input
"""
# disambiguate the gene to find all equivalent representations
features = convert_to_rid_list(
get_equivalent_features(
conn,
gene_name,
source=gene_source,
is_source_id=gene_is_source_id,
ignore_cache=ignore_cache,
# disambiguate the reference to find all equivalent representations
references: List[str] = []
if reference_class == 'Feature':
references = convert_to_rid_list(
get_equivalent_features(
conn,
reference_name,
source=gene_source,
is_source_id=gene_is_source_id,
ignore_cache=ignore_cache,
)
)
)

if not features:
raise FeatureNotFoundError(
f"unable to find the gene ({gene_name}) or any equivalent representations"
if not references:
raise FeatureNotFoundError(
f"unable to find the gene ({reference_name}) or any equivalent representations"
)
if reference_class == 'Signature':
references = convert_to_rid_list(
get_equivalent_terms(
conn,
reference_name.lower(),
ontology_class='Signature',
ignore_cache=ignore_cache,
)
)

# get the list of terms that we should match
terms = convert_to_rid_list(
types = convert_to_rid_list(
get_term_tree(conn, category, root_exclude_term, ignore_cache=ignore_cache)
)

if not terms:
if not types:
raise ValueError(f"unable to find the term/category ({category}) or any equivalent")

# find the variant list
Expand All @@ -183,8 +195,8 @@ def match_category_variant(
"target": {
"target": "CategoryVariant",
"filters": [
{"reference1": features, "operator": "IN"},
{"type": terms, "operator": "IN"},
{"reference1": references, "operator": "IN"},
{"type": types, "operator": "IN"},
],
},
"queryType": "similarTo",
Expand Down
53 changes: 53 additions & 0 deletions pori_python/ipr/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Hashabledict,
IprCopyVariant,
IprExprVariant,
IprSignatureVariant,
IprStructuralVariant,
KbMatch,
Statement,
Expand Down Expand Up @@ -380,3 +381,55 @@ def annotate_tmb(
ipr_row["variantType"] = "tmb"
gkb_matches.append(ipr_row)
return gkb_matches


def annotate_signature_variants(
graphkb_conn: GraphKBConnection,
variants: List[IprSignatureVariant] = [],
disease_name: str = "cancer",
show_progress: bool = False,
) -> List[KbMatch]:
"""Annotate Signature variants with GraphKB in the IPR alterations format.

Match to corresponding GraphKB Variants, then to linked GraphKB Statements

Args:
graphkb_conn: the graphkb api connection object
variants: list of signature variants
disease_name: oncotree disease name for graphkb matching
show_progress: progressbar displayed for long runs; default to False

Returns:
list of kbMatches records for IPR
"""
alterations: List[Hashabledict] = []

iterfunc = tqdm if show_progress else iter
for variant in iterfunc(variants):
try:
# Matching signature variant to GKB Variants
matched_variants: List[Variant] = gkb_match.match_category_variant(
graphkb_conn,
variant["signatureName"],
variant["variantTypeName"],
reference_class="Signature",
)
# Matching GKB Variants to GKB Statements
for ipr_row in get_ipr_statements_from_variants(
graphkb_conn, matched_variants, disease_name
):
ipr_row["variant"] = variant["key"]
ipr_row["variantType"] = "sigv"
alterations.append(Hashabledict(ipr_row))

except ValueError as err:
logger.error(f"failed to match signature category variant '{variant}': {err}")

# drop duplicates
alterations = list(set(alterations))

logger.info(
f"matched {len(variants)} signature category variants to {len(alterations)} graphkb annotations"
)

return alterations
3 changes: 3 additions & 0 deletions pori_python/ipr/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,8 @@
# all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial']
FAILED_REVIEW_STATUS = "failed"

# Signatures
COSMIC_SIGNATURE_VARIANT_TYPE = "high signature"
HLA_SIGNATURE_VARIANT_TYPE = "signature present"
TMB_HIGH = 10.0 # genomic mutations per mb - https://www.bcgsc.ca/jira/browse/GERO-296
TMB_HIGH_CATEGORY = "high mutation burden"
10 changes: 10 additions & 0 deletions pori_python/ipr/content.spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,16 @@
},
"type": "array"
},
"cosmicSignatures": {
"default": [],
"description": "List of observed (above threshold) COSMIC signatures (DBS, SBS and ID) & dMMR",
"items": {
"description": "Signature name",
"example": "DBS1",
"type": "string"
},
"type": "array"
},
"expressionVariants": {
"default": [],
"items": {
Expand Down
72 changes: 71 additions & 1 deletion pori_python/ipr/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
IprCopyVariant,
IprExprVariant,
IprFusionVariant,
IprSignatureVariant,
IprSmallMutationVariant,
IprVariant,
)

from .constants import DEFAULT_URL
from .constants import COSMIC_SIGNATURE_VARIANT_TYPE, DEFAULT_URL, HLA_SIGNATURE_VARIANT_TYPE
from .util import hash_key, logger, pandas_falsy

protein_letters_3to1.setdefault("Ter", "*")
Expand Down Expand Up @@ -152,6 +153,12 @@
"mavis_product_id",
]

SIGV_REQ = ["signatureName", "variantTypeName"]
SIGV_COSMIC = ["signature"] # 1st element used as signatureName key
SIGV_HLA = ["a1", "a2", "b1", "b2", "c1", "c2"]
SIGV_OPTIONAL = ["displayName"]
SIGV_KEY = SIGV_REQ[:]


def validate_variant_rows(
rows: Iterable[Dict], required: List[str], optional: List[str], row_to_key: Callable
Expand Down Expand Up @@ -388,6 +395,69 @@ def row_key(row: Dict) -> Tuple[str, ...]:
return result


def preprocess_signature_variants(rows: Iterable[Dict]) -> List[IprSignatureVariant]:
"""
Validate the input rows contain the minimum required fields and
generate any default values where possible
"""

def row_key(row: Dict) -> Tuple[str, ...]:
return tuple(["sigv"] + [row[key] for key in SIGV_KEY])

variants = validate_variant_rows(rows, SIGV_REQ, SIGV_OPTIONAL, row_key)
result = [cast(IprSignatureVariant, var) for var in variants]

# Adding additional required properties
for row in result:
row["variant"] = row["displayName"]
row["variantType"] = "sigv"

return result


def preprocess_cosmic(rows: Iterable[Dict]) -> Iterable[Dict]:
"""
Process cosmic inputs into preformatted signature inputs
Note: Cosmic and dMMR already evaluated against thresholds in gsc_report
"""
cosmic = set()
for row in rows:
if not set(SIGV_COSMIC).issubset(row.keys()):
continue
cosmic.add(row[SIGV_COSMIC[0]])

return [
{
"displayName": f"{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}",
"signatureName": signature,
"variantTypeName": COSMIC_SIGNATURE_VARIANT_TYPE,
}
for signature in cosmic
]


def preprocess_hla(rows: Iterable[Dict]) -> Iterable[Dict]:
"""
Process hla inputs into preformatted signature inputs
"""
hla: Set[str] = set()
for row in rows: # 1 row per sample; should be 3
for k, v in row.items():
if k not in SIGV_HLA:
continue
hla.add(f"HLA-{v}") # 2nd level, e.g. 'HLA-A*02:01'
hla.add(f"HLA-{v.split(':')[0]}") # 1st level, e.g. 'HLA-A*02'

return [
{
"displayName": f"{signature} {HLA_SIGNATURE_VARIANT_TYPE}",
"signatureName": signature,
"variantTypeName": HLA_SIGNATURE_VARIANT_TYPE,
}
for signature in hla
]


def check_variant_links(
small_mutations: List[IprSmallMutationVariant],
expression_variants: List[IprExprVariant],
Expand Down
Loading