Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 21: Use downloadable cBioPortal data #24

Merged
merged 5 commits into from
Mar 28, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ exclude =
source
outputs
evidence/version.py
build/*
inline-quotes = "
import-order-style = pep8
application-import-names =
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,4 @@ pyproject.toml

# data
evidence/data/*
evidence/dev/etl/data/*
187 changes: 122 additions & 65 deletions evidence/data_sources/cbioportal.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,143 @@
"""Module for accessing python client for cBioPortal."""
from typing import Dict, Optional, Union
from typing import Optional
from pathlib import Path
import shutil
from os import remove
import csv

import requests
from bravado.client import SwaggerClient
from bravado.exception import HTTPNotFound
import boto3

from evidence import DATA_DIR_PATH, logger
from evidence.schemas import SourceMeta, Response, Sources


class CBioPortal:
"""cBioPortal class."""

def __init__(self, study_id: str = "msk_impact_2017",
api_docs_url: str = "https://www.cbioportal.org/api/api-docs") -> None:
"""Initialize cBioPortal class
def __init__(
self, data_url: str = "https://cbioportal-datahub.s3.amazonaws.com/msk_impact_2017.tar.gz", # noqa: E501
src_dir_path: Path = DATA_DIR_PATH / "cbioportal",
transformed_mutations_data_path: Optional[Path] = None,
transformed_case_lists_data_path: Optional[Path] = None,
ignore_transformed_data: bool = False
) -> None:
"""Initialize cbioportal class

:param str study_id: The id for the study to retrieve mutation data from
:param str api_docs_url: The url to api docs
:param str data_url: URL to data file
:param Path src_dir_path: Path to cbioportal data directory
:param Optional[Path] transformed_mutations_data_path: Path to transformed
cbioportal mutations file
:param Optional[Path] transformed_case_lists_data_path: Path to transformed
cbioportal case_lists file
:param bool ignore_transformed_data: `True` if only bare init is needed. This
is intended for developers when using the CLI to transform cbioportal data.
Ignores path set in `transformed_mutations_data_path` and
`transformed_case_lists_data_path`. `False` will load transformed
data from s3 and load transformed excel sheet data.
"""
self.api_docs_url = api_docs_url
self.cbioportal = SwaggerClient.from_url(
self.api_docs_url,
config={
"validate_requests": False,
"validate_responses": False,
"validate_swagger_spec": False
}
self.data_url = data_url
self.src_dir_path = src_dir_path
self.src_dir_path.mkdir(exist_ok=True, parents=True)
self.source_meta = SourceMeta(
label=Sources.CBIOPORTAL,
version="msk_impact_2017"
)
self.cbioportal_dir = dir(self.cbioportal)
for a in self.cbioportal_dir:
self.cbioportal.__setattr__(a.replace(" ", "_").lower(),
self.cbioportal.__getattr__(a))
self.study_id = study_id
self.source_meta = self.source_meta()

def source_meta(self) -> Optional[SourceMeta]:
"""Return source meta for cBioPortal"""
r = requests.get(self.api_docs_url)
if r.status_code == 200:
resp = r.json()
version = resp["info"]["version"]
return SourceMeta(
label=Sources.CBIOPORTAL,
version=version[:version.index(".", 2)]
)

def cancer_types_summary(self, gene_id: int) -> Response:

if not ignore_transformed_data:
if transformed_mutations_data_path:
if transformed_mutations_data_path.exists():
self.transformed_mutations_data_path = \
transformed_mutations_data_path
else:
logger.error(f"The supplied path at `transformed_mutations_data_"
f"path`, {transformed_mutations_data_path}, for "
f"cBioPortal does not exist.")
else:
self.get_transformed_data_path(is_mutations=True)

if not self.transformed_mutations_data_path:
raise FileNotFoundError(
"Unable to retrieve path for transformed cBioPortal mutations data")

if transformed_case_lists_data_path:
if transformed_case_lists_data_path.exists():
self.transformed_case_lists_data_path = \
transformed_case_lists_data_path
else:
logger.error(f"The supplied path at `transformed_case_lists_data_"
f"path`, {transformed_case_lists_data_path}, for "
f"cBioPortal does not exist.")
else:
self.get_transformed_data_path(is_mutations=False)

if not self.transformed_case_lists_data_path:
raise FileNotFoundError(
"Unable to retrieve path for transformed cBioPortal case_lists"
" data")

def get_transformed_data_path(self, is_mutations: bool = True) -> None:
"""Download MSK Impact 2017 mutation and case_lists data from public s3 bucket
if it does not already exist in data directory and set the corresponding
data path

:param bool is_mutations: `True` if getting mutations data false. `False` if
getting case_lists data path
"""
data_type = "mutations" if is_mutations else "case_lists"
logger.info(f"Retrieving transformed {data_type} data from s3 bucket...")
s3 = boto3.client("s3")
zip_fn = "msk_impact_2017_mutations.csv.zip" if is_mutations else "msk_impact_2017_case_lists.csv.zip" # noqa: E501
zip_path = self.src_dir_path / zip_fn
with open(zip_path, "wb") as f:
s3.download_fileobj("vicc-normalizers",
f"evidence_normalization/cbioportal/{zip_fn}", f)
shutil.unpack_archive(zip_path, self.src_dir_path)
remove(zip_path)
logger.info(f"Successfully downloaded transformed cBioPortal {data_type} data")
data_path = self.src_dir_path / zip_fn[:-4]
if is_mutations:
self.transformed_mutations_data_path = data_path
else:
self.transformed_case_lists_data_path = data_path

def cancer_types_summary(self, hgnc_symbol: str) -> Response:
"""Get cancer types with gene mutations data

:param int gene_id: Entrez ID for gene
:param str hgnc_symbol: HGNC symbol
:return: Cancer types summary for gene
"""
try:
self.cbioportal.genes.getGeneUsingGET(geneId=gene_id).result()
except HTTPNotFound:
hgnc_symbol = hgnc_symbol.upper()

mutation_sample_ids = set()
with open(self.transformed_mutations_data_path) as f:
data = csv.reader(f)
headers = next(data)
for row in data:
if row[headers.index("Hugo_Symbol")] == hgnc_symbol:
sample_id = row[headers.index("Tumor_Sample_Barcode")]
mutation_sample_ids.add(sample_id)

if not mutation_sample_ids:
return Response(data=dict(), source_meta_=self.source_meta)

mutations = self.cbioportal.mutations.getMutationsInMolecularProfileBySampleListIdUsingGET( # noqa: E501
molecularProfileId=f"{self.study_id}_mutations",
sampleListId=f"{self.study_id}_all",
entrezGeneId=gene_id,
projection="DETAILED"
).result()
mutations_sample_ids = {m.sampleId for m in mutations}

samples_lists = self.cbioportal.Sample_Lists.getAllSampleListsInStudyUsingGET(
studyId=self.study_id,
projection="DETAILED"
).result()

tumor_type_totals: Dict[str, Dict[str, Union[int, float]]] = dict()
for sample_list in samples_lists:
if ":" in sample_list.name:
tumor_type = sample_list.name.split(": ")[-1]
tumor_type_totals[tumor_type] = {
"count": 0,
"total": len(sample_list.sampleIds)
}
for sample_id in sample_list.sampleIds:
if sample_id in mutations_sample_ids:
tumor_type_totals[tumor_type]["count"] += 1
tumor_type_totals[tumor_type]["percent_altered"] = (tumor_type_totals[tumor_type]["count"] / tumor_type_totals[tumor_type]["total"]) * 100 # noqa: E501
tumor_type_totals = dict()
with open(self.transformed_case_lists_data_path) as f:
data = csv.reader(f)
headers = next(data)
for row in data:
case_list_name = row[headers.index("case_list_name")]
if ":" in case_list_name:
tumor_type = case_list_name.split(": ")[-1]
sample_ids = row[headers.index("case_list_ids")].split("\t")
tumor_type_totals[tumor_type] = {
"count": 0,
"total": len(sample_ids)
}
for sample_id in sample_ids:
if sample_id in mutation_sample_ids:
tumor_type_totals[tumor_type]["count"] += 1
tumor_type_totals[tumor_type]["percent_altered"] = (tumor_type_totals[tumor_type]["count"] / tumor_type_totals[tumor_type]["total"]) * 100 # noqa: E501

return Response(
data=tumor_type_totals,
source_meta_=self.source_meta
Expand Down
6 changes: 6 additions & 0 deletions evidence/dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,9 @@ Requires Variation Normalization to be configured. See the [README](https://gith
```commandline
python3 -m evidence.dev.cli --normalize_cancer_hotspots
```

### Tranforming cBioPortal Data

```commandline
python3 -m evidence.dev.cli --tranform_cbioportal
```
136 changes: 23 additions & 113 deletions evidence/dev/cli.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,9 @@
"""Dev CLI"""
from timeit import default_timer as timer
from datetime import datetime

import requests
import click
from variation.query import QueryHandler
import pandas as pd

from evidence import logger
from evidence.data_sources import CancerHotspots


def log_and_echo_msg(msg: str, log_level: str = "info") -> None:
"""Log and echo message

:param str msg: Message
:param str log_level: Logging level. Must be either info, warning, or error
"""
if log_level == "info":
logger.info(msg)
elif log_level == "warning":
logger.warning(msg)
else:
logger.error(msg)
click.echo(msg)
from evidence.dev.etl.cancer_hotspots import CancerHotspotsETL, \
CancerHotspotsETLException
from evidence.dev.etl.cbioportal import CBioPortalETL, CBioPortalETLException


@click.command()
Expand All @@ -33,102 +13,32 @@ def log_and_echo_msg(msg: str, log_level: str = "info") -> None:
default=False,
help="Normalize Cancer Hotspots data"
)
def cli(normalize_cancer_hotspots: bool) -> None:
@click.option(
"--transform_cbioportal",
is_flag=True,
default=False,
help="Transform cBioPortal data"
)
def cli(normalize_cancer_hotspots: bool, transform_cbioportal: bool) -> None:
"""Execute CLI methods

:param bool normalize_cancer_hotspots: Determines whether or not to normalize
Cancer Hotspots data
:param bool transform_cbioportal: Determines whether or not to transform cBioPortal
data
"""
if normalize_cancer_hotspots:
normalize_cancer_hotspots_data()


def normalize_cancer_hotspots_data() -> None:
"""Normalize Cancer Hotspots data"""

def download_data(ch: CancerHotspots) -> None:
"""Download Cancer Hotspots data.

:param CancerHotspots ch: Cancer Hotspots data source
"""
if not ch.data_path.exists():
log_and_echo_msg("Downloading Cancer Hotspots data...")
r = requests.get(ch.data_url)
if r.status_code == 200:
with open(ch.data_path, "wb") as f:
f.write(r.content)

def add_vrs_identifier_to_data(ch: CancerHotspots) -> None:
"""Normalize variations in cancer hotspots SNV sheet and adds `vrs_identifier`
column to dataframe. Run manually each time variation-normalizer
or Cancer Hotspots releases a new version.

:param CancerHotspots ch: Cancer Hotspots data source
"""
download_data(ch)
if not ch.data_path.exists():
log_and_echo_msg("Downloading Cancer Hotspots data was unsuccessful",
"warning")
return

snv_hotspots = pd.read_excel(ch.data_path,
sheet_name=ch.og_snv_sheet_name)
indel_hotspots = pd.read_excel(
ch.data_path, sheet_name=ch.og_indel_sheet_name)
variation_normalizer = QueryHandler()

log_and_echo_msg("Normalizing Cancer Hotspots data...")
start = timer()
get_normalized_data(
ch, snv_hotspots, variation_normalizer, ch.new_snv_sheet_name)
get_normalized_data(
ch, indel_hotspots, variation_normalizer, ch.new_indel_sheet_name)
end = timer()
log_and_echo_msg(f"Normalized Cancer Hotspots data in {(end-start):.5f} s")

today = datetime.strftime(datetime.today(), "%Y%m%d")
ch.normalized_data_path = \
ch.src_dir_path / f"normalized_hotspots_v{ch.source_meta.version}_{today}.xls" # noqa: E501
with pd.ExcelWriter(ch.normalized_data_path) as writer:
snv_hotspots.to_excel(
writer, sheet_name=ch.og_snv_sheet_name, index=False)
indel_hotspots.to_excel(
writer, sheet_name=ch.og_indel_sheet_name, index=False)
log_and_echo_msg(f"Successfully normalized Cancer Hotspots data. "
f"Normalized data can be found at: {ch.normalized_data_path}")

def get_normalized_data(ch: CancerHotspots, df: pd.DataFrame,
variation_normalizer: QueryHandler, df_name: str) -> None:
"""Normalize variant and add vrs_identifier column to df

:param CancerHotspots ch: Cancer Hotspots data source
:param pd.DataFrame df: Dataframe to normalize
:param QueryHandler variation_normalizer: Variation Normalizer handler
:param str df_name: Name of df.
Must be either `snv_hotspots` or `indel_hotspots`
"""
df["vrs_identifier"] = None
for i, row in df.iterrows():
if df_name == ch.new_snv_sheet_name:
variation = f"{row['Hugo_Symbol']} {row['ref']}{row['Amino_Acid_Position']}{row['Variant_Amino_Acid'].split(':')[0]}" # noqa: E501
else:
variation = f"{row['Hugo_Symbol']} {row['Variant_Amino_Acid'].split(':')[0]}" # noqa: E501
try:
norm_vd = variation_normalizer.normalize(variation)
except Exception as e:
logger.warning(f"variation-normalizer unable to normalize {variation}: {e}") # noqa: E501
else:
if norm_vd:
norm_vd = norm_vd.dict()
if norm_vd["variation"]["type"] != "Text":
df.at[i, "vrs_identifier"] = norm_vd["variation"]["id"]
else:
logger.warning(f"variation-normalizer unable to normalize: {variation}") # noqa: E501
else:
logger.warning(f"variation-normalizer unable to normalize: {variation}") # noqa: E501

cancer_hotspots = CancerHotspots(ignore_normalized_data=True)
add_vrs_identifier_to_data(cancer_hotspots)
c = CancerHotspotsETL()
try:
c.add_vrs_identifier_to_data()
except CancerHotspotsETLException as e:
click.echo(e)
if transform_cbioportal:
c = CBioPortalETL()
try:
c.transform_data()
except CBioPortalETLException as e:
click.echo(e)


if __name__ == "__main__":
Expand Down
5 changes: 5 additions & 0 deletions evidence/dev/etl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Module for initializing ETL classes"""
from pathlib import Path

ETL_PATH = Path(__file__).resolve().parents[0]
ETL_DATA_DIR_PATH = ETL_PATH / "data"
Loading