Merge pull request #24 from cancervariants/issue-21-cbioportal

Issue 21: Use downloadable cBioPortal data
cancervariants · Mar 28, 2022 · d279388 · d279388
2 parents 085cfee + 18a8d9c
commit d279388
Show file tree

Hide file tree

Showing 9 changed files with 389 additions and 197 deletions.
diff --git a/.flake8 b/.flake8
@@ -8,6 +8,7 @@ exclude =
   source
   outputs
   evidence/version.py
+  build/*
 inline-quotes = "
 import-order-style = pep8
 application-import-names =

diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,4 @@ pyproject.toml
 
 # data
 evidence/data/*
+evidence/dev/etl/data/*
diff --git a/evidence/data_sources/cbioportal.py b/evidence/data_sources/cbioportal.py
@@ -1,89 +1,143 @@
 """Module for accessing python client for cBioPortal."""
-from typing import Dict, Optional, Union
+from typing import Optional
+from pathlib import Path
+import shutil
+from os import remove
+import csv
 
-import requests
-from bravado.client import SwaggerClient
-from bravado.exception import HTTPNotFound
+import boto3
 
+from evidence import DATA_DIR_PATH, logger
 from evidence.data_sources.base import DataSource
 from evidence.schemas import SourceMeta, Response, Sources
 
 
 class CBioPortal(DataSource):
     """cBioPortal class."""
 
-    def __init__(self, study_id: str = "msk_impact_2017",
-                 api_docs_url: str = "https://www.cbioportal.org/api/api-docs") -> None:
-        """Initialize cBioPortal class
+    def __init__(
+        self, data_url: str = "https://cbioportal-datahub.s3.amazonaws.com/msk_impact_2017.tar.gz",  # noqa: E501
+        src_dir_path: Path = DATA_DIR_PATH / "cbioportal",
+        transformed_mutations_data_path: Optional[Path] = None,
+        transformed_case_lists_data_path: Optional[Path] = None,
+        ignore_transformed_data: bool = False
+    ) -> None:
+        """Initialize cbioportal class
 
-        :param str study_id: The id for the study to retrieve mutation data from
-        :param str api_docs_url: The url to api docs
+        :param str data_url: URL to data file
+        :param Path src_dir_path: Path to cbioportal data directory
+        :param Optional[Path] transformed_mutations_data_path: Path to transformed
+            cbioportal mutations file
+        :param Optional[Path] transformed_case_lists_data_path: Path to transformed
+            cbioportal case_lists file
+        :param bool ignore_transformed_data: `True` if only bare init is needed. This
+            is intended for developers when using the CLI to transform cbioportal data.
+            Ignores path set in `transformed_mutations_data_path` and
+            `transformed_case_lists_data_path`. `False` will load transformed
+            data from s3 and load transformed excel sheet data.
         """
-        self.api_docs_url = api_docs_url
-        self.cbioportal = SwaggerClient.from_url(
-            self.api_docs_url,
-            config={
-                "validate_requests": False,
-                "validate_responses": False,
-                "validate_swagger_spec": False
-            }
+        self.data_url = data_url
+        self.src_dir_path = src_dir_path
+        self.src_dir_path.mkdir(exist_ok=True, parents=True)
+        self.source_meta = SourceMeta(
+            label=Sources.CBIOPORTAL,
+            version="msk_impact_2017"
         )
-        self.cbioportal_dir = dir(self.cbioportal)
-        for a in self.cbioportal_dir:
-            self.cbioportal.__setattr__(a.replace(" ", "_").lower(),
-                                        self.cbioportal.__getattr__(a))
-        self.study_id = study_id
-        self.source_meta = self.source_meta()
-
-    def source_meta(self) -> Optional[SourceMeta]:
-        """Return source meta for cBioPortal"""
-        r = requests.get(self.api_docs_url)
-        if r.status_code == 200:
-            resp = r.json()
-            version = resp["info"]["version"]
-            return SourceMeta(
-                label=Sources.CBIOPORTAL,
-                version=version[:version.index(".", 2)]
-            )
-
-    def cancer_types_summary(self, gene_id: int) -> Response:
+
+        if not ignore_transformed_data:
+            if transformed_mutations_data_path:
+                if transformed_mutations_data_path.exists():
+                    self.transformed_mutations_data_path = \
+                        transformed_mutations_data_path
+                else:
+                    logger.error(f"The supplied path at `transformed_mutations_data_"
+                                 f"path`, {transformed_mutations_data_path}, for "
+                                 f"cBioPortal does not exist.")
+            else:
+                self.get_transformed_data_path(is_mutations=True)
+
+            if not self.transformed_mutations_data_path:
+                raise FileNotFoundError(
+                    "Unable to retrieve path for transformed cBioPortal mutations data")
+
+            if transformed_case_lists_data_path:
+                if transformed_case_lists_data_path.exists():
+                    self.transformed_case_lists_data_path = \
+                        transformed_case_lists_data_path
+                else:
+                    logger.error(f"The supplied path at `transformed_case_lists_data_"
+                                 f"path`, {transformed_case_lists_data_path}, for "
+                                 f"cBioPortal does not exist.")
+            else:
+                self.get_transformed_data_path(is_mutations=False)
+
+            if not self.transformed_case_lists_data_path:
+                raise FileNotFoundError(
+                    "Unable to retrieve path for transformed cBioPortal case_lists"
+                    " data")
+
+    def get_transformed_data_path(self, is_mutations: bool = True) -> None:
+        """Download MSK Impact 2017 mutation and case_lists data from public s3 bucket
+        if it does not already exist in data directory and set the corresponding
+        data path
+
+        :param bool is_mutations: `True` if getting mutations data false. `False` if
+            getting case_lists data path
+        """
+        data_type = "mutations" if is_mutations else "case_lists"
+        logger.info(f"Retrieving transformed {data_type} data from s3 bucket...")
+        s3 = boto3.client("s3")
+        zip_fn = "msk_impact_2017_mutations.csv.zip" if is_mutations else "msk_impact_2017_case_lists.csv.zip"  # noqa: E501
+        zip_path = self.src_dir_path / zip_fn
+        with open(zip_path, "wb") as f:
+            s3.download_fileobj("vicc-normalizers",
+                                f"evidence_normalization/cbioportal/{zip_fn}", f)
+        shutil.unpack_archive(zip_path, self.src_dir_path)
+        remove(zip_path)
+        logger.info(f"Successfully downloaded transformed cBioPortal {data_type} data")
+        data_path = self.src_dir_path / zip_fn[:-4]
+        if is_mutations:
+            self.transformed_mutations_data_path = data_path
+        else:
+            self.transformed_case_lists_data_path = data_path
+
+    def cancer_types_summary(self, hgnc_symbol: str) -> Response:
         """Get cancer types with gene mutations data
 
-        :param int gene_id: Entrez ID for gene
+        :param str hgnc_symbol: HGNC symbol
         :return: Cancer types summary for gene
         """
-        try:
-            self.cbioportal.genes.getGeneUsingGET(geneId=gene_id).result()
-        except HTTPNotFound:
-            return self.format_response(
-                Response(data=dict(), source_meta_=self.source_meta)
-            )
-
-        mutations = self.cbioportal.mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(  # noqa: E501
-            molecularProfileId=f"{self.study_id}_mutations",
-            sampleListId=f"{self.study_id}_all",
-            entrezGeneId=gene_id,
-            projection="DETAILED"
-        ).result()
-        mutations_sample_ids = {m.sampleId for m in mutations}
-
-        samples_lists = self.cbioportal.Sample_Lists.getAllSampleListsInStudyUsingGET(
-            studyId=self.study_id,
-            projection="DETAILED"
-        ).result()
-
-        tumor_type_totals: Dict[str, Dict[str, Union[int, float]]] = dict()
-        for sample_list in samples_lists:
-            if ":" in sample_list.name:
-                tumor_type = sample_list.name.split(": ")[-1]
-                tumor_type_totals[tumor_type] = {
-                    "count": 0,
-                    "total": len(sample_list.sampleIds)
-                }
-                for sample_id in sample_list.sampleIds:
-                    if sample_id in mutations_sample_ids:
-                        tumor_type_totals[tumor_type]["count"] += 1
-                tumor_type_totals[tumor_type]["percent_altered"] = (tumor_type_totals[tumor_type]["count"] / tumor_type_totals[tumor_type]["total"]) * 100  # noqa: E501
-        return self.format_response(
-            Response(data=tumor_type_totals, source_meta_=self.source_meta)
-        )
+        hgnc_symbol = hgnc_symbol.upper()
+
+        mutation_sample_ids = set()
+        with open(self.transformed_mutations_data_path) as f:
+            data = csv.reader(f)
+            headers = next(data)
+            for row in data:
+                if row[headers.index("Hugo_Symbol")] == hgnc_symbol:
+                    sample_id = row[headers.index("Tumor_Sample_Barcode")]
+                    mutation_sample_ids.add(sample_id)
+
+        if not mutation_sample_ids:
+            return self.format_response(Response(data=dict(),
+                                                 source_meta_=self.source_meta))
+
+        tumor_type_totals = dict()
+        with open(self.transformed_case_lists_data_path) as f:
+            data = csv.reader(f)
+            headers = next(data)
+            for row in data:
+                case_list_name = row[headers.index("case_list_name")]
+                if ":" in case_list_name:
+                    tumor_type = case_list_name.split(": ")[-1]
+                    sample_ids = row[headers.index("case_list_ids")].split("\t")
+                    tumor_type_totals[tumor_type] = {
+                        "count": 0,
+                        "total": len(sample_ids)
+                    }
+                    for sample_id in sample_ids:
+                        if sample_id in mutation_sample_ids:
+                            tumor_type_totals[tumor_type]["count"] += 1
+                    tumor_type_totals[tumor_type]["percent_altered"] = (tumor_type_totals[tumor_type]["count"] / tumor_type_totals[tumor_type]["total"]) * 100  # noqa: E501
+        return self.format_response(Response(data=tumor_type_totals,
+                                             source_meta_=self.source_meta))
diff --git a/evidence/dev/README.md b/evidence/dev/README.md
@@ -10,3 +10,9 @@ Requires Variation Normalization to be configured. See the [README](https://gith
 ```commandline
 python3 -m evidence.dev.cli --normalize_cancer_hotspots
 ```
+
+### Tranforming cBioPortal Data
+
+```commandline
+python3 -m evidence.dev.cli --tranform_cbioportal
+```
diff --git a/evidence/dev/cli.py b/evidence/dev/cli.py
@@ -1,29 +1,9 @@
 """Dev CLI"""
-from timeit import default_timer as timer
-from datetime import datetime
-
-import requests
 import click
-from variation.query import QueryHandler
-import pandas as pd
-
-from evidence import logger
-from evidence.data_sources import CancerHotspots
-
 
-def log_and_echo_msg(msg: str, log_level: str = "info") -> None:
-    """Log and echo message
-
-    :param str msg: Message
-    :param str log_level: Logging level. Must be either info, warning, or error
-    """
-    if log_level == "info":
-        logger.info(msg)
-    elif log_level == "warning":
-        logger.warning(msg)
-    else:
-        logger.error(msg)
-    click.echo(msg)
+from evidence.dev.etl.cancer_hotspots import CancerHotspotsETL, \
+    CancerHotspotsETLException
+from evidence.dev.etl.cbioportal import CBioPortalETL, CBioPortalETLException
 
 
 @click.command()
@@ -33,102 +13,32 @@ def log_and_echo_msg(msg: str, log_level: str = "info") -> None:
     default=False,
     help="Normalize Cancer Hotspots data"
 )
-def cli(normalize_cancer_hotspots: bool) -> None:
+@click.option(
+    "--transform_cbioportal",
+    is_flag=True,
+    default=False,
+    help="Transform cBioPortal data"
+)
+def cli(normalize_cancer_hotspots: bool, transform_cbioportal: bool) -> None:
     """Execute CLI methods
 
     :param bool normalize_cancer_hotspots: Determines whether or not to normalize
         Cancer Hotspots data
+    :param bool transform_cbioportal: Determines whether or not to transform cBioPortal
+        data
     """
     if normalize_cancer_hotspots:
-        normalize_cancer_hotspots_data()
-
-
-def normalize_cancer_hotspots_data() -> None:
-    """Normalize Cancer Hotspots data"""
-
-    def download_data(ch: CancerHotspots) -> None:
-        """Download Cancer Hotspots data.
-
-        :param CancerHotspots ch: Cancer Hotspots data source
-        """
-        if not ch.data_path.exists():
-            log_and_echo_msg("Downloading Cancer Hotspots data...")
-            r = requests.get(ch.data_url)
-            if r.status_code == 200:
-                with open(ch.data_path, "wb") as f:
-                    f.write(r.content)
-
-    def add_vrs_identifier_to_data(ch: CancerHotspots) -> None:
-        """Normalize variations in cancer hotspots SNV sheet and adds `vrs_identifier`
-        column to dataframe. Run manually each time variation-normalizer
-        or Cancer Hotspots releases a new version.
-
-        :param CancerHotspots ch: Cancer Hotspots data source
-        """
-        download_data(ch)
-        if not ch.data_path.exists():
-            log_and_echo_msg("Downloading Cancer Hotspots data was unsuccessful",
-                             "warning")
-            return
-
-        snv_hotspots = pd.read_excel(ch.data_path,
-                                     sheet_name=ch.og_snv_sheet_name)
-        indel_hotspots = pd.read_excel(
-            ch.data_path, sheet_name=ch.og_indel_sheet_name)
-        variation_normalizer = QueryHandler()
-
-        log_and_echo_msg("Normalizing Cancer Hotspots data...")
-        start = timer()
-        get_normalized_data(
-            ch, snv_hotspots, variation_normalizer, ch.new_snv_sheet_name)
-        get_normalized_data(
-            ch, indel_hotspots, variation_normalizer, ch.new_indel_sheet_name)
-        end = timer()
-        log_and_echo_msg(f"Normalized Cancer Hotspots data in {(end-start):.5f} s")
-
-        today = datetime.strftime(datetime.today(), "%Y%m%d")
-        ch.normalized_data_path = \
-            ch.src_dir_path / f"normalized_hotspots_v{ch.source_meta.version}_{today}.xls"  # noqa: E501
-        with pd.ExcelWriter(ch.normalized_data_path) as writer:
-            snv_hotspots.to_excel(
-                writer, sheet_name=ch.og_snv_sheet_name, index=False)
-            indel_hotspots.to_excel(
-                writer, sheet_name=ch.og_indel_sheet_name, index=False)
-        log_and_echo_msg(f"Successfully normalized Cancer Hotspots data. "
-                         f"Normalized data can be found at: {ch.normalized_data_path}")
-
-    def get_normalized_data(ch: CancerHotspots, df: pd.DataFrame,
-                            variation_normalizer: QueryHandler, df_name: str) -> None:
-        """Normalize variant and add vrs_identifier column to df
-
-        :param CancerHotspots ch: Cancer Hotspots data source
-        :param pd.DataFrame df: Dataframe to normalize
-        :param QueryHandler variation_normalizer: Variation Normalizer handler
-        :param str df_name: Name of df.
-            Must be either `snv_hotspots` or `indel_hotspots`
-        """
-        df["vrs_identifier"] = None
-        for i, row in df.iterrows():
-            if df_name == ch.new_snv_sheet_name:
-                variation = f"{row['Hugo_Symbol']} {row['ref']}{row['Amino_Acid_Position']}{row['Variant_Amino_Acid'].split(':')[0]}"  # noqa: E501
-            else:
-                variation = f"{row['Hugo_Symbol']} {row['Variant_Amino_Acid'].split(':')[0]}"  # noqa: E501
-            try:
-                norm_vd = variation_normalizer.normalize(variation)
-            except Exception as e:
-                logger.warning(f"variation-normalizer unable to normalize {variation}: {e}")  # noqa: E501
-            else:
-                if norm_vd:
-                    norm_vd = norm_vd.dict()
-                    if norm_vd["variation"]["type"] != "Text":
-                        df.at[i, "vrs_identifier"] = norm_vd["variation"]["id"]
-                    else:
-                        logger.warning(f"variation-normalizer unable to normalize: {variation}")  # noqa: E501
-                else:
-                    logger.warning(f"variation-normalizer unable to normalize: {variation}")  # noqa: E501
-
-    cancer_hotspots = CancerHotspots(ignore_normalized_data=True)
-    add_vrs_identifier_to_data(cancer_hotspots)
+        c = CancerHotspotsETL()
+        try:
+            c.add_vrs_identifier_to_data()
+        except CancerHotspotsETLException as e:
+            click.echo(e)
+    if transform_cbioportal:
+        c = CBioPortalETL()
+        try:
+            c.transform_data()
+        except CBioPortalETLException as e:
+            click.echo(e)
 
 
 if __name__ == "__main__":
Original file line number	Diff line number	Diff line change
Expand Up		@@ -132,3 +132,4 @@ pyproject.toml

		# data
		evidence/data/*
		evidence/dev/etl/data/*