broadinstitute · alxndrkalinin · May 15, 2023 · Dec 7, 2022 · Dec 8, 2022 · Dec 8, 2022
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -0,0 +1,24 @@
+name: Code formatting
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  format:
+    name: Black formatting
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Setup Python
+      uses: actions/setup-python@master
+      with:
+        python-version: 3.9
+    - name: Update pip
+      run: python -m pip install --upgrade pip
+    - name: Install Black
+      run: pip install black[jupyter]==22.12
+    - name: Run Black
+      run: black --config=black.toml --check .
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 *.parquet
 rsconnect
 */__pycache__/*
+data
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+    - id: requirements-txt-fixer
+-   repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+    -   id: black
+        args: [--config=black.toml]
diff --git a/0.download-data/0.download-ceres.ipynb b/0.download-data/0.download-ceres.ipynb
@@ -35,14 +35,8 @@
     "figshare_base_url = \"https://ndownloader.figshare.com/files/\"\n",
     "\n",
     "file_info = {\n",
-    "    \"ceres\": {\n",
-    "        \"file_id\": \"24613292\",\n",
-    "        \"output_file\": \"ceres.csv\"\n",
-    "    },\n",
-    "    \"sample_id\": {\n",
-    "        \"file_id\": \"24613394\",\n",
-    "        \"output_file\": \"depmap_sample_info.csv\"\n",
-    "    }\n",
+    "    \"ceres\": {\"file_id\": \"24613292\", \"output_file\": \"ceres.csv\"},\n",
+    "    \"sample_id\": {\"file_id\": \"24613394\", \"output_file\": \"depmap_sample_info.csv\"},\n",
     "}\n",
     "\n",
     "output_dir = pathlib.Path(\"data\")\n",
@@ -61,7 +55,7 @@
     "\n",
     "    download_url = f\"{figshare_base_url}/{file_id}\"\n",
     "    output_file = pathlib.Path(f\"{output_dir}/{output_file}\")\n",
-    "    \n",
+    "\n",
     "    urlretrieve(download_url, output_file)"
    ]
   },

diff --git a/0.download-data/1.download-perturbseq.ipynb b/0.download-data/1.download-perturbseq.ipynb
@@ -34,7 +34,7 @@
    "source": [
     "def download_file(file, base_url, output_dir):\n",
     "    download_url = f\"{base_url}/{file}\"\n",
-    "    print(f\"Now downloading {download_url}...\")\n",
+    "    print(f\"Now downloading {download_url}\")\n",
     "    output_file = pathlib.Path(f\"{output_dir}/{file}\")\n",
     "\n",
     "    urlretrieve(download_url, output_file)"
@@ -72,20 +72,20 @@
     "files = {\n",
     "    \"barcodes\": {\n",
     "        \"download\": f\"{gse_id}_10X_barcodes.tsv.gz\",\n",
-    "        \"rename\": pathlib.Path(f\"{gse_id}/barcodes.tsv\")\n",
+    "        \"rename\": pathlib.Path(f\"{gse_id}/barcodes.tsv\"),\n",
     "    },\n",
     "    \"data\": {\n",
     "        \"download\": f\"{gse_id}_10X_matrix.mtx.gz\",\n",
-    "        \"rename\": pathlib.Path(f\"{gse_id}/matrix.mtx\")\n",
+    "        \"rename\": pathlib.Path(f\"{gse_id}/matrix.mtx\"),\n",
     "    },\n",
     "    \"genes\": {\n",
     "        \"download\": f\"{gse_id}_10X_genes.tsv.gz\",\n",
-    "        \"rename\": pathlib.Path(f\"{gse_id}/genes.tsv\")\n",
+    "        \"rename\": pathlib.Path(f\"{gse_id}/genes.tsv\"),\n",
     "    },\n",
     "    \"other\": {\n",
     "        f\"{gse_id}_cell_identities.csv.gz\",\n",
     "        f\"{gse_id}_sgRNA_barcode_sequences_and_phenotypes.csv.gz\",\n",
-    "    }\n",
+    "    },\n",
     "}\n",
     "\n",
     "files"
@@ -103,8 +103,8 @@
       "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_10X_barcodes.tsv.gz...\n",
       "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_10X_matrix.mtx.gz...\n",
       "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_10X_genes.tsv.gz...\n",
-      "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_cell_identities.csv.gz...\n",
-      "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_sgRNA_barcode_sequences_and_phenotypes.csv.gz...\n"
+      "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_sgRNA_barcode_sequences_and_phenotypes.csv.gz...\n",
+      "Now downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132080/suppl//GSE132080_cell_identities.csv.gz...\n"
      ]
     }
    ],
@@ -138,20 +138,57 @@
     "    if data_type != \"other\":\n",
     "        file = files[data_type][\"download\"]\n",
     "        file = pathlib.Path(f\"{output_dir}/{file}\")\n",
-    "        \n",
-    "        rename_and_gunzip_file = pathlib.Path(f\"{output_dir}/{files[data_type]['rename']}\")\n",
+    "\n",
+    "        rename_and_gunzip_file = pathlib.Path(\n",
+    "            f\"{output_dir}/{files[data_type]['rename']}\"\n",
+    "        )\n",
     "        rename_and_gunzip_file.parent.mkdir(exist_ok=True)\n",
-    "        \n",
-    "        print(f\"Now extracting {file} to {rename_and_gunzip_file}...\")\n",
+    "\n",
+    "        print(f\"Now extracting {file} to {rename_and_gunzip_file}\")\n",
     "        with gzip.open(file, \"rb\") as f_in:\n",
     "            with open(rename_and_gunzip_file, \"wb\") as f_out:\n",
     "                shutil.copyfileobj(f_in, f_out)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Now downloading https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-019-0387-5/MediaObjects/41587_2019_387_MOESM3_ESM.zip...\n"
+     ]
+    }
+   ],
+   "source": [
+    "paper_supplement_base_url = \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-019-0387-5/MediaObjects\"\n",
+    "paper_supplement_filename = \"41587_2019_387_MOESM3_ESM.zip\"\n",
+    "paper_supplement_dir = \"paper_supplement\"\n",
+    "\n",
+    "download_file(paper_supplement_filename, paper_supplement_base_url, output_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\n",
+    "    f\"Now extracting {paper_supplement_filename} to {output_dir / paper_supplement_dir}\"\n",
+    ")\n",
+    "shutil.unpack_archive(\n",
+    "    output_dir / paper_supplement_filename, output_dir / paper_supplement_dir\n",
+    ")"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -165,7 +202,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.11"
+   "version": "3.10.8"
   }
  },
  "nbformat": 4,

diff --git a/0.download-data/2.process-perturbseq.ipynb b/0.download-data/2.process-perturbseq.ipynb
diff --git a/0.download-data/3.finalize-perturbseq.ipynb b/0.download-data/3.finalize-perturbseq.ipynb
@@ -42,7 +42,9 @@
     "perturbseq_data_dir = pathlib.Path(\"data/perturbseq/\")\n",
     "\n",
     "output_file = pathlib.Path(f\"{perturbseq_data_dir}/{gse_id}_final_analytical.tsv.gz\")\n",
-    "output_bulk_file = pathlib.Path(f\"{perturbseq_data_dir}/{gse_id}_bulk_final_analytical.tsv.gz\")"
+    "output_bulk_file = pathlib.Path(\n",
+    "    f\"{perturbseq_data_dir}/{gse_id}_bulk_final_analytical.tsv.gz\"\n",
+    ")"
    ]
   },
   {
@@ -279,7 +281,9 @@
     "gene_features = gene_exp_df.columns.tolist()\n",
     "gene_features.remove(\"Metadata_barcode\")\n",
     "\n",
-    "gene_exp_df = gene_exp_df.assign(Metadata_sequence=[x.split(\"-\")[0] for x in gene_exp_df.Metadata_barcode])\n",
+    "gene_exp_df = gene_exp_df.assign(\n",
+    "    Metadata_sequence=[x.split(\"-\")[0] for x in gene_exp_df.Metadata_barcode]\n",
+    ")\n",
     "gene_exp_df.columns.name = \"\"\n",
     "\n",
     "meta_features = [\"Metadata_barcode\", \"Metadata_sequence\"]\n",
@@ -441,7 +445,11 @@
     "cell_id_df = pd.read_csv(identity_file, sep=\",\")\n",
     "\n",
     "cell_id_df.columns = [f\"Metadata_{x}\" for x in cell_id_df.columns]\n",
-    "cell_id_df = cell_id_df.assign(Metadata_gene_identity=[str(x).split(\"_\")[0] for x in cell_id_df.Metadata_guide_identity])\n",
+    "cell_id_df = cell_id_df.assign(\n",
+    "    Metadata_gene_identity=[\n",
+    "        str(x).split(\"_\")[0] for x in cell_id_df.Metadata_guide_identity\n",
+    "    ]\n",
+    ")\n",
     "\n",
     "print(cell_id_df.shape)\n",
     "cell_id_df.head()"
@@ -686,7 +694,7 @@
     "    gene_exp_df,\n",
     "    how=\"right\",\n",
     "    right_on=\"Metadata_barcode\",\n",
-    "    left_on=\"Metadata_cell_barcode\"\n",
+    "    left_on=\"Metadata_cell_barcode\",\n",
     ")\n",
     "\n",
     "sc_df = sc_df.reset_index().rename({\"index\": \"Metadata_cell_identity\"}, axis=\"columns\")\n",
@@ -704,10 +712,7 @@
    "source": [
     "# Write the file to disk\n",
     "sc_df.to_csv(\n",
-    "    output_file,\n",
-    "    index=False,\n",
-    "    sep=\"\\t\",\n",
-    "    compression={\"method\": \"gzip\", \"mtime\": 1}\n",
+    "    output_file, index=False, sep=\"\\t\", compression={\"method\": \"gzip\", \"mtime\": 1}\n",
     ")"
    ]
   },
@@ -943,17 +948,21 @@
     "    population_df=sc_df,\n",
     "    strata=[\"Metadata_guide_identity\"],\n",
     "    features=gene_features,\n",
-    "    operation=\"median\"\n",
+    "    operation=\"median\",\n",
     ")\n",
     "\n",
+    "# remove one row with NaN value\n",
+    "bulk_df = bulk_df[~bulk_df[\"Metadata_guide_identity\"].isnull()]\n",
+    "\n",
     "# create a column for the gene\n",
-    "bulk_df = (\n",
-    "    bulk_df\n",
-    "    .assign(Metadata_gene_identity=[x.split(\"_\")[0] for x in bulk_df.Metadata_guide_identity])\n",
-    "    .query(\"Metadata_gene_identity != '*'\")\n",
-    ")\n",
+    "bulk_df = bulk_df.assign(\n",
+    "    Metadata_gene_identity=[x.split(\"_\")[0] for x in bulk_df.Metadata_guide_identity]\n",
+    ").query(\"Metadata_gene_identity != '*'\")\n",
     "\n",
-    "bulk_df = bulk_df.reindex([\"Metadata_guide_identity\", \"Metadata_gene_identity\"] + gene_features, axis=\"columns\")\n",
+    "bulk_df = bulk_df.reindex(\n",
+    "    [\"Metadata_guide_identity\", \"Metadata_gene_identity\"] + gene_features,\n",
+    "    axis=\"columns\",\n",
+    ")\n",
     "\n",
     "print(bulk_df.shape)\n",
     "bulk_df.head()"
@@ -967,19 +976,16 @@
    "source": [
     "# Write the file to disk\n",
     "bulk_df.to_csv(\n",
-    "    output_bulk_file,\n",
-    "    index=False,\n",
-    "    sep=\"\\t\",\n",
-    "    compression={\"method\": \"gzip\", \"mtime\": 1}\n",
+    "    output_bulk_file, index=False, sep=\"\\t\", compression={\"method\": \"gzip\", \"mtime\": 1}\n",
     ")"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:grit-benchmark] *",
+   "display_name": "grit-benchmark",
    "language": "python",
-   "name": "conda-env-grit-benchmark-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -991,7 +997,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.10.8"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "5263a8d172dade9e46e5af2db82be6add0b3c649f4dab478dd2c42518a050092"
+   }
   }
  },
  "nbformat": 4,