MannLabs · mschwoer · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 25, 2024
diff --git a/alphabase/anndata/__init__.py b/alphabase/anndata/__init__.py
diff --git a/alphabase/anndata/anndata_factory.py b/alphabase/anndata/anndata_factory.py
@@ -0,0 +1,123 @@
+"""Factory class to convert PSM DataFrames to AnnData format."""
+
+import warnings
+from typing import List, Optional, Union
+
+import anndata as ad
+import numpy as np
+import pandas as pd
+
+from alphabase.psm_reader import PSMReaderBase  # noqa: TCH001
+from alphabase.psm_reader.keys import PsmDfCols
+
+
+class AnnDataFactory:
+    """Factory class to convert AlphaBase PSM DataFrames to AnnData format."""
+
+    def __init__(self, psm_df: pd.DataFrame):
+        """Initialize AnnDataFactory.
+
+        Parameters
+        ----------
+        psm_df : pd.DataFrame
+            AlphaBase PSM DataFrame containing at minimum the columns:
+            - PsmDfCols.RAW_NAME
+            - PsmDfCols.PROTEINS
+            - PsmDfCols.INTENSITY
+
+        """
+        required_cols = [PsmDfCols.RAW_NAME, PsmDfCols.PROTEINS, PsmDfCols.INTENSITY]
+        missing_cols = [col for col in required_cols if col not in psm_df.columns]
+        if missing_cols:
+            raise ValueError(f"Missing required columns: {missing_cols}")
+
+        self._psm_df = psm_df
+
+        duplicated_proteins = self._psm_df[PsmDfCols.PROTEINS].duplicated()
+        if duplicated_proteins.sum() > 0:
+            warnings.warn(
+                f"Found {duplicated_proteins.sum()} duplicated protein groups. Using only first."
+            )
+
+    def create_anndata(self) -> ad.AnnData:
+        """Create AnnData object from PSM DataFrame.
+
+        Returns
+        -------
+        ad.AnnData
+            AnnData object where:
+            - obs (rows) are raw names
+            - var (columns) are proteins
+            - X contains intensity values
+
+        """
+        # Create pivot table: raw names x proteins with intensity values
+        pivot_df = pd.pivot_table(
+            self._psm_df,
+            index=PsmDfCols.RAW_NAME,
+            columns=PsmDfCols.PROTEINS,
+            values=PsmDfCols.INTENSITY,
+            aggfunc="first",
+            fill_value=np.nan,
+            dropna=False,
+        )
+
+        return ad.AnnData(
+            X=pivot_df.values,
+            obs=pd.DataFrame(index=pivot_df.index),
+            var=pd.DataFrame(index=pivot_df.columns),
+        )
+
+    @classmethod
+    def from_files(
+        cls,
+        file_paths: Union[str, List[str]],
+        reader_type: str = "maxquant",
+        *,
+        intensity_column: Optional[str] = None,
+        protein_id_column: Optional[str] = None,
+        raw_name_column: Optional[str] = None,
+        **kwargs,
+    ) -> "AnnDataFactory":
+        """Create AnnDataFactory from PSM files.
+
+        Parameters
+        ----------
+        file_paths : Union[str, List[str]]
+            Path(s) to PSM file(s)
+        reader_type : str, optional
+            Type of PSM reader to use, by default "maxquant"
+        intensity_column: str, optional
+            Name of the column storing intensity data. Default is taken from `psm_reader.yaml`
+        protein_id_column: str, optional
+            Name of the column storing proteins ids. Default is taken from `psm_reader.yaml`
+        raw_name_column: str, optional
+            Name of the column storing raw (or run) name. Default is taken from `psm_reader.yaml`
+        **kwargs
+            Additional arguments passed to PSM reader
+
+        Returns
+        -------
+        AnnDataFactory
+            Initialized AnnDataFactory instance
+
+        """
+        from alphabase.psm_reader.psm_reader import psm_reader_provider
+
+        reader: PSMReaderBase = psm_reader_provider.get_reader(reader_type, **kwargs)
+
+        custom_column_mapping = {
+            k: v
+            for k, v in {
+                PsmDfCols.INTENSITY: intensity_column if intensity_column else None,
+                PsmDfCols.PROTEINS: protein_id_column if protein_id_column else None,
+                PsmDfCols.RAW_NAME: raw_name_column if raw_name_column else None,
+            }.items()
+            if v is not None
+        }
+
+        if custom_column_mapping:
+            reader.add_column_mapping(custom_column_mapping)
+
+        psm_df = reader.load(file_paths)
+        return cls(psm_df)
diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml
@@ -180,25 +180,26 @@ msfragger_pepxml:
   mod_mass_tol: 0.1 # Da
   modification_mapping_type: 'maxquant'
 
-diann:
+diann: # 1.8.1
   reader_type: diann
   rt_unit: minute
   fixed_C57: False
   column_mapping:
-    'raw_name': 'Run'
+    'raw_name': 'Run' # File.Name?
     'sequence': 'Stripped.Sequence'
     'charge': 'Precursor.Charge'
     'rt': 'RT'
     'rt_start': 'RT.Start'
     'rt_stop': 'RT.Stop'
     'ccs': 'CCS'
     'mobility': ['IM','IonMobility']
-    'proteins': 'Protein.Names'
+    'proteins': 'Protein.Names' # Protein.Group ?
     'uniprot_ids': 'Protein.Ids'
     'genes': 'Genes'
     'scan_num': 'MS2.Scan'
     'score': 'CScore'
     'fdr': 'Q.Value'
+    'intensity': "PG.MaxLFQ"
   mod_seq_columns:
     - "Modified.Sequence"
   modification_mapping_type: 'maxquant'

diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py
@@ -44,6 +44,7 @@ class PsmDfCols(metaclass=ConstantsClass):
     MOBILITY = "mobility"
     PEPTIDE_FDR = "peptide_fdr"
     PROTEIN_FDR = "protein_fdr"
+    INTENSITY = "intensity"
 
     RAW_NAME = "raw_name"
     CHARGE = "charge"
@@ -58,9 +59,6 @@ class PsmDfCols(metaclass=ConstantsClass):
     _GENES = "genes"
     _QUERY_ID = "query_id"
 
-    # part of psm_reader_yaml, but not directly referenced
-    _INTENSITY = "intensity"
-
 
 class LibPsmDfCols(metaclass=ConstantsClass):
     """Constants for accessing the columns of a Library PSM dataframe."""

diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py
@@ -198,6 +198,10 @@ def set_modification_mapping(
         """
         self._modification_mapper.set_modification_mapping(modification_mapping)
 
+    def add_column_mapping(self, column_mapping: Dict) -> None:
+        """Add additional column mappings for the search engine."""
+        self.column_mapping = self.column_mapping | column_mapping
+
     def load(self, _file: Union[List[str], str]) -> pd.DataFrame:
         """Import a single file or multiple files."""
         if isinstance(_file, list):
@@ -444,13 +448,19 @@ def get_reader(
         **kwargs,
     ) -> PSMReaderBase:
         """Get a reader by reader_type."""
-        return self.reader_dict[reader_type.lower()](
-            column_mapping=column_mapping,
-            modification_mapping=modification_mapping,
-            fdr=fdr,
-            keep_decoy=keep_decoy,
-            **kwargs,
-        )
+        try:
+            return self.reader_dict[reader_type.lower()](
+                column_mapping=column_mapping,
+                modification_mapping=modification_mapping,
+                fdr=fdr,
+                keep_decoy=keep_decoy,
+                **kwargs,
+            )
+        except KeyError as e:
+            raise KeyError(
+                f"Unknown reader type '{reader_type}'. Available readers: "
+                f"{', '.join(self.reader_dict.keys())}"
+            ) from e
 
     def get_reader_by_yaml(
         self,

diff --git a/nbs_tests/anndata/tutorial_anndata.ipynb b/nbs_tests/anndata/tutorial_anndata.ipynb
@@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Tutorial: Using AnnDataFactory for Proteomics Data Analysis\n",
+    "\n",
+    "This notebook demonstrates how to use the `AnnDataFactory` class to convert proteomics PSM (Peptide Spectrum Matches) data into AnnData format, which is widely used in single-cell analysis pipelines."
+   ],
+   "id": "e5bd244a6f88775c"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "import tempfile\n",
+    "\n",
+    "from alphabase.psm_reader.keys import PsmDfCols\n",
+    "from alphabase.anndata.anndata_factory import AnnDataFactory\n",
+    "from alphabase.tools.data_downloader import DataShareDownloader\n"
+   ],
+   "id": "84a2980926abae94",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### 1. Creating an AnnDataFactory from a DataFrame\n",
+    "\n",
+    "First, let's create a sample PSM DataFrame with the required columns and pass it to the `AnnDataFactory` constructor.\n",
+    "\n",
+    "The resulting AnnData object has:\n",
+    "   - Rows (obs) representing samples (raw names)\n",
+    "   - Columns (var) representing proteins\n",
+    "   - X matrix containing intensity values"
+   ],
+   "id": "9647ed76f001bb89"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# Create sample PSM data\n",
+    "sample_psm_data = {\n",
+    "    PsmDfCols.RAW_NAME: ['sample1', 'sample1', 'sample2', 'sample2'],\n",
+    "    PsmDfCols.PROTEINS: ['proteinA', 'proteinB', 'proteinA', 'proteinB'],\n",
+    "    PsmDfCols.INTENSITY: [100, 200, 150, 250]\n",
+    "}\n",
+    "psm_df = pd.DataFrame(sample_psm_data)\n",
+    "\n",
+    "# Create AnnDataFactory instance\n",
+    "factory = AnnDataFactory(psm_df)\n",
+    "\n",
+    "# Convert to AnnData\n",
+    "adata = factory.create_anndata()\n",
+    "\n",
+    "print(\"AnnData shape:\", adata.shape)\n",
+    "print(\"\\nObservations (samples):\", adata.obs_names)\n",
+    "print(\"\\nVariables (proteins):\", adata.var_names)\n",
+    "print(\"\\nIntensity matrix:\\n\", adata.X)"
+   ],
+   "id": "1aa9f62e70422f32",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## 2. Loading Data from Files (AlphaDIA Example)\n",
+    "\n",
+    "The AnnDataFactory can also read data directly from PSM files. Here's how to use it with MaxQuant output:\n",
+    "\n"
+   ],
+   "id": "14a55a3abe72ad83"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "url = \"https://datashare.biochem.mpg.de/s/Hk41INtwBvBl0kP/download?files=alphadia_1.8.1_report_head.tsv\"\n",
+    "with tempfile.TemporaryDirectory() as temp_dir:\n",
+    "    file_path = DataShareDownloader(\n",
+    "        url=url, output_dir=temp_dir\n",
+    "    ).download()\n",
+    "    \n",
+    "\n",
+    "    factory = AnnDataFactory.from_files(\n",
+    "        file_paths=file_path,\n",
+    "        reader_type=\"alphadia\"\n",
+    "    )\n",
+    "\n",
+    "# Convert to AnnData\n",
+    "adata = factory.create_anndata()\n",
+    "\n",
+    "\n",
+    "print(\"AnnData shape:\", adata.shape)\n",
+    "\n",
+    "adata.to_df()\n"
+   ],
+   "id": "d2af73ad60b29601",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "\n",
+    "## 3. Customizing Column Names\n",
+    "\n",
+    "If your input files use different column names than what is preconfigured in `AnnDataFactory`, you can specify them:"
+   ],
+   "id": "685930b74da70a12"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "url = \"https://datashare.biochem.mpg.de/s/Hk41INtwBvBl0kP/download?files=diann_1.9.0_report_head.tsv\"\n",
+    "\n",
+    "with tempfile.TemporaryDirectory() as temp_dir:\n",
+    "    file_path = DataShareDownloader(\n",
+    "        url=url, output_dir=temp_dir\n",
+    "    ).download()\n",
+    "    \n",
+    "    factory = AnnDataFactory.from_files(\n",
+    "        file_paths=file_path,\n",
+    "        reader_type=\"diann\",\n",
+    "        raw_name_column=\"File.Name\",\n",
+    "        protein_id_column=\"Protein.Group\",\n",
+    "        # intensity_column=\"PG.MaxLFQ\",\n",
+    "    )\n",
+    "    \n",
+    "adata = factory.create_anndata()\n",
+    "\n",
+    "print(\"AnnData shape:\", adata.shape)\n",
+    "\n",
+    "adata.to_df()"
+   ],
+   "id": "767c7268ff800451",
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/requirements.txt b/requirements.txt
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -2,6 +2,7 @@
 # Only usage of fixed versions is allowed, and all dependencies listed here must also be
 # included in `requirements_loose.txt` (enforced by a test).
 # TODO clean the requirements
+anndata==0.11.1
 numba==0.60.0
 numpy<2.0 # test: tolerate_version  # rdkit==2024.3.3 is not compatible with numpy >= 2.0
 pyyaml==6.0.2

diff --git a/requirements/requirements_loose.txt b/requirements/requirements_loose.txt
@@ -1,6 +1,7 @@
 # Dependencies required for running the "loose" version of alphabase.
 # All dependencies that are also included in `requirements.txt` must be added also here (enforced by a test).
 # TODO clean the requirements
+anndata
 numba
 numpy
 pyyaml

diff --git a/tests/integration/reference_data/reference_ad_alphadia_181.parquet b/tests/integration/reference_data/reference_ad_alphadia_181.parquet
diff --git a/tests/integration/reference_data/reference_ad_diann_181.parquet b/tests/integration/reference_data/reference_ad_diann_181.parquet
diff --git a/tests/integration/reference_data/reference_ad_diann_190.parquet b/tests/integration/reference_data/reference_ad_diann_190.parquet
diff --git a/tests/integration/reference_data/reference_diann.parquet b/tests/integration/reference_data/reference_diann.parquet
diff --git a/tests/integration/reference_data/reference_diann_1.8.1_tsv.parquet b/tests/integration/reference_data/reference_diann_1.8.1_tsv.parquet
diff --git a/tests/integration/reference_data/reference_diann_1.9.0_tsv.parquet b/tests/integration/reference_data/reference_diann_1.9.0_tsv.parquet