Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add anndata factory #255

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added alphabase/anndata/__init__.py
Empty file.
123 changes: 123 additions & 0 deletions alphabase/anndata/anndata_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Factory class to convert PSM DataFrames to AnnData format."""

import warnings
from typing import List, Optional, Union

import anndata as ad
import numpy as np
import pandas as pd

from alphabase.psm_reader import PSMReaderBase # noqa: TCH001
from alphabase.psm_reader.keys import PsmDfCols


class AnnDataFactory:
"""Factory class to convert AlphaBase PSM DataFrames to AnnData format."""

def __init__(self, psm_df: pd.DataFrame):
"""Initialize AnnDataFactory.

Parameters
----------
psm_df : pd.DataFrame
AlphaBase PSM DataFrame containing at minimum the columns:
- PsmDfCols.RAW_NAME
- PsmDfCols.PROTEINS
- PsmDfCols.INTENSITY

"""
required_cols = [PsmDfCols.RAW_NAME, PsmDfCols.PROTEINS, PsmDfCols.INTENSITY]
missing_cols = [col for col in required_cols if col not in psm_df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")

self._psm_df = psm_df
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to add optional metadata columns to the .obs and .var attributes by passing obs_columns: Optional[str, List[str]] and var_columns: Optional[str, List[str]] to the factory class?

This would add to the complexity as one had to validate that the columns are in the data frame, but other than that one could just use .pivot_table while passing the list of columns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


duplicated_proteins = self._psm_df[PsmDfCols.PROTEINS].duplicated()
if duplicated_proteins.sum() > 0:
warnings.warn(
f"Found {duplicated_proteins.sum()} duplicated protein groups. Using only first."
)

def create_anndata(self) -> ad.AnnData:
"""Create AnnData object from PSM DataFrame.

Returns
-------
ad.AnnData
AnnData object where:
- obs (rows) are raw names
- var (columns) are proteins
- X contains intensity values

"""
# Create pivot table: raw names x proteins with intensity values
pivot_df = pd.pivot_table(
self._psm_df,
index=PsmDfCols.RAW_NAME,
columns=PsmDfCols.PROTEINS,
values=PsmDfCols.INTENSITY,
aggfunc="first",
fill_value=np.nan,
dropna=False,
)

return ad.AnnData(
X=pivot_df.values,
obs=pd.DataFrame(index=pivot_df.index),
var=pd.DataFrame(index=pivot_df.columns),
)

@classmethod
def from_files(
cls,
file_paths: Union[str, List[str]],
reader_type: str = "maxquant",
*,
intensity_column: Optional[str] = None,
protein_id_column: Optional[str] = None,
raw_name_column: Optional[str] = None,
**kwargs,
) -> "AnnDataFactory":
"""Create AnnDataFactory from PSM files.

Parameters
----------
file_paths : Union[str, List[str]]
Path(s) to PSM file(s)
reader_type : str, optional
Type of PSM reader to use, by default "maxquant"
intensity_column: str, optional
Name of the column storing intensity data. Default is taken from `psm_reader.yaml`
protein_id_column: str, optional
Name of the column storing proteins ids. Default is taken from `psm_reader.yaml`
raw_name_column: str, optional
Name of the column storing raw (or run) name. Default is taken from `psm_reader.yaml`
**kwargs
Additional arguments passed to PSM reader

Returns
-------
AnnDataFactory
Initialized AnnDataFactory instance

"""
from alphabase.psm_reader.psm_reader import psm_reader_provider

reader: PSMReaderBase = psm_reader_provider.get_reader(reader_type, **kwargs)

custom_column_mapping = {
k: v
for k, v in {
PsmDfCols.INTENSITY: intensity_column if intensity_column else None,
PsmDfCols.PROTEINS: protein_id_column if protein_id_column else None,
PsmDfCols.RAW_NAME: raw_name_column if raw_name_column else None,
}.items()
if v is not None
}

if custom_column_mapping:
reader.add_column_mapping(custom_column_mapping)

psm_df = reader.load(file_paths)
return cls(psm_df)
7 changes: 4 additions & 3 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -180,25 +180,26 @@ msfragger_pepxml:
mod_mass_tol: 0.1 # Da
modification_mapping_type: 'maxquant'

diann:
diann: # 1.8.1
reader_type: diann
rt_unit: minute
fixed_C57: False
column_mapping:
'raw_name': 'Run'
'raw_name': 'Run' # File.Name?
'sequence': 'Stripped.Sequence'
'charge': 'Precursor.Charge'
'rt': 'RT'
'rt_start': 'RT.Start'
'rt_stop': 'RT.Stop'
'ccs': 'CCS'
'mobility': ['IM','IonMobility']
'proteins': 'Protein.Names'
'proteins': 'Protein.Names' # Protein.Group ?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which one to use here @GeorgWa @vbrennsteiner ?

and: if we change it, this would be a breaking change .. how to deal with that?

Copy link
Contributor

@lucas-diedrich lucas-diedrich Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the difference is whether the Uniprot Names (Protein.Names) or potentially different names are utilized, but to me, it sounds like the information is the same.

From the official DIANN Docs:

  • Protein.Group - inferred proteins. See the description of the Protein inference GUI setting and the --relaxed-prot-inf option.

--relaxed-prot-inf instructs DIA-NN to use a very heuristical protein inference algorithm (similar to the one used by FragPipe and many other software tools), wherein DIA-NN aims to make sure that no protein is present simultaneously in multiple protein groups. This mode (i) is recommended for method optimisation & benchmarks, (ii) might be convenient for gene set enrichment analysis and related kinds of downstream processing. However the alternative protein inference strategy of DIA-NN is more reliable for differential expression analyses (this is one of the advantages of DIA-NN). Equivalent to the 'Heuristic protein inference' GUI setting, which is activated by default since DIA-NN 1.8.1

  • Protein.Ids - all proteins matched to the precursor in the library or, in case of library-free search, in the sequence database
  • Protein.Names names (UniProt names) of the proteins in the Protein.Group

'uniprot_ids': 'Protein.Ids'
'genes': 'Genes'
'scan_num': 'MS2.Scan'
'score': 'CScore'
'fdr': 'Q.Value'
'intensity': "PG.MaxLFQ"
mod_seq_columns:
- "Modified.Sequence"
modification_mapping_type: 'maxquant'
Expand Down
4 changes: 1 addition & 3 deletions alphabase/psm_reader/keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class PsmDfCols(metaclass=ConstantsClass):
MOBILITY = "mobility"
PEPTIDE_FDR = "peptide_fdr"
PROTEIN_FDR = "protein_fdr"
INTENSITY = "intensity"

RAW_NAME = "raw_name"
CHARGE = "charge"
Expand All @@ -58,9 +59,6 @@ class PsmDfCols(metaclass=ConstantsClass):
_GENES = "genes"
_QUERY_ID = "query_id"

# part of psm_reader_yaml, but not directly referenced
_INTENSITY = "intensity"


class LibPsmDfCols(metaclass=ConstantsClass):
"""Constants for accessing the columns of a Library PSM dataframe."""
Expand Down
24 changes: 17 additions & 7 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ def set_modification_mapping(
"""
self._modification_mapper.set_modification_mapping(modification_mapping)

def add_column_mapping(self, column_mapping: Dict) -> None:
"""Add additional column mappings for the search engine."""
self.column_mapping = self.column_mapping | column_mapping

def load(self, _file: Union[List[str], str]) -> pd.DataFrame:
"""Import a single file or multiple files."""
if isinstance(_file, list):
Expand Down Expand Up @@ -444,13 +448,19 @@ def get_reader(
**kwargs,
) -> PSMReaderBase:
"""Get a reader by reader_type."""
return self.reader_dict[reader_type.lower()](
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
**kwargs,
)
try:
return self.reader_dict[reader_type.lower()](
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
**kwargs,
)
except KeyError as e:
raise KeyError(
f"Unknown reader type '{reader_type}'. Available readers: "
f"{', '.join(self.reader_dict.keys())}"
) from e

def get_reader_by_yaml(
self,
Expand Down
171 changes: 171 additions & 0 deletions nbs_tests/anndata/tutorial_anndata.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Tutorial: Using AnnDataFactory for Proteomics Data Analysis\n",
"\n",
"This notebook demonstrates how to use the `AnnDataFactory` class to convert proteomics PSM (Peptide Spectrum Matches) data into AnnData format, which is widely used in single-cell analysis pipelines."
],
"id": "e5bd244a6f88775c"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import tempfile\n",
"\n",
"from alphabase.psm_reader.keys import PsmDfCols\n",
"from alphabase.anndata.anndata_factory import AnnDataFactory\n",
"from alphabase.tools.data_downloader import DataShareDownloader\n"
],
"id": "84a2980926abae94",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"### 1. Creating an AnnDataFactory from a DataFrame\n",
"\n",
"First, let's create a sample PSM DataFrame with the required columns and pass it to the `AnnDataFactory` constructor.\n",
"\n",
"The resulting AnnData object has:\n",
" - Rows (obs) representing samples (raw names)\n",
" - Columns (var) representing proteins\n",
" - X matrix containing intensity values"
],
"id": "9647ed76f001bb89"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# Create sample PSM data\n",
"sample_psm_data = {\n",
" PsmDfCols.RAW_NAME: ['sample1', 'sample1', 'sample2', 'sample2'],\n",
" PsmDfCols.PROTEINS: ['proteinA', 'proteinB', 'proteinA', 'proteinB'],\n",
" PsmDfCols.INTENSITY: [100, 200, 150, 250]\n",
"}\n",
"psm_df = pd.DataFrame(sample_psm_data)\n",
"\n",
"# Create AnnDataFactory instance\n",
"factory = AnnDataFactory(psm_df)\n",
"\n",
"# Convert to AnnData\n",
"adata = factory.create_anndata()\n",
"\n",
"print(\"AnnData shape:\", adata.shape)\n",
"print(\"\\nObservations (samples):\", adata.obs_names)\n",
"print(\"\\nVariables (proteins):\", adata.var_names)\n",
"print(\"\\nIntensity matrix:\\n\", adata.X)"
],
"id": "1aa9f62e70422f32",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## 2. Loading Data from Files (AlphaDIA Example)\n",
"\n",
"The AnnDataFactory can also read data directly from PSM files. Here's how to use it with MaxQuant output:\n",
"\n"
],
"id": "14a55a3abe72ad83"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"url = \"https://datashare.biochem.mpg.de/s/Hk41INtwBvBl0kP/download?files=alphadia_1.8.1_report_head.tsv\"\n",
"with tempfile.TemporaryDirectory() as temp_dir:\n",
" file_path = DataShareDownloader(\n",
" url=url, output_dir=temp_dir\n",
" ).download()\n",
" \n",
"\n",
" factory = AnnDataFactory.from_files(\n",
" file_paths=file_path,\n",
" reader_type=\"alphadia\"\n",
" )\n",
"\n",
"# Convert to AnnData\n",
"adata = factory.create_anndata()\n",
"\n",
"\n",
"print(\"AnnData shape:\", adata.shape)\n",
"\n",
"adata.to_df()\n"
],
"id": "d2af73ad60b29601",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"\n",
"## 3. Customizing Column Names\n",
"\n",
"If your input files use different column names than what is preconfigured in `AnnDataFactory`, you can specify them:"
],
"id": "685930b74da70a12"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"url = \"https://datashare.biochem.mpg.de/s/Hk41INtwBvBl0kP/download?files=diann_1.9.0_report_head.tsv\"\n",
"\n",
"with tempfile.TemporaryDirectory() as temp_dir:\n",
" file_path = DataShareDownloader(\n",
" url=url, output_dir=temp_dir\n",
" ).download()\n",
" \n",
" factory = AnnDataFactory.from_files(\n",
" file_paths=file_path,\n",
" reader_type=\"diann\",\n",
" raw_name_column=\"File.Name\",\n",
" protein_id_column=\"Protein.Group\",\n",
" # intensity_column=\"PG.MaxLFQ\",\n",
" )\n",
" \n",
"adata = factory.create_anndata()\n",
"\n",
"print(\"AnnData shape:\", adata.shape)\n",
"\n",
"adata.to_df()"
],
"id": "767c7268ff800451",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Empty file added requirements.txt
Empty file.
1 change: 1 addition & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Only usage of fixed versions is allowed, and all dependencies listed here must also be
# included in `requirements_loose.txt` (enforced by a test).
# TODO clean the requirements
anndata==0.11.1
numba==0.60.0
numpy<2.0 # test: tolerate_version # rdkit==2024.3.3 is not compatible with numpy >= 2.0
pyyaml==6.0.2
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements_loose.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Dependencies required for running the "loose" version of alphabase.
# All dependencies that are also included in `requirements.txt` must be added also here (enforced by a test).
# TODO clean the requirements
anndata
numba
numpy
pyyaml
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified tests/integration/reference_data/reference_diann.parquet
Binary file not shown.
Binary file not shown.
Binary file modified tests/integration/reference_data/reference_diann_1.9.0_tsv.parquet
Binary file not shown.
Loading
Loading