-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add anndata factory #255
base: main
Are you sure you want to change the base?
Add anndata factory #255
Changes from all commits
fe00974
24572c1
0d2042c
1a84472
542cb1b
8ce3ce5
1c71385
d51c7ed
40a44bf
6fdc7ee
1af8e6f
b7e9a90
e833839
6200f12
75e3ef4
131e2de
1e6b3d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
"""Factory class to convert PSM DataFrames to AnnData format.""" | ||
|
||
import warnings | ||
from typing import List, Optional, Union | ||
|
||
import anndata as ad | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from alphabase.psm_reader import PSMReaderBase # noqa: TCH001 | ||
from alphabase.psm_reader.keys import PsmDfCols | ||
|
||
|
||
class AnnDataFactory: | ||
"""Factory class to convert AlphaBase PSM DataFrames to AnnData format.""" | ||
|
||
def __init__(self, psm_df: pd.DataFrame): | ||
"""Initialize AnnDataFactory. | ||
|
||
Parameters | ||
---------- | ||
psm_df : pd.DataFrame | ||
AlphaBase PSM DataFrame containing at minimum the columns: | ||
- PsmDfCols.RAW_NAME | ||
- PsmDfCols.PROTEINS | ||
- PsmDfCols.INTENSITY | ||
|
||
""" | ||
required_cols = [PsmDfCols.RAW_NAME, PsmDfCols.PROTEINS, PsmDfCols.INTENSITY] | ||
missing_cols = [col for col in required_cols if col not in psm_df.columns] | ||
if missing_cols: | ||
raise ValueError(f"Missing required columns: {missing_cols}") | ||
|
||
self._psm_df = psm_df | ||
|
||
duplicated_proteins = self._psm_df[PsmDfCols.PROTEINS].duplicated() | ||
if duplicated_proteins.sum() > 0: | ||
warnings.warn( | ||
f"Found {duplicated_proteins.sum()} duplicated protein groups. Using only first." | ||
) | ||
|
||
def create_anndata(self) -> ad.AnnData: | ||
"""Create AnnData object from PSM DataFrame. | ||
|
||
Returns | ||
------- | ||
ad.AnnData | ||
AnnData object where: | ||
- obs (rows) are raw names | ||
- var (columns) are proteins | ||
- X contains intensity values | ||
|
||
""" | ||
# Create pivot table: raw names x proteins with intensity values | ||
pivot_df = pd.pivot_table( | ||
self._psm_df, | ||
index=PsmDfCols.RAW_NAME, | ||
columns=PsmDfCols.PROTEINS, | ||
values=PsmDfCols.INTENSITY, | ||
aggfunc="first", | ||
fill_value=np.nan, | ||
dropna=False, | ||
) | ||
|
||
return ad.AnnData( | ||
X=pivot_df.values, | ||
obs=pd.DataFrame(index=pivot_df.index), | ||
var=pd.DataFrame(index=pivot_df.columns), | ||
) | ||
|
||
@classmethod | ||
def from_files( | ||
cls, | ||
file_paths: Union[str, List[str]], | ||
reader_type: str = "maxquant", | ||
*, | ||
intensity_column: Optional[str] = None, | ||
protein_id_column: Optional[str] = None, | ||
raw_name_column: Optional[str] = None, | ||
**kwargs, | ||
) -> "AnnDataFactory": | ||
"""Create AnnDataFactory from PSM files. | ||
|
||
Parameters | ||
---------- | ||
file_paths : Union[str, List[str]] | ||
Path(s) to PSM file(s) | ||
reader_type : str, optional | ||
Type of PSM reader to use, by default "maxquant" | ||
intensity_column: str, optional | ||
Name of the column storing intensity data. Default is taken from `psm_reader.yaml` | ||
protein_id_column: str, optional | ||
Name of the column storing proteins ids. Default is taken from `psm_reader.yaml` | ||
raw_name_column: str, optional | ||
Name of the column storing raw (or run) name. Default is taken from `psm_reader.yaml` | ||
**kwargs | ||
Additional arguments passed to PSM reader | ||
|
||
Returns | ||
------- | ||
AnnDataFactory | ||
Initialized AnnDataFactory instance | ||
|
||
""" | ||
from alphabase.psm_reader.psm_reader import psm_reader_provider | ||
|
||
reader: PSMReaderBase = psm_reader_provider.get_reader(reader_type, **kwargs) | ||
|
||
custom_column_mapping = { | ||
k: v | ||
for k, v in { | ||
PsmDfCols.INTENSITY: intensity_column if intensity_column else None, | ||
PsmDfCols.PROTEINS: protein_id_column if protein_id_column else None, | ||
PsmDfCols.RAW_NAME: raw_name_column if raw_name_column else None, | ||
}.items() | ||
if v is not None | ||
} | ||
|
||
if custom_column_mapping: | ||
reader.add_column_mapping(custom_column_mapping) | ||
|
||
psm_df = reader.load(file_paths) | ||
return cls(psm_df) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,25 +180,26 @@ msfragger_pepxml: | |
mod_mass_tol: 0.1 # Da | ||
modification_mapping_type: 'maxquant' | ||
|
||
diann: | ||
diann: # 1.8.1 | ||
reader_type: diann | ||
rt_unit: minute | ||
fixed_C57: False | ||
column_mapping: | ||
'raw_name': 'Run' | ||
'raw_name': 'Run' # File.Name? | ||
'sequence': 'Stripped.Sequence' | ||
'charge': 'Precursor.Charge' | ||
'rt': 'RT' | ||
'rt_start': 'RT.Start' | ||
'rt_stop': 'RT.Stop' | ||
'ccs': 'CCS' | ||
'mobility': ['IM','IonMobility'] | ||
'proteins': 'Protein.Names' | ||
'proteins': 'Protein.Names' # Protein.Group ? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. which one to use here @GeorgWa @vbrennsteiner ? and: if we change it, this would be a breaking change .. how to deal with that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like the difference is whether the Uniprot Names (Protein.Names) or potentially different names are utilized, but to me, it sounds like the information is the same. From the official DIANN Docs:
|
||
'uniprot_ids': 'Protein.Ids' | ||
'genes': 'Genes' | ||
'scan_num': 'MS2.Scan' | ||
'score': 'CScore' | ||
'fdr': 'Q.Value' | ||
'intensity': "PG.MaxLFQ" | ||
mod_seq_columns: | ||
- "Modified.Sequence" | ||
modification_mapping_type: 'maxquant' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"metadata": {}, | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Tutorial: Using AnnDataFactory for Proteomics Data Analysis\n", | ||
"\n", | ||
"This notebook demonstrates how to use the `AnnDataFactory` class to convert proteomics PSM (Peptide Spectrum Matches) data into AnnData format, which is widely used in single-cell analysis pipelines." | ||
], | ||
"id": "e5bd244a6f88775c" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "code", | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import tempfile\n", | ||
"\n", | ||
"from alphabase.psm_reader.keys import PsmDfCols\n", | ||
"from alphabase.anndata.anndata_factory import AnnDataFactory\n", | ||
"from alphabase.tools.data_downloader import DataShareDownloader\n" | ||
], | ||
"id": "84a2980926abae94", | ||
"outputs": [], | ||
"execution_count": null | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "markdown", | ||
"source": [ | ||
"### 1. Creating an AnnDataFactory from a DataFrame\n", | ||
"\n", | ||
"First, let's create a sample PSM DataFrame with the required columns and pass it to the `AnnDataFactory` constructor.\n", | ||
"\n", | ||
"The resulting AnnData object has:\n", | ||
" - Rows (obs) representing samples (raw names)\n", | ||
" - Columns (var) representing proteins\n", | ||
" - X matrix containing intensity values" | ||
], | ||
"id": "9647ed76f001bb89" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "code", | ||
"source": [ | ||
"# Create sample PSM data\n", | ||
"sample_psm_data = {\n", | ||
" PsmDfCols.RAW_NAME: ['sample1', 'sample1', 'sample2', 'sample2'],\n", | ||
" PsmDfCols.PROTEINS: ['proteinA', 'proteinB', 'proteinA', 'proteinB'],\n", | ||
" PsmDfCols.INTENSITY: [100, 200, 150, 250]\n", | ||
"}\n", | ||
"psm_df = pd.DataFrame(sample_psm_data)\n", | ||
"\n", | ||
"# Create AnnDataFactory instance\n", | ||
"factory = AnnDataFactory(psm_df)\n", | ||
"\n", | ||
"# Convert to AnnData\n", | ||
"adata = factory.create_anndata()\n", | ||
"\n", | ||
"print(\"AnnData shape:\", adata.shape)\n", | ||
"print(\"\\nObservations (samples):\", adata.obs_names)\n", | ||
"print(\"\\nVariables (proteins):\", adata.var_names)\n", | ||
"print(\"\\nIntensity matrix:\\n\", adata.X)" | ||
], | ||
"id": "1aa9f62e70422f32", | ||
"outputs": [], | ||
"execution_count": null | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "markdown", | ||
"source": [ | ||
"## 2. Loading Data from Files (AlphaDIA Example)\n", | ||
"\n", | ||
"The AnnDataFactory can also read data directly from PSM files. Here's how to use it with MaxQuant output:\n", | ||
"\n" | ||
], | ||
"id": "14a55a3abe72ad83" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "code", | ||
"source": [ | ||
"url = \"https://datashare.biochem.mpg.de/s/Hk41INtwBvBl0kP/download?files=alphadia_1.8.1_report_head.tsv\"\n", | ||
"with tempfile.TemporaryDirectory() as temp_dir:\n", | ||
" file_path = DataShareDownloader(\n", | ||
" url=url, output_dir=temp_dir\n", | ||
" ).download()\n", | ||
" \n", | ||
"\n", | ||
" factory = AnnDataFactory.from_files(\n", | ||
" file_paths=file_path,\n", | ||
" reader_type=\"alphadia\"\n", | ||
" )\n", | ||
"\n", | ||
"# Convert to AnnData\n", | ||
"adata = factory.create_anndata()\n", | ||
"\n", | ||
"\n", | ||
"print(\"AnnData shape:\", adata.shape)\n", | ||
"\n", | ||
"adata.to_df()\n" | ||
], | ||
"id": "d2af73ad60b29601", | ||
"outputs": [], | ||
"execution_count": null | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "markdown", | ||
"source": [ | ||
"\n", | ||
"## 3. Customizing Column Names\n", | ||
"\n", | ||
"If your input files use different column names than what is preconfigured in `AnnDataFactory`, you can specify them:" | ||
], | ||
"id": "685930b74da70a12" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "code", | ||
"source": [ | ||
"url = \"https://datashare.biochem.mpg.de/s/Hk41INtwBvBl0kP/download?files=diann_1.9.0_report_head.tsv\"\n", | ||
"\n", | ||
"with tempfile.TemporaryDirectory() as temp_dir:\n", | ||
" file_path = DataShareDownloader(\n", | ||
" url=url, output_dir=temp_dir\n", | ||
" ).download()\n", | ||
" \n", | ||
" factory = AnnDataFactory.from_files(\n", | ||
" file_paths=file_path,\n", | ||
" reader_type=\"diann\",\n", | ||
" raw_name_column=\"File.Name\",\n", | ||
" protein_id_column=\"Protein.Group\",\n", | ||
" # intensity_column=\"PG.MaxLFQ\",\n", | ||
" )\n", | ||
" \n", | ||
"adata = factory.create_anndata()\n", | ||
"\n", | ||
"print(\"AnnData shape:\", adata.shape)\n", | ||
"\n", | ||
"adata.to_df()" | ||
], | ||
"id": "767c7268ff800451", | ||
"outputs": [], | ||
"execution_count": null | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be possible to add optional metadata columns to the
.obs
and.var
attributes by passingobs_columns: Optional[str, List[str]]
andvar_columns: Optional[str, List[str]]
to the factory class?This would add to the complexity as one had to validate that the columns are in the data frame, but other than that one could just use .pivot_table while passing the list of columns
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
=> https://github.com/orgs/MannLabs/projects/20/views/1?pane=issue&itemId=88563842