Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor readers vi #248

Open
wants to merge 15 commits into
base: refactor_readers_V
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ maxquant:
reader_type: maxquant
rt_unit: minute
fixed_C57: True
mod_seq_columns:
- 'Modified sequence'
column_mapping:
'sequence': 'Sequence'
'charge': 'Charge'
Expand Down Expand Up @@ -168,6 +170,8 @@ diann:
'scan_num': 'MS2.Scan'
'score': 'CScore'
'fdr': 'Q.Value'
mod_seq_columns:
- "Modified.Sequence"
modification_mapping: 'maxquant'

spectronaut_report:
Expand All @@ -182,19 +186,14 @@ spectronaut_report:
'genes': 'PG.Genes'
'uniprot_ids': 'PG.UniProtIds'
'charge': 'charge'
mod_seq_columns:
- 'ModifiedSequence'
modification_mapping: 'maxquant'

spectronaut:
reader_type: spectronaut
rt_unit: irt
fixed_C57: False
mod_seq_columns:
- 'ModifiedPeptide'
- 'ModifiedSequence'
- 'FullUniModPeptideName'
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
column_mapping:
'raw_name': 'ReferenceRun'
'sequence': ['StrippedPeptide','PeptideSequence']
Expand All @@ -206,19 +205,19 @@ spectronaut:
'proteins': ['Protein Name','ProteinId','ProteinID','ProteinName','ProteinGroup','ProteinGroups']
'uniprot_ids': ['UniProtIds','UniProtID','UniprotId']
'genes': ['Genes','Gene','GeneName','GeneNames']
modification_mapping: 'maxquant'

library_reader_base:
reader_type: library_reader_base
rt_unit: irt
fixed_C57: False
mod_seq_columns:
- 'ModifiedPeptideSequence'
- 'ModifiedPeptide'
- 'ModifiedSequence'
- 'FullUniModPeptideName'
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'

library_reader_base:
reader_type: library_reader_base
rt_unit: irt
fixed_C57: False
column_mapping:
'raw_name': 'ReferenceRun'
'sequence': ['PeptideSequence', 'StrippedPeptide']
Expand All @@ -237,6 +236,13 @@ library_reader_base:
'fragment_charge' : ['FragmentCharge', 'FragmentIonCharge', 'ProductCharge', 'ProductIonCharge']
'fragment_series': ['FragmentSeriesNumber','FragmentNumber']
'fragment_loss_type': ['FragmentLossType', 'FragmentIonLossType', 'ProductLossType', 'ProductIonLossType']
mod_seq_columns:
- 'ModifiedPeptideSequence'
- 'ModifiedPeptide'
- 'ModifiedSequence'
- 'FullUniModPeptideName'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'

sage:
Expand Down
21 changes: 8 additions & 13 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml
from alphabase.psm_reader.psm_reader import psm_reader_provider


class SpectronautReader(MaxQuantReader):
Expand All @@ -33,9 +33,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
):
"""Initialize SpectronautReader."""
if mod_seq_columns is None:
mod_seq_columns = psm_reader_yaml["spectronaut"]["mod_seq_columns"]

super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
Expand All @@ -47,14 +44,15 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

self.mod_seq_column = "ModifiedPeptide"
self._min_max_rt_norm = True

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self._find_mod_seq_column(df)
self.mod_seq_column = self._get_mod_seq_column(
df
) # TODO: this needs to be removed
if "ReferenceRun" in df.columns:
df.drop_duplicates(
["ReferenceRun", self.mod_seq_column, "PrecursorCharge"], inplace=True
Expand Down Expand Up @@ -84,9 +82,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
):
"""SWATH or OpenSWATH library, similar to `SpectronautReader`."""
if mod_seq_columns is None:
mod_seq_columns = psm_reader_yaml["spectronaut"]["mod_seq_columns"]

super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
Expand Down Expand Up @@ -126,7 +121,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

self.mod_seq_column = "Modified.Sequence"
self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
Expand Down Expand Up @@ -172,15 +166,16 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

self.precursor_column = "EG.PrecursorId"
self.mod_seq_column = "ModifiedSequence"

self.precursor_column = "EG.PrecursorId" # TODO: move to yaml
self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self.mod_seq_column = self._get_mod_seq_column(
df
) # TODO: this needs to be removed
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
Expand Down
17 changes: 5 additions & 12 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,23 +177,17 @@ def __init__( # noqa: PLR0913 many arguments in function definition
deprecated

"""
if mod_seq_columns is None:
mod_seq_columns = [
"Modified sequence"
] # TODO: why not take from psm_reader.yaml?

super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
rt_unit=rt_unit,
mod_seq_columns=mod_seq_columns,
**kwargs,
)

self.fixed_C57 = fixed_C57
self._mod_seq_columns = mod_seq_columns
self.mod_seq_column = "Modified sequence"

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
Expand All @@ -205,22 +199,21 @@ def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self._find_mod_seq_column(df)
df = df[~pd.isna(df["Retention time"])]
df.fillna("", inplace=True)

# remove MBR PSMs as they are currently not supported and will crash import
mapped_columns = self._find_mapped_columns(df)
if "scan_num" in mapped_columns:
scan_num_col = mapped_columns["scan_num"]
if PsmDfCols.SCAN_NUM in mapped_columns:
scan_num_col = mapped_columns[PsmDfCols.SCAN_NUM]
no_ms2_mask = df[scan_num_col] == ""
if (num_no_ms2_mask := np.sum(no_ms2_mask)) > 0:
warnings.warn(
f"Maxquant psm file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed."
f"MaxQuant PSM file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed."
)
df = df[~no_ms2_mask]
df.reset_index(drop=True, inplace=True)
df[scan_num_col] = df[scan_num_col].astype(int)
df[scan_num_col] = df[scan_num_col].astype(int)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

merci


# if 'K0' in df.columns:
# df['Mobility'] = df['K0'] # Bug in MaxQuant? It should be 1/K0
Expand Down
62 changes: 42 additions & 20 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,15 @@ class PSMReaderBase(ABC):
# the typ of modification mapping to be used
_modification_type: Optional[str] = None

def __init__(
def __init__( # noqa: PLR0913 # too many arguments
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: str = "minute",
mod_seq_columns: Optional[List[str]] = None,
**kwargs,
):
"""The Base class for all PSMReaders.
Expand All @@ -53,20 +54,20 @@ def __init__(
Parameters
----------
column_mapping : dict, optional
A dict that maps alphabase's columns to other search engine's.
A dict that maps alphabase's columns to those of other search engines'.
If it is None, this dict will be read from psm_reader.yaml key `column_mapping`.

The key of the column_mapping is alphabase's column name, and
the value could be the column name or a list of column names
in other engine's result.
If it is None, this dict will be init by
`self._init_column_mapping`. The dict values could be
either str or list, for example:
in other engine's result, for example:
```
columns_mapping = {
'sequence': 'NakedSequence',
'charge': 'Charge',
'proteins':['Proteins','UniprotIDs'] # list, this reader will automatically detect all of them.
}
```
The first column name in the list will be mapped to the harmonized column names, the rest will be ignored.
Defaults to None.

modification_mapping : dict, optional
Expand Down Expand Up @@ -96,6 +97,12 @@ def __init__(
The unit of RT in the search engine result.
Defaults to 'minute'.

mod_seq_columns : list, optional
The columns to find modified sequences.
The first column name in the list will be used, the rest will be ignored.
By default read from psm_reader_yaml key "mod_seq_columns".
If it is not found there, an empty list is used.

**kwargs: dict
deprecated

Expand Down Expand Up @@ -139,7 +146,11 @@ def __init__(
self._engine_rt_unit = rt_unit
self._min_irt_value = -100
self._max_irt_value = 200
self._mod_seq_columns = []
self._mod_seq_columns = (
mod_seq_columns
if mod_seq_columns is not None
else psm_reader_yaml[self._reader_type].get("mod_seq_columns", [])
)

for key, value in kwargs.items(): # TODO: remove and remove kwargs
warnings.warn(
Expand Down Expand Up @@ -174,12 +185,13 @@ def set_modification_mapping(
"""
self._modification_mapper.set_modification_mapping(modification_mapping)

def _find_mod_seq_column(self, df: pd.DataFrame) -> None: # called in _load_file
def _get_mod_seq_column(self, df: pd.DataFrame) -> Optional[str]:
"""Get the first column from `_mod_seq_columns` that is a column of `df`."""
for mod_seq_col in self._mod_seq_columns:
if mod_seq_col in df.columns:
self.mod_seq_column = mod_seq_col
break
# TODO: warn if there's more
return mod_seq_col
return None
# TODO: warn if there's more

def _read_column_mapping(self) -> Dict[str, str]:
"""Read column mapping from psm_reader yaml file."""
Expand Down Expand Up @@ -218,6 +230,9 @@ def import_file(self, _file: str) -> pd.DataFrame:

"""
origin_df = self._load_file(_file)

self.mod_seq_column = self._get_mod_seq_column(origin_df)

self._psm_df = pd.DataFrame()

if len(origin_df):
Expand Down Expand Up @@ -304,17 +319,24 @@ def _load_file(self, filename: str) -> pd.DataFrame:

"""

def _find_mapped_columns(self, origin_df: pd.DataFrame) -> Dict[str, str]:
def _find_mapped_columns(self, df: pd.DataFrame) -> Dict[str, str]:
"""Determine the mapping of AlphaBase columns to the columns in the given DataFrame.

For each AlphaBase column name, check if the corresponding search engine-specific
name is in the DataFrame columns. If it is, add it to the mapping.
If the searchengine-specific name is a list, use the first column name in the list.
"""
mapped_columns = {}
for col, map_col in self.column_mapping.items():
if isinstance(map_col, str):
if map_col in origin_df.columns:
mapped_columns[col] = map_col
elif isinstance(map_col, (list, tuple)):
for other_col in map_col:
if other_col in origin_df.columns:
mapped_columns[col] = other_col
for col_alphabase, col_other in self.column_mapping.items():
if isinstance(col_other, str):
if col_other in df.columns:
mapped_columns[col_alphabase] = col_other
elif isinstance(col_other, (list, tuple)):
for other_col in col_other:
if other_col in df.columns:
mapped_columns[col_alphabase] = other_col
break
# TODO: warn if there's more
return mapped_columns

def _translate_columns(self, origin_df: pd.DataFrame) -> None:
Expand Down
9 changes: 1 addition & 8 deletions alphabase/spectral_library/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from alphabase.peptide.mobility import mobility_to_ccs_for_df
from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_yaml
from alphabase.spectral_library.base import SpecLibBase


Expand Down Expand Up @@ -88,9 +87,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
deprecated

"""
if mod_seq_columns is None:
mod_seq_columns = psm_reader_yaml["library_reader_base"]["mod_seq_columns"]

SpecLibBase.__init__(
self,
charged_frag_types=charged_frag_types,
Expand Down Expand Up @@ -248,7 +244,7 @@ def _load_file(self, filename: str) -> pd.DataFrame:
"""Load the spectral library from a csv file."""
csv_sep = self._get_table_delimiter(filename)

df = pd.read_csv(
return pd.read_csv(
filename,
sep=csv_sep,
keep_default_na=False,
Expand All @@ -273,9 +269,6 @@ def _load_file(self, filename: str) -> pd.DataFrame:
"null",
],
)
self._find_mod_seq_column(df)

return df

def _post_process(
self,
Expand Down
Loading
Loading