Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor readers iv #246

Open
wants to merge 8 commits into
base: refactor_readers_III
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ maxquant:
'genes': ['Gene Names','Gene names']
'decoy': 'Reverse'
'intensity': 'Intensity'

modification_mapping:
'Dimethyl@K':
- 'K(Dimethyl)'
Expand Down
34 changes: 15 additions & 19 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,10 @@ class SpectronautReader(MaxQuantReader):

Other parameters, please see `MaxQuantReader`
in `alphabase.psm_reader.maxquant_reader`

Parameters
----------
csv_sep : str, optional
Delimiter for TSV/CSV, by default 'tab'

"""

_reader_type = "spectronaut"
_add_unimod_to_mod_mapping = True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we also add this to the base class?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind :D


def __init__( # noqa: PLR0913 many arguments in function definition
self,
Expand Down Expand Up @@ -56,8 +51,9 @@ def __init__( # noqa: PLR0913 many arguments in function definition
self._min_max_rt_norm = True

def _load_file(self, filename: str) -> pd.DataFrame:
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False)
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self._find_mod_seq_column(df)
if "ReferenceRun" in df.columns:
df.drop_duplicates(
Expand All @@ -73,6 +69,9 @@ def _load_file(self, filename: str) -> pd.DataFrame:
class SwathReader(SpectronautReader):
"""Reader for SWATH or OpenSWATH library TSV/CSV."""

_reader_type = "spectronaut" # no typo
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
*,
Expand Down Expand Up @@ -103,6 +102,7 @@ class DiannReader(SpectronautReader):
"""Reader for DIANN data."""

_reader_type = "diann"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
Expand Down Expand Up @@ -130,8 +130,8 @@ def __init__( # noqa: PLR0913 many arguments in function definition
self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
self.csv_sep = self._get_table_delimiter(filename)
return pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False)
csv_sep = self._get_table_delimiter(filename)
return pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

def _post_process(self) -> None:
super()._post_process()
Expand All @@ -145,15 +145,10 @@ class SpectronautReportReader(MaxQuantReader):

Other parameters, please see `MaxQuantReader`
in `alphabase.psm_reader.maxquant_reader`

Parameters
----------
csv_sep : str, optional
Delimiter for TSV/CSV, by default ','

"""

_reader_type = "spectronaut_report"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
Expand All @@ -178,13 +173,14 @@ def __init__( # noqa: PLR0913 many arguments in function definition
)

self.precursor_column = "EG.PrecursorId"
self.mod_seq_column = "ModifiedSequence"

self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
self.mod_seq_column = "ModifiedSequence"
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False)
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
Expand Down
62 changes: 2 additions & 60 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import numpy as np
import pandas as pd

from alphabase.constants.modification import MOD_DF
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
Expand All @@ -19,16 +18,6 @@
# make sure all warnings are shown
warnings.filterwarnings("always")

mod_to_unimod_dict = {}
for mod_name, unimod_id_ in MOD_DF[["mod_name", "unimod_id"]].to_numpy():
unimod_id = int(unimod_id_)
if unimod_id in (-1, "-1"):
continue
if mod_name[-2] == "@":
mod_to_unimod_dict[mod_name] = f"{mod_name[-1]}(UniMod:{unimod_id})"
else:
mod_to_unimod_dict[mod_name] = f"_(UniMod:{unimod_id})"


@numba.njit
def replace_parentheses_with_brackets(
Expand Down Expand Up @@ -138,6 +127,7 @@ class MaxQuantReader(PSMReaderBase):
"""Reader for MaxQuant data."""

_reader_type = "maxquant"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
Expand Down Expand Up @@ -206,61 +196,12 @@ def __init__( # noqa: PLR0913 many arguments in function definition
self._mod_seq_columns = mod_seq_columns
self.mod_seq_column = "Modified sequence"

def _find_mod_seq_column(self, df: pd.DataFrame) -> None:
for mod_seq_col in self._mod_seq_columns:
if mod_seq_col in df.columns:
self.mod_seq_column = mod_seq_col
break

def _init_modification_mapping(self) -> None:
self.modification_mapping = copy.deepcopy(
# otherwise maxquant reader will modify the dict inplace
psm_reader_yaml["maxquant"]["modification_mapping"]
)

def set_modification_mapping(
self, modification_mapping: Optional[dict] = None
) -> None:
"""Set modification mapping."""
super().set_modification_mapping(modification_mapping)
self._add_all_unimod()
self._extend_mod_brackets()
self.rev_mod_mapping = self._get_reversed_mod_mapping()

def _add_all_unimod(self) -> None:
for mod_name, unimod in mod_to_unimod_dict.items():
if mod_name in self.modification_mapping:
self.modification_mapping[mod_name].append(unimod)
else:
self.modification_mapping[mod_name] = [unimod]

def _extend_mod_brackets(self) -> None:
"""Update modification_mapping to include different bracket types."""
for key, mod_list in list(self.modification_mapping.items()):
mod_set = set(mod_list)
# extend bracket types of modifications
# K(Acetyl) -> K[Acetyl]
# (Phospho) -> _(Phospho)
# _[Phospho] -> _(Phospho)
for mod in mod_list:
if mod[1] == "(":
mod_set.add(f"{mod[0]}[{mod[2:-1]}]")
elif mod[1] == "[":
mod_set.add(f"{mod[0]}({mod[2:-1]})")

if mod.startswith("_"):
mod_set.add(f"{mod[1:]}")
elif mod.startswith("("):
mod_set.add(f"_{mod}")
mod_set.add(f"[{mod[1:-1]}]")
mod_set.add(f"_[{mod[1:-1]}]")
elif mod.startswith("["):
mod_set.add(f"_{mod}")
mod_set.add(f"({mod[1:-1]})")
mod_set.add(f"_({mod[1:-1]})")

self.modification_mapping[key] = list(mod_set)

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
self._psm_df[PsmDfCols.DECOY] = (
Expand All @@ -270,6 +211,7 @@ def _translate_decoy(self) -> None:
def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self._find_mod_seq_column(df)
df = df[~pd.isna(df["Retention time"])]
df.fillna("", inplace=True)
Expand Down
112 changes: 36 additions & 76 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,71 +14,15 @@
from alphabase.peptide import mobility
from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.utils import (
MOD_TO_UNIMOD_DICT,
get_extended_modifications,
keep_modifications,
translate_modifications,
)
from alphabase.utils import _get_delimiter
from alphabase.yaml_utils import load_yaml


def translate_other_modification(mod_str: str, mod_dict: dict) -> str:
"""Translate modifications of `mod_str` to the AlphaBase format mapped by mod_dict.

Parameters
----------
mod_str : str
mod list in str format, seperated by ';',
e.g. ModA;ModB
mod_dict : dict
translate mod dict from others to AlphaBase,
e.g. for pFind, key=['Phospho[S]','Oxidation[M]'],
value=['Phospho@S','Oxidation@M']

Returns
-------
str
new mods in AlphaBase format seperated by ';'. if any
modification is not in `mod_dict`, return pd.NA.

"""
if mod_str == "":
return "", []
ret_mods = []
unknown_mods = []
for mod in mod_str.split(";"):
if mod in mod_dict:
ret_mods.append(mod_dict[mod])
else:
unknown_mods.append(mod)

if len(unknown_mods) > 0:
return pd.NA, unknown_mods
return ";".join(ret_mods), []


def _keep_modifications(mod_str: str, mod_set: set) -> str:
"""Check if modifications of `mod_str` are in `mod_set`.

Parameters
----------
mod_str : str
mod list in str format, seperated by ';',
e.g. Oxidation@M;Phospho@S.
mod_set : set
mod set to check

Returns
-------
str
original `mod_str` if all modifications are in mod_set
else pd.NA.

"""
if not mod_str:
return ""
for mod in mod_str.split(";"):
if mod not in mod_set:
return pd.NA
return mod_str


#: See `psm_reader.yaml <https://github.com/MannLabs/alphabase/blob/main/alphabase/constants/const_files/psm_reader.yaml>`_
psm_reader_yaml = load_yaml(Path(CONST_FILE_FOLDER) / "psm_reader.yaml")

Expand All @@ -89,6 +33,8 @@ class PSMReaderBase(ABC):
# the type of the reader, this references a key in psm_reader.yaml
_reader_type: str

_add_unimod_to_mod_mapping: bool = False

def __init__(
self,
*,
Expand Down Expand Up @@ -192,6 +138,7 @@ def __init__(
self._engine_rt_unit = rt_unit
self._min_irt_value = -100
self._max_irt_value = 200
self._mod_seq_columns = []

for key, value in kwargs.items(): # TODO: remove and remove kwargs
warnings.warn(
Expand Down Expand Up @@ -266,17 +213,41 @@ def set_modification_mapping(
self.modification_mapping = copy.deepcopy(modification_mapping)

self._str_mods_to_lists()

if self._add_unimod_to_mod_mapping:
self._add_all_unimod()
self._extend_mod_brackets()

self.rev_mod_mapping = self._get_reversed_mod_mapping()

def _init_modification_mapping(self) -> None:
self.modification_mapping = {}

def _add_all_unimod(self) -> None:
for mod_name, unimod in MOD_TO_UNIMOD_DICT.items():
if mod_name in self.modification_mapping:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we reuse the logic used to add custom mods here? I feel like adding unimod, adding reader specific mods and adding custom mods is very similar.

self.modification_mapping[mod_name].append(unimod)
else:
self.modification_mapping[mod_name] = [unimod]

def _extend_mod_brackets(self) -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code (ab in general, not your PR) feels to me like we should start a ModificationMapping class 😆

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. It also depends on how heavy we are going to use these reader classes. Before, I just used them to get unified training data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done here #247

"""Update modification_mapping to include different bracket types."""
for key, mod_list in list(self.modification_mapping.items()):
self.modification_mapping[key] = get_extended_modifications(mod_list)

def _str_mods_to_lists(self) -> None:
"""Convert all single strings to lists containing one item in self.modification_mapping."""
for mod, val in list(self.modification_mapping.items()):
if isinstance(val, str):
self.modification_mapping[mod] = [val]

def _find_mod_seq_column(self, df: pd.DataFrame) -> None: # called in _load_file
for mod_seq_col in self._mod_seq_columns:
if mod_seq_col in df.columns:
self.mod_seq_column = mod_seq_col
break
# TODO: warn if there's more

def _get_reversed_mod_mapping(self) -> Dict[str, str]:
"""Create a reverse mapping from the modification format used by the search engine to the AlphaBase format."""
rev_mod_mapping = {}
Expand Down Expand Up @@ -457,20 +428,9 @@ def _translate_columns(self, origin_df: pd.DataFrame) -> None:
self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCAN_NUM] - 1

def _transform_table(self) -> None: # noqa: B027 empty method in an abstract base class
"""Transform the dataframe format if needed.
"""Transform the dataframe format if needed, ddd information inplace into self._psm_df.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) typo: add


Usually only needed in combination with spectral libraries.

Parameters
----------
origin_df : pd.DataFrame
df of other search engines

Returns
-------
None
Add information inplace into self._psm_df

"""

@abstractmethod
Expand Down Expand Up @@ -499,7 +459,7 @@ def _translate_modifications(self) -> None:
"""
self._psm_df[PsmDfCols.MODS], unknown_mods = zip(
*self._psm_df[PsmDfCols.MODS].apply(
translate_other_modification, mod_dict=self.rev_mod_mapping
translate_modifications, mod_dict=self.rev_mod_mapping
)
)

Expand Down Expand Up @@ -568,7 +528,7 @@ def filter_psm_by_modifications(
"Acetyl@Protein_N-term",
}
self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply(
_keep_modifications, mod_set=include_mod_set
keep_modifications, mod_set=include_mod_set
)

self._psm_df.dropna(subset=[PsmDfCols.MODS], inplace=True)
Expand Down
Loading
Loading