-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor readers iv #246
base: refactor_readers_III
Are you sure you want to change the base?
Refactor readers iv #246
Changes from all commits
6cfd070
8968d23
baf75dc
e079ffe
84b3091
86c76bd
682b07b
a200913
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,71 +14,15 @@ | |
from alphabase.peptide import mobility | ||
from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz | ||
from alphabase.psm_reader.keys import PsmDfCols | ||
from alphabase.psm_reader.utils import ( | ||
MOD_TO_UNIMOD_DICT, | ||
get_extended_modifications, | ||
keep_modifications, | ||
translate_modifications, | ||
) | ||
from alphabase.utils import _get_delimiter | ||
from alphabase.yaml_utils import load_yaml | ||
|
||
|
||
def translate_other_modification(mod_str: str, mod_dict: dict) -> str: | ||
"""Translate modifications of `mod_str` to the AlphaBase format mapped by mod_dict. | ||
|
||
Parameters | ||
---------- | ||
mod_str : str | ||
mod list in str format, seperated by ';', | ||
e.g. ModA;ModB | ||
mod_dict : dict | ||
translate mod dict from others to AlphaBase, | ||
e.g. for pFind, key=['Phospho[S]','Oxidation[M]'], | ||
value=['Phospho@S','Oxidation@M'] | ||
|
||
Returns | ||
------- | ||
str | ||
new mods in AlphaBase format seperated by ';'. if any | ||
modification is not in `mod_dict`, return pd.NA. | ||
|
||
""" | ||
if mod_str == "": | ||
return "", [] | ||
ret_mods = [] | ||
unknown_mods = [] | ||
for mod in mod_str.split(";"): | ||
if mod in mod_dict: | ||
ret_mods.append(mod_dict[mod]) | ||
else: | ||
unknown_mods.append(mod) | ||
|
||
if len(unknown_mods) > 0: | ||
return pd.NA, unknown_mods | ||
return ";".join(ret_mods), [] | ||
|
||
|
||
def _keep_modifications(mod_str: str, mod_set: set) -> str: | ||
"""Check if modifications of `mod_str` are in `mod_set`. | ||
|
||
Parameters | ||
---------- | ||
mod_str : str | ||
mod list in str format, seperated by ';', | ||
e.g. Oxidation@M;Phospho@S. | ||
mod_set : set | ||
mod set to check | ||
|
||
Returns | ||
------- | ||
str | ||
original `mod_str` if all modifications are in mod_set | ||
else pd.NA. | ||
|
||
""" | ||
if not mod_str: | ||
return "" | ||
for mod in mod_str.split(";"): | ||
if mod not in mod_set: | ||
return pd.NA | ||
return mod_str | ||
|
||
|
||
#: See `psm_reader.yaml <https://github.com/MannLabs/alphabase/blob/main/alphabase/constants/const_files/psm_reader.yaml>`_ | ||
psm_reader_yaml = load_yaml(Path(CONST_FILE_FOLDER) / "psm_reader.yaml") | ||
|
||
|
@@ -89,6 +33,8 @@ class PSMReaderBase(ABC): | |
# the type of the reader, this references a key in psm_reader.yaml | ||
_reader_type: str | ||
|
||
_add_unimod_to_mod_mapping: bool = False | ||
|
||
def __init__( | ||
self, | ||
*, | ||
|
@@ -192,6 +138,7 @@ def __init__( | |
self._engine_rt_unit = rt_unit | ||
self._min_irt_value = -100 | ||
self._max_irt_value = 200 | ||
self._mod_seq_columns = [] | ||
|
||
for key, value in kwargs.items(): # TODO: remove and remove kwargs | ||
warnings.warn( | ||
|
@@ -266,17 +213,41 @@ def set_modification_mapping( | |
self.modification_mapping = copy.deepcopy(modification_mapping) | ||
|
||
self._str_mods_to_lists() | ||
|
||
if self._add_unimod_to_mod_mapping: | ||
self._add_all_unimod() | ||
self._extend_mod_brackets() | ||
|
||
self.rev_mod_mapping = self._get_reversed_mod_mapping() | ||
|
||
def _init_modification_mapping(self) -> None: | ||
self.modification_mapping = {} | ||
|
||
def _add_all_unimod(self) -> None: | ||
for mod_name, unimod in MOD_TO_UNIMOD_DICT.items(): | ||
if mod_name in self.modification_mapping: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we reuse the logic used to add custom mods here? I feel like adding unimod, adding reader specific mods and adding custom mods is very similar. |
||
self.modification_mapping[mod_name].append(unimod) | ||
else: | ||
self.modification_mapping[mod_name] = [unimod] | ||
|
||
def _extend_mod_brackets(self) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code (ab in general, not your PR) feels to me like we should start a ModificationMapping class 😆 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. It also depends on how heavy we are going to use these reader classes. Before, I just used them to get unified training data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done here #247 |
||
"""Update modification_mapping to include different bracket types.""" | ||
for key, mod_list in list(self.modification_mapping.items()): | ||
self.modification_mapping[key] = get_extended_modifications(mod_list) | ||
|
||
def _str_mods_to_lists(self) -> None: | ||
"""Convert all single strings to lists containing one item in self.modification_mapping.""" | ||
for mod, val in list(self.modification_mapping.items()): | ||
if isinstance(val, str): | ||
self.modification_mapping[mod] = [val] | ||
|
||
def _find_mod_seq_column(self, df: pd.DataFrame) -> None: # called in _load_file | ||
for mod_seq_col in self._mod_seq_columns: | ||
if mod_seq_col in df.columns: | ||
self.mod_seq_column = mod_seq_col | ||
break | ||
# TODO: warn if there's more | ||
|
||
def _get_reversed_mod_mapping(self) -> Dict[str, str]: | ||
"""Create a reverse mapping from the modification format used by the search engine to the AlphaBase format.""" | ||
rev_mod_mapping = {} | ||
|
@@ -457,20 +428,9 @@ def _translate_columns(self, origin_df: pd.DataFrame) -> None: | |
self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCAN_NUM] - 1 | ||
|
||
def _transform_table(self) -> None: # noqa: B027 empty method in an abstract base class | ||
"""Transform the dataframe format if needed. | ||
"""Transform the dataframe format if needed, ddd information inplace into self._psm_df. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) typo: add |
||
|
||
Usually only needed in combination with spectral libraries. | ||
|
||
Parameters | ||
---------- | ||
origin_df : pd.DataFrame | ||
df of other search engines | ||
|
||
Returns | ||
------- | ||
None | ||
Add information inplace into self._psm_df | ||
|
||
""" | ||
|
||
@abstractmethod | ||
|
@@ -499,7 +459,7 @@ def _translate_modifications(self) -> None: | |
""" | ||
self._psm_df[PsmDfCols.MODS], unknown_mods = zip( | ||
*self._psm_df[PsmDfCols.MODS].apply( | ||
translate_other_modification, mod_dict=self.rev_mod_mapping | ||
translate_modifications, mod_dict=self.rev_mod_mapping | ||
) | ||
) | ||
|
||
|
@@ -568,7 +528,7 @@ def filter_psm_by_modifications( | |
"Acetyl@Protein_N-term", | ||
} | ||
self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( | ||
_keep_modifications, mod_set=include_mod_set | ||
keep_modifications, mod_set=include_mod_set | ||
) | ||
|
||
self._psm_df.dropna(subset=[PsmDfCols.MODS], inplace=True) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we also add this to the base class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nevermind :D