MannLabs · mschwoer · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/.github/workflows/_run_tests.yml b/.github/workflows/_run_tests.yml
@@ -14,7 +14,7 @@ on:
         required: true
         type: string
 jobs:
-  pre-commit:
+  run-tests:
     runs-on: ${{ inputs.os }}
     steps:
     - uses: actions/checkout@v3

diff --git a/.github/workflows/branch-checks.yaml b/.github/workflows/branch-checks.yaml
@@ -24,3 +24,11 @@ jobs:
       python-version: ${{ matrix.python-version }}
       os: ${{ matrix.os }}
       install-script: "loose_pip_install.sh"
+  get-code-review-input:
+    runs-on: ubuntu-latest
+    #if: contains(github.event.pull_request.labels.*.name, 'code-review')
+    steps:
+      - uses: MannLabs/alphashared/actions/get-code-review-input@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.number }}
diff --git a/alphabase/constants/_const.py b/alphabase/constants/_const.py
@@ -5,6 +5,7 @@
 from alphabase.yaml_utils import load_yaml
 
 CONST_FILE_FOLDER = os.path.join(os.path.dirname(__file__), "const_files")
+PSM_READER_YAML_FILE_NAME = "psm_reader.yaml"
 
 common_const_dict: dict = load_yaml(
     os.path.join(CONST_FILE_FOLDER, "common_constants.yaml")

diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml
@@ -13,13 +13,8 @@ alphapept:
     'raw_name': 'raw_name' #parse from `ms_data.hdf`` file
     'fdr': 'q_value'
     'decoy': 'decoy'
-  modification_mapping:
-    'Carbamidomethyl@C': 'cC'
-    'Oxidation@M': 'oxM'
-    'Phospho@S': 'pS'
-    'Phospho@T': 'pT'
-    'Phospho@Y': 'pY'
-    'Acetyl@Protein_N-term': 'a'
+  modification_mapping_type: 'alphapept'
+
 
 maxquant:
   reader_type: maxquant
@@ -49,7 +44,10 @@ maxquant:
     'genes': ['Gene Names','Gene names']
     'decoy': 'Reverse'
     'intensity': 'Intensity'
-  modification_mapping:
+  modification_mapping_type: 'maxquant'
+
+modification_mappings:
+  maxquant:
     'Dimethyl@K':
       - 'K(Dimethyl)'
     'Dimethyl@R':
@@ -103,6 +101,13 @@ maxquant:
     'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']
     'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']
     'hydroxyisobutyryl@K': 'K(2-)'
+  alphapept:
+    'Carbamidomethyl@C': 'cC'
+    'Oxidation@M': 'oxM'
+    'Phospho@S': 'pS'
+    'Phospho@T': 'pT'
+    'Phospho@Y': 'pY'
+    'Acetyl@Protein_N-term': 'a'
 
 pfind:
   reader_type: pfind
@@ -119,8 +124,7 @@ pfind:
     'uniprot_ids': 'Proteins'
     'fdr': 'Q-value'
     'decoy': ['Target/Decoy', 'Targe/Decoy']
-  modification_mapping:
-    '': ''
+  modification_mapping_type: 'maxquant'
 
 msfragger_pepxml:
   reader_type: msfragger_pepxml
@@ -136,8 +140,6 @@ msfragger_pepxml:
     'proteins': 'protein'
     'raw_name': 'raw_name'
     'mobility': 'ion_mobility'
-  modification_mapping:
-    '': ''
   mass_mapped_mods:
     - 'Oxidation@M' #other Oxidation@X are not needed here
     - 'Carbamidomethyl@C'
@@ -150,6 +152,7 @@ msfragger_pepxml:
     - 'Dimethyl@K' # Any_N-term is not needed here as it will be infered in-the-fly
     - 'Methyl@E' #an example of a PTM that can be C-term
   mod_mass_tol: 0.1 # Da
+  modification_mapping_type: 'maxquant'
 
 diann:
   reader_type: diann
@@ -172,7 +175,7 @@ diann:
     'fdr': 'Q.Value'
   mod_seq_columns:
     - "Modified.Sequence"
-  modification_mapping: 'maxquant'
+  modification_mapping_type: 'maxquant'
 
 spectronaut_report:
   reader_type: spectronaut_report
@@ -188,7 +191,7 @@ spectronaut_report:
     'charge': 'charge'
   mod_seq_columns:
     - 'ModifiedSequence'
-  modification_mapping: 'maxquant'
+  modification_mapping_type: 'maxquant'
 
 spectronaut:
   reader_type: spectronaut
@@ -212,7 +215,7 @@ spectronaut:
     - 'ModifiedPeptideSequence'
     - 'LabeledSequence'
     - 'FullUniModPeptideName'
-  modification_mapping: 'maxquant'
+  modification_mapping_type: 'maxquant'
 
 library_reader_base:
   reader_type: library_reader_base
@@ -243,11 +246,11 @@ library_reader_base:
     - 'FullUniModPeptideName'
     - 'LabeledSequence'
     - 'FullUniModPeptideName'
-  modification_mapping: 'maxquant'
+  modification_mapping_type: 'maxquant'
 
 sage:
   reader_type: sage
-  rt_unit: minute
+  rt_unit: second
   column_mapping:
     'modified_sequence': 'peptide'
     'sequence': 'stripped_peptide'
@@ -262,3 +265,4 @@ sage:
     'peptide_fdr': 'peptide_q'
     'protein_fdr': 'protein_q'
     'decoy': 'is_decoy'
+  modification_mapping_type: 'maxquant'
diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py
@@ -52,7 +52,6 @@ class AlphaPeptReader(PSMReaderBase):
     """Reader for AlphaPept's *.ms_data.hdf files."""
 
     _reader_type = "alphapept"
-    _modification_type = "alphapept"
 
     def _load_file(self, filename: str) -> pd.DataFrame:
         """Load an AlphaPept output file to a DataFrame."""

diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py
@@ -18,7 +18,6 @@ class SpectronautReader(MaxQuantReader):
     _reader_type = "spectronaut"
     _add_unimod_to_mod_mapping = True
     _min_max_rt_norm = True
-    _fixed_c57_default = False
 
     def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
         """Spectronaut-specific preprocessing of output data."""

diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py
@@ -11,6 +11,7 @@
 from alphabase.psm_reader.psm_reader import (
     PSMReaderBase,
     psm_reader_provider,
+    psm_reader_yaml,
 )
 from alphabase.psm_reader.utils import get_column_mapping_for_df
 
@@ -127,8 +128,6 @@ class MaxQuantReader(PSMReaderBase):
 
     _reader_type = "maxquant"
     _add_unimod_to_mod_mapping = True
-    _modification_type = "maxquant"
-    _fixed_c57_default = True
 
     def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
         self,
@@ -138,7 +137,7 @@ def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition
         mod_seq_columns: Optional[List[str]] = None,
         fdr: float = 0.01,
         keep_decoy: bool = False,
-        rt_unit: str = "minute",
+        rt_unit: Optional[str] = None,
         # MaxQuant reader-specific
         fixed_C57: Optional[bool] = None,  # noqa: N803 TODO: make this  *,fixed_c57  (breaking)
         **kwargs,
@@ -152,7 +151,7 @@ def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition
         fixed_C57 : bool, optional
             If true, the search engine will not show `Carbamidomethyl`
             in the modified sequences.
-            by default True
+            by default read from psm_reader_yaml key `fixed_C57`.
 
         See documentation of `PSMReaderBase` for the rest of parameters.
 
@@ -167,7 +166,11 @@ def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition
             **kwargs,
         )
 
-        self.fixed_C57 = fixed_C57 if fixed_C57 is not None else self._fixed_c57_default
+        self.fixed_C57 = (
+            fixed_C57
+            if fixed_C57 is not None
+            else psm_reader_yaml[self._reader_type]["fixed_C57"]
+        )
 
     def _translate_decoy(self) -> None:
         if PsmDfCols.DECOY in self._psm_df.columns:

diff --git a/alphabase/psm_reader/modification_mapper.py b/alphabase/psm_reader/modification_mapper.py
@@ -15,7 +15,7 @@ def __init__(
         custom_modification_mapping: Optional[Dict[str, str]],
         *,
         reader_yaml: Dict,
-        modification_type: Optional[str],
+        mapping_type: str,
         add_unimod_to_mod_mapping: bool,
     ):
         """Initialize the ModificationMapper.
@@ -35,7 +35,7 @@ def __init__(
         reader_yaml:
             the yaml (read from file) containing the modification mappings
 
-        modification_type:
+        mapping_type:
             the type of modification mapping ("maxquant" or "alphapept")
 
         add_unimod_to_mod_mapping:
@@ -44,7 +44,7 @@ def __init__(
         """
         self._psm_reader_yaml = reader_yaml
         self._add_unimod_to_mod_mapping = add_unimod_to_mod_mapping
-        self._modification_type = modification_type
+        self._mapping_type = mapping_type
 
         self.modification_mapping = None
         self.rev_mod_mapping = None
@@ -95,23 +95,20 @@ def set_modification_mapping(
         ----------
         modification_mapping:
             If dictionary: the current modification_mapping will be overwritten by this.
-            If str: the parameter will be interpreted as a reader type, and the modification_mapping is read from the
-                "modification_mapping" section of the psm_reader_yaml
+            If str: the parameter will be interpreted as a modification_mapping_type, and the mapping is read from the
+                respective key in the "modification_mappings" section of the psm_reader_yaml
 
         """
         if modification_mapping is None:
             self._init_modification_mapping()
         elif isinstance(
-            modification_mapping, str
-        ):  # TODO: remove this overloading of the parameter by introducing yaml key "modification_mapping_type"
-            if modification_mapping in self._psm_reader_yaml:
-                self.modification_mapping = self._psm_reader_yaml[modification_mapping][
-                    "modification_mapping"
-                ]
-            else:
-                raise ValueError(
-                    f"Unknown modification mapping: {modification_mapping}"
-                )
+            modification_mapping,
+            str,  # interprete as modification_mapping_type
+        ):
+            self.modification_mapping = self._psm_reader_yaml["modification_mappings"][
+                modification_mapping
+            ]
+
         else:
             self.modification_mapping = copy.deepcopy(modification_mapping)
 
@@ -125,12 +122,9 @@ def set_modification_mapping(
 
     def _init_modification_mapping(self) -> None:
         """Initialize the modification mapping from the psm_reader_yaml or as an empty dictionary."""
-        if self._modification_type is not None:
-            self.modification_mapping = self._psm_reader_yaml[self._modification_type][
-                "modification_mapping"
-            ]
-        else:
-            self.modification_mapping = {}
+        self.modification_mapping = self._psm_reader_yaml["modification_mappings"][
+            self._mapping_type
+        ]
 
     def _add_all_unimod(self) -> None:
         """Add all unimod modifications to the modification mapping."""

diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py
@@ -21,12 +21,11 @@ def _is_fragger_decoy(proteins: List[str]) -> bool:
     return all(prot.lower().startswith("rev_") for prot in proteins)
 
 
-mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]
-mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"]
-
-
 def _get_mods_from_masses(  # noqa: PLR0912, C901 too many branches, too complex TODO: refactor
-    sequence: str, msf_aa_mods: List[str]
+    sequence: str,
+    msf_aa_mods: List[str],
+    mass_mapped_mods: List[str],
+    mod_mass_tol: float,
 ) -> Tuple[str, str, str, str]:
     mods = []
     mod_sites = []
@@ -106,7 +105,7 @@ def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition
         # mod_seq_columns: Optional[List[str]] = None,# TODO: not needed here?
         fdr: float = 0.001,  # refers to E-value in the PepXML
         keep_decoy: bool = False,
-        rt_unit: str = "second",
+        rt_unit: Optional[str] = None,
         # MSFragger reader-specific:
         keep_unknown_aa_mass_diffs: bool = False,
         **kwargs,
@@ -134,7 +133,10 @@ def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition
             rt_unit=rt_unit,
             **kwargs,
         )
-        self.keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs
+        self._keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs
+        # TODO: should those be set via API, too?
+        self._mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]
+        self._mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"]
 
     def _translate_modifications(self) -> None:
         pass
@@ -183,11 +185,16 @@ def _load_modifications(self, origin_df: pd.DataFrame) -> None:
             self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
         ) = zip(
             *origin_df[["peptide", "modifications"]].apply(
-                lambda x: _get_mods_from_masses(*x), axis=1
+                lambda x: _get_mods_from_masses(
+                    *x,
+                    mass_mapped_mods=self._mass_mapped_mods,
+                    mod_mass_tol=self._mod_mass_tol,
+                ),
+                axis=1,
             )
         )
 
-        if not self.keep_unknown_aa_mass_diffs:
+        if not self._keep_unknown_aa_mass_diffs:
             self._psm_df[PsmDfCols.TO_REMOVE] += (
                 self._psm_df[PsmDfCols.AA_MASS_DIFFS] != ""
             )