Merge pull request #285 from MannLabs/infer-charged-frag-types

fix #279
MannLabs · Jan 21, 2025 · 8a1298c · 8a1298c
2 parents 57e58ab + 1c7633d
commit 8a1298c
Show file tree

Hide file tree

Showing 4 changed files with 220 additions and 94 deletions.
diff --git a/alphabase/spectral_library/base.py b/alphabase/spectral_library/base.py
@@ -683,6 +683,7 @@ def load_hdf(
         hdf_file: str,
         load_mod_seq: bool = True,
         support_legacy_mods_format: bool = True,
+        infer_charged_frag_types: bool = True,
     ):
         """Load the hdf library from hdf_file
 
@@ -702,6 +703,10 @@ def load_hdf(
             Defaults to True.
             DeprecationWarning: future versions will have a different default and eventually this flag will be dropped.
 
+        infer_charged_frag_types : bool, optional
+            if True, infer the charged fragment types as defined in the hdf file, defaults to True.
+            This is the default as users most likely don't know the charged fragment types in the hdf file.
+            If set to False, only charged frag types defined in `charged_frag_types` will be loaded.
         """
         _hdf = HDF_File(hdf_file)
         self._precursor_df: pd.DataFrame = _hdf.library.precursor_df.values
@@ -719,17 +724,18 @@ def load_hdf(
             self._precursor_df[cols] = mod_seq_df[cols]
 
         _fragment_mz_df = _hdf.library.fragment_mz_df.values
-        self._fragment_mz_df = _fragment_mz_df[
-            sort_charged_frag_types(
-                filter_valid_charged_frag_types(_fragment_mz_df.columns.values)
+        if infer_charged_frag_types:
+            self.charged_frag_types = sort_charged_frag_types(
+                filter_valid_charged_frag_types(_fragment_mz_df.columns)
             )
+
+        self._fragment_mz_df = _fragment_mz_df[
+            get_available_columns(_fragment_mz_df, self.charged_frag_types)
         ]
 
         _fragment_intensity_df = _hdf.library.fragment_intensity_df.values
         self._fragment_intensity_df = _fragment_intensity_df[
-            sort_charged_frag_types(
-                filter_valid_charged_frag_types(_fragment_intensity_df.columns.values)
-            )
+            get_available_columns(_fragment_intensity_df, self.charged_frag_types)
         ]
 
     @staticmethod
@@ -822,3 +828,27 @@ def annotate_fragments_from_speclib(
     speclib._fragment_intensity_df = fragment_speclib._fragment_intensity_df.copy()
 
     return speclib
+
+
+def get_available_columns(df, columns):
+    """Get a list of column names that exist in the given dataframe.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The dataframe to check columns against
+    columns : list
+        List of column names to check
+
+    Returns
+    -------
+    list
+        List of column names that exist in the dataframe
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({'a': [1], 'b': [2]})
+    >>> get_available_columns(df, ['a', 'b', 'c'])
+    ['a', 'b']
+    """
+    return [col for col in columns if col in df.columns]
diff --git a/alphabase/spectral_library/flat.py b/alphabase/spectral_library/flat.py
@@ -11,7 +11,7 @@
     remove_unused_fragments,
     sort_charged_frag_types,
 )
-from alphabase.spectral_library.base import SpecLibBase
+from alphabase.spectral_library.base import SpecLibBase, get_available_columns
 
 
 class SpecLibFlat(SpecLibBase):
@@ -182,7 +182,12 @@ def save_hdf(self, hdf_file: str):
         _hdf.library.fragment_mz_df = self.fragment_mz_df
         _hdf.library.fragment_intensity_df = self.fragment_intensity_df
 
-    def load_hdf(self, hdf_file: str, load_mod_seq: bool = False):
+    def load_hdf(
+        self,
+        hdf_file: str,
+        load_mod_seq: bool = False,
+        infer_charged_frag_types: bool = True,
+    ):
         """Load the hdf library from hdf_file
 
         Parameters
@@ -194,6 +199,11 @@ def load_hdf(self, hdf_file: str, load_mod_seq: bool = False):
             if also load mod_seq_df.
             Defaults to False.
 
+        infer_charged_frag_types : bool, optional
+            if True, infer the charged fragment types as defined in the hdf file, defaults to True.
+            This is the default as users most likely don't know the charged fragment types in the hdf file.
+            If set to False, only charged frag types defined in `SpecLibBase.charged_frag_types` will be loaded.
+
         """
         super().load_hdf(hdf_file, load_mod_seq=load_mod_seq)
         _hdf = HDF_File(
@@ -202,18 +212,19 @@ def load_hdf(self, hdf_file: str, load_mod_seq: bool = False):
         self._fragment_df = _hdf.library.fragment_df.values
         self._protein_df = _hdf.library.protein_df.values
 
-        _fragment_mz_df = _hdf.library.fragment_mz_df.values
-        self._fragment_mz_df = _fragment_mz_df[
-            sort_charged_frag_types(
-                filter_valid_charged_frag_types(_fragment_mz_df.columns.values)
+        if infer_charged_frag_types:
+            self.charged_frag_types = sort_charged_frag_types(
+                filter_valid_charged_frag_types(_hdf.library.fragment_mz_df.columns)
             )
-        ]
 
         _fragment_intensity_df = _hdf.library.fragment_intensity_df.values
         self._fragment_intensity_df = _fragment_intensity_df[
-            sort_charged_frag_types(
-                filter_valid_charged_frag_types(_fragment_intensity_df.columns.values)
-            )
+            get_available_columns(_fragment_intensity_df, self.charged_frag_types)
+        ]
+
+        _fragment_mz_df = _hdf.library.fragment_mz_df.values
+        self._fragment_mz_df = _fragment_mz_df[
+            get_available_columns(_fragment_mz_df, self.charged_frag_types)
         ]
 
     def get_full_charged_types(self, frag_df: pd.DataFrame) -> list:

diff --git a/nbs_tests/spectral_library/flat_library.ipynb b/nbs_tests/spectral_library/flat_library.ipynb
@@ -196,13 +196,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Check if the fragment intensity dataframes are equal\n",
-    "# since for now the reader uses a predefined charge_types and the \"to_specLibBase\" method infers the charged_types \n",
-    "# We will ignore columns that are full zeros in the target object\n",
     "\n",
-    "non_zero_target_columns = target.fragment_intensity_df.columns[target.fragment_intensity_df.sum() != 0]\n",
-    "\n",
-    "pd.testing.assert_frame_equal(target.fragment_intensity_df[non_zero_target_columns].sort_index(axis=1), back_to_base.fragment_intensity_df.sort_index(axis=1), check_dtype=False)"
+    "pd.testing.assert_frame_equal(target.fragment_intensity_df.sort_index(axis=1), back_to_base.fragment_intensity_df.sort_index(axis=1), check_dtype=False)"
    ]
   },
   {
@@ -213,7 +208,7 @@
    "source": [
     "# Check the non zero values in back_to_base.fragment_mz_df are the same as in target.fragment_mz_df\n",
     "non_zero_indices = back_to_base.fragment_mz_df > 0\n",
-    "pd.testing.assert_frame_equal(target.fragment_mz_df[non_zero_target_columns][non_zero_indices].sort_index(axis=1), back_to_base.fragment_mz_df[non_zero_indices].sort_index(axis=1), check_dtype=False)"
+    "pd.testing.assert_frame_equal(target.fragment_mz_df[non_zero_indices].sort_index(axis=1), back_to_base.fragment_mz_df[non_zero_indices].sort_index(axis=1), check_dtype=False)"
    ]
   },
   {
@@ -224,12 +219,12 @@
    "source": [
     "#Calculate the full fragment_mz for the back_to_base and compare to the original\n",
     "back_to_base.calc_fragment_mz_df()\n",
-    "pd.testing.assert_frame_equal(target.fragment_mz_df[non_zero_target_columns].sort_index(axis=1), back_to_base.fragment_mz_df.sort_index(axis=1), check_dtype=False)"
+    "pd.testing.assert_frame_equal(target.fragment_mz_df.sort_index(axis=1), back_to_base.fragment_mz_df.sort_index(axis=1), check_dtype=False)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +279,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.5"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,