Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor readers vii #249

Merged
merged 6 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,24 @@ def __init__(
self.hdf_dataset = "identifications"

def _load_file(self, filename: str) -> pd.DataFrame:
"""Load an AlphaPept output file to a DataFrame."""
with h5py.File(filename, "r") as _hdf:
dataset = _hdf[self.hdf_dataset]
df = pd.DataFrame({col: dataset[col] for col in dataset})
df[PsmDfCols.RAW_NAME] = Path(filename).name[: -len(".ms_data.hdf")]
df["precursor"] = df["precursor"].str.decode("utf-8")
# df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')
if "scan_no" in df.columns:
df["scan_no"] = df["scan_no"].astype("int")
df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)

# TODO: make this more stable
df[PsmDfCols.RAW_NAME] = Path(filename).name[: -len(".ms_data.hdf")]

return df

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""AlphaPept-specific preprocessing of output data."""
df["precursor"] = df["precursor"].str.decode("utf-8")
# df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')
if "scan_no" in df.columns:
df["scan_no"] = df["scan_no"].astype("int")
df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)
return df

def _load_modifications(self, df: pd.DataFrame) -> None:
Expand Down
30 changes: 12 additions & 18 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,8 @@ def __init__( # noqa: PLR0913 many arguments in function definition

self._min_max_rt_norm = True

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self.mod_seq_column = self._get_mod_seq_column(
df
) # TODO: this needs to be removed
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut-specific preprocessing of output data."""
if "ReferenceRun" in df.columns:
df.drop_duplicates(
["ReferenceRun", self.mod_seq_column, "PrecursorCharge"], inplace=True
Expand Down Expand Up @@ -93,7 +88,7 @@ def __init__( # noqa: PLR0913 many arguments in function definition
)


class DiannReader(SpectronautReader):
class DiannReader(MaxQuantReader):
mschwoer marked this conversation as resolved.
Show resolved Hide resolved
"""Reader for DIANN data."""

_reader_type = "diann"
Expand Down Expand Up @@ -123,9 +118,13 @@ def __init__( # noqa: PLR0913 many arguments in function definition

self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
return pd.read_csv(filename, sep=csv_sep, keep_default_na=False)
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""DIANN-specific preprocessing of output data.

Nothing to do for DIANN, still method of superclass needs to be overwritten.
TODO disentangle the inheritance structure.
"""
return df

def _post_process(self) -> None:
super()._post_process()
Expand Down Expand Up @@ -169,13 +168,8 @@ def __init__( # noqa: PLR0913 many arguments in function definition
self.precursor_column = "EG.PrecursorId" # TODO: move to yaml
self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self.mod_seq_column = self._get_mod_seq_column(
df
) # TODO: this needs to be removed
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut report-specific preprocessing of output data."""
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
Expand Down
6 changes: 2 additions & 4 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,8 @@ def _translate_decoy(self) -> None:
self._psm_df[PsmDfCols.DECOY] == "-"
).astype(np.int8)

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""MaxQuant-specific preprocessing of output data."""
df = df[~pd.isna(df["Retention time"])]
df.fillna("", inplace=True)

Expand Down
20 changes: 11 additions & 9 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,16 +124,18 @@ def _translate_modifications(self) -> None:
pass

def _load_file(self, filename: str) -> pd.DataFrame:
msf_df = pepxml.DataFrame(filename)
msf_df.fillna("", inplace=True)
if "ion_mobility" in msf_df.columns:
msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float)
msf_df[PsmDfCols.RAW_NAME] = (
msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
)
msf_df["to_remove"] = 0 # TODO: revisit
"""Load a MsFragger output file to a DataFrame."""
return pepxml.DataFrame(filename)

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""MsFragger-specific preprocessing of output data."""
df.fillna("", inplace=True)
if "ion_mobility" in df.columns:
df["ion_mobility"] = df.ion_mobility.astype(float)
df[PsmDfCols.RAW_NAME] = df["spectrum"].str.split(".").apply(lambda x: x[0])
df["to_remove"] = 0 # TODO: revisit
self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove"
return msf_df
return df

def _translate_decoy(self) -> None:
self._psm_df[PsmDfCols.DECOY] = (
Expand Down
20 changes: 10 additions & 10 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,16 +120,16 @@ def _translate_modifications(self) -> None:
pass

def _load_file(self, filename: str) -> pd.DataFrame:
pfind_df = pd.read_csv(
filename, index_col=False, sep="\t", keep_default_na=False
)
pfind_df.fillna("", inplace=True)
pfind_df = pfind_df[pfind_df.Sequence != ""]
pfind_df[PsmDfCols.RAW_NAME] = (
pfind_df["File_Name"].str.split(".").apply(lambda x: x[0])
)
pfind_df["Proteins"] = pfind_df["Proteins"].apply(parse_pfind_protein)
return pfind_df
"""Load a pFind output file to a DataFrame."""
return pd.read_csv(filename, index_col=False, sep="\t", keep_default_na=False)

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""pFind-specific preprocessing of output data."""
df.fillna("", inplace=True)
df = df[df.Sequence != ""]
df[PsmDfCols.RAW_NAME] = df["File_Name"].str.split(".").apply(lambda x: x[0])
df["Proteins"] = df["Proteins"].apply(parse_pfind_protein)
return df

def _translate_decoy(self) -> None:
self._psm_df[PsmDfCols.DECOY] = (
Expand Down
27 changes: 15 additions & 12 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import warnings
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, NoReturn, Optional, Set, Type, Union
from typing import Dict, List, Optional, Set, Type, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -151,6 +151,7 @@ def __init__( # noqa: PLR0913 # too many arguments
if mod_seq_columns is not None
else psm_reader_yaml[self._reader_type].get("mod_seq_columns", [])
)
self.mod_seq_column = None

for key, value in kwargs.items(): # TODO: remove and remove kwargs
warnings.warn(
Expand Down Expand Up @@ -231,12 +232,13 @@ def import_file(self, _file: str) -> pd.DataFrame:
"""
origin_df = self._load_file(_file)

self.mod_seq_column = self._get_mod_seq_column(origin_df)

self._psm_df = pd.DataFrame()

if len(origin_df):
# TODO: think about dropping the 'inplace' pattern here
self.mod_seq_column = self._get_mod_seq_column(origin_df)

origin_df = self._pre_process(origin_df)
self._translate_columns(origin_df) # only here
self._transform_table() # only sage
self._translate_decoy() # only sage, mq, msfragger, pfind
Expand All @@ -248,6 +250,9 @@ def import_file(self, _file: str) -> pd.DataFrame:
self._post_process() # here, libraryreader, diann, msfragger
return self._psm_df

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _translate_decoy(self) -> None: # noqa: B027 empty method in an abstract base class
pass

Expand All @@ -256,9 +261,6 @@ def _translate_score(self) -> None: # noqa: B027 empty method in an abstract ba
# to -log(evalue), as score is the larger the better
pass

def _get_table_delimiter(self, filename: str) -> str:
return _get_delimiter(filename)

def _normalize_rt(self) -> None:
if PsmDfCols.RT in self._psm_df.columns:
if self._engine_rt_unit == "second":
Expand Down Expand Up @@ -300,12 +302,11 @@ def normalize_rt_by_raw_name(self) -> None:
df_group[PsmDfCols.RT_NORM] / df_group[PsmDfCols.RT_NORM].max()
)

@abstractmethod
def _load_file(self, filename: str) -> pd.DataFrame:
"""Load original dataframe from PSM filename.
"""Load PSM file into a dataframe.

Different search engines may store PSMs in different ways:
tsv, csv, HDF, XML, ...
Different search engines may store PSMs in different ways: tsv, csv, HDF, XML, ...
This default implementation works for tsv and csv files and thus covers many readers.

Parameters
----------
Expand All @@ -315,9 +316,11 @@ def _load_file(self, filename: str) -> pd.DataFrame:
Returns
-------
pd.DataFrame
loaded dataframe
psm file as dataframe

"""
sep = _get_delimiter(filename)
return pd.read_csv(filename, sep=sep, keep_default_na=False)

def _find_mapped_columns(self, df: pd.DataFrame) -> Dict[str, str]:
"""Determine the mapping of AlphaBase columns to the columns in the given DataFrame.
Expand Down Expand Up @@ -371,7 +374,7 @@ def _transform_table(self) -> None: # noqa: B027 empty method in an abstract ba
"""

@abstractmethod
def _load_modifications(self, origin_df: pd.DataFrame) -> NoReturn:
def _load_modifications(self, origin_df: pd.DataFrame) -> None:
"""Read modification information from 'origin_df'.

Some search engines use modified_sequence, some of them
Expand Down
8 changes: 3 additions & 5 deletions alphabase/psm_reader/sage_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import logging
import multiprocessing as mp
import re
from abc import ABC
from functools import partial
from typing import Generator, List, NoReturn, Optional, Tuple
from typing import Generator, List, Optional, Tuple

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -562,7 +563,7 @@ def _sage_spec_idx_from_scan_nr(scan_indicator_str: str) -> int:
return int(re.search(r"scan=(\d+)", scan_indicator_str).group(1)) - 1


class SageReaderBase(PSMReaderBase):
class SageReaderBase(PSMReaderBase, ABC):
mschwoer marked this conversation as resolved.
Show resolved Hide resolved
"""Base class for SageReader."""

_reader_type = "sage"
Expand Down Expand Up @@ -592,9 +593,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

def _load_file(self, filename: str) -> NoReturn:
raise NotImplementedError

def _transform_table(self) -> None:
self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCANNR].apply(
_sage_spec_idx_from_scan_nr
Expand Down
11 changes: 10 additions & 1 deletion alphabase/spectral_library/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.spectral_library.base import SpecLibBase
from alphabase.utils import _get_delimiter


class LibraryReaderBase(MaxQuantReader, SpecLibBase):
Expand Down Expand Up @@ -242,7 +243,7 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame) -> pd.DataFrame: # noqa

def _load_file(self, filename: str) -> pd.DataFrame:
"""Load the spectral library from a csv file."""
csv_sep = self._get_table_delimiter(filename)
csv_sep = _get_delimiter(filename)

return pd.read_csv(
filename,
Expand Down Expand Up @@ -270,6 +271,14 @@ def _load_file(self, filename: str) -> pd.DataFrame:
],
)

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Library-specific preprocessing of output data.

Nothing to do here, still method of superclass needs to be overwritten.
TODO disentangle the inheritance structure.
"""
return df

def _post_process(
self,
) -> None:
Expand Down
Loading