Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor readers viii #250

Merged
merged 3 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 4 additions & 21 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Reader for AlphaPept's *.ms_data.hdf files."""

from pathlib import Path
from typing import Optional, Tuple
from typing import Tuple

import h5py
import numba
Expand Down Expand Up @@ -54,29 +54,12 @@ class AlphaPeptReader(PSMReaderBase):
_reader_type = "alphapept"
_modification_type = "alphapept"

def __init__(
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
**kwargs,
):
"""Reading PSMs from alphapept's *.ms_data.hdf."""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
**kwargs,
)
self.hdf_dataset = "identifications"

def _load_file(self, filename: str) -> pd.DataFrame:
"""Load an AlphaPept output file to a DataFrame."""
with h5py.File(filename, "r") as _hdf:
dataset = _hdf[self.hdf_dataset]
dataset = _hdf[
"identifications"
] # TODO: "identifications" could be moved to yaml
df = pd.DataFrame({col: dataset[col] for col in dataset})

# TODO: make this more stable
Expand Down
105 changes: 5 additions & 100 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Readers for Spectronaut's output library and reports, Swath data and DIANN data."""

from typing import List, Optional

import numpy as np
import pandas as pd

Expand All @@ -19,32 +17,8 @@ class SpectronautReader(MaxQuantReader):

_reader_type = "spectronaut"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
fixed_C57: bool = False, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
mod_seq_columns: Optional[List[str]] = None,
rt_unit: str = "minute",
**kwargs,
):
"""Initialize SpectronautReader."""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
mod_seq_columns=mod_seq_columns,
fixed_C57=fixed_C57,
rt_unit=rt_unit,
**kwargs,
)

self._min_max_rt_norm = True
_min_max_rt_norm = True
mschwoer marked this conversation as resolved.
Show resolved Hide resolved
_fixed_c57_default = False
mschwoer marked this conversation as resolved.
Show resolved Hide resolved

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut-specific preprocessing of output data."""
Expand All @@ -65,58 +39,13 @@ class SwathReader(SpectronautReader):
_reader_type = "spectronaut" # no typo
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
fixed_C57: bool = False, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
mod_seq_columns: Optional[List[str]] = None,
**kwargs,
):
"""SWATH or OpenSWATH library, similar to `SpectronautReader`."""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
fixed_C57=fixed_C57,
mod_seq_columns=mod_seq_columns,
**kwargs,
)


class DiannReader(MaxQuantReader):
"""Reader for DIANN data."""

_reader_type = "diann"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
fixed_C57: bool = False, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
rt_unit: str = "minute",
**kwargs,
):
"""Similar to `SpectronautReader` but different in column_mapping and modification_mapping."""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
fixed_C57=fixed_C57,
rt_unit=rt_unit,
**kwargs,
)

self._min_max_rt_norm = False
_min_max_rt_norm = False

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""DIANN-specific preprocessing of output data.
Expand All @@ -142,36 +71,12 @@ class SpectronautReportReader(MaxQuantReader):

_reader_type = "spectronaut_report"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913 many arguments in function definition
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
fixed_C57: bool = False, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
rt_unit: str = "minute",
**kwargs,
):
"""Initialize SpectronautReportReader."""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
fixed_C57=fixed_C57,
rt_unit=rt_unit,
**kwargs,
)

self.precursor_column = "EG.PrecursorId" # TODO: move to yaml
self._min_max_rt_norm = False
_min_max_rt_norm = False

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut report-specific preprocessing of output data."""
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
"EG.PrecursorId" # TODO: move to yaml
].str.split(".", expand=True, n=2)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8)
return df
Expand Down
41 changes: 10 additions & 31 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,67 +127,46 @@ class MaxQuantReader(PSMReaderBase):
_reader_type = "maxquant"
_add_unimod_to_mod_mapping = True
_modification_type = "maxquant"
_fixed_c57_default = True

def __init__( # noqa: PLR0913 many arguments in function definition
def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
mod_seq_columns: Optional[List[str]] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
fixed_C57: bool = True, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
mod_seq_columns: Optional[List[str]] = None,
rt_unit: str = "minute",
# MaxQuant reader-specific
fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
**kwargs,
):
"""Reader for MaxQuant msms.txt and evidence.txt.

See documentation of `PSMReaderBase` for more information.

Parameters
----------
column_mapping : dict, optional
By default None. If None, use
`psm_reader_yaml['maxquant']['column_mapping']`
(alphabase.psm_reader.psm_reader_yaml).

modification_mapping : dict, optional
By default None. If None, use
`psm_reader_yaml['maxquant']['modification_mapping']`
(alphabase.psm_reader.psm_reader_yaml).

fdr : float, optional
Load PSMs with FDR < this fdr, by default 0.01

keep_decoy : bool, optional
If keep decoy PSMs, by default False

fixed_C57 : bool, optional
If true, the search engine will not show `Carbamidomethyl`
in the modified sequences.
by default True

mod_seq_columns : list, optional
The columns to find modified sequences,
by default ['Modified sequence']

rt_unit : str, optional
The unit of RT in the search engine result.
Defaults to 'minute'.

**kwargs : dict
deprecated
See documentation of `PSMReaderBase` for the rest of parameters.

"""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
mod_seq_columns=mod_seq_columns,
fdr=fdr,
keep_decoy=keep_decoy,
rt_unit=rt_unit,
mod_seq_columns=mod_seq_columns,
**kwargs,
)

self.fixed_C57 = fixed_C57
self.fixed_C57 = fixed_C57 if fixed_C57 is not None else self._fixed_c57_default

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
Expand Down
20 changes: 18 additions & 2 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,34 @@ class MSFraggerPepXML(PSMReaderBase):

_reader_type = "msfragger_pepxml"

def __init__( # noqa: PLR0913 many arguments in function definition
def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
# mod_seq_columns: Optional[List[str]] = None,# TODO: not needed here?
fdr: float = 0.001, # refers to E-value in the PepXML
keep_decoy: bool = False,
rt_unit: str = "second",
# MSFragger reader-specific:
keep_unknown_aa_mass_diffs: bool = False,
**kwargs,
):
"""MSFragger is not fully supported as we can only access the pepxml file."""
"""Initialize the MSFraggerreader.

See documentation of `PSMReaderBase` for more information.

MSFragger is not fully supported as we can only access the pepxml file.

Parameters
----------
keep_unknown_aa_mass_diffs:
whether to keep PSMs with unknown amino acid mass differences, default: False


See documentation of `PSMReaderBase` for the rest of parameters.

"""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
Expand Down
18 changes: 0 additions & 18 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,24 +98,6 @@ class pFindReader(PSMReaderBase): # noqa: N801 name `pFindReader` should use Ca

_reader_type = "pfind"

def __init__(
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
**kwargs,
):
"""Reading PSMs from pFind's *.txt."""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
**kwargs,
)

def _translate_modifications(self) -> None:
pass

Expand Down
3 changes: 2 additions & 1 deletion alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class PSMReaderBase(ABC):
# the typ of modification mapping to be used
_modification_type: Optional[str] = None

_min_max_rt_norm = False

def __init__( # noqa: PLR0913 # too many arguments
self,
*,
Expand Down Expand Up @@ -142,7 +144,6 @@ def __init__( # noqa: PLR0913 # too many arguments
self._psm_df = pd.DataFrame()
self._keep_fdr = fdr
self._keep_decoy = keep_decoy
self._min_max_rt_norm = False
self._engine_rt_unit = rt_unit
self._min_irt_value = -100
self._max_irt_value = 200
Expand Down
Loading
Loading