From 7a1f4a74519290315bdbd1ddf66adbf3eb23de2c Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 15:05:03 +0100 Subject: [PATCH] move models to new fdr_analysis module --- alphadia/fdr_analysis/models/__init__.py | 4 + .../models/logistic_regression.py | 128 ++++++ .../models/two_step_classifier.py | 288 +++++++++++++ alphadia/fdrexperimental.py | 405 ------------------ alphadia/workflow/manager.py | 4 +- alphadia/workflow/peptidecentric.py | 5 +- 6 files changed, 425 insertions(+), 409 deletions(-) create mode 100644 alphadia/fdr_analysis/models/__init__.py create mode 100644 alphadia/fdr_analysis/models/logistic_regression.py create mode 100644 alphadia/fdr_analysis/models/two_step_classifier.py diff --git a/alphadia/fdr_analysis/models/__init__.py b/alphadia/fdr_analysis/models/__init__.py new file mode 100644 index 00000000..1e0053df --- /dev/null +++ b/alphadia/fdr_analysis/models/__init__.py @@ -0,0 +1,4 @@ +from .logistic_regression import LogisticRegressionClassifier +from .two_step_classifier import TwoStepClassifier + +__all__ = ["LogisticRegressionClassifier", "TwoStepClassifier"] diff --git a/alphadia/fdr_analysis/models/logistic_regression.py b/alphadia/fdr_analysis/models/logistic_regression.py new file mode 100644 index 00000000..48076622 --- /dev/null +++ b/alphadia/fdr_analysis/models/logistic_regression.py @@ -0,0 +1,128 @@ +import logging + +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler + +from alphadia.fdrexperimental import Classifier + +logger = logging.getLogger() + + +class LogisticRegressionClassifier(Classifier): + def __init__(self) -> None: + """Binary classifier using a logistic regression model.""" + self.scaler = StandardScaler() + self.model = LogisticRegression() + self._fitted = False + + @property + def fitted(self) -> bool: + return self._fitted + + def fit(self, x: np.ndarray, y: np.ndarray) -> None: + """Fit the classifier to the data. + + Parameters + ---------- + + x : np.array, dtype=float + Training data of shape (n_samples, n_features). + + y : np.array, dtype=int + Target values of shape (n_samples,) or (n_samples, n_classes). + + """ + x_scaled = self.scaler.fit_transform(x) + self.model.fit(x_scaled, y) + self._fitted = True + + def predict(self, x: np.ndarray) -> np.ndarray: + """Predict the class of the data. + + Parameters + ---------- + + x : np.array, dtype=float + Data of shape (n_samples, n_features). + + Returns + ------- + + y : np.array, dtype=float + Predicted class probabilities of shape (n_samples, n_classes). + + """ + x_scaled = self.scaler.transform(x) + return self.model.predict(x_scaled) + + def predict_proba(self, x: np.ndarray) -> np.ndarray: + """Predict the class probabilities of the data. + + Parameters + ---------- + + x : np.array, dtype=float + Data of shape (n_samples, n_features). + + Returns + ------- + + y : np.array, dtype=float + Predicted class probabilities of shape (n_samples, n_classes). + + """ + x_scaled = self.scaler.transform(x) + return self.model.predict_proba(x_scaled) + + def to_state_dict(self) -> dict: + """Return the state of the classifier as a dictionary. + + Returns + ------- + + dict : dict + Dictionary containing the state of the classifier. + + """ + state_dict = {"_fitted": self._fitted} + + if self._fitted: + state_dict.update( + { + "scaler_mean": self.scaler.mean_, + "scaler_var": self.scaler.var_, + "scaler_scale": self.scaler.scale_, + "scaler_n_samples_seen": self.scaler.n_samples_seen_, + "model_coef": self.model.coef_, + "model_intercept": self.model.intercept_, + "model_classes": self.model.classes_, + "is_fitted": self._fitted, + } + ) + + return state_dict + + def from_state_dict(self, state_dict: dict) -> None: + """Load the state of the classifier from a dictionary. + + Parameters + ---------- + + dict : dict + Dictionary containing the state of the classifier. + + """ + self._fitted = state_dict["_fitted"] + + if self.fitted: + self.scaler = StandardScaler() + self.scaler.mean_ = np.array(state_dict["scaler_mean"]) + self.scaler.var_ = np.array(state_dict["scaler_var"]) + self.scaler.scale_ = np.array(state_dict["scaler_scale"]) + self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"]) + + self.model = LogisticRegression() + self.model.coef_ = np.array(state_dict["model_coef"]) + self.model.intercept_ = np.array(state_dict["model_intercept"]) + self.model.classes_ = np.array(state_dict["model_classes"]) diff --git a/alphadia/fdr_analysis/models/two_step_classifier.py b/alphadia/fdr_analysis/models/two_step_classifier.py new file mode 100644 index 00000000..62e8141f --- /dev/null +++ b/alphadia/fdr_analysis/models/two_step_classifier.py @@ -0,0 +1,288 @@ +import logging + +import numpy as np +import pandas as pd + +from alphadia.fdr import get_q_values, keep_best +from alphadia.fdrexperimental import Classifier + +logger = logging.getLogger() + + +class TwoStepClassifier: + def __init__( + self, + first_classifier: Classifier, + second_classifier: Classifier, + train_on_top_n: int = 1, + first_fdr_cutoff: float = 0.6, + second_fdr_cutoff: float = 0.01, + ): + """ + A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. + + Parameters + ---------- + first_classifier : Classifier + The first classifier used to initially filter the data. + second_classifier : Classifier + The second classifier used to further refine or confirm the classification based on the output from the first classifier. + train_on_top_n : int, default=1 + The number of top candidates that are considered for training of the second classifier. + first_fdr_cutoff : float, default=0.6 + The fdr threshold for the first classifier, determining how selective the first classification step is. + second_fdr_cutoff : float, default=0.01 + The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. + + """ + self.first_classifier = first_classifier + self.second_classifier = second_classifier + self.first_fdr_cutoff = first_fdr_cutoff + self.second_fdr_cutoff = second_fdr_cutoff + + self.train_on_top_n = train_on_top_n + + def fit_predict( + self, + df: pd.DataFrame, + x_cols: list[str], + y_col: str = "decoy", + group_columns: list[str] | None = None, + ) -> pd.DataFrame: + """ + Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame from which predictions are to be made. + x_cols : list[str] + List of column names representing the features to be used for prediction. + y_col : str, optional + The name of the column that denotes the target variable, by default 'decoy'. + group_columns : list[str] | None, optional + List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped. + + Returns + ------- + pd.DataFrame + A DataFrame containing only the predicted precursors. + + """ + df.dropna(subset=x_cols, inplace=True) + df = apply_absolute_transformations(df) + + if self.first_classifier.fitted: + X = df[x_cols].to_numpy() + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_subset = get_entries_below_fdr( + df, self.first_fdr_cutoff, group_columns, remove_decoys=False + ) + + self.second_classifier.epochs = 50 + + df_train = df_subset + df_predict = df_subset + + else: + df_train = df[df["rank"] < self.train_on_top_n] + df_predict = df + + self.second_classifier.fit( + df_train[x_cols].to_numpy().astype(np.float32), + df_train[y_col].to_numpy().astype(np.float32), + ) + X = df_predict[x_cols].to_numpy() + df_predict["proba"] = self.second_classifier.predict_proba(X)[:, 1] + df_predict = get_entries_below_fdr( + df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False + ) + + df_targets = df_predict[df_predict["decoy"] == 0] + + self.update_first_classifier( + df=get_target_decoy_partners(df_predict, df), + x_cols=x_cols, + y_col=y_col, + group_columns=group_columns, + ) + + return df_targets + + def update_first_classifier( + self, + df: pd.DataFrame, + x_cols: list[str], + y_col: str, + group_columns: list[str], + ) -> None: + """ + Update the first classifier only if it improves upon the previous version or if it has not been previously fitted. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing the features and target. + x_cols : list[str] + List of column names representing the features. + y_col : str + Name of the column representing the target variable. + group_columns : list[str] + Columns used to group data for FDR calculation. + + """ + X = df[x_cols].to_numpy() + y = df[y_col].to_numpy() + + previous_n_precursors = -1 + + if self.first_classifier.fitted: + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + previous_n_precursors = len(df_targets) + previous_state_dict = self.first_classifier.to_state_dict() + + self.first_classifier.fit(X, y) + + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + current_n_precursors = len(df_targets) + + if previous_n_precursors > current_n_precursors: + self.first_classifier.from_state_dict(previous_state_dict) + + @property + def fitted(self) -> bool: + """Return whether both classifiers have been fitted.""" + return self.second_classifier.fitted + + def to_state_dict(self) -> dict: + """Save classifier state. + + Returns + ------- + dict + State dictionary containing both classifiers + """ + return { + "first_classifier": self.first_classifier.to_state_dict(), + "second_classifier": self.second_classifier.to_state_dict(), + "first_fdr_cutoff": self.first_fdr_cutoff, + "second_fdr_cutoff": self.second_fdr_cutoff, + "train_on_top_n": self.train_on_top_n, + } + + def from_state_dict(self, state_dict: dict) -> None: + """Load classifier state. + + Parameters + ---------- + state_dict : dict + State dictionary containing both classifiers + """ + self.first_classifier.from_state_dict(state_dict["first_classifier"]) + self.second_classifier.from_state_dict(state_dict["second_classifier"]) + self.first_fdr_cutoff = state_dict["first_fdr_cutoff"] + self.second_fdr_cutoff = state_dict["second_fdr_cutoff"] + self.train_on_top_n = state_dict["train_on_top_n"] + + +def get_entries_below_fdr( + df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True +) -> pd.DataFrame: + """ + Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. + If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns. + fdr : float + The false discovery rate threshold for filtering entries. + group_columns : list + List of columns to group by when determining the best entries per group. + remove_decoys : bool, optional + Specifies whether decoy entries should be removed from the final result. Defaults to True. + + Returns + ------- + pd.DataFrame + A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys. + """ + df.sort_values("proba", ascending=True, inplace=True) + df = keep_best(df, group_columns=group_columns) + df = get_q_values(df, "proba", "decoy") + + df_subset = df[df["qval"] < fdr] + if remove_decoys: + df_subset = df_subset[df_subset["decoy"] == 0] + + # Handle case where no entries are below the FDR threshold + if len(df_subset) == 0: + df = df[df["decoy"] == 0] + df_subset = df.loc[[df["qval"].idxmin()]] + + return df_subset + + +def get_target_decoy_partners( + reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None +) -> pd.DataFrame: + """ + Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/ + This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. + + Parameters + ---------- + reference_df : pd.DataFrame + A subset DataFrame that contains reference values for matching. + full_df : pd.DataFrame + The main DataFrame from which rows will be matched against reference_df. + group_by : list[str] | None, optional + The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided. + + Returns + ------- + pd.DataFrame + A DataFrame containing rows from full_df that match the grouping criteria. + + """ + if group_by is None: + group_by = ["rank", "elution_group_idx"] + valid_tuples = reference_df[group_by] + matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner") + + return matching_rows + + +def apply_absolute_transformations( + df: pd.DataFrame, columns: list[str] | None = None +) -> pd.DataFrame: + """ + Applies absolute value transformations to predefined columns in a DataFrame inplace. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing the data to be transformed. + columns : list of str, optional + List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error']. + + Returns + ------- + pd.DataFrame + The transformed DataFrame. + """ + if columns is None: + columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"] + + for col in columns: + if col in df.columns: + df[col] = np.abs(df[col]) + else: + logger.warning( + f"column '{col}' is not present in df, therefore abs() was not applied." + ) + + return df \ No newline at end of file diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index a0dbe8cb..07f22ce3 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -8,52 +8,15 @@ # alpha family imports # third party imports import numpy as np -import pandas as pd import torch from sklearn import model_selection -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler from torch import nn, optim from torchmetrics.classification import BinaryAUROC from tqdm import tqdm -from alphadia.fdr import get_q_values, keep_best - logger = logging.getLogger() -def apply_absolute_transformations( - df: pd.DataFrame, columns: list[str] | None = None -) -> pd.DataFrame: - """ - Applies absolute value transformations to predefined columns in a DataFrame inplace. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing the data to be transformed. - columns : list of str, optional - List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error']. - - Returns - ------- - pd.DataFrame - The transformed DataFrame. - """ - if columns is None: - columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"] - - for col in columns: - if col in df.columns: - df[col] = np.abs(df[col]) - else: - logger.warning( - f"column '{col}' is not present in df, therefore abs() was not applied." - ) - - return df - - class Classifier(ABC): """Abstract base class for classifiers. @@ -147,374 +110,6 @@ def from_state_dict(self, state_dict: dict): """ -class TwoStepClassifier: - def __init__( - self, - first_classifier: Classifier, - second_classifier: Classifier, - train_on_top_n: int = 1, - first_fdr_cutoff: float = 0.6, - second_fdr_cutoff: float = 0.01, - ): - """ - A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. - - Parameters - ---------- - first_classifier : Classifier - The first classifier used to initially filter the data. - second_classifier : Classifier - The second classifier used to further refine or confirm the classification based on the output from the first classifier. - train_on_top_n : int, default=1 - The number of top candidates that are considered for training of the second classifier. - first_fdr_cutoff : float, default=0.6 - The fdr threshold for the first classifier, determining how selective the first classification step is. - second_fdr_cutoff : float, default=0.01 - The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. - - """ - self.first_classifier = first_classifier - self.second_classifier = second_classifier - self.first_fdr_cutoff = first_fdr_cutoff - self.second_fdr_cutoff = second_fdr_cutoff - - self.train_on_top_n = train_on_top_n - - def fit_predict( - self, - df: pd.DataFrame, - x_cols: list[str], - y_col: str = "decoy", - group_columns: list[str] | None = None, - ) -> pd.DataFrame: - """ - Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame from which predictions are to be made. - x_cols : list[str] - List of column names representing the features to be used for prediction. - y_col : str, optional - The name of the column that denotes the target variable, by default 'decoy'. - group_columns : list[str] | None, optional - List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped. - - Returns - ------- - pd.DataFrame - A DataFrame containing only the predicted precursors. - - """ - df.dropna(subset=x_cols, inplace=True) - df = apply_absolute_transformations(df) - - if self.first_classifier.fitted: - X = df[x_cols].to_numpy() - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_subset = get_entries_below_fdr( - df, self.first_fdr_cutoff, group_columns, remove_decoys=False - ) - - self.second_classifier.batch_size = 500 - self.second_classifier.epochs = 50 - - df_train = df_subset - df_predict = df_subset - - else: - df_train = df[df["rank"] < self.train_on_top_n] - df_predict = df - - self.second_classifier.fit( - df_train[x_cols].to_numpy().astype(np.float32), - df_train[y_col].to_numpy().astype(np.float32), - ) - - df_predict["proba"] = self.second_classifier.predict_proba( - df_predict[x_cols].to_numpy() - )[:, 1] - df_predict = get_entries_below_fdr( - df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False - ) - df_targets = df_predict[df_predict["decoy"] == 0] - - self.update_first_classifier( - df=get_target_decoy_partners(df_predict, df), - x_cols=x_cols, - y_col=y_col, - group_columns=group_columns, - ) - - return df_targets - - def update_first_classifier( - self, - df: pd.DataFrame, - x_cols: list[str], - y_col: str, - group_columns: list[str], - ) -> None: - """ - Update the first classifier only if it improves upon the previous version or if it has not been previously fitted. - - Parameters - ---------- - df : pd.DataFrame - DataFrame containing the features and target. - x_cols : list[str] - List of column names representing the features. - y_col : str - Name of the column representing the target variable. - group_columns : list[str] - Columns used to group data for FDR calculation. - - """ - X = df[x_cols].to_numpy() - y = df[y_col].to_numpy() - - previous_n_precursors = -1 - - if self.first_classifier.fitted: - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) - previous_n_precursors = len(df_targets) - previous_state_dict = self.first_classifier.to_state_dict() - - self.first_classifier.fit(X, y) - - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) - current_n_precursors = len(df_targets) - - if previous_n_precursors > current_n_precursors: - self.first_classifier.from_state_dict(previous_state_dict) - - @property - def fitted(self) -> bool: - """Return whether both classifiers have been fitted.""" - return self.second_classifier.fitted - - def to_state_dict(self) -> dict: - """Save classifier state. - - Returns - ------- - dict - State dictionary containing both classifiers - """ - return { - "first_classifier": self.first_classifier.to_state_dict(), - "second_classifier": self.second_classifier.to_state_dict(), - "first_fdr_cutoff": self.first_fdr_cutoff, - "second_fdr_cutoff": self.second_fdr_cutoff, - "train_on_top_n": self.train_on_top_n, - } - - def from_state_dict(self, state_dict: dict) -> None: - """Load classifier state. - - Parameters - ---------- - state_dict : dict - State dictionary containing both classifiers - """ - self.first_classifier.from_state_dict(state_dict["first_classifier"]) - self.second_classifier.from_state_dict(state_dict["second_classifier"]) - self.first_fdr_cutoff = state_dict["first_fdr_cutoff"] - self.second_fdr_cutoff = state_dict["second_fdr_cutoff"] - self.train_on_top_n = state_dict["train_on_top_n"] - - -def get_entries_below_fdr( - df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True -) -> pd.DataFrame: - """ - Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. - If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns. - fdr : float - The false discovery rate threshold for filtering entries. - group_columns : list - List of columns to group by when determining the best entries per group. - remove_decoys : bool, optional - Specifies whether decoy entries should be removed from the final result. Defaults to True. - - Returns - ------- - pd.DataFrame - A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys. - """ - df.sort_values("proba", ascending=True, inplace=True) - df = keep_best(df, group_columns=group_columns) - df = get_q_values(df, "proba", "decoy") - - df_subset = df[df["qval"] < fdr] - if remove_decoys: - df_subset = df_subset[df_subset["decoy"] == 0] - - # Handle case where no entries are below the FDR threshold - if len(df_subset) == 0: - df = df[df["decoy"] == 0] - df_subset = df.loc[[df["qval"].idxmin()]] - - return df_subset - - -def get_target_decoy_partners( - reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None -) -> pd.DataFrame: - """ - Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/ - This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. - - Parameters - ---------- - reference_df : pd.DataFrame - A subset DataFrame that contains reference values for matching. - full_df : pd.DataFrame - The main DataFrame from which rows will be matched against reference_df. - group_by : list[str] | None, optional - The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided. - - Returns - ------- - pd.DataFrame - A DataFrame containing rows from full_df that match the grouping criteria. - - """ - if group_by is None: - group_by = ["rank", "elution_group_idx"] - valid_tuples = reference_df[group_by] - matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner") - - return matching_rows - - -class LogisticRegressionClassifier(Classifier): - def __init__(self) -> None: - """Binary classifier using a logistic regression model.""" - self.scaler = StandardScaler() - self.model = LogisticRegression() - self._fitted = False - - @property - def fitted(self) -> bool: - return self._fitted - - def fit(self, x: np.ndarray, y: np.ndarray) -> None: - """Fit the classifier to the data. - - Parameters - ---------- - - x : np.array, dtype=float - Training data of shape (n_samples, n_features). - - y : np.array, dtype=int - Target values of shape (n_samples,) or (n_samples, n_classes). - - """ - x_scaled = self.scaler.fit_transform(x) - self.model.fit(x_scaled, y) - self._fitted = True - - def predict(self, x: np.ndarray) -> np.ndarray: - """Predict the class of the data. - - Parameters - ---------- - - x : np.array, dtype=float - Data of shape (n_samples, n_features). - - Returns - ------- - - y : np.array, dtype=float - Predicted class probabilities of shape (n_samples, n_classes). - - """ - x_scaled = self.scaler.transform(x) - return self.model.predict(x_scaled) - - def predict_proba(self, x: np.ndarray) -> np.ndarray: - """Predict the class probabilities of the data. - - Parameters - ---------- - - x : np.array, dtype=float - Data of shape (n_samples, n_features). - - Returns - ------- - - y : np.array, dtype=float - Predicted class probabilities of shape (n_samples, n_classes). - - """ - x_scaled = self.scaler.transform(x) - return self.model.predict_proba(x_scaled) - - def to_state_dict(self) -> dict: - """Return the state of the classifier as a dictionary. - - Returns - ------- - - dict : dict - Dictionary containing the state of the classifier. - - """ - state_dict = {"_fitted": self._fitted} - - if self._fitted: - state_dict.update( - { - "scaler_mean": self.scaler.mean_, - "scaler_var": self.scaler.var_, - "scaler_scale": self.scaler.scale_, - "scaler_n_samples_seen": self.scaler.n_samples_seen_, - "model_coef": self.model.coef_, - "model_intercept": self.model.intercept_, - "model_classes": self.model.classes_, - "is_fitted": self._fitted, - } - ) - - return state_dict - - def from_state_dict(self, state_dict: dict) -> None: - """Load the state of the classifier from a dictionary. - - Parameters - ---------- - - dict : dict - Dictionary containing the state of the classifier. - - """ - self._fitted = state_dict["_fitted"] - - if self.fitted: - self.scaler = StandardScaler() - self.scaler.mean_ = np.array(state_dict["scaler_mean"]) - self.scaler.var_ = np.array(state_dict["scaler_var"]) - self.scaler.scale_ = np.array(state_dict["scaler_scale"]) - self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"]) - - self.model = LogisticRegression() - self.model.coef_ = np.array(state_dict["model_coef"]) - self.model.intercept_ = np.array(state_dict["model_intercept"]) - self.model.classes_ = np.array(state_dict["model_classes"]) - - class BinaryClassifier(Classifier): def __init__( self, diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 197713fe..78dc5160 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -18,9 +18,9 @@ # alphadia imports import alphadia -import alphadia.fdrexperimental as fdrx from alphadia import fdr from alphadia.calibration.property import Calibration, calibration_model_provider +from alphadia.fdr_analysis.models import TwoStepClassifier from alphadia.workflow import reporting from alphadia.workflow.config import Config @@ -629,7 +629,7 @@ def __init__( self.classifier_store = defaultdict(list) self.classifier_base = classifier_base self.enable_two_step_classifier = isinstance( - classifier_base, fdrx.TwoStepClassifier + classifier_base, TwoStepClassifier ) self._current_version = -1 diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index fe1be321..ffcb4eaf 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -15,6 +15,7 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils +from alphadia.fdr_analysis.models import LogisticRegressionClassifier, TwoStepClassifier from alphadia.peakgroup import search from alphadia.workflow import base, manager, optimization from alphadia.workflow.config import Config @@ -105,8 +106,8 @@ def get_classifier_base(enable_two_step_classifier: bool = False): ) if enable_two_step_classifier: - return fdrx.TwoStepClassifier( - first_classifier=fdrx.LogisticRegressionClassifier(), + return TwoStepClassifier( + first_classifier=LogisticRegressionClassifier(), second_classifier=nn_classifier, ) else: