From 7a1f4a74519290315bdbd1ddf66adbf3eb23de2c Mon Sep 17 00:00:00 2001
From: anna-charlotte <chgerhaher@yahoo.com>
Date: Fri, 17 Jan 2025 15:05:03 +0100
Subject: [PATCH] move models to new fdr_analysis module

---
 alphadia/fdr_analysis/models/__init__.py      |   4 +
 .../models/logistic_regression.py             | 128 ++++++
 .../models/two_step_classifier.py             | 288 +++++++++++++
 alphadia/fdrexperimental.py                   | 405 ------------------
 alphadia/workflow/manager.py                  |   4 +-
 alphadia/workflow/peptidecentric.py           |   5 +-
 6 files changed, 425 insertions(+), 409 deletions(-)
 create mode 100644 alphadia/fdr_analysis/models/__init__.py
 create mode 100644 alphadia/fdr_analysis/models/logistic_regression.py
 create mode 100644 alphadia/fdr_analysis/models/two_step_classifier.py

diff --git a/alphadia/fdr_analysis/models/__init__.py b/alphadia/fdr_analysis/models/__init__.py
new file mode 100644
index 00000000..1e0053df
--- /dev/null
+++ b/alphadia/fdr_analysis/models/__init__.py
@@ -0,0 +1,4 @@
+from .logistic_regression import LogisticRegressionClassifier
+from .two_step_classifier import TwoStepClassifier
+
+__all__ = ["LogisticRegressionClassifier", "TwoStepClassifier"]
diff --git a/alphadia/fdr_analysis/models/logistic_regression.py b/alphadia/fdr_analysis/models/logistic_regression.py
new file mode 100644
index 00000000..48076622
--- /dev/null
+++ b/alphadia/fdr_analysis/models/logistic_regression.py
@@ -0,0 +1,128 @@
+import logging
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+
+from alphadia.fdrexperimental import Classifier
+
+logger = logging.getLogger()
+
+
+class LogisticRegressionClassifier(Classifier):
+    def __init__(self) -> None:
+        """Binary classifier using a logistic regression model."""
+        self.scaler = StandardScaler()
+        self.model = LogisticRegression()
+        self._fitted = False
+
+    @property
+    def fitted(self) -> bool:
+        return self._fitted
+
+    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
+        """Fit the classifier to the data.
+
+        Parameters
+        ----------
+
+        x : np.array, dtype=float
+            Training data of shape (n_samples, n_features).
+
+        y : np.array, dtype=int
+            Target values of shape (n_samples,) or (n_samples, n_classes).
+
+        """
+        x_scaled = self.scaler.fit_transform(x)
+        self.model.fit(x_scaled, y)
+        self._fitted = True
+
+    def predict(self, x: np.ndarray) -> np.ndarray:
+        """Predict the class of the data.
+
+        Parameters
+        ----------
+
+        x : np.array, dtype=float
+            Data of shape (n_samples, n_features).
+
+        Returns
+        -------
+
+        y : np.array, dtype=float
+            Predicted class probabilities of shape (n_samples, n_classes).
+
+        """
+        x_scaled = self.scaler.transform(x)
+        return self.model.predict(x_scaled)
+
+    def predict_proba(self, x: np.ndarray) -> np.ndarray:
+        """Predict the class probabilities of the data.
+
+        Parameters
+        ----------
+
+        x : np.array, dtype=float
+            Data of shape (n_samples, n_features).
+
+        Returns
+        -------
+
+        y : np.array, dtype=float
+            Predicted class probabilities of shape (n_samples, n_classes).
+
+        """
+        x_scaled = self.scaler.transform(x)
+        return self.model.predict_proba(x_scaled)
+
+    def to_state_dict(self) -> dict:
+        """Return the state of the classifier as a dictionary.
+
+        Returns
+        -------
+
+        dict : dict
+            Dictionary containing the state of the classifier.
+
+        """
+        state_dict = {"_fitted": self._fitted}
+
+        if self._fitted:
+            state_dict.update(
+                {
+                    "scaler_mean": self.scaler.mean_,
+                    "scaler_var": self.scaler.var_,
+                    "scaler_scale": self.scaler.scale_,
+                    "scaler_n_samples_seen": self.scaler.n_samples_seen_,
+                    "model_coef": self.model.coef_,
+                    "model_intercept": self.model.intercept_,
+                    "model_classes": self.model.classes_,
+                    "is_fitted": self._fitted,
+                }
+            )
+
+        return state_dict
+
+    def from_state_dict(self, state_dict: dict) -> None:
+        """Load the state of the classifier from a dictionary.
+
+        Parameters
+        ----------
+
+        dict : dict
+            Dictionary containing the state of the classifier.
+
+        """
+        self._fitted = state_dict["_fitted"]
+
+        if self.fitted:
+            self.scaler = StandardScaler()
+            self.scaler.mean_ = np.array(state_dict["scaler_mean"])
+            self.scaler.var_ = np.array(state_dict["scaler_var"])
+            self.scaler.scale_ = np.array(state_dict["scaler_scale"])
+            self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"])
+
+            self.model = LogisticRegression()
+            self.model.coef_ = np.array(state_dict["model_coef"])
+            self.model.intercept_ = np.array(state_dict["model_intercept"])
+            self.model.classes_ = np.array(state_dict["model_classes"])
diff --git a/alphadia/fdr_analysis/models/two_step_classifier.py b/alphadia/fdr_analysis/models/two_step_classifier.py
new file mode 100644
index 00000000..62e8141f
--- /dev/null
+++ b/alphadia/fdr_analysis/models/two_step_classifier.py
@@ -0,0 +1,288 @@
+import logging
+
+import numpy as np
+import pandas as pd
+
+from alphadia.fdr import get_q_values, keep_best
+from alphadia.fdrexperimental import Classifier
+
+logger = logging.getLogger()
+
+
+class TwoStepClassifier:
+    def __init__(
+        self,
+        first_classifier: Classifier,
+        second_classifier: Classifier,
+        train_on_top_n: int = 1,
+        first_fdr_cutoff: float = 0.6,
+        second_fdr_cutoff: float = 0.01,
+    ):
+        """
+        A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage.
+
+        Parameters
+        ----------
+        first_classifier : Classifier
+            The first classifier used to initially filter the data.
+        second_classifier : Classifier
+            The second classifier used to further refine or confirm the classification based on the output from the first classifier.
+        train_on_top_n : int, default=1
+            The number of top candidates that are considered for training of the second classifier.
+        first_fdr_cutoff : float, default=0.6
+            The fdr threshold for the first classifier, determining how selective the first classification step is.
+        second_fdr_cutoff : float, default=0.01
+            The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results.
+
+        """
+        self.first_classifier = first_classifier
+        self.second_classifier = second_classifier
+        self.first_fdr_cutoff = first_fdr_cutoff
+        self.second_fdr_cutoff = second_fdr_cutoff
+
+        self.train_on_top_n = train_on_top_n
+
+    def fit_predict(
+        self,
+        df: pd.DataFrame,
+        x_cols: list[str],
+        y_col: str = "decoy",
+        group_columns: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """
+        Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The input DataFrame from which predictions are to be made.
+        x_cols : list[str]
+            List of column names representing the features to be used for prediction.
+        y_col : str, optional
+            The name of the column that denotes the target variable, by default 'decoy'.
+        group_columns : list[str] | None, optional
+            List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped.
+
+        Returns
+        -------
+        pd.DataFrame
+            A DataFrame containing only the predicted precursors.
+
+        """
+        df.dropna(subset=x_cols, inplace=True)
+        df = apply_absolute_transformations(df)
+
+        if self.first_classifier.fitted:
+            X = df[x_cols].to_numpy()
+            df["proba"] = self.first_classifier.predict_proba(X)[:, 1]
+            df_subset = get_entries_below_fdr(
+                df, self.first_fdr_cutoff, group_columns, remove_decoys=False
+            )
+
+            self.second_classifier.epochs = 50
+
+            df_train = df_subset
+            df_predict = df_subset
+
+        else:
+            df_train = df[df["rank"] < self.train_on_top_n]
+            df_predict = df
+
+        self.second_classifier.fit(
+            df_train[x_cols].to_numpy().astype(np.float32),
+            df_train[y_col].to_numpy().astype(np.float32),
+        )
+        X = df_predict[x_cols].to_numpy()
+        df_predict["proba"] = self.second_classifier.predict_proba(X)[:, 1]
+        df_predict = get_entries_below_fdr(
+            df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False
+        )
+
+        df_targets = df_predict[df_predict["decoy"] == 0]
+
+        self.update_first_classifier(
+            df=get_target_decoy_partners(df_predict, df),
+            x_cols=x_cols,
+            y_col=y_col,
+            group_columns=group_columns,
+        )
+
+        return df_targets
+
+    def update_first_classifier(
+        self,
+        df: pd.DataFrame,
+        x_cols: list[str],
+        y_col: str,
+        group_columns: list[str],
+    ) -> None:
+        """
+        Update the first classifier only if it improves upon the previous version or if it has not been previously fitted.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            DataFrame containing the features and target.
+        x_cols : list[str]
+            List of column names representing the features.
+        y_col : str
+            Name of the column representing the target variable.
+        group_columns : list[str]
+            Columns used to group data for FDR calculation.
+
+        """
+        X = df[x_cols].to_numpy()
+        y = df[y_col].to_numpy()
+
+        previous_n_precursors = -1
+
+        if self.first_classifier.fitted:
+            df["proba"] = self.first_classifier.predict_proba(X)[:, 1]
+            df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns)
+            previous_n_precursors = len(df_targets)
+            previous_state_dict = self.first_classifier.to_state_dict()
+
+        self.first_classifier.fit(X, y)
+
+        df["proba"] = self.first_classifier.predict_proba(X)[:, 1]
+        df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns)
+        current_n_precursors = len(df_targets)
+
+        if previous_n_precursors > current_n_precursors:
+            self.first_classifier.from_state_dict(previous_state_dict)
+
+    @property
+    def fitted(self) -> bool:
+        """Return whether both classifiers have been fitted."""
+        return self.second_classifier.fitted
+
+    def to_state_dict(self) -> dict:
+        """Save classifier state.
+
+        Returns
+        -------
+        dict
+            State dictionary containing both classifiers
+        """
+        return {
+            "first_classifier": self.first_classifier.to_state_dict(),
+            "second_classifier": self.second_classifier.to_state_dict(),
+            "first_fdr_cutoff": self.first_fdr_cutoff,
+            "second_fdr_cutoff": self.second_fdr_cutoff,
+            "train_on_top_n": self.train_on_top_n,
+        }
+
+    def from_state_dict(self, state_dict: dict) -> None:
+        """Load classifier state.
+
+        Parameters
+        ----------
+        state_dict : dict
+            State dictionary containing both classifiers
+        """
+        self.first_classifier.from_state_dict(state_dict["first_classifier"])
+        self.second_classifier.from_state_dict(state_dict["second_classifier"])
+        self.first_fdr_cutoff = state_dict["first_fdr_cutoff"]
+        self.second_fdr_cutoff = state_dict["second_fdr_cutoff"]
+        self.train_on_top_n = state_dict["train_on_top_n"]
+
+
+def get_entries_below_fdr(
+    df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True
+) -> pd.DataFrame:
+    """
+    Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries.
+    If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns.
+    fdr : float
+        The false discovery rate threshold for filtering entries.
+    group_columns : list
+        List of columns to group by when determining the best entries per group.
+    remove_decoys : bool, optional
+        Specifies whether decoy entries should be removed from the final result. Defaults to True.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys.
+    """
+    df.sort_values("proba", ascending=True, inplace=True)
+    df = keep_best(df, group_columns=group_columns)
+    df = get_q_values(df, "proba", "decoy")
+
+    df_subset = df[df["qval"] < fdr]
+    if remove_decoys:
+        df_subset = df_subset[df_subset["decoy"] == 0]
+
+    # Handle case where no entries are below the FDR threshold
+    if len(df_subset) == 0:
+        df = df[df["decoy"] == 0]
+        df_subset = df.loc[[df["qval"].idxmin()]]
+
+    return df_subset
+
+
+def get_target_decoy_partners(
+    reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None
+) -> pd.DataFrame:
+    """
+    Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/
+    This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index.
+
+    Parameters
+    ----------
+    reference_df : pd.DataFrame
+        A subset DataFrame that contains reference values for matching.
+    full_df : pd.DataFrame
+        The main DataFrame from which rows will be matched against reference_df.
+    group_by : list[str] | None, optional
+        The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing rows from full_df that match the grouping criteria.
+
+    """
+    if group_by is None:
+        group_by = ["rank", "elution_group_idx"]
+    valid_tuples = reference_df[group_by]
+    matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner")
+
+    return matching_rows
+
+
+def apply_absolute_transformations(
+    df: pd.DataFrame, columns: list[str] | None = None
+) -> pd.DataFrame:
+    """
+    Applies absolute value transformations to predefined columns in a DataFrame inplace.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame containing the data to be transformed.
+    columns : list of str, optional
+        List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error'].
+
+    Returns
+    -------
+    pd.DataFrame
+        The transformed DataFrame.
+    """
+    if columns is None:
+        columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"]
+
+    for col in columns:
+        if col in df.columns:
+            df[col] = np.abs(df[col])
+        else:
+            logger.warning(
+                f"column '{col}' is not present in df, therefore abs() was not applied."
+            )
+
+    return df
\ No newline at end of file
diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py
index a0dbe8cb..07f22ce3 100644
--- a/alphadia/fdrexperimental.py
+++ b/alphadia/fdrexperimental.py
@@ -8,52 +8,15 @@
 # alpha family imports
 # third party imports
 import numpy as np
-import pandas as pd
 import torch
 from sklearn import model_selection
-from sklearn.linear_model import LogisticRegression
-from sklearn.preprocessing import StandardScaler
 from torch import nn, optim
 from torchmetrics.classification import BinaryAUROC
 from tqdm import tqdm
 
-from alphadia.fdr import get_q_values, keep_best
-
 logger = logging.getLogger()
 
 
-def apply_absolute_transformations(
-    df: pd.DataFrame, columns: list[str] | None = None
-) -> pd.DataFrame:
-    """
-    Applies absolute value transformations to predefined columns in a DataFrame inplace.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        The input DataFrame containing the data to be transformed.
-    columns : list of str, optional
-        List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error'].
-
-    Returns
-    -------
-    pd.DataFrame
-        The transformed DataFrame.
-    """
-    if columns is None:
-        columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"]
-
-    for col in columns:
-        if col in df.columns:
-            df[col] = np.abs(df[col])
-        else:
-            logger.warning(
-                f"column '{col}' is not present in df, therefore abs() was not applied."
-            )
-
-    return df
-
-
 class Classifier(ABC):
     """Abstract base class for classifiers.
 
@@ -147,374 +110,6 @@ def from_state_dict(self, state_dict: dict):
         """
 
 
-class TwoStepClassifier:
-    def __init__(
-        self,
-        first_classifier: Classifier,
-        second_classifier: Classifier,
-        train_on_top_n: int = 1,
-        first_fdr_cutoff: float = 0.6,
-        second_fdr_cutoff: float = 0.01,
-    ):
-        """
-        A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage.
-
-        Parameters
-        ----------
-        first_classifier : Classifier
-            The first classifier used to initially filter the data.
-        second_classifier : Classifier
-            The second classifier used to further refine or confirm the classification based on the output from the first classifier.
-        train_on_top_n : int, default=1
-            The number of top candidates that are considered for training of the second classifier.
-        first_fdr_cutoff : float, default=0.6
-            The fdr threshold for the first classifier, determining how selective the first classification step is.
-        second_fdr_cutoff : float, default=0.01
-            The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results.
-
-        """
-        self.first_classifier = first_classifier
-        self.second_classifier = second_classifier
-        self.first_fdr_cutoff = first_fdr_cutoff
-        self.second_fdr_cutoff = second_fdr_cutoff
-
-        self.train_on_top_n = train_on_top_n
-
-    def fit_predict(
-        self,
-        df: pd.DataFrame,
-        x_cols: list[str],
-        y_col: str = "decoy",
-        group_columns: list[str] | None = None,
-    ) -> pd.DataFrame:
-        """
-        Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors.
-
-        Parameters
-        ----------
-        df : pd.DataFrame
-            The input DataFrame from which predictions are to be made.
-        x_cols : list[str]
-            List of column names representing the features to be used for prediction.
-        y_col : str, optional
-            The name of the column that denotes the target variable, by default 'decoy'.
-        group_columns : list[str] | None, optional
-            List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped.
-
-        Returns
-        -------
-        pd.DataFrame
-            A DataFrame containing only the predicted precursors.
-
-        """
-        df.dropna(subset=x_cols, inplace=True)
-        df = apply_absolute_transformations(df)
-
-        if self.first_classifier.fitted:
-            X = df[x_cols].to_numpy()
-            df["proba"] = self.first_classifier.predict_proba(X)[:, 1]
-            df_subset = get_entries_below_fdr(
-                df, self.first_fdr_cutoff, group_columns, remove_decoys=False
-            )
-
-            self.second_classifier.batch_size = 500
-            self.second_classifier.epochs = 50
-
-            df_train = df_subset
-            df_predict = df_subset
-
-        else:
-            df_train = df[df["rank"] < self.train_on_top_n]
-            df_predict = df
-
-        self.second_classifier.fit(
-            df_train[x_cols].to_numpy().astype(np.float32),
-            df_train[y_col].to_numpy().astype(np.float32),
-        )
-
-        df_predict["proba"] = self.second_classifier.predict_proba(
-            df_predict[x_cols].to_numpy()
-        )[:, 1]
-        df_predict = get_entries_below_fdr(
-            df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False
-        )
-        df_targets = df_predict[df_predict["decoy"] == 0]
-
-        self.update_first_classifier(
-            df=get_target_decoy_partners(df_predict, df),
-            x_cols=x_cols,
-            y_col=y_col,
-            group_columns=group_columns,
-        )
-
-        return df_targets
-
-    def update_first_classifier(
-        self,
-        df: pd.DataFrame,
-        x_cols: list[str],
-        y_col: str,
-        group_columns: list[str],
-    ) -> None:
-        """
-        Update the first classifier only if it improves upon the previous version or if it has not been previously fitted.
-
-        Parameters
-        ----------
-        df : pd.DataFrame
-            DataFrame containing the features and target.
-        x_cols : list[str]
-            List of column names representing the features.
-        y_col : str
-            Name of the column representing the target variable.
-        group_columns : list[str]
-            Columns used to group data for FDR calculation.
-
-        """
-        X = df[x_cols].to_numpy()
-        y = df[y_col].to_numpy()
-
-        previous_n_precursors = -1
-
-        if self.first_classifier.fitted:
-            df["proba"] = self.first_classifier.predict_proba(X)[:, 1]
-            df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns)
-            previous_n_precursors = len(df_targets)
-            previous_state_dict = self.first_classifier.to_state_dict()
-
-        self.first_classifier.fit(X, y)
-
-        df["proba"] = self.first_classifier.predict_proba(X)[:, 1]
-        df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns)
-        current_n_precursors = len(df_targets)
-
-        if previous_n_precursors > current_n_precursors:
-            self.first_classifier.from_state_dict(previous_state_dict)
-
-    @property
-    def fitted(self) -> bool:
-        """Return whether both classifiers have been fitted."""
-        return self.second_classifier.fitted
-
-    def to_state_dict(self) -> dict:
-        """Save classifier state.
-
-        Returns
-        -------
-        dict
-            State dictionary containing both classifiers
-        """
-        return {
-            "first_classifier": self.first_classifier.to_state_dict(),
-            "second_classifier": self.second_classifier.to_state_dict(),
-            "first_fdr_cutoff": self.first_fdr_cutoff,
-            "second_fdr_cutoff": self.second_fdr_cutoff,
-            "train_on_top_n": self.train_on_top_n,
-        }
-
-    def from_state_dict(self, state_dict: dict) -> None:
-        """Load classifier state.
-
-        Parameters
-        ----------
-        state_dict : dict
-            State dictionary containing both classifiers
-        """
-        self.first_classifier.from_state_dict(state_dict["first_classifier"])
-        self.second_classifier.from_state_dict(state_dict["second_classifier"])
-        self.first_fdr_cutoff = state_dict["first_fdr_cutoff"]
-        self.second_fdr_cutoff = state_dict["second_fdr_cutoff"]
-        self.train_on_top_n = state_dict["train_on_top_n"]
-
-
-def get_entries_below_fdr(
-    df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True
-) -> pd.DataFrame:
-    """
-    Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries.
-    If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns.
-    fdr : float
-        The false discovery rate threshold for filtering entries.
-    group_columns : list
-        List of columns to group by when determining the best entries per group.
-    remove_decoys : bool, optional
-        Specifies whether decoy entries should be removed from the final result. Defaults to True.
-
-    Returns
-    -------
-    pd.DataFrame
-        A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys.
-    """
-    df.sort_values("proba", ascending=True, inplace=True)
-    df = keep_best(df, group_columns=group_columns)
-    df = get_q_values(df, "proba", "decoy")
-
-    df_subset = df[df["qval"] < fdr]
-    if remove_decoys:
-        df_subset = df_subset[df_subset["decoy"] == 0]
-
-    # Handle case where no entries are below the FDR threshold
-    if len(df_subset) == 0:
-        df = df[df["decoy"] == 0]
-        df_subset = df.loc[[df["qval"].idxmin()]]
-
-    return df_subset
-
-
-def get_target_decoy_partners(
-    reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None
-) -> pd.DataFrame:
-    """
-    Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/
-    This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index.
-
-    Parameters
-    ----------
-    reference_df : pd.DataFrame
-        A subset DataFrame that contains reference values for matching.
-    full_df : pd.DataFrame
-        The main DataFrame from which rows will be matched against reference_df.
-    group_by : list[str] | None, optional
-        The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided.
-
-    Returns
-    -------
-    pd.DataFrame
-        A DataFrame containing rows from full_df that match the grouping criteria.
-
-    """
-    if group_by is None:
-        group_by = ["rank", "elution_group_idx"]
-    valid_tuples = reference_df[group_by]
-    matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner")
-
-    return matching_rows
-
-
-class LogisticRegressionClassifier(Classifier):
-    def __init__(self) -> None:
-        """Binary classifier using a logistic regression model."""
-        self.scaler = StandardScaler()
-        self.model = LogisticRegression()
-        self._fitted = False
-
-    @property
-    def fitted(self) -> bool:
-        return self._fitted
-
-    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
-        """Fit the classifier to the data.
-
-        Parameters
-        ----------
-
-        x : np.array, dtype=float
-            Training data of shape (n_samples, n_features).
-
-        y : np.array, dtype=int
-            Target values of shape (n_samples,) or (n_samples, n_classes).
-
-        """
-        x_scaled = self.scaler.fit_transform(x)
-        self.model.fit(x_scaled, y)
-        self._fitted = True
-
-    def predict(self, x: np.ndarray) -> np.ndarray:
-        """Predict the class of the data.
-
-        Parameters
-        ----------
-
-        x : np.array, dtype=float
-            Data of shape (n_samples, n_features).
-
-        Returns
-        -------
-
-        y : np.array, dtype=float
-            Predicted class probabilities of shape (n_samples, n_classes).
-
-        """
-        x_scaled = self.scaler.transform(x)
-        return self.model.predict(x_scaled)
-
-    def predict_proba(self, x: np.ndarray) -> np.ndarray:
-        """Predict the class probabilities of the data.
-
-        Parameters
-        ----------
-
-        x : np.array, dtype=float
-            Data of shape (n_samples, n_features).
-
-        Returns
-        -------
-
-        y : np.array, dtype=float
-            Predicted class probabilities of shape (n_samples, n_classes).
-
-        """
-        x_scaled = self.scaler.transform(x)
-        return self.model.predict_proba(x_scaled)
-
-    def to_state_dict(self) -> dict:
-        """Return the state of the classifier as a dictionary.
-
-        Returns
-        -------
-
-        dict : dict
-            Dictionary containing the state of the classifier.
-
-        """
-        state_dict = {"_fitted": self._fitted}
-
-        if self._fitted:
-            state_dict.update(
-                {
-                    "scaler_mean": self.scaler.mean_,
-                    "scaler_var": self.scaler.var_,
-                    "scaler_scale": self.scaler.scale_,
-                    "scaler_n_samples_seen": self.scaler.n_samples_seen_,
-                    "model_coef": self.model.coef_,
-                    "model_intercept": self.model.intercept_,
-                    "model_classes": self.model.classes_,
-                    "is_fitted": self._fitted,
-                }
-            )
-
-        return state_dict
-
-    def from_state_dict(self, state_dict: dict) -> None:
-        """Load the state of the classifier from a dictionary.
-
-        Parameters
-        ----------
-
-        dict : dict
-            Dictionary containing the state of the classifier.
-
-        """
-        self._fitted = state_dict["_fitted"]
-
-        if self.fitted:
-            self.scaler = StandardScaler()
-            self.scaler.mean_ = np.array(state_dict["scaler_mean"])
-            self.scaler.var_ = np.array(state_dict["scaler_var"])
-            self.scaler.scale_ = np.array(state_dict["scaler_scale"])
-            self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"])
-
-            self.model = LogisticRegression()
-            self.model.coef_ = np.array(state_dict["model_coef"])
-            self.model.intercept_ = np.array(state_dict["model_intercept"])
-            self.model.classes_ = np.array(state_dict["model_classes"])
-
-
 class BinaryClassifier(Classifier):
     def __init__(
         self,
diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py
index 197713fe..78dc5160 100644
--- a/alphadia/workflow/manager.py
+++ b/alphadia/workflow/manager.py
@@ -18,9 +18,9 @@
 
 # alphadia imports
 import alphadia
-import alphadia.fdrexperimental as fdrx
 from alphadia import fdr
 from alphadia.calibration.property import Calibration, calibration_model_provider
+from alphadia.fdr_analysis.models import TwoStepClassifier
 from alphadia.workflow import reporting
 from alphadia.workflow.config import Config
 
@@ -629,7 +629,7 @@ def __init__(
             self.classifier_store = defaultdict(list)
             self.classifier_base = classifier_base
             self.enable_two_step_classifier = isinstance(
-                classifier_base, fdrx.TwoStepClassifier
+                classifier_base, TwoStepClassifier
             )
 
         self._current_version = -1
diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py
index fe1be321..ffcb4eaf 100644
--- a/alphadia/workflow/peptidecentric.py
+++ b/alphadia/workflow/peptidecentric.py
@@ -15,6 +15,7 @@
 
 # alphadia imports
 from alphadia import fragcomp, plexscoring, utils
+from alphadia.fdr_analysis.models import LogisticRegressionClassifier, TwoStepClassifier
 from alphadia.peakgroup import search
 from alphadia.workflow import base, manager, optimization
 from alphadia.workflow.config import Config
@@ -105,8 +106,8 @@ def get_classifier_base(enable_two_step_classifier: bool = False):
     )
 
     if enable_two_step_classifier:
-        return fdrx.TwoStepClassifier(
-            first_classifier=fdrx.LogisticRegressionClassifier(),
+        return TwoStepClassifier(
+            first_classifier=LogisticRegressionClassifier(),
             second_classifier=nn_classifier,
         )
     else: