Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt hyperparameter scaling #447

Merged
merged 6 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,12 @@ fdr:
keep_decoys: false
channel_wise_fdr: false
inference_strategy: "heuristic"
# (Experimental)
# uses a two-step classifier consisting of a logistic regression and a neural network
enable_two_step_classifier: false
# (Experimental)
# Optimizes the batch size and learning rate of the neural network
enable_nn_hyperparameter_tuning: false

search_output:
peptide_level_lfq: false
Expand Down
4 changes: 2 additions & 2 deletions alphadia/fdrexperimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ def predict_proba(self, x: np.ndarray):
return self.network(torch.Tensor(x)).detach().numpy()
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved


def get_scaled_training_params(df, base_lr=0.001, max_batch=1024, min_batch=64):
def get_scaled_training_params(df, base_lr=0.001, max_batch=4096, min_batch=128):
"""
Scale batch size and learning rate based on dataframe size using square root relationship.

Expand All @@ -921,7 +921,7 @@ def get_scaled_training_params(df, base_lr=0.001, max_batch=1024, min_batch=64):
max_batch : int, optional
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
Maximum batch size (1024 for >= 1M samples), defaults to 1024
min_batch : int, optional
Minimum batch size, defaults to 32
Minimum batch size, defaults to 128

Returns
-------
Expand Down
20 changes: 16 additions & 4 deletions alphadia/workflow/peptidecentric.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@


def get_classifier_base(
enable_two_step_classifier: bool = False, fdr_cutoff: float = 0.01
enable_two_step_classifier: bool = False,
enable_nn_hyperparameter_tuning: bool = False,
fdr_cutoff: float = 0.01,
):
"""Creates and returns a classifier base instance.

Expand All @@ -106,6 +108,11 @@ def get_classifier_base(
enable_two_step_classifier : bool, optional
If True, uses logistic regression + neural network.
If False (default), uses only neural network.

enable_nn_hyperparameter_tuning: bool, optional
If True, uses hyperparameter tuning for the neural network.
If False (default), uses default hyperparameters for the neural network.

fdr_cutoff : float, optional
The FDR cutoff threshold used by the second classifier when two-step
classification is enabled. Default is 0.01.
Expand All @@ -120,7 +127,7 @@ def get_classifier_base(
batch_size=5000,
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
learning_rate=0.001,
epochs=10,
experimental_hyperparameter_tuning=True,
experimental_hyperparameter_tuning=enable_nn_hyperparameter_tuning,
)

if enable_two_step_classifier:
Expand Down Expand Up @@ -168,8 +175,13 @@ def init_fdr_manager(self):
self.fdr_manager = manager.FDRManager(
feature_columns=feature_columns,
classifier_base=get_classifier_base(
self.config["fdr"]["enable_two_step_classifier"],
self.config["fdr"]["fdr"],
enable_two_step_classifier=self.config["fdr"][
"enable_two_step_classifier"
],
enable_nn_hyperparameter_tuning=self.config["fdr"][
"enable_nn_hyperparameter_tuning"
],
fdr_cutoff=self.config["fdr"]["fdr"],
),
)

Expand Down
18 changes: 16 additions & 2 deletions gui/workflows/PeptideCentric.v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@
"parameters": [
{
"id": "transfer_step_enabled",
"name": "Add 'transfer learning' step",
"name": "Transfer Learning Step (Experimental)",
"value": false,
"description": "Whether to perform a 'transfer learning' step before the first search. All parameters set here will also be used for this step (except those required to switch on the specific behaviour of this step).",
"type": "boolean"
},
{
"id": "mbr_step_enabled",
"name": "Add 'second search' step",
"name": "MBR Search Step (Experimental)",
"value": false,
"description": "Whether to perform a 'second search' step after the first search. All parameters set here will also be used for this step (except those required to switch on the specific behaviour of this step).",
"type": "boolean"
Expand Down Expand Up @@ -402,6 +402,20 @@
"value": false,
"description": "If enabled, decoy PSMs will be retained in the output.",
"type": "boolean"
},
{
"id": "enable_two_step_classifier",
"name": "Two Step Classifier (Experimental)",
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
"value": false,
"description": "If enabled, a two step classifier consisting of a linear filter and a neural network will be used.",
"type": "boolean"
},
{
"id": "enable_nn_hyperparameter_tuning",
"name": "Hyperparameter Tuning (Experimental)",
"value": false,
"description": "If enabled, the hyperparameters of the neural network like the batch size and learning rate will be tuned.",
"type": "boolean"
}
]
},
Expand Down
3 changes: 2 additions & 1 deletion requirements/requirements_loose.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ numba
argparse
alpharaw>=0.3.1 # test: tolerate_version
alphatims
alphabase>=1.4.0 # test: tolerate_version
# TODO remove once compatible with alphabase>=1.5.0
alphabase>=1.4.0,<1.5.0 # test: tolerate_version
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
GeorgWa marked this conversation as resolved.
Show resolved Hide resolved
peptdeep>=1.3.0 # test: tolerate_version
dask==2024.11.2 # test: tolerate_version
progressbar
Expand Down
12 changes: 6 additions & 6 deletions tests/unit_tests/test_fdrx_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ def test_target_decoy_fdr(mock_show):
"n_samples,expected_batch,expected_lr",
[
# Large dataset case (≥1M samples)
(1_000_000, 1024, 0.001),
(2_000_000, 1024, 0.001),
(1_000_000, 4096, 0.001),
(2_000_000, 4096, 0.001),
# Mid-size dataset cases
(500_000, 512, 0.001 * np.sqrt(512 / 1024)), # 50% of max
(250_000, 256, 0.001 * np.sqrt(256 / 1024)), # 25% of max
(500_000, 2048, 0.001 * np.sqrt(2048 / 4096)), # 50% of max
(250_000, 1024, 0.001 * np.sqrt(1024 / 4096)), # 25% of max
# Small dataset cases
(50_000, 64, 0.001 * np.sqrt(64 / 1024)), # Should hit min batch size
(1_000, 64, 0.001 * np.sqrt(64 / 1024)), # Should hit min batch size
(25_000, 128, 0.001 * np.sqrt(128 / 4096)), # Should hit min batch size
(1_000, 128, 0.001 * np.sqrt(128 / 4096)), # Should hit min batch size
],
)
def test_get_scaled_training_params(n_samples, expected_batch, expected_lr):
Expand Down
Loading