Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multistep search #399

Merged
merged 43 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
b1f7442
orchestrate multiple steps in cli.py
mschwoer Dec 10, 2024
68e0c09
add "meta" config
mschwoer Dec 10, 2024
996b164
allow to transfer values between runs
mschwoer Dec 10, 2024
e2d6c9a
create log folder if it does not exist
mschwoer Dec 10, 2024
d01b054
update design
mschwoer Dec 11, 2024
2608a47
small improvements
mschwoer Dec 11, 2024
2eba74a
adapt frontend
mschwoer Dec 11, 2024
6c10667
tweaks around logging
mschwoer Dec 11, 2024
c5c2b87
remove passing of step_name
mschwoer Dec 11, 2024
adc94e1
refactoring
mschwoer Dec 11, 2024
40e5278
add multistep to e2e tests
mschwoer Dec 11, 2024
92e9886
minor refactorings
mschwoer Dec 11, 2024
82fdd79
minor refactorings
mschwoer Dec 11, 2024
efb4794
add tests
mschwoer Dec 11, 2024
04ea0c8
add tests
mschwoer Dec 11, 2024
849fbaa
fix tests
mschwoer Dec 11, 2024
a2771a1
fix tests
mschwoer Dec 11, 2024
2561cd4
some fixes and added TODOs
mschwoer Dec 11, 2024
3f51c30
add type hints
mschwoer Dec 11, 2024
1ed6f83
use correct values to pass to mbr step
mschwoer Dec 11, 2024
1064fa8
take dynamic config from whatever step is first
mschwoer Dec 11, 2024
788c706
make tests a bit easier to get
mschwoer Dec 11, 2024
f9f2911
use correct speclib
mschwoer Dec 12, 2024
a767d7d
fix some business errors
mschwoer Dec 12, 2024
0507fb5
change basic_multistep test data
mschwoer Dec 12, 2024
b2093f8
fix multistep config
mschwoer Dec 12, 2024
8ab017b
make tests easier to debug
mschwoer Dec 12, 2024
b600501
make tests easier to debug
mschwoer Dec 12, 2024
f31676a
improve tests
mschwoer Dec 12, 2024
d6ba82c
switch to reading optmization params from file
mschwoer Dec 12, 2024
6cba262
remove extracting data directly from workflow again
mschwoer Dec 12, 2024
99c5771
remove setting quant_dir
mschwoer Dec 12, 2024
252da3a
rename config in tests
mschwoer Dec 12, 2024
9a3c309
fix e2e tests
mschwoer Dec 12, 2024
3ec80d6
fix extraction of values
mschwoer Dec 13, 2024
feaf857
rename basic_multistep e2e test
mschwoer Dec 13, 2024
57efd0d
refactor how config is being merged
mschwoer Dec 13, 2024
ddab4a2
add docs on multistep search
mschwoer Dec 13, 2024
3c39f15
revert some changes
mschwoer Dec 13, 2024
b2be519
Merge branch 'development' into outline_for_multistep_search
mschwoer Dec 20, 2024
b7572b2
install mono in CI
mschwoer Dec 20, 2024
a55a9e1
fix formatting
mschwoer Dec 20, 2024
1b48ce8
Merge branch 'development' into outline_for_multistep_search
mschwoer Jan 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/e2e_testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ jobs:
runs-on: self-hosted
if: contains(github.event.pull_request.labels.*.name, 'test:e2e') || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
# test case name as defined in e2e_test_cases.yaml
test_case: [ "basic", "synchropasef", "astral", "astral_automatic_calibration", ]
test_case: [ "basic", "synchropasef", "astral", "astral_automatic_calibration", "multistep"]
env:
RUN_NAME: alphadia-${{github.sha}}-${{github.run_id}}-${{github.run_attempt}}
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
Expand Down Expand Up @@ -48,6 +49,10 @@ jobs:
conda remove -n $RUN_NAME --all -y
- name: Delete Caches on Error
if: ${{ failure() && steps.pip_installation.conclusion == 'failure' }}
shell: bash -el {0}
run: |
rm -rf ~/.cache/pip
rm -rf ~/.cache/conda

# Exit with error code to fail the job
exit 1
19 changes: 8 additions & 11 deletions alphadia/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
import re
import sys

import matplotlib
import yaml

import alphadia
from alphadia import utils
from alphadia.exceptions import CustomError
from alphadia.search_plan import SearchPlan
from alphadia.workflow import reporting

logger = logging.getLogger()
Expand Down Expand Up @@ -205,6 +207,7 @@ def run(*args, **kwargs):

raw_path_list = _get_raw_path_list_from_args_and_config(args, user_config)
logger.progress(f"Searching {len(raw_path_list)} files:")

for f in raw_path_list:
logger.progress(f" {os.path.basename(f)}")

Expand All @@ -225,24 +228,18 @@ def run(*args, **kwargs):
if quant_dir is not None:
logger.progress(f"Saving quantification output to {quant_dir=}")

try:
import matplotlib

# important to suppress matplotlib output
matplotlib.use("Agg")
# important to suppress matplotlib output
matplotlib.use("Agg")

from alphadia.planning import Plan

plan = Plan(
try:
SearchPlan(
output_directory,
raw_path_list=raw_path_list,
library_path=library_path,
fasta_path_list=fasta_path_list,
config=user_config,
quant_path=quant_dir,
)

plan.run()
).run_plan()

except Exception as e:
if isinstance(e, CustomError):
Expand Down
5 changes: 5 additions & 0 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -423,3 +423,8 @@ calibration_manager:
- mobility_observed
output_columns:
- mobility_calibrated

# scope of default yaml should be one search step
multistep_search:
transfer_step_enabled: False
mbr_step_enabled: False
47 changes: 47 additions & 0 deletions alphadia/constants/multistep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# configuration for multistep search
# for each of the three steps, the configuration values defined here override the default values and the values defined by the user

# future : default.yaml -> user.yaml -> multistep_user.yaml -> multistep.yaml

transfer:
# library_prediction: # should be done in 99% of cases
# predict: True
transfer_library:
enabled: True
transfer_learning:
enabled: True

# override settings that could have been set by the user:
general:
save_library: False
reuse_quant: False
# TODO: think about enforcing optimization of rt here

library:
# the step following TL needs to have this. It will be forced to true only (by code) if transfer step was done before
# library_prediction:
# predict: True

# override settings that could have been set by the user:
general:
save_library: False
reuse_quant: False
transfer_library:
enabled: False
transfer_learning:
enabled: False

mbr:
fdr:
inference_strategy: library
search:
target_num_candidates: 5
# override settings that could have been set by the user:
general:
reuse_quant: False
library_prediction:
predict: False
transfer_library:
enabled: False
transfer_learning:
enabled: False
18 changes: 12 additions & 6 deletions alphadia/outputtransform.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,15 @@
)
from alphadia.transferlearning.train import FinetuneManager
from alphadia.workflow import manager, peptidecentric
from alphadia.workflow.config import Config
from alphadia.workflow.managers.raw_file_manager import RawFileManager

# TODO move to a class with the rest of the constants
MS1_ERROR = "ms1_error"
MS2_ERROR = "ms2_error"

OPTIMIZATION_PREFIX = "optimization."

logger = logging.getLogger()


Expand Down Expand Up @@ -306,7 +313,7 @@ class SearchPlanOutput:
TRANSFER_MODEL = "peptdeep.transfer"
TRANSFER_STATS_OUTPUT = "stats.transfer"

def __init__(self, config: dict, output_folder: str):
def __init__(self, config: Config, output_folder: str):
"""Combine individual searches into and build combined outputs

In alphaDIA the search plan orchestrates the library building preparation,
Expand Down Expand Up @@ -974,16 +981,15 @@ def _build_run_stat_df(
optimization_manager = manager.OptimizationManager(
path=optimization_manager_path
)
optimization_stats["ms2_error"] = optimization_manager.ms2_error
optimization_stats["ms1_error"] = optimization_manager.ms1_error
optimization_stats[MS2_ERROR] = optimization_manager.ms2_error
optimization_stats[MS1_ERROR] = optimization_manager.ms1_error
optimization_stats["rt_error"] = optimization_manager.rt_error
optimization_stats["mobility_error"] = optimization_manager.mobility_error
else:
logger.warning(f"Error reading optimization manager for {raw_name}")

prefix = "optimization."
for key in ["ms2_error", "ms1_error", "rt_error", "mobility_error"]:
stats[f"{prefix}{key}"] = optimization_stats[key]
for key in [MS2_ERROR, MS1_ERROR, "rt_error", "mobility_error"]:
stats[f"{OPTIMIZATION_PREFIX}{key}"] = optimization_stats[key]

# collect calibration stats
calibration_stats = defaultdict(lambda: np.nan)
Expand Down
67 changes: 41 additions & 26 deletions alphadia/planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
from alphadia.exceptions import CustomError
from alphadia.workflow import peptidecentric, reporting
from alphadia.workflow.base import WorkflowBase
from alphadia.workflow.config import Config
from alphadia.workflow.config import MULTISTEP_SEARCH, USER_DEFINED, Config

SPECLIB_FILE_NAME = "speclib.hdf"

logger = logging.getLogger()


class Plan:
class Plan: # TODO rename -> SearchStep, planning.py -> search_step.py
def __init__(
self,
output_folder: str,
Expand All @@ -42,6 +44,7 @@ def __init__(
fasta_path_list: list[str] | None = None,
config: dict | Config | None = None,
config_base_path: str | None = None,
extra_config: dict | None = None,
quant_path: str | None = None,
) -> None:
"""Highest level class to plan a DIA Search.
Expand All @@ -64,15 +67,19 @@ def __init__(
list of fasta file locations to build the library from

config_base_path : str, optional
yaml file containing the default config.
user-provided yaml file containing the default config.

config : dict, optional
dict to update the default config. Can be used for debugging purposes etc.
user-provided dict to update the default config. Can be used for debugging purposes etc.

extra_config : dict, optional
dict to update the final config. Used for multistep searches.

quant_path : str, optional
path to directory to save the quantification results (psm & frag parquet files). If not provided, the results are saved in the usual workflow folder

"""

if config is None:
config = {}
if fasta_path_list is None:
Expand All @@ -81,32 +88,29 @@ def __init__(
raw_path_list = []

self.output_folder = output_folder
os.makedirs(output_folder, exist_ok=True)
reporting.init_logging(self.output_folder)

self.raw_path_list = raw_path_list
self.library_path = library_path
self.fasta_path_list = fasta_path_list
self.quant_path = quant_path

self.spectral_library = None

# needs to be done before any logging:
reporting.init_logging(self.output_folder)

self._print_logo()

self._print_environment()

self._config = self._init_config(config, output_folder, config_base_path)
self._config = self._init_config(
config, extra_config, output_folder, config_base_path
)

level_to_set = self._config["general"]["log_level"]
level_code = logging.getLevelName(level_to_set)
logger.setLevel(level_code)
logger.setLevel(logging.getLevelName(self._config["general"]["log_level"]))

self.init_alphabase()
self.load_library()

torch.set_num_threads(self._config["general"]["thread_count"])

def _print_logo(self) -> None:
@staticmethod
def print_logo() -> None: # TODO move elsewhere
"""Print the alphadia logo and version."""
logger.progress(" _ _ ___ ___ _ ")
logger.progress(r" __ _| |_ __| |_ __ _| \_ _| /_\ ")
Expand All @@ -119,6 +123,7 @@ def _print_logo(self) -> None:
def _init_config(
self,
user_config: dict | Config,
extra_config: dict,
output_folder: str,
config_base_path: str | None,
) -> Config:
Expand All @@ -131,20 +136,29 @@ def _init_config(
os.path.dirname(__file__), "constants", "default.yaml"
)

logger.info(f"loading default config from {config_base_path}")
logger.info(f"loading config from {config_base_path}")
config = Config()
config.from_yaml(config_base_path)

config_updates = []
# load update config from dict
if isinstance(user_config, dict):
update_config = Config("user defined")
update_config.from_dict(user_config)
user_config_update = Config(USER_DEFINED)
user_config_update.from_dict(user_config)
config_updates.append(user_config_update)
elif isinstance(user_config, Config):
update_config = user_config
config_updates.append(user_config)
else:
raise ValueError("'config' parameter must be of type 'dict' or 'Config'")

config.update([update_config], print_modifications=True)
if extra_config is not None:
extra_config_update = Config(MULTISTEP_SEARCH)
extra_config_update.from_dict(extra_config)
# need to overwrite user-defined output folder here
extra_config["output"] = output_folder
config_updates.append(extra_config_update)

config.update(config_updates, print_modifications=True)

if "output" not in config:
config["output"] = output_folder
Expand All @@ -169,7 +183,8 @@ def spectral_library(self) -> SpecLibFlat:
def spectral_library(self, spectral_library: SpecLibFlat) -> None:
self._spectral_library = spectral_library

def _print_environment(self) -> None:
@staticmethod
def print_environment() -> None: # TODO move elsewhere
"""Log information about the python environment."""

logger.progress(f"hostname: {socket.gethostname()}")
Expand Down Expand Up @@ -285,7 +300,7 @@ def _parse_modifications(mod_str: str) -> list[str]:
)
spectral_library = multiplexing(spectral_library)

library_path = os.path.join(self.output_folder, "speclib.hdf")
library_path = os.path.join(self.output_folder, SPECLIB_FILE_NAME)
logger.info(f"Saving library to {library_path}")
spectral_library.save_hdf(library_path)

Expand Down Expand Up @@ -344,15 +359,15 @@ def run(
raise e

finally:
if workflow.reporter:
if workflow and workflow.reporter:
workflow.reporter.log_string(f"Finished workflow for {raw_name}")
workflow.reporter.context.__exit__(None, None, None)
del workflow

try:
base_spec_lib = SpecLibBase()
base_spec_lib.load_hdf(
os.path.join(self.output_folder, "speclib.hdf"), load_mod_seq=True
os.path.join(self.output_folder, SPECLIB_FILE_NAME), load_mod_seq=True
)

output = outputtransform.SearchPlanOutput(self.config, self.output_folder)
Expand Down Expand Up @@ -415,7 +430,7 @@ def _process_raw_file(
def _clean(self):
if not self.config["general"]["save_library"]:
try:
os.remove(os.path.join(self.output_folder, "speclib.hdf"))
os.remove(os.path.join(self.output_folder, SPECLIB_FILE_NAME))
except Exception as e:
logger.exception(f"Error deleting library: {e}")

Expand Down
Loading
Loading