From 9b6e545c5439247a6021a403967d8907a7ff5d62 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Mon, 30 Dec 2024 17:40:39 +0100 Subject: [PATCH] test: unit tests for output accumulator psotproc --- tests/unit_tests/conftest.py | 21 +++++++ tests/unit_tests/test_outputaccumulator.py | 66 +++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index e6c8ecf3..e5f25adc 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -63,6 +63,7 @@ def mock_precursor_df( df = pd.DataFrame( { + "decoy": decoy, "mz_library": precursor_mz, "rt_library": random_rt, @@ -167,6 +168,26 @@ def mock_fragment_df(n_fragments: int = 10, n_precursor: int = 20): } ) +def mock_fragment_correlation_df(fragments_df: pd.DataFrame): + """Create a mock fragment correlation dataframe as it's found as the individual search outputs + + Parameters + ---------- + + fragments_df : pd.DataFrame + A mock fragment dataframe + + Returns + ------- + + fragment_correlation_df : pd.DataFrame + A mock fragment correlation dataframe + """ + # create random correlation values between 0 and 1 + fragments_shape = fragments_df.shape + fragment_correlation = np.random.rand(*fragments_shape) + fragment_correlation_df = pd.DataFrame(fragment_correlation, columns=fragments_df.columns) + return fragment_correlation_df def pytest_configure(config): test_data_path = os.environ.get("TEST_DATA_DIR", None) diff --git a/tests/unit_tests/test_outputaccumulator.py b/tests/unit_tests/test_outputaccumulator.py index 896aa06d..321443ac 100644 --- a/tests/unit_tests/test_outputaccumulator.py +++ b/tests/unit_tests/test_outputaccumulator.py @@ -5,11 +5,12 @@ import numpy as np import pandas as pd from alphabase.spectral_library.base import SpecLibBase -from conftest import mock_fragment_df, mock_precursor_df +from alphabase.spectral_library.flat import SpecLibFlat +from conftest import mock_fragment_df, mock_precursor_df,mock_fragment_correlation_df from alphadia import outputtransform from alphadia.workflow.base import QUANT_FOLDER_NAME - +from alphadia.outputaccumulator import ms2_quality_control def prepare_input_data(): """ @@ -247,3 +248,64 @@ def test_default_column_assignment(): assert built_lib.precursor_df[f"{col}"].equals( built_lib.precursor_df[f"{col}_library"] ), f"{col} != {col}_library" + +def test_non_nan_fragments(): + """ + Test that the accumulated fragments data frame has no nan values + """ + # Given: + config, temp_folder, raw_folders, psm_dfs, fragment_dfs = prepare_input_data() + keep_top = 2 + config["transfer_library"]["top_k_samples"] = keep_top + + # When: + output = outputtransform.SearchPlanOutput(config, temp_folder) + _ = output.build_transfer_library(raw_folders, save=True) + built_lib = SpecLibBase() + built_lib.load_hdf( + os.path.join(temp_folder, f"{output.TRANSFER_OUTPUT}.hdf"), load_mod_seq=True + ) + + # Then: The fragment dataframe should have no nan values + assert not built_lib.fragment_intensity_df.isnull().values.any(), "There are nan values in the fragment dataframe" + + shutil.rmtree(temp_folder) + +def test_use_for_ms2(): + """ + Test that the ms2 quality control is correctly applied by checking the use_for_ms2 column in the precursor_df + """ + # Given: + psm_flat_df = mock_precursor_df(n_precursor=100, with_decoy=True) + fragment_flat_df = mock_fragment_df(n_precursor=100, n_fragments=10) + psm_flat_df = psm_flat_df.sort_values(by="precursor_idx") + fragment_flat_df = fragment_flat_df.sort_values(by="precursor_idx") + psm_flat_df["flat_frag_start_idx"] = np.arange(0, len(psm_flat_df) * 10, 10) + psm_flat_df["flat_frag_stop_idx"] = np.arange(0, len(psm_flat_df) * 10, 10) + 9 + psm_flat_df['nAA'] =psm_flat_df.sequence.str.len().astype(np.int32) + fragment_flat_df["loss_type"] = 0 + flat_spec_lib = SpecLibFlat() + flat_spec_lib._precursor_df = psm_flat_df + flat_spec_lib._fragment_df = fragment_flat_df + spec_lib = flat_spec_lib.to_SpecLibBase() + fragment_correlation_base_df = mock_fragment_correlation_df(spec_lib.fragment_intensity_df) + spec_lib._fragment_correlation_df = fragment_correlation_base_df + precursor_correlation_cutoff = 0.5 + fragment_correlation_ratio = 0.75 + + base_precursor_df = spec_lib.precursor_df.copy() + base_fragment_df = spec_lib.fragment_intensity_df.copy() + # When: + ms2_quality_control(spec_lib, precursor_correlation_cutoff, fragment_correlation_ratio) + + # Then: The use_for_ms2 column should be correctly assigned for precursors with median fragment correlation above precursor_correlation_cutoff + target_use_for_ms2 = [] + for frag_start,frag_stop in zip(base_precursor_df["frag_start_idx"],base_precursor_df["frag_stop_idx"]): + frag_corr = fragment_correlation_base_df.iloc[frag_start:frag_stop].values + frag_intensities = base_fragment_df.iloc[frag_start:frag_stop].values + # median corr of non zero intensities + frag_corr = frag_corr[frag_intensities>0] + median_frag_corr = np.median(frag_corr) if len(frag_corr) > 0 else 0 + target_use_for_ms2.append(median_frag_corr > precursor_correlation_cutoff) + + np.testing.assert_array_equal(spec_lib.precursor_df["use_for_ms2"].values, target_use_for_ms2)