Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit tests for the outputaccumulator post processing. #422

Open
wants to merge 3 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions tests/unit_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def mock_precursor_df(

df = pd.DataFrame(
{

"decoy": decoy,
"mz_library": precursor_mz,
"rt_library": random_rt,
Expand Down Expand Up @@ -167,6 +168,26 @@ def mock_fragment_df(n_fragments: int = 10, n_precursor: int = 20):
}
)

def mock_fragment_correlation_df(fragments_df: pd.DataFrame):
"""Create a mock fragment correlation dataframe as it's found as the individual search outputs

Parameters
----------

fragments_df : pd.DataFrame
A mock fragment dataframe

Returns
-------

fragment_correlation_df : pd.DataFrame
A mock fragment correlation dataframe
"""
# create random correlation values between 0 and 1
fragments_shape = fragments_df.shape
fragment_correlation = np.random.rand(*fragments_shape)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we use a fixed seed here to make the tests reproducible?
if not, it would be good to print out the generated date, otherwise debugging tests will be a nightmare

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently mocking the precursor_df, fragment_df and fragment correlation are completely random.
@GeorgWa do you think we can fix the seed for all of them ?

fragment_correlation_df = pd.DataFrame(fragment_correlation, columns=fragments_df.columns)
return fragment_correlation_df

def pytest_configure(config):
test_data_path = os.environ.get("TEST_DATA_DIR", None)
Expand Down
66 changes: 64 additions & 2 deletions tests/unit_tests/test_outputaccumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import numpy as np
import pandas as pd
from alphabase.spectral_library.base import SpecLibBase
from conftest import mock_fragment_df, mock_precursor_df
from alphabase.spectral_library.flat import SpecLibFlat
from conftest import mock_fragment_df, mock_precursor_df,mock_fragment_correlation_df

from alphadia import outputtransform
from alphadia.workflow.base import QUANT_FOLDER_NAME

from alphadia.outputaccumulator import ms2_quality_control

def prepare_input_data():
"""
Expand Down Expand Up @@ -247,3 +248,64 @@ def test_default_column_assignment():
assert built_lib.precursor_df[f"{col}"].equals(
built_lib.precursor_df[f"{col}_library"]
), f"{col} != {col}_library"

def test_non_nan_fragments():
"""
Test that the accumulated fragments data frame has no nan values
"""
# Given:
config, temp_folder, raw_folders, psm_dfs, fragment_dfs = prepare_input_data()
keep_top = 2
config["transfer_library"]["top_k_samples"] = keep_top

# When:
output = outputtransform.SearchPlanOutput(config, temp_folder)
_ = output.build_transfer_library(raw_folders, save=True)
built_lib = SpecLibBase()
built_lib.load_hdf(
os.path.join(temp_folder, f"{output.TRANSFER_OUTPUT}.hdf"), load_mod_seq=True
)

# Then: The fragment dataframe should have no nan values
assert not built_lib.fragment_intensity_df.isnull().values.any(), "There are nan values in the fragment dataframe"

shutil.rmtree(temp_folder)
mo-sameh marked this conversation as resolved.
Show resolved Hide resolved

def test_use_for_ms2():
"""
Test that the ms2 quality control is correctly applied by checking the use_for_ms2 column in the precursor_df
"""
# Given:
psm_flat_df = mock_precursor_df(n_precursor=100, with_decoy=True)
fragment_flat_df = mock_fragment_df(n_precursor=100, n_fragments=10)
psm_flat_df = psm_flat_df.sort_values(by="precursor_idx")
fragment_flat_df = fragment_flat_df.sort_values(by="precursor_idx")
psm_flat_df["flat_frag_start_idx"] = np.arange(0, len(psm_flat_df) * 10, 10)
psm_flat_df["flat_frag_stop_idx"] = np.arange(0, len(psm_flat_df) * 10, 10) + 9
psm_flat_df['nAA'] =psm_flat_df.sequence.str.len().astype(np.int32)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please install the pre-commit hook :-)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is strange, I already have them installed and ran it locally and all checks were passed.

fragment_flat_df["loss_type"] = 0
flat_spec_lib = SpecLibFlat()
flat_spec_lib._precursor_df = psm_flat_df
flat_spec_lib._fragment_df = fragment_flat_df
spec_lib = flat_spec_lib.to_SpecLibBase()
mo-sameh marked this conversation as resolved.
Show resolved Hide resolved
fragment_correlation_base_df = mock_fragment_correlation_df(spec_lib.fragment_intensity_df)
spec_lib._fragment_correlation_df = fragment_correlation_base_df
precursor_correlation_cutoff = 0.5
fragment_correlation_ratio = 0.75

base_precursor_df = spec_lib.precursor_df.copy()
base_fragment_df = spec_lib.fragment_intensity_df.copy()
# When:
ms2_quality_control(spec_lib, precursor_correlation_cutoff, fragment_correlation_ratio)

# Then: The use_for_ms2 column should be correctly assigned for precursors with median fragment correlation above precursor_correlation_cutoff
target_use_for_ms2 = []
for frag_start,frag_stop in zip(base_precursor_df["frag_start_idx"],base_precursor_df["frag_stop_idx"]):
frag_corr = fragment_correlation_base_df.iloc[frag_start:frag_stop].values
frag_intensities = base_fragment_df.iloc[frag_start:frag_stop].values
# median corr of non zero intensities
frag_corr = frag_corr[frag_intensities>0]
median_frag_corr = np.median(frag_corr) if len(frag_corr) > 0 else 0
target_use_for_ms2.append(median_frag_corr > precursor_correlation_cutoff)

np.testing.assert_array_equal(spec_lib.precursor_df["use_for_ms2"].values, target_use_for_ms2)
Loading