From 993d25bbcfc0a9880bcedc583eebd9f62862fc67 Mon Sep 17 00:00:00 2001 From: elena-krismer Date: Thu, 31 Aug 2023 08:29:04 +0200 Subject: [PATCH] #231 replace zero with nan parameter --- alphastats/DataSet.py | 6 ++++++ alphastats/loader/AlphaPeptLoader.py | 13 ++++++++----- alphastats/loader/BaseLoader.py | 3 ++- alphastats/loader/DIANNLoader.py | 3 +++ alphastats/loader/FragPipeLoader.py | 10 ++++++++++ alphastats/loader/GenericLoader.py | 6 ++++-- alphastats/loader/MaxQuantLoader.py | 3 +++ alphastats/loader/SpectronautLoader.py | 20 ++++++++++++-------- alphastats/loader/mzTabLoader.py | 5 ++++- 9 files changed, 52 insertions(+), 17 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 73dd7885..60359c41 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -67,6 +67,7 @@ def __init__(self, loader, metadata_path=None, sample_column=None): self.filter_columns = loader.filter_columns self.evidence_df = loader.evidence_df self.gene_names = loader.gene_names + self.replace_zero_with_nan = loader.replace_zero_with_nan # include filtering before self.create_matrix() @@ -169,6 +170,10 @@ def create_matrix(self): # transpose dataframe mat = df.transpose() mat.replace([np.inf, -np.inf], np.nan, inplace=True) + + if self.replace_zero_with_nan: + mat.replace([0], np.nan, inplace=True) + # remove proteins with only zero self.mat = mat.loc[:, (mat != 0).any(axis=0)] self.mat = self.mat.astype(float) @@ -222,6 +227,7 @@ def _save_dataset_info(self): "Contamination columns": self.filter_columns, "Number of removed ProteinGroups due to contaminaton": 0, "Data completeness cut-off": 0, + "Zeros replaced with NaN": self.replace_zero_with_nan, } return preprocessing_dict diff --git a/alphastats/loader/AlphaPeptLoader.py b/alphastats/loader/AlphaPeptLoader.py index a3112729..b2fe4ff3 100644 --- a/alphastats/loader/AlphaPeptLoader.py +++ b/alphastats/loader/AlphaPeptLoader.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np import logging - +from typing import Union class AlphaPeptLoader(BaseLoader): """Loader for AlphaPept outputfiles @@ -11,10 +11,11 @@ class AlphaPeptLoader(BaseLoader): def __init__( self, - file, - intensity_column="[sample]_LFQ", - index_column="Unnamed: 0", # column name to be changed - sep=",", + file:Union[str, pd.DataFrame], + intensity_column:str="[sample]_LFQ", + index_column:str="Unnamed: 0", # column name to be changed + sep:str=",", + replace_zero_with_nan:bool=True, **kwargs ): """Loads Alphapept output: results_proteins.csv. Will add contamination column for further analysis. @@ -23,6 +24,7 @@ def __init__( file (str): AlphaPept output, either results_proteins.csv file or the hdf_file with the protein_table given intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample]_LFQ". index_column (str, optional): column indicating the protein groups. Defaults to "Unnamed: 0". + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. sep (str, optional): file separation of file. Defaults to ",". """ @@ -33,6 +35,7 @@ def __init__( self.intensity_column = intensity_column self.index_column = index_column + self.replace_zero_with_nan = replace_zero_with_nan self.filter_columns = [] self.confidence_column = None self.software = "AlphaPept" diff --git a/alphastats/loader/BaseLoader.py b/alphastats/loader/BaseLoader.py index d80a53e5..88c3d06c 100644 --- a/alphastats/loader/BaseLoader.py +++ b/alphastats/loader/BaseLoader.py @@ -9,7 +9,7 @@ class BaseLoader: """Parent class of Loaders""" - def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, list], index_column:str, sep:str): + def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, list], index_column:str, sep:str, replace_zero_with_nan=True): """BaseLoader for AlphaPept, MaxQuant, Fragpipe, Spectronau and DIANNLoader Args: @@ -30,6 +30,7 @@ def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, li self.evidence_df = None self.gene_names = None self.ptm_df = None + self.replace_zero_with_nan = replace_zero_with_nan self._add_contamination_column() self._check_if_columns_are_present() self._read_all_columns_as_string() diff --git a/alphastats/loader/DIANNLoader.py b/alphastats/loader/DIANNLoader.py index 3c78e31d..a23109f7 100644 --- a/alphastats/loader/DIANNLoader.py +++ b/alphastats/loader/DIANNLoader.py @@ -13,6 +13,7 @@ def __init__( intensity_column="[sample]", index_column="Protein.Group", sep="\t", + replace_zero_with_nan = True, **kwargs ): """Import DIA-NN output data report.pg_matrix.tsv @@ -21,11 +22,13 @@ def __init__( file (str): DIA-NN output file report.pg_matrix.tsv intensity_column (str, optional): columns containing the intensity column for each experiment. Defaults to "[experiment]". index_column (str, optional): column with the Protein IDs. Defaults to "Protein.Group". + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. sep (str, optional): file separation of the input file. Defaults to "\t". """ super().__init__(file, intensity_column, index_column, sep) self.software = "DIANN" + self.replace_zero_with_nan = replace_zero_with_nan self.no_sample_column = [ "PG.Q.value", "Global.PG.Q.value", diff --git a/alphastats/loader/FragPipeLoader.py b/alphastats/loader/FragPipeLoader.py index 18563c6c..22eb55a9 100644 --- a/alphastats/loader/FragPipeLoader.py +++ b/alphastats/loader/FragPipeLoader.py @@ -17,14 +17,24 @@ def __init__( gene_names_column:str="Gene Names", confidence_column:str="Protein Probability", sep:str="\t", + replace_zero_with_nan:bool = True, **kwargs ): + """Loads FragPipe/Philosopher output: combined_protein.tsv + Args: + file (str, pd.DataFryame): FragPipe output, combined_protein.tsv. + intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample] MaxLFQ Intensity ". + index_column (str, optional): column indicating the protein groups. Defaults to "Protein". + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. + sep (str/optional): file separation. Defaults to "\t". + """ super().__init__(file, intensity_column, index_column, sep) if gene_names_column in self.rawinput.columns.to_list(): self.gene_names = gene_names_column + self.replace_zero_with_nan = replace_zero_with_nan self.confidence_column = confidence_column self.software = "MSFragger_Philosopher" diff --git a/alphastats/loader/GenericLoader.py b/alphastats/loader/GenericLoader.py index 90b20b40..11cfdb4f 100644 --- a/alphastats/loader/GenericLoader.py +++ b/alphastats/loader/GenericLoader.py @@ -5,13 +5,14 @@ from typing import Union class GenericLoader(BaseLoader): - def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str=None): + def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str=None, replace_zero_with_nan:bool=True): """Generic Loader for you proteomics data Args: file (Union[str, pd.DataFrame]): path to your proteomics file or pandas.DataFrame intensity_column (list): list of samples with intensity - index_column (str): column with Protein IDs or Gene names, used for indexing + index_column (str): column with Protein IDs or Gene names, used for indexing. + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. sep (str): file separation """ @@ -21,6 +22,7 @@ def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_c self.rawinput = pd.read_csv(file, sep=sep, low_memory=False) self.intensity_column = intensity_column self.intensity_column_list = intensity_column + self.replace_zero_with_nan = replace_zero_with_nan self.index_column = index_column self.filter_columns = [] self.confidence_column = None diff --git a/alphastats/loader/MaxQuantLoader.py b/alphastats/loader/MaxQuantLoader.py index 50b48ce0..1d239be2 100644 --- a/alphastats/loader/MaxQuantLoader.py +++ b/alphastats/loader/MaxQuantLoader.py @@ -16,6 +16,7 @@ def __init__( filter_columns:list=["Only identified by site", "Reverse", "Potential contaminant"], confidence_column:str="Q-value", evidence_file=None, + replace_zero_with_nan:bool=True, sep:str="\t", **kwargs ): @@ -27,12 +28,14 @@ def __init__( index_column (str, optional): column with Protein IDs . Defaults to "Protein IDs". filter_columns (list, optional): columns that should be used for filtering. Defaults to ["Only identified by site", "Reverse", "Potential contaminant"]. confidence_column (str, optional): column with the Q-value given. Defaults to "Q-value". + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. sep (str, optional): separation of the input file. Defaults to "\t". """ super().__init__(file, intensity_column, index_column, sep) self.filter_columns = filter_columns + self.filter_columns self.confidence_column = confidence_column + self.replace_zero_with_nan = replace_zero_with_nan self.software = "MaxQuant" self._set_filter_columns_to_true_false() self._read_all_columns_as_string() diff --git a/alphastats/loader/SpectronautLoader.py b/alphastats/loader/SpectronautLoader.py index c80fe131..25f04007 100644 --- a/alphastats/loader/SpectronautLoader.py +++ b/alphastats/loader/SpectronautLoader.py @@ -2,6 +2,7 @@ import pandas as pd import numpy as np import logging +from typing import Union class SpectronautLoader(BaseLoader): @@ -10,31 +11,34 @@ class SpectronautLoader(BaseLoader): def __init__( self, - file, - intensity_column="PG.Quantity", - index_column="PG.ProteinGroups", - sample_column="R.FileName", - gene_names_column="PG.Genes", - filter_qvalue=True, - qvalue_cutoff=0.01, + file:Union[str, pd.DataFrame], + intensity_column:str="PG.Quantity", + index_column:str="PG.ProteinGroups", + sample_column:str="R.FileName", + gene_names_column:str="PG.Genes", + filter_qvalue:bool=True, + qvalue_cutoff:float=0.01, + replace_zero_with_nan:bool=True, sep="\t", ): """Loads Spectronaut output. Will add contamination column for further analysis. Args: - file (str): path to Spectronaut outputfile or pandas.DataFrame + file (str, pd.DataFrame): path to Spectronaut outputfile or pandas.DataFrame intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity". index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups". sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName". gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes". filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True. qvalue_cutoff (float, optional): cut off vaƩie. Defaults to 0.01. + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. sep (str, optional): file separation of file. Defaults to "\t". """ self.software = "Spectronaut" self.intensity_column = intensity_column self.index_column = index_column + self.replace_zero_with_nan = replace_zero_with_nan self.confidence_column = None self.filter_columns = [] self.evidence_df = None diff --git a/alphastats/loader/mzTabLoader.py b/alphastats/loader/mzTabLoader.py index 4f573569..27327c2f 100644 --- a/alphastats/loader/mzTabLoader.py +++ b/alphastats/loader/mzTabLoader.py @@ -1,19 +1,22 @@ from pyteomics import mztab from alphastats.loader.BaseLoader import BaseLoader + class mzTabLoader(BaseLoader): - def __init__(self, file, intensity_column: str="protein_abundance_[sample]", index_column:str="accession"): + def __init__(self, file, intensity_column: str="protein_abundance_[sample]", index_column:str="accession",replace_zero_with_nan:bool=True): """Load mzTab file. Will add contamination column for further analysis. Args: file (str): path to mzTab file. intensity_column (str, optional): columns where the intensity of the proteins are given.. Defaults to "protein_abundance_[sample]". index_column (str, optional): column indicating the protein groups. Defaults to "accession". + replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True. """ self.filter_columns = [] self.gene_names = None self.intensity_column = intensity_column self.index_column = index_column + self.replace_zero_with_nan = replace_zero_with_nan self.confidence_column = None self.evidence_df = None self.gene_names = None