From 993d25bbcfc0a9880bcedc583eebd9f62862fc67 Mon Sep 17 00:00:00 2001
From: elena-krismer <elena.krismer@hotmail.com>
Date: Thu, 31 Aug 2023 08:29:04 +0200
Subject: [PATCH] #231 replace zero with nan parameter

---
 alphastats/DataSet.py                  |  6 ++++++
 alphastats/loader/AlphaPeptLoader.py   | 13 ++++++++-----
 alphastats/loader/BaseLoader.py        |  3 ++-
 alphastats/loader/DIANNLoader.py       |  3 +++
 alphastats/loader/FragPipeLoader.py    | 10 ++++++++++
 alphastats/loader/GenericLoader.py     |  6 ++++--
 alphastats/loader/MaxQuantLoader.py    |  3 +++
 alphastats/loader/SpectronautLoader.py | 20 ++++++++++++--------
 alphastats/loader/mzTabLoader.py       |  5 ++++-
 9 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py
index 73dd7885..60359c41 100644
--- a/alphastats/DataSet.py
+++ b/alphastats/DataSet.py
@@ -67,6 +67,7 @@ def __init__(self, loader, metadata_path=None, sample_column=None):
         self.filter_columns = loader.filter_columns
         self.evidence_df = loader.evidence_df
         self.gene_names = loader.gene_names
+        self.replace_zero_with_nan = loader.replace_zero_with_nan
 
         # include filtering before
         self.create_matrix()
@@ -169,6 +170,10 @@ def create_matrix(self):
         # transpose dataframe
         mat = df.transpose()
         mat.replace([np.inf, -np.inf], np.nan, inplace=True)
+        
+        if self.replace_zero_with_nan:
+             mat.replace([0], np.nan, inplace=True)
+        
         # remove proteins with only zero
         self.mat = mat.loc[:, (mat != 0).any(axis=0)]
         self.mat = self.mat.astype(float)
@@ -222,6 +227,7 @@ def _save_dataset_info(self):
             "Contamination columns": self.filter_columns,
             "Number of removed ProteinGroups due to contaminaton": 0,
             "Data completeness cut-off": 0,
+            "Zeros replaced with NaN": self.replace_zero_with_nan,
         }
         return preprocessing_dict
 
diff --git a/alphastats/loader/AlphaPeptLoader.py b/alphastats/loader/AlphaPeptLoader.py
index a3112729..b2fe4ff3 100644
--- a/alphastats/loader/AlphaPeptLoader.py
+++ b/alphastats/loader/AlphaPeptLoader.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 import logging
-
+from typing import Union
 
 class AlphaPeptLoader(BaseLoader):
     """Loader for AlphaPept outputfiles
@@ -11,10 +11,11 @@ class AlphaPeptLoader(BaseLoader):
 
     def __init__(
         self,
-        file,
-        intensity_column="[sample]_LFQ",
-        index_column="Unnamed: 0",  # column name to be changed
-        sep=",",
+        file:Union[str, pd.DataFrame],
+        intensity_column:str="[sample]_LFQ",
+        index_column:str="Unnamed: 0",  # column name to be changed
+        sep:str=",",
+        replace_zero_with_nan:bool=True,
         **kwargs
     ):
         """Loads Alphapept output: results_proteins.csv. Will add contamination column for further analysis.
@@ -23,6 +24,7 @@ def __init__(
             file (str): AlphaPept output, either results_proteins.csv file or the hdf_file with the protein_table given
             intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample]_LFQ".
             index_column (str, optional): column indicating the protein groups. Defaults to "Unnamed: 0".
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
             sep (str, optional): file separation of file. Defaults to ",".
         """
 
@@ -33,6 +35,7 @@ def __init__(
 
         self.intensity_column = intensity_column
         self.index_column = index_column
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.filter_columns = []
         self.confidence_column = None
         self.software = "AlphaPept"
diff --git a/alphastats/loader/BaseLoader.py b/alphastats/loader/BaseLoader.py
index d80a53e5..88c3d06c 100644
--- a/alphastats/loader/BaseLoader.py
+++ b/alphastats/loader/BaseLoader.py
@@ -9,7 +9,7 @@
 class BaseLoader:
     """Parent class of Loaders"""
 
-    def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, list], index_column:str, sep:str):
+    def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, list], index_column:str, sep:str, replace_zero_with_nan=True):
         """BaseLoader for AlphaPept, MaxQuant, Fragpipe, Spectronau and DIANNLoader
 
         Args:
@@ -30,6 +30,7 @@ def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, li
         self.evidence_df = None
         self.gene_names = None
         self.ptm_df = None
+        self.replace_zero_with_nan = replace_zero_with_nan
         self._add_contamination_column()
         self._check_if_columns_are_present()
         self._read_all_columns_as_string()
diff --git a/alphastats/loader/DIANNLoader.py b/alphastats/loader/DIANNLoader.py
index 3c78e31d..a23109f7 100644
--- a/alphastats/loader/DIANNLoader.py
+++ b/alphastats/loader/DIANNLoader.py
@@ -13,6 +13,7 @@ def __init__(
         intensity_column="[sample]",
         index_column="Protein.Group",
         sep="\t",
+        replace_zero_with_nan = True,
         **kwargs
     ):
         """Import DIA-NN output data report.pg_matrix.tsv
@@ -21,11 +22,13 @@ def __init__(
             file (str): DIA-NN output file report.pg_matrix.tsv
             intensity_column (str, optional): columns containing the intensity column for each experiment. Defaults to "[experiment]".
             index_column (str, optional): column with the Protein IDs. Defaults to "Protein.Group".
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
             sep (str, optional): file separation of the input file. Defaults to "\t".
         """
 
         super().__init__(file, intensity_column, index_column, sep)
         self.software = "DIANN"
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.no_sample_column = [
             "PG.Q.value",
             "Global.PG.Q.value",
diff --git a/alphastats/loader/FragPipeLoader.py b/alphastats/loader/FragPipeLoader.py
index 18563c6c..22eb55a9 100644
--- a/alphastats/loader/FragPipeLoader.py
+++ b/alphastats/loader/FragPipeLoader.py
@@ -17,14 +17,24 @@ def __init__(
         gene_names_column:str="Gene Names",
         confidence_column:str="Protein Probability",
         sep:str="\t",
+        replace_zero_with_nan:bool = True,
         **kwargs
     ):
+        """Loads FragPipe/Philosopher output: combined_protein.tsv
+        Args:
+            file (str, pd.DataFryame): FragPipe output, combined_protein.tsv.
+            intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample] MaxLFQ Intensity ".
+            index_column (str, optional): column indicating the protein groups. Defaults to "Protein".
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
+            sep (str/optional): file separation. Defaults to "\t".
+        """
 
         super().__init__(file, intensity_column, index_column, sep)
 
         if gene_names_column in self.rawinput.columns.to_list():
             self.gene_names = gene_names_column
 
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.confidence_column = confidence_column
         self.software = "MSFragger_Philosopher"
 
diff --git a/alphastats/loader/GenericLoader.py b/alphastats/loader/GenericLoader.py
index 90b20b40..11cfdb4f 100644
--- a/alphastats/loader/GenericLoader.py
+++ b/alphastats/loader/GenericLoader.py
@@ -5,13 +5,14 @@
 from typing import Union
 
 class GenericLoader(BaseLoader):
-    def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str=None):
+    def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str=None, replace_zero_with_nan:bool=True):
         """Generic Loader for you proteomics data
 
         Args:
             file (Union[str, pd.DataFrame]): path to your proteomics file or pandas.DataFrame
             intensity_column (list): list of samples with intensity
-            index_column (str): column with Protein IDs or Gene names, used for indexing
+            index_column (str): column with Protein IDs or Gene names, used for indexing.
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
             sep (str): file separation
         """
 
@@ -21,6 +22,7 @@ def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_c
             self.rawinput = pd.read_csv(file, sep=sep, low_memory=False)
         self.intensity_column = intensity_column
         self.intensity_column_list = intensity_column
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.index_column = index_column
         self.filter_columns = []
         self.confidence_column = None
diff --git a/alphastats/loader/MaxQuantLoader.py b/alphastats/loader/MaxQuantLoader.py
index 50b48ce0..1d239be2 100644
--- a/alphastats/loader/MaxQuantLoader.py
+++ b/alphastats/loader/MaxQuantLoader.py
@@ -16,6 +16,7 @@ def __init__(
         filter_columns:list=["Only identified by site", "Reverse", "Potential contaminant"],
         confidence_column:str="Q-value",
         evidence_file=None,
+        replace_zero_with_nan:bool=True,
         sep:str="\t",
         **kwargs
     ):
@@ -27,12 +28,14 @@ def __init__(
             index_column (str, optional): column with Protein IDs . Defaults to "Protein IDs".
             filter_columns (list, optional): columns that should be used for filtering. Defaults to ["Only identified by site", "Reverse", "Potential contaminant"].
             confidence_column (str, optional): column with the Q-value given. Defaults to "Q-value".
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
             sep (str, optional): separation of the input file. Defaults to "\t".
         """
 
         super().__init__(file, intensity_column, index_column, sep)
         self.filter_columns = filter_columns + self.filter_columns
         self.confidence_column = confidence_column
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.software = "MaxQuant"
         self._set_filter_columns_to_true_false()
         self._read_all_columns_as_string()
diff --git a/alphastats/loader/SpectronautLoader.py b/alphastats/loader/SpectronautLoader.py
index c80fe131..25f04007 100644
--- a/alphastats/loader/SpectronautLoader.py
+++ b/alphastats/loader/SpectronautLoader.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import numpy as np
 import logging
+from typing import Union
 
 
 class SpectronautLoader(BaseLoader):
@@ -10,31 +11,34 @@ class SpectronautLoader(BaseLoader):
 
     def __init__(
         self,
-        file,
-        intensity_column="PG.Quantity",
-        index_column="PG.ProteinGroups",
-        sample_column="R.FileName",
-        gene_names_column="PG.Genes",
-        filter_qvalue=True,
-        qvalue_cutoff=0.01,
+        file:Union[str, pd.DataFrame],
+        intensity_column:str="PG.Quantity",
+        index_column:str="PG.ProteinGroups",
+        sample_column:str="R.FileName",
+        gene_names_column:str="PG.Genes",
+        filter_qvalue:bool=True,
+        qvalue_cutoff:float=0.01,
+        replace_zero_with_nan:bool=True,
         sep="\t",
     ):
         """Loads Spectronaut output. Will add contamination column for further analysis.
 
         Args:
-            file (str): path to Spectronaut outputfile or pandas.DataFrame 
+            file (str, pd.DataFrame): path to Spectronaut outputfile or pandas.DataFrame 
             intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity".
             index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups".
             sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName".
             gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes".
             filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True.
             qvalue_cutoff (float, optional): cut off vaéie. Defaults to 0.01.
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
             sep (str, optional): file separation of file. Defaults to "\t".
         """
 
         self.software = "Spectronaut"
         self.intensity_column = intensity_column
         self.index_column = index_column
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.confidence_column = None
         self.filter_columns = []
         self.evidence_df = None
diff --git a/alphastats/loader/mzTabLoader.py b/alphastats/loader/mzTabLoader.py
index 4f573569..27327c2f 100644
--- a/alphastats/loader/mzTabLoader.py
+++ b/alphastats/loader/mzTabLoader.py
@@ -1,19 +1,22 @@
 from pyteomics import mztab
 from alphastats.loader.BaseLoader import BaseLoader
 
+
 class mzTabLoader(BaseLoader):
-    def __init__(self, file, intensity_column: str="protein_abundance_[sample]", index_column:str="accession"):
+    def __init__(self, file, intensity_column: str="protein_abundance_[sample]", index_column:str="accession",replace_zero_with_nan:bool=True):
         """Load mzTab file. Will add contamination column for further analysis.
 
         Args:
             file (str): path to mzTab file.
             intensity_column (str, optional):  columns where the intensity of the proteins are given.. Defaults to "protein_abundance_[sample]".
             index_column (str, optional): column indicating the protein groups.  Defaults to "accession".
+            replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
         """
         self.filter_columns = []
         self.gene_names = None
         self.intensity_column = intensity_column
         self.index_column = index_column
+        self.replace_zero_with_nan = replace_zero_with_nan
         self.confidence_column = None
         self.evidence_df = None
         self.gene_names = None