Skip to content

Commit

Permalink
#231 replace zero with nan parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
elena-krismer committed Aug 31, 2023
1 parent 89843be commit 993d25b
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 17 deletions.
6 changes: 6 additions & 0 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(self, loader, metadata_path=None, sample_column=None):
self.filter_columns = loader.filter_columns
self.evidence_df = loader.evidence_df
self.gene_names = loader.gene_names
self.replace_zero_with_nan = loader.replace_zero_with_nan

# include filtering before
self.create_matrix()
Expand Down Expand Up @@ -169,6 +170,10 @@ def create_matrix(self):
# transpose dataframe
mat = df.transpose()
mat.replace([np.inf, -np.inf], np.nan, inplace=True)

if self.replace_zero_with_nan:
mat.replace([0], np.nan, inplace=True)

# remove proteins with only zero
self.mat = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = self.mat.astype(float)
Expand Down Expand Up @@ -222,6 +227,7 @@ def _save_dataset_info(self):
"Contamination columns": self.filter_columns,
"Number of removed ProteinGroups due to contaminaton": 0,
"Data completeness cut-off": 0,
"Zeros replaced with NaN": self.replace_zero_with_nan,
}
return preprocessing_dict

Expand Down
13 changes: 8 additions & 5 deletions alphastats/loader/AlphaPeptLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import numpy as np
import logging

from typing import Union

class AlphaPeptLoader(BaseLoader):
"""Loader for AlphaPept outputfiles
Expand All @@ -11,10 +11,11 @@ class AlphaPeptLoader(BaseLoader):

def __init__(
self,
file,
intensity_column="[sample]_LFQ",
index_column="Unnamed: 0", # column name to be changed
sep=",",
file:Union[str, pd.DataFrame],
intensity_column:str="[sample]_LFQ",
index_column:str="Unnamed: 0", # column name to be changed
sep:str=",",
replace_zero_with_nan:bool=True,
**kwargs
):
"""Loads Alphapept output: results_proteins.csv. Will add contamination column for further analysis.
Expand All @@ -23,6 +24,7 @@ def __init__(
file (str): AlphaPept output, either results_proteins.csv file or the hdf_file with the protein_table given
intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample]_LFQ".
index_column (str, optional): column indicating the protein groups. Defaults to "Unnamed: 0".
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
sep (str, optional): file separation of file. Defaults to ",".
"""

Expand All @@ -33,6 +35,7 @@ def __init__(

self.intensity_column = intensity_column
self.index_column = index_column
self.replace_zero_with_nan = replace_zero_with_nan
self.filter_columns = []
self.confidence_column = None
self.software = "AlphaPept"
Expand Down
3 changes: 2 additions & 1 deletion alphastats/loader/BaseLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class BaseLoader:
"""Parent class of Loaders"""

def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, list], index_column:str, sep:str):
def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, list], index_column:str, sep:str, replace_zero_with_nan=True):
"""BaseLoader for AlphaPept, MaxQuant, Fragpipe, Spectronau and DIANNLoader
Args:
Expand All @@ -30,6 +30,7 @@ def __init__(self, file:Union[str, pd.DataFrame], intensity_column:Union[str, li
self.evidence_df = None
self.gene_names = None
self.ptm_df = None
self.replace_zero_with_nan = replace_zero_with_nan
self._add_contamination_column()
self._check_if_columns_are_present()
self._read_all_columns_as_string()
Expand Down
3 changes: 3 additions & 0 deletions alphastats/loader/DIANNLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(
intensity_column="[sample]",
index_column="Protein.Group",
sep="\t",
replace_zero_with_nan = True,
**kwargs
):
"""Import DIA-NN output data report.pg_matrix.tsv
Expand All @@ -21,11 +22,13 @@ def __init__(
file (str): DIA-NN output file report.pg_matrix.tsv
intensity_column (str, optional): columns containing the intensity column for each experiment. Defaults to "[experiment]".
index_column (str, optional): column with the Protein IDs. Defaults to "Protein.Group".
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
sep (str, optional): file separation of the input file. Defaults to "\t".
"""

super().__init__(file, intensity_column, index_column, sep)
self.software = "DIANN"
self.replace_zero_with_nan = replace_zero_with_nan
self.no_sample_column = [
"PG.Q.value",
"Global.PG.Q.value",
Expand Down
10 changes: 10 additions & 0 deletions alphastats/loader/FragPipeLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,24 @@ def __init__(
gene_names_column:str="Gene Names",
confidence_column:str="Protein Probability",
sep:str="\t",
replace_zero_with_nan:bool = True,
**kwargs
):
"""Loads FragPipe/Philosopher output: combined_protein.tsv
Args:
file (str, pd.DataFryame): FragPipe output, combined_protein.tsv.
intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "[sample] MaxLFQ Intensity ".
index_column (str, optional): column indicating the protein groups. Defaults to "Protein".
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
sep (str/optional): file separation. Defaults to "\t".
"""

super().__init__(file, intensity_column, index_column, sep)

if gene_names_column in self.rawinput.columns.to_list():
self.gene_names = gene_names_column

self.replace_zero_with_nan = replace_zero_with_nan
self.confidence_column = confidence_column
self.software = "MSFragger_Philosopher"

Expand Down
6 changes: 4 additions & 2 deletions alphastats/loader/GenericLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from typing import Union

class GenericLoader(BaseLoader):
def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str=None):
def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str=None, replace_zero_with_nan:bool=True):
"""Generic Loader for you proteomics data
Args:
file (Union[str, pd.DataFrame]): path to your proteomics file or pandas.DataFrame
intensity_column (list): list of samples with intensity
index_column (str): column with Protein IDs or Gene names, used for indexing
index_column (str): column with Protein IDs or Gene names, used for indexing.
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
sep (str): file separation
"""

Expand All @@ -21,6 +22,7 @@ def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_c
self.rawinput = pd.read_csv(file, sep=sep, low_memory=False)
self.intensity_column = intensity_column
self.intensity_column_list = intensity_column
self.replace_zero_with_nan = replace_zero_with_nan
self.index_column = index_column
self.filter_columns = []
self.confidence_column = None
Expand Down
3 changes: 3 additions & 0 deletions alphastats/loader/MaxQuantLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
filter_columns:list=["Only identified by site", "Reverse", "Potential contaminant"],
confidence_column:str="Q-value",
evidence_file=None,
replace_zero_with_nan:bool=True,
sep:str="\t",
**kwargs
):
Expand All @@ -27,12 +28,14 @@ def __init__(
index_column (str, optional): column with Protein IDs . Defaults to "Protein IDs".
filter_columns (list, optional): columns that should be used for filtering. Defaults to ["Only identified by site", "Reverse", "Potential contaminant"].
confidence_column (str, optional): column with the Q-value given. Defaults to "Q-value".
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
sep (str, optional): separation of the input file. Defaults to "\t".
"""

super().__init__(file, intensity_column, index_column, sep)
self.filter_columns = filter_columns + self.filter_columns
self.confidence_column = confidence_column
self.replace_zero_with_nan = replace_zero_with_nan
self.software = "MaxQuant"
self._set_filter_columns_to_true_false()
self._read_all_columns_as_string()
Expand Down
20 changes: 12 additions & 8 deletions alphastats/loader/SpectronautLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
import logging
from typing import Union


class SpectronautLoader(BaseLoader):
Expand All @@ -10,31 +11,34 @@ class SpectronautLoader(BaseLoader):

def __init__(
self,
file,
intensity_column="PG.Quantity",
index_column="PG.ProteinGroups",
sample_column="R.FileName",
gene_names_column="PG.Genes",
filter_qvalue=True,
qvalue_cutoff=0.01,
file:Union[str, pd.DataFrame],
intensity_column:str="PG.Quantity",
index_column:str="PG.ProteinGroups",
sample_column:str="R.FileName",
gene_names_column:str="PG.Genes",
filter_qvalue:bool=True,
qvalue_cutoff:float=0.01,
replace_zero_with_nan:bool=True,
sep="\t",
):
"""Loads Spectronaut output. Will add contamination column for further analysis.
Args:
file (str): path to Spectronaut outputfile or pandas.DataFrame
file (str, pd.DataFrame): path to Spectronaut outputfile or pandas.DataFrame
intensity_column (str, optional): columns where the intensity of the proteins are given. Defaults to "PG.Quantity".
index_column (str, optional): column indicating the protein groups. Defaults to "PG.ProteinGroups".
sample_column (str, optional): column that contains sample names used for downstream analysis. Defaults to "R.FileName".
gene_names_column (str, optional): column with gene names. Defaults to "PG.Genes".
filter_qvalue (bool, optional): will filter out the intensities that have greater than qvalue_cutoff in EG.Qvalue column. Those intensities will be replaced with zero and will be considered as censored missing values for imputation purpose.. Defaults to True.
qvalue_cutoff (float, optional): cut off vaéie. Defaults to 0.01.
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
sep (str, optional): file separation of file. Defaults to "\t".
"""

self.software = "Spectronaut"
self.intensity_column = intensity_column
self.index_column = index_column
self.replace_zero_with_nan = replace_zero_with_nan
self.confidence_column = None
self.filter_columns = []
self.evidence_df = None
Expand Down
5 changes: 4 additions & 1 deletion alphastats/loader/mzTabLoader.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from pyteomics import mztab
from alphastats.loader.BaseLoader import BaseLoader


class mzTabLoader(BaseLoader):
def __init__(self, file, intensity_column: str="protein_abundance_[sample]", index_column:str="accession"):
def __init__(self, file, intensity_column: str="protein_abundance_[sample]", index_column:str="accession",replace_zero_with_nan:bool=True):
"""Load mzTab file. Will add contamination column for further analysis.
Args:
file (str): path to mzTab file.
intensity_column (str, optional): columns where the intensity of the proteins are given.. Defaults to "protein_abundance_[sample]".
index_column (str, optional): column indicating the protein groups. Defaults to "accession".
replace_zero_with_nan (bool, optional): whether zero values should be replaced with NaN when loading the data. Defaults to True.
"""
self.filter_columns = []
self.gene_names = None
self.intensity_column = intensity_column
self.index_column = index_column
self.replace_zero_with_nan = replace_zero_with_nan
self.confidence_column = None
self.evidence_df = None
self.gene_names = None
Expand Down

0 comments on commit 993d25b

Please sign in to comment.