ENH: Added action normalize that can normalize a `FeatureTable[Freq…

…uency]` with common methods. (#312) * added normalized * typemap properties and plugin setup * lint and add rnanorm to metayaml
qiime2 · Dec 12, 2024 · cb009e7 · cb009e7
1 parent 443490c
commit cb009e7
Show file tree

Hide file tree

Showing 6 changed files with 297 additions and 6 deletions.
diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml
@@ -31,6 +31,7 @@ requirements:
     - qiime2 {{ qiime2_epoch }}.*
     - q2templates {{ qiime2_epoch }}.*
     - q2-types {{ qiime2_epoch }}.*
+    - rnanorm
 
 test:
   requires:

diff --git a/q2_feature_table/__init__.py b/q2_feature_table/__init__.py
@@ -6,7 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from ._normalize import rarefy
+from ._normalize import rarefy, normalize
 from ._subsample_ids import subsample_ids
 from ._transform import (presence_absence, relative_frequency, transpose)
 from ._summarize import (summarize, tabulate_seqs, tabulate_sample_frequencies,
@@ -31,4 +31,4 @@
            'filter_seqs', 'subsample_ids', 'rename_ids',
            'filter_features_conditionally', 'split',
            'tabulate_feature_frequencies', 'tabulate_sample_frequencies',
-           'summarize_plus']
+           'summarize_plus', 'normalize']
diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py
@@ -8,6 +8,12 @@
 
 import biom
 
+import os
+
+import pandas as pd
+from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
+from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
+
 
 def rarefy(table: biom.Table, sampling_depth: int,
            with_replacement: bool = False) -> biom.Table:
@@ -23,3 +29,95 @@ def rarefy(table: biom.Table, sampling_depth: int,
                          'shallow enough sampling depth.')
 
     return table
+
+
+def normalize(
+    table: pd.DataFrame,
+    method: str,
+    m_trim: float = None,
+    a_trim: float = None,
+    gene_length: SequenceCharacteristicsDirectoryFormat = None,
+) -> pd.DataFrame:
+    # Validate parameter combinations and set trim parameters
+    m_trim, a_trim = _validate_parameters(
+        method, m_trim, a_trim, gene_length)
+
+    # Process gene_lengths input and define methods that need gene_lengths
+    # input
+    if method in ["tpm", "fpkm"]:
+        lengths = _convert_lengths(table, gene_length)
+
+        methods = {
+            "tpm": TPM(gene_lengths=lengths),
+            "fpkm": FPKM(gene_lengths=lengths),
+        }
+
+    # Define remaining methods that don't need gene_lengths input
+    else:
+        methods = {
+            "tmm": TMM(m_trim=m_trim, a_trim=a_trim),
+            "ctf": CTF(m_trim=m_trim, a_trim=a_trim),
+            "uq": UQ(),
+            "cuf": CUF(),
+            "cpm": CPM(),
+        }
+
+    # Run normalization method on frequency table
+    normalized = methods[method].set_output(
+        transform="pandas").fit_transform(table)
+    normalized.index.name = "sample_id"
+
+    return normalized
+
+
+def _validate_parameters(method, m_trim, a_trim, gene_length):
+    # Raise Error if gene-length is missing when using methods TPM or FPKM
+    if method in ["tpm", "fpkm"] and not gene_length:
+        raise ValueError("gene-length input is missing.")
+
+    # Raise Error if gene-length is given when using methods TMM, UQ, CUF,
+    # CPM or CTF
+    if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length:
+        raise ValueError(
+            "gene-length input can only be used with FPKM and TPM methods."
+        )
+
+    # Raise Error if m_trim or a_trim are given when not using methods
+    # TMM or CTF
+    if ((method not in ["tmm", "ctf"])
+            and (m_trim is not None or a_trim is not None)):
+        raise ValueError(
+            "Parameters m-trim and a-trim can only be used with "
+            "methods TMM and CTF."
+        )
+
+    # Set m_trim and a_trim to their default values for methods
+    # TMM and CTF
+    if method in ["tmm", "ctf"]:
+        m_trim = 0.3 if m_trim is None else m_trim
+        a_trim = 0.05 if a_trim is None else a_trim
+
+    return m_trim, a_trim
+
+
+def _convert_lengths(table, gene_length):
+    # Read in table from sequence_characteristics.tsv as a pd.Series
+    lengths = pd.read_csv(
+        os.path.join(gene_length.path, "sequence_characteristics.tsv"),
+        sep="\t",
+        header=None,
+        names=["index", "values"],
+        index_col="index",
+        skiprows=1,
+    ).squeeze("columns")
+
+    # Check if all gene IDs that are present in the table are also
+    # present in the lengths
+    if not set(table.columns).issubset(set(lengths.index)):
+        only_in_counts = set(table.columns) - set(lengths.index)
+        raise ValueError(
+            f"There are genes present in the FeatureTable that are "
+            f"not present in the gene-length input. Missing lengths "
+            f"for genes: {only_in_counts}"
+        )
+    return lengths
diff --git a/q2_feature_table/citations.bib b/q2_feature_table/citations.bib
@@ -43,3 +43,12 @@ @article{NCBI-BLAST
 year = {2008},
 doi = {10.1093/nar/gkn201},
 }
+
+@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023,
+author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha},
+month = jul,
+title = {{RNAnorm: RNA-seq data normalization in Python}},
+url = {https://github.com/genialis/RNAnorm},
+version = {2.1.0},
+year = {2023}
+}
diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py
@@ -5,16 +5,17 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
+from qiime2.core.type import Properties
 from qiime2.plugin import (Plugin, Int, Float, Range, Metadata, Str, Bool,
                            Choices, MetadataColumn, Categorical, List,
                            Citations, TypeMatch, TypeMap, Collection,
                            Visualization)
 
 from q2_types.feature_table import (
-    FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition)
+    FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition,
+    Normalized)
 from q2_types.feature_data import (
-    FeatureData, Sequence, Taxonomy, AlignedSequence)
+    FeatureData, Sequence, Taxonomy, AlignedSequence, SequenceCharacteristics)
 from q2_types.metadata import ImmutableMetadata
 
 import q2_feature_table
@@ -723,3 +724,77 @@
                 "Tabulate sample and feature frequencies.",
     examples={'feature_table_summarize_plus': ex.feature_table_summarize_plus}
 )
+
+P_method, T_normalized_table = TypeMap(
+    {
+        Str
+        % Choices("tpm"): (
+            FeatureTable[Normalized % Properties("TPM")],
+        ),
+        Str
+        % Choices("fpkm"): (
+            FeatureTable[Normalized % Properties("FPKM")],
+        ),
+        Str
+        % Choices("tmm"): (
+            FeatureTable[Normalized % Properties("TMM")],
+        ),
+        Str
+        % Choices("uq"): (
+            FeatureTable[Normalized % Properties("UQ")],
+        ),
+        Str
+        % Choices("cuf"): (
+            FeatureTable[Normalized % Properties("CUF")],
+        ),
+        Str
+        % Choices("ctf"): (
+            FeatureTable[Normalized % Properties("CTF")],
+        ),
+        Str
+        % Choices("cpm"): (
+            FeatureTable[Normalized % Properties("CPM")],
+        ),
+    }
+)
+
+plugin.methods.register_function(
+    function=q2_feature_table.normalize,
+    inputs={
+        "table": FeatureTable[Frequency],
+        "gene_length": FeatureData[
+            SequenceCharacteristics % Properties("length")],
+    },
+    parameters={
+        "method": P_method,
+        "m_trim": Float % Range(
+            0, 1, inclusive_start=True, inclusive_end=True
+        ),
+        "a_trim": Float % Range(
+            0, 1, inclusive_start=True, inclusive_end=True
+        ),
+    },
+    outputs=[("normalized_table", T_normalized_table)],
+    input_descriptions={
+        "table": "Feature table with gene counts.",
+        "gene_length": "Gene lengths of all genes in the feature table.",
+    },
+    parameter_descriptions={
+        "method": "Specify the normalization method to be used. Use FPKM or "
+        "TPM for within comparisons and TMM, UQ, CUF or CTF for between "
+        "sample camparisons. Check https://www.genialis.com/wp-content/uploads"
+        "/2023/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf "
+        "for more information on the methods.",
+        "m_trim": "Two sided cutoff for M-values. Can only be used for "
+        "methods TMM and CTF. (default = 0.3)",
+        "a_trim": "Two sided cutoff for A-values. Can only be used for "
+        "methods TMM and CTF. (default = 0.05)",
+    },
+    output_descriptions={
+        "normalized_table": "Feature table normalized with specified method."
+    },
+    name="Normalize FeatureTable",
+    description="Normalize FeatureTable by gene length, library size and "
+    "composition with common methods for RNA-seq.",
+    citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]],
+)
diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py
@@ -5,14 +5,20 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
+import os
 from unittest import TestCase, main
+from unittest.mock import MagicMock, patch
 
 import numpy as np
 import numpy.testing as npt
+import pandas as pd
 from biom.table import Table
+from pandas._testing import assert_series_equal
+from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
 
 from q2_feature_table import rarefy
+from q2_feature_table._normalize import (_validate_parameters,
+                                         _convert_lengths, normalize)
 
 
 class RarefyTests(TestCase):
@@ -53,5 +59,107 @@ def test_rarefy_depth_error(self):
             rarefy(t, 50)
 
 
+class NormalizeTests(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.lengths = pd.Series(
+            {
+                "ARO1": 1356.0,
+                "ARO2": 1173.0,
+            },
+            name="values",
+        )
+        cls.lengths.index.name = "index"
+        cls.table = pd.DataFrame({
+            'ID': ['sample1', 'sample2'],
+            'ARO1': [2.0, 2.0],
+            'ARO2': [0.0, 0.0]
+        }).set_index('ID')
+
+    def test_validate_parameters_uq_with_m_a_trim(self):
+        # Test Error raised if gene-length is given with UQ method
+        with self.assertRaisesRegex(
+                ValueError,
+                "Parameters m-trim and a-trim can only "
+                "be used with methods TMM and CTF.",
+        ):
+            _validate_parameters("uq", 0.2, 0.05, None)
+
+    def test_validate_parameters_tpm_missing_gene_length(self):
+        # Test Error raised if gene-length is missing with TPM method
+        with self.assertRaisesRegex(
+                ValueError, "gene-length input is missing."):
+            _validate_parameters("tpm", None, None, None)
+
+    def test_validate_parameters_tmm_gene_length(self):
+        # Test Error raised if gene-length is given with TMM method
+        with self.assertRaisesRegex(
+                ValueError,
+                "gene-length input can only be used with FPKM and "
+                "TPM methods."
+        ):
+            _validate_parameters(
+                "tmm", None, None, gene_length=MagicMock())
+
+    def test_validate_parameters_default_m_a_trim(self):
+        # Test if m_trim and a_trim get set to default values if None
+        m_trim, a_trim = _validate_parameters("tmm", None, None, None)
+        self.assertEqual(m_trim, 0.3)
+        self.assertEqual(a_trim, 0.05)
+
+    def test_validate_parameters_m_a_trim(self):
+        # Test if m_trim and a_trim are not modified if not None
+        m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None)
+        self.assertEqual(m_trim, 0.1)
+        self.assertEqual(a_trim, 0.06)
+
+    def test_convert_lengths_gene_length(self):
+        # Test _convert_lengths
+        gene_length = SequenceCharacteristicsDirectoryFormat()
+        with open(os.path.join(
+                str(gene_length), "sequence_characteristics.tsv"),
+                  'w') as file:
+            file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
+
+        obs = _convert_lengths(self.table, gene_length=gene_length)
+        assert_series_equal(obs, self.lengths)
+
+    def test_convert_lengths_short_gene_length(self):
+        # Test Error raised if gene-length is missing genes
+        gene_length = SequenceCharacteristicsDirectoryFormat()
+        with open(os.path.join(
+                str(gene_length),
+                "sequence_characteristics.tsv"), 'w') as file:
+            file.write("id\tlength\nARO1\t1356.0")
+        with self.assertRaisesRegex(
+                ValueError,
+                "There are genes present in the FeatureTable that are "
+                "not present in the gene-length input. Missing lengths "
+                "for genes: {'ARO2'}",
+        ):
+            _convert_lengths(self.table, gene_length=gene_length)
+
+    @patch("q2_feature_table._normalize.TPM")
+    def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
+        # Test valid inputs for TPM method
+        gene_length = SequenceCharacteristicsDirectoryFormat()
+        with open(os.path.join(
+                str(gene_length), "sequence_characteristics.tsv"),
+                  'w') as file:
+            file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
+        normalize(table=self.table, gene_length=gene_length, method="tpm")
+
+    @patch("q2_feature_table._normalize.TMM")
+    def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm):
+        # Test valid inputs for TMM method
+        gene_length = SequenceCharacteristicsDirectoryFormat()
+        with open(os.path.join(
+                str(gene_length), "sequence_characteristics.tsv"),
+                  'w') as file:
+            file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
+        normalize(table=self.table, method="tmm", a_trim=0.06, m_trim=0.4)
+
+
 if __name__ == "__main__":
     main()