Skip to content

Commit

Permalink
ENH: Added action normalize that can normalize a `FeatureTable[Freq…
Browse files Browse the repository at this point in the history
…uency]` with common methods. (#312)

* added normalized

* typemap properties and plugin setup

* lint and add rnanorm to metayaml
  • Loading branch information
VinzentRisch authored Dec 12, 2024
1 parent 443490c commit cb009e7
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 6 deletions.
1 change: 1 addition & 0 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ requirements:
- qiime2 {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- q2-types {{ qiime2_epoch }}.*
- rnanorm

test:
requires:
Expand Down
4 changes: 2 additions & 2 deletions q2_feature_table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from ._normalize import rarefy
from ._normalize import rarefy, normalize
from ._subsample_ids import subsample_ids
from ._transform import (presence_absence, relative_frequency, transpose)
from ._summarize import (summarize, tabulate_seqs, tabulate_sample_frequencies,
Expand All @@ -31,4 +31,4 @@
'filter_seqs', 'subsample_ids', 'rename_ids',
'filter_features_conditionally', 'split',
'tabulate_feature_frequencies', 'tabulate_sample_frequencies',
'summarize_plus']
'summarize_plus', 'normalize']
98 changes: 98 additions & 0 deletions q2_feature_table/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@

import biom

import os

import pandas as pd
from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ


def rarefy(table: biom.Table, sampling_depth: int,
with_replacement: bool = False) -> biom.Table:
Expand All @@ -23,3 +29,95 @@ def rarefy(table: biom.Table, sampling_depth: int,
'shallow enough sampling depth.')

return table


def normalize(
table: pd.DataFrame,
method: str,
m_trim: float = None,
a_trim: float = None,
gene_length: SequenceCharacteristicsDirectoryFormat = None,
) -> pd.DataFrame:
# Validate parameter combinations and set trim parameters
m_trim, a_trim = _validate_parameters(
method, m_trim, a_trim, gene_length)

# Process gene_lengths input and define methods that need gene_lengths
# input
if method in ["tpm", "fpkm"]:
lengths = _convert_lengths(table, gene_length)

methods = {
"tpm": TPM(gene_lengths=lengths),
"fpkm": FPKM(gene_lengths=lengths),
}

# Define remaining methods that don't need gene_lengths input
else:
methods = {
"tmm": TMM(m_trim=m_trim, a_trim=a_trim),
"ctf": CTF(m_trim=m_trim, a_trim=a_trim),
"uq": UQ(),
"cuf": CUF(),
"cpm": CPM(),
}

# Run normalization method on frequency table
normalized = methods[method].set_output(
transform="pandas").fit_transform(table)
normalized.index.name = "sample_id"

return normalized


def _validate_parameters(method, m_trim, a_trim, gene_length):
# Raise Error if gene-length is missing when using methods TPM or FPKM
if method in ["tpm", "fpkm"] and not gene_length:
raise ValueError("gene-length input is missing.")

# Raise Error if gene-length is given when using methods TMM, UQ, CUF,
# CPM or CTF
if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length:
raise ValueError(
"gene-length input can only be used with FPKM and TPM methods."
)

# Raise Error if m_trim or a_trim are given when not using methods
# TMM or CTF
if ((method not in ["tmm", "ctf"])
and (m_trim is not None or a_trim is not None)):
raise ValueError(
"Parameters m-trim and a-trim can only be used with "
"methods TMM and CTF."
)

# Set m_trim and a_trim to their default values for methods
# TMM and CTF
if method in ["tmm", "ctf"]:
m_trim = 0.3 if m_trim is None else m_trim
a_trim = 0.05 if a_trim is None else a_trim

return m_trim, a_trim


def _convert_lengths(table, gene_length):
# Read in table from sequence_characteristics.tsv as a pd.Series
lengths = pd.read_csv(
os.path.join(gene_length.path, "sequence_characteristics.tsv"),
sep="\t",
header=None,
names=["index", "values"],
index_col="index",
skiprows=1,
).squeeze("columns")

# Check if all gene IDs that are present in the table are also
# present in the lengths
if not set(table.columns).issubset(set(lengths.index)):
only_in_counts = set(table.columns) - set(lengths.index)
raise ValueError(
f"There are genes present in the FeatureTable that are "
f"not present in the gene-length input. Missing lengths "
f"for genes: {only_in_counts}"
)
return lengths
9 changes: 9 additions & 0 deletions q2_feature_table/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,12 @@ @article{NCBI-BLAST
year = {2008},
doi = {10.1093/nar/gkn201},
}

@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023,
author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha},
month = jul,
title = {{RNAnorm: RNA-seq data normalization in Python}},
url = {https://github.com/genialis/RNAnorm},
version = {2.1.0},
year = {2023}
}
81 changes: 78 additions & 3 deletions q2_feature_table/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.core.type import Properties
from qiime2.plugin import (Plugin, Int, Float, Range, Metadata, Str, Bool,
Choices, MetadataColumn, Categorical, List,
Citations, TypeMatch, TypeMap, Collection,
Visualization)

from q2_types.feature_table import (
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition)
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition,
Normalized)
from q2_types.feature_data import (
FeatureData, Sequence, Taxonomy, AlignedSequence)
FeatureData, Sequence, Taxonomy, AlignedSequence, SequenceCharacteristics)
from q2_types.metadata import ImmutableMetadata

import q2_feature_table
Expand Down Expand Up @@ -723,3 +724,77 @@
"Tabulate sample and feature frequencies.",
examples={'feature_table_summarize_plus': ex.feature_table_summarize_plus}
)

P_method, T_normalized_table = TypeMap(
{
Str
% Choices("tpm"): (
FeatureTable[Normalized % Properties("TPM")],
),
Str
% Choices("fpkm"): (
FeatureTable[Normalized % Properties("FPKM")],
),
Str
% Choices("tmm"): (
FeatureTable[Normalized % Properties("TMM")],
),
Str
% Choices("uq"): (
FeatureTable[Normalized % Properties("UQ")],
),
Str
% Choices("cuf"): (
FeatureTable[Normalized % Properties("CUF")],
),
Str
% Choices("ctf"): (
FeatureTable[Normalized % Properties("CTF")],
),
Str
% Choices("cpm"): (
FeatureTable[Normalized % Properties("CPM")],
),
}
)

plugin.methods.register_function(
function=q2_feature_table.normalize,
inputs={
"table": FeatureTable[Frequency],
"gene_length": FeatureData[
SequenceCharacteristics % Properties("length")],
},
parameters={
"method": P_method,
"m_trim": Float % Range(
0, 1, inclusive_start=True, inclusive_end=True
),
"a_trim": Float % Range(
0, 1, inclusive_start=True, inclusive_end=True
),
},
outputs=[("normalized_table", T_normalized_table)],
input_descriptions={
"table": "Feature table with gene counts.",
"gene_length": "Gene lengths of all genes in the feature table.",
},
parameter_descriptions={
"method": "Specify the normalization method to be used. Use FPKM or "
"TPM for within comparisons and TMM, UQ, CUF or CTF for between "
"sample camparisons. Check https://www.genialis.com/wp-content/uploads"
"/2023/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf "
"for more information on the methods.",
"m_trim": "Two sided cutoff for M-values. Can only be used for "
"methods TMM and CTF. (default = 0.3)",
"a_trim": "Two sided cutoff for A-values. Can only be used for "
"methods TMM and CTF. (default = 0.05)",
},
output_descriptions={
"normalized_table": "Feature table normalized with specified method."
},
name="Normalize FeatureTable",
description="Normalize FeatureTable by gene length, library size and "
"composition with common methods for RNA-seq.",
citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]],
)
110 changes: 109 additions & 1 deletion q2_feature_table/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,20 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
from unittest import TestCase, main
from unittest.mock import MagicMock, patch

import numpy as np
import numpy.testing as npt
import pandas as pd
from biom.table import Table
from pandas._testing import assert_series_equal
from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat

from q2_feature_table import rarefy
from q2_feature_table._normalize import (_validate_parameters,
_convert_lengths, normalize)


class RarefyTests(TestCase):
Expand Down Expand Up @@ -53,5 +59,107 @@ def test_rarefy_depth_error(self):
rarefy(t, 50)


class NormalizeTests(TestCase):

@classmethod
def setUpClass(cls):
cls.lengths = pd.Series(
{
"ARO1": 1356.0,
"ARO2": 1173.0,
},
name="values",
)
cls.lengths.index.name = "index"
cls.table = pd.DataFrame({
'ID': ['sample1', 'sample2'],
'ARO1': [2.0, 2.0],
'ARO2': [0.0, 0.0]
}).set_index('ID')

def test_validate_parameters_uq_with_m_a_trim(self):
# Test Error raised if gene-length is given with UQ method
with self.assertRaisesRegex(
ValueError,
"Parameters m-trim and a-trim can only "
"be used with methods TMM and CTF.",
):
_validate_parameters("uq", 0.2, 0.05, None)

def test_validate_parameters_tpm_missing_gene_length(self):
# Test Error raised if gene-length is missing with TPM method
with self.assertRaisesRegex(
ValueError, "gene-length input is missing."):
_validate_parameters("tpm", None, None, None)

def test_validate_parameters_tmm_gene_length(self):
# Test Error raised if gene-length is given with TMM method
with self.assertRaisesRegex(
ValueError,
"gene-length input can only be used with FPKM and "
"TPM methods."
):
_validate_parameters(
"tmm", None, None, gene_length=MagicMock())

def test_validate_parameters_default_m_a_trim(self):
# Test if m_trim and a_trim get set to default values if None
m_trim, a_trim = _validate_parameters("tmm", None, None, None)
self.assertEqual(m_trim, 0.3)
self.assertEqual(a_trim, 0.05)

def test_validate_parameters_m_a_trim(self):
# Test if m_trim and a_trim are not modified if not None
m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None)
self.assertEqual(m_trim, 0.1)
self.assertEqual(a_trim, 0.06)

def test_convert_lengths_gene_length(self):
# Test _convert_lengths
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length), "sequence_characteristics.tsv"),
'w') as file:
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")

obs = _convert_lengths(self.table, gene_length=gene_length)
assert_series_equal(obs, self.lengths)

def test_convert_lengths_short_gene_length(self):
# Test Error raised if gene-length is missing genes
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length),
"sequence_characteristics.tsv"), 'w') as file:
file.write("id\tlength\nARO1\t1356.0")
with self.assertRaisesRegex(
ValueError,
"There are genes present in the FeatureTable that are "
"not present in the gene-length input. Missing lengths "
"for genes: {'ARO2'}",
):
_convert_lengths(self.table, gene_length=gene_length)

@patch("q2_feature_table._normalize.TPM")
def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
# Test valid inputs for TPM method
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length), "sequence_characteristics.tsv"),
'w') as file:
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
normalize(table=self.table, gene_length=gene_length, method="tpm")

@patch("q2_feature_table._normalize.TMM")
def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm):
# Test valid inputs for TMM method
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length), "sequence_characteristics.tsv"),
'w') as file:
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
normalize(table=self.table, method="tmm", a_trim=0.06, m_trim=0.4)


if __name__ == "__main__":
main()

0 comments on commit cb009e7

Please sign in to comment.