FUN-01-DGE_comparison_v2.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<title>Fitting Multifactorial Models of Differential Expression</title>

<script src="site_libs/header-attrs-2.20/header-attrs.js"></script>
<script src="site_libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
       h1.title {font-size: 38px;}
       h2 {font-size: 30px;}
       h3 {font-size: 24px;}
       h4 {font-size: 18px;}
       h5 {font-size: 16px;}
       h6 {font-size: 12px;}
       code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
       pre:not([class]) { background-color: white }</style>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>

<style type="text/css">code{white-space: pre;}</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>


<style type="text/css">
/* for pandoc --citeproc since 2.11 */
div.csl-bib-body { }
div.csl-entry {
  clear: both;
}
.hanging div.csl-entry {
  margin-left:2em;
  text-indent:-2em;
}
div.csl-left-margin {
  min-width:2em;
  float:left;
}
div.csl-right-inline {
  margin-left:2em;
  padding-left:1em;
}
div.csl-indent {
  margin-left: 2em;
}
</style>

<link rel="stylesheet" href="tutorial.css" type="text/css" />


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
details > summary > p:only-child {
  display: inline;
}
pre code {
  padding: 0;
}
</style>


<style type="text/css">
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #adb5bd;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script type="text/javascript">
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark the anchor link active (and if it's in a dropdown, also mark that active)
  var dropdown = menuAnchor.closest('li.dropdown');
  if (window.bootstrap) { // Bootstrap 4+
    menuAnchor.addClass('active');
    dropdown.find('> .dropdown-toggle').addClass('active');
  } else { // Bootstrap 3
    menuAnchor.parent().addClass('active');
    dropdown.addClass('active');
  }

  // Navbar adjustments
  var navHeight = $(".navbar").first().height() + 15;
  var style = document.createElement('style');
  var pt = "padding-top: " + navHeight + "px; ";
  var mt = "margin-top: -" + navHeight + "px; ";
  var css = "";
  // offset scroll position for anchor links (for fixed navbar)
  for (var i = 1; i <= 6; i++) {
    css += ".section h" + i + "{ " + pt + mt + "}\n";
  }
  style.innerHTML = "body {" + pt + "padding-bottom: 40px; }\n" + css;
  document.head.appendChild(style);
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "\e259";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "\e258";
  font-family: 'Glyphicons Halflings';
  border: none;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->


<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}

@media print {
.toc-content {
  /* see https://github.com/w3c/csswg-drafts/issues/4434 */
  float: right;
}
}

.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
}

.tocify .list-group-item {
  border-radius: 0px;
}

.tocify-subheader {
  display: inline;
}
.tocify-subheader .tocify-item {
  font-size: 0.95em;
}

</style>


</head>

<body>


<div class="container-fluid main-container">


<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">


<div class="navbar navbar-inverse  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-bs-toggle="collapse" data-target="#navbar" data-bs-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">MarineOmics</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="contributions.html">Contributions</a>
</li>
<li>
  <a href="panels.html">Panel Seminars</a>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Population Genomics
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="choosing-population-genetics.html">Choosing a Population Genomics Approach</a>
    </li>
    <li>
      <a href="WGS_intro.html">Whole Genome Resequencing</a>
    </li>
    <li>
      <a href="RADseq.html">Reduced Representation Sequencing</a>
    </li>
    <li>
      <a href="poolseq.html">Poolseq</a>
    </li>
    <li>
      <a href="RDAtraitPredictionTutorial.html">Redundancy Analysis (RDA) Trait Prediction</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Functional Genomics
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="DGE_comparison_v2.html">Mutifactorial RNAseq</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Genome-Phenome
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li class="dropdown-header">coming soon!</li>
  </ul>
</li>
<li>
  <a href="https://github.com/MarineOmics/marineomics.github.io/discussions">Discussion Forum</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->
<script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

  ga('create', 'G-53GH9PV49T', 'auto');
  ga('send', 'pageview');

</script>

<div id="header">


<h1 class="title toc-ignore">Fitting Multifactorial Models of
Differential Expression</h1>
<h3 class="subtitle"><em>Samuel N. Bogan</em></h3>

</div>


<div id="intro-to-multifactorial-rna-seq-models" class="section level1">
<h1>Intro to multifactorial RNA-seq models</h1>
<p>Studies of molecular responses to environmental change increasingly
employ multifactorial experimental designs incorporating different
developmental stages, stressors, populations, or non-linear dynamics
that resolve interactions between variables that shape expression,
physiology, and performance. After an experiment during which a study
organism was exposed to abiotic variation and perhaps other variables,
researchers interested in how gene expression correlates with these
variables will often extract RNA across treatment groups, prepare cDNA
libraries for sequencing, and conduct an analysis of differential
expression (DE) across variables of interest after mapping and counting
RNA-seq reads. DE analysis is achieved by fitting RNA-seq read count
data to statistical models, often using tried-and-true DE packages. Most
packages do not have the flexibility to accommodate a number of common
experimental design components and it can be difficult to navigate their
functionality for fitting multivariate models. When these packages fall
short, determining how to fit customized multifactorial models yourself
can be an additional challenge. Many great walkthroughs exist for
conducting DE analysis across using single-variable experimental
designs, but there is a lack of guidance on multifactorial approaches
for testing for DE and modelling gene expression. Regardless of whether
you have RNA-seq data on hand or you’re in the process of planning your
experiment, this walkthrough will help you navigate what DE packages and
model fitting approaches are best suited to your experimental
design.</p>
<p><br></p>
<div id="targeted-audience" class="section level2">
<h2>Targeted audience</h2>
<p>This walkthrough is for both intermediate and expert practitioners of
RNA-seq analyses, and most of all, those new to multifactorial RNA-seq.
We have written this page with the assumption that readers have a basic
familiarity with model fitting and statistical analysis, as well as next
generation sequencing technologies, RNA-seq workflows, and the R
statistical environment.</p>
<p><br></p>
</div>
<div id="variables-we-will-cover" class="section level2">
<h2>Variables we will cover</h2>
<p>This walkthrough can be broken down into different types of predictor
variables that can be incorporated in models of gene expression. In the
order we address them, these variables include:</p>
<ul>
<li>Non-linear fixed effects</li>
<li>Continuous fixed effects</li>
<li>Interactive fixed effects</li>
<li>Random intercepts</li>
<li>Random slopes</li>
</ul>
<p><br></p>
</div>
<div id="packages-we-will-cover" class="section level2">
<h2>Packages we will cover</h2>
<ul>
<li>DESeq2 <span class="citation">(Love, Huber, and Anders 2014)</span>
<a
href="http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html">Link</a></li>
<li>edgeR <span class="citation">(Robinson, McCarthy, and Smyth
2010)</span> <a
href="https://www.bioconductor.org/packages/devel/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf">Link</a></li>
<li>Voom (often called Limma-Voom) <span class="citation">(Law et al.
2014)</span> <a
href="https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/thursday/DE.html">Link</a></li>
</ul>
<p><br></p>
</div>
</div>
<div id="features-of-popular-dge-packages" class="section level1">
<h1>Features of popular DGE packages</h1>
<p>Two important considerations when choosing a DE package to analyze
RNA-seq data are the distribution family used by the package and the
effect types it incorporate into a model. The DE packages we will cover
assume the following read count distributions and are capable of fitting
the following effect types:</p>
<table style="width:100%;">
<colgroup>
<col width="14%" />
<col width="14%" />
<col width="14%" />
<col width="14%" />
<col width="14%" />
<col width="14%" />
<col width="14%" />
</colgroup>
<thead>
<tr class="header">
<th align="left">Program</th>
<th align="left">Distribution</th>
<th align="left">Continuous fixed eff.</th>
<th align="left">Random intercepts</th>
<th align="left">Random slopes</th>
<th align="left">Interactive eff.</th>
<th align="left">Non-linear eff.</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">edgeR</td>
<td align="left">Negative binomial generalized linear model</td>
<td align="left">✔</td>
<td align="left">✖</td>
<td align="left">✖</td>
<td align="left">✔</td>
<td align="left">✔</td>
</tr>
<tr class="even">
<td align="left">DESeq2</td>
<td align="left">Negative binomial generalized linear model</td>
<td align="left">✔</td>
<td align="left">✖</td>
<td align="left">✖</td>
<td align="left">✔</td>
<td align="left">✔</td>
</tr>
<tr class="odd">
<td align="left">Voom</td>
<td align="left">Mean-variance linear model</td>
<td align="left">✔</td>
<td align="left">✔</td>
<td align="left">✖</td>
<td align="left">✔</td>
<td align="left">✔</td>
</tr>
</tbody>
</table>
<p><br></p>
<div id="summary-of-approach" class="section level2">
<h2>Summary of approach</h2>
<p>We will walk you through (i) how to conduct essential checks of your
data such as principal components analysis and observing the
distribution of filtered read counts, (ii) evaluating each package’s
functionality for multifactorial model fitting, and (iii) how to fit
each type of predictor by performing differential expression tests using
real data. These data come from an experiment that exposed the Antarctic
pteropod (a free-swimming planktonic snail) <em>Limacina helicina
antarctica</em> to three continuous levels of ocean acidification (OA)
for either 0.5 or 7 days <span class="citation">(Bogan, Johnson, and
Hofmann 2020)</span>. These RNA-seq counts are derived from reads mapped
and counted via RSEM <span class="citation">(Li and Dewey 2011)</span>
using a <em>de novo</em> reference transcriptome that includes
transcript isoforms assembled by <span class="citation">Johnson and
Hofmann (2016)</span>.</p>
<p>Throughout this walkthrough, we will provide examples of custom
scripts for fitting linear, linear mixed models, generalized linear
models to read count data using for loops in order to provide
functionality where it cannot be met by most differential expression
packages.</p>
<p>At the end of the walkthrough, we demonstrate how to compare model
predictions by different packages for different effect types commonly
used in multifactorial RNAseq analyses.</p>
<p>Let’s start by loading our required packages…</p>
<p><br></p>
<pre class="r"><code>## Unhash and run the code below if you believe you may need to install the packages loaded below

#invisible(lapply(c( &quot;tidyverse&quot;, &quot;ape&quot;, &quot;vegan&quot;, &quot;GGally&quot;,
                   #&quot;rgl&quot;, &quot;adegenet&quot;, &quot;MASS&quot;,
                   #&quot;data.table&quot;, &quot;plyr&quot;, &quot;lmtest&quot;, &quot;reshape2&quot;, &quot;Rmisc&quot;, &quot;lmerTest&quot;,&quot;statmod&quot;),
                 #function(p){
                   #if(! p %in% rownames(installed.packages())) {
                     #install.packages(p)
                   #}
                   #library(p, character.only=TRUE)
                 #}))

#if (!require(&quot;BiocManager&quot;, quietly = TRUE))
    #install.packages(&quot;BiocManager&quot;)

#BiocManager::install(c(&quot;DESeq2&quot;,&quot;edgeR&quot;,&quot;arrayQualityMetrics&quot;))

# Load packages
library(DESeq2)
library(edgeR)
library(tidyverse)
library(ape)
library(vegan)
library(GGally)
library(arrayQualityMetrics)
library(rgl)
library(adegenet)
library(MASS)
library(data.table)
library(plyr)
library(lmtest)
library(reshape2)
library(Rmisc)
library(lmerTest)</code></pre>
</div>
</div>
<div id="filter-and-visualize-read-counts" class="section level1">
<h1>Filter and visualize read counts</h1>
<p>Before model fitting and testing for DE, it is important to visually
inspect read counts pre- and post-filtering. In this walk through, we
will plot the distributions of read counts and sample loading to
multidimensional scaling axes (e.g., plotting principal components or
principal coordinates analyses). Looking at these plots, we can (i)
determine whether the distribution of our data match the assumptions of
model families used by different packages and (ii) determine whether one
or multiple variables within a dataset explain a significant degree of
variation in gene expression across replicates. Depending on the
distribution of your data and the variables that appear important (or
variable you would like to incorporate into your models), we will then
demonstrate how to choose an appropriate package to test for DE.</p>
<p>Most DE packages assume that read counts possess a negative binomial
distribution. The negative binomial distribution is an extension of
distributions for binary variables such as the Poisson distribution,
allowing for estimations of “equidispersion” and “overdispersion”, equal
and greater-than-expected variation in expression attributed to
biological variability . However, RNA-seq datasets can exhibit poor fits
with the negative binomial distribution <span class="citation">(Hawinkel
et al. 2020)</span>. It is well worth visualizing and testing the
distributions of RNA-seq read counts before and after filtering. Keep in
mind that the distribution applies not to the raw data, but the
residuals of a model. Additionally, individual models are fit to the
read counts of each gene. Thus, the most accurate representation of how
appropriate a negative binomial distribution is for read counts would
come from plotting the distribution of residuals across all genes
following model fitting. Below is a visual example of negative binomial
distributions that possess different means and variances:</p>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-2-1.png" width="672" /></p>
<p>Now let’s first plot the distribution of unfiltered reads from our
dataset:</p>
<p><br></p>
<pre class="r"><code># Read in matrix of RSEM expected read counts
data &lt;- read.delim(&quot;RNA_data/GE2_expected_counts_gene.matrix&quot;, sep = &quot;\t&quot;, header = T, row.names = 1)

# Peak at data to check that it looks okay
head(data)</code></pre>
<pre><code>##                   B7a.genes.results B7b.genes.results B7c.genes.results
## TR100110|c0_g1_i1              4.00             10.00              3.00
## TR101578|c0_g1_i1              0.00              0.00              0.00
## TR1047|c0_g1_i1               14.00             19.00             11.00
## TR105096|c0_g1_i1             14.00             12.00              9.00
## TR107626|c1_g1_i1           4530.93          28139.91          14141.83
## TR11301|c0_g1_i1               0.00              0.00              0.00
##                   B12a.genes.results B12b.genes.results B12c.genes.results
## TR100110|c0_g1_i1               6.00                9.0               8.00
## TR101578|c0_g1_i1               0.00                0.0               0.00
## TR1047|c0_g1_i1                15.00               14.0               9.00
## TR105096|c0_g1_i1              15.00                5.0              10.00
## TR107626|c1_g1_i1           80846.09             5649.4           30756.08
## TR11301|c0_g1_i1                0.00                0.0               0.00
##                   R7a.genes.results R7b.genes.results R7c.genes.results
## TR100110|c0_g1_i1              18.0             15.00              6.00
## TR101578|c0_g1_i1               0.0              0.00              0.00
## TR1047|c0_g1_i1                16.0             16.00             27.00
## TR105096|c0_g1_i1              25.0             15.00              6.00
## TR107626|c1_g1_i1          137592.5          45299.57          14843.47
## TR11301|c0_g1_i1                0.0              0.00              0.00
##                   R12a.genes.results R12b.genes.results R12c.genes.results
## TR100110|c0_g1_i1              11.00               13.0                  9
## TR101578|c0_g1_i1               0.00                0.0                  0
## TR1047|c0_g1_i1                23.00               14.0                 11
## TR105096|c0_g1_i1              14.00               45.0                 12
## TR107626|c1_g1_i1           21210.52           230436.5                  0
## TR11301|c0_g1_i1                0.00                0.0                  0
##                   Y7a.genes.results Y7b.genes.results Y7c.genes.results
## TR100110|c0_g1_i1                 6                 4              12.0
## TR101578|c0_g1_i1                 0                 0               0.0
## TR1047|c0_g1_i1                  20                 3              18.0
## TR105096|c0_g1_i1                 9                 1              13.0
## TR107626|c1_g1_i1                 0                 0          112214.6
## TR11301|c0_g1_i1                  0                 0               0.0
##                   Y12a.genes.results Y12b.genes.results Y12c.genes.results
## TR100110|c0_g1_i1              21.00                 12              12.00
## TR101578|c0_g1_i1               0.00                  0               0.00
## TR1047|c0_g1_i1                25.00                 14               9.00
## TR105096|c0_g1_i1              14.00                 19              24.00
## TR107626|c1_g1_i1           86243.14             148100           74153.54
## TR11301|c0_g1_i1                0.00                  0               0.00</code></pre>
<p>The table of read counts above has rows representing individual
transcripts and columns representing different replicate samples across
treatments.</p>
<p><br></p>
<pre class="r"><code># Name experimental samples: names correspond to pCO2 treatment (300, 600, 900) + days (12 h2 vs 7 days) + replicate ID
colnames(data) &lt;- c(&quot;300.7.a&quot;, &quot;300.7.b&quot;, &quot;300.7.c&quot;,&quot;300.12.a&quot;, &quot;300.12.b&quot;, &quot;300.12.c&quot;, 
                    &quot;900.7.a&quot;, &quot;900.7.b&quot;, &quot;900.7.c&quot;, &quot;900.12.a&quot;, &quot;900.12.b&quot;, &quot;900.12.c&quot;, 
                    &quot;600.7.a&quot;, &quot;600.7.b&quot;, &quot;600.7.c&quot;, &quot;600.12.a&quot;, &quot;600.12.b&quot;, &quot;600.12.c&quot;)

## Create &#39;targets&#39; and &#39;Group dataframe, expressing experimental variables for DEG analysis
pCO2 &lt;- as.numeric(c( 255, 255, 255, 255, 255, 255,
                      530, 530, 530, 530, 530, 530,
                      918, 918, 918, 918, 918, 918))

treatment &lt;- c(&quot;B&quot;,&quot;B&quot;, &quot;B&quot;, &quot;B&quot;, &quot;B&quot;, &quot;B&quot;,
               &quot;R&quot;, &quot;R&quot;, &quot;R&quot;, &quot;R&quot;, &quot;R&quot;, &quot;R&quot;,
               &quot;Y&quot;, &quot;Y&quot;, &quot;Y&quot;, &quot;Y&quot;, &quot;Y&quot;, &quot;Y&quot;)

day &lt;- as.numeric(c(7, 7, 7, .5, .5, .5,
                    7, 7, 7, .5, .5, .5,
                    7, 7, 7, .5, .5, .5))

targets &lt;- data.frame(pCO2, day, treatment)
targets$grouping &lt;- paste(targets$pCO2, targets$day, sep = &quot;.&quot;)

# The group factor represents the combined levels of an experimental replicate across all variables
Group &lt;- factor(paste(targets$day, targets$pCO2, sep = &quot;_&quot;))

# Data must be rounded to nearest integer in order to be fit for negative binomial distribution
data_input &lt;- round(data)

# Peak at rounded data
head(data_input)</code></pre>
<pre><code>##                   300.7.a 300.7.b 300.7.c 300.12.a 300.12.b 300.12.c 900.7.a
## TR100110|c0_g1_i1       4      10       3        6        9        8      18
## TR101578|c0_g1_i1       0       0       0        0        0        0       0
## TR1047|c0_g1_i1        14      19      11       15       14        9      16
## TR105096|c0_g1_i1      14      12       9       15        5       10      25
## TR107626|c1_g1_i1    4531   28140   14142    80846     5649    30756  137592
## TR11301|c0_g1_i1        0       0       0        0        0        0       0
##                   900.7.b 900.7.c 900.12.a 900.12.b 900.12.c 600.7.a 600.7.b
## TR100110|c0_g1_i1      15       6       11       13        9       6       4
## TR101578|c0_g1_i1       0       0        0        0        0       0       0
## TR1047|c0_g1_i1        16      27       23       14       11      20       3
## TR105096|c0_g1_i1      15       6       14       45       12       9       1
## TR107626|c1_g1_i1   45300   14843    21211   230436        0       0       0
## TR11301|c0_g1_i1        0       0        0        0        0       0       0
##                   600.7.c 600.12.a 600.12.b 600.12.c
## TR100110|c0_g1_i1      12       21       12       12
## TR101578|c0_g1_i1       0        0        0        0
## TR1047|c0_g1_i1        18       25       14        9
## TR105096|c0_g1_i1      13       14       19       24
## TR107626|c1_g1_i1  112215    86243   148100    74154
## TR11301|c0_g1_i1        0        0        0        0</code></pre>
<pre class="r"><code># Plot distribution of unfiltered read counts across all samples 
ggplot(data = data.frame(rowMeans(data_input)),
       aes(x = rowMeans.data_input.)) +
  geom_histogram(fill = &quot;grey&quot;) +
  xlim(0, 500) +
  theme_classic() +
  labs(title = &quot;Distribution of unfiltered reads&quot;) +
  labs(y = &quot;Density&quot;, x = &quot;Raw read counts&quot;,
  title = &quot;Read count distribution: untransformed, unnormalized, unfiltered&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-4-1.png" width="672" /></p>
<p>As you can see in the above plot, the raw distribution of all read
counts takes on a left-skewed negative binomial distribution similar to
the purple distribution in the example above. While looking at the
distribution of your raw reads is useful, these are not in fact the data
you will be inputting to tests of differential expression. Let’s plot
the distribution of filtered reads normalized by library size, expressed
as log2 counts per million reads (logCPM). These are the reads we will
use in our test, and after we plot their distribution, we will conduct
one more, slightly more robust, test of our data’s fit to the negative
binomial distribution using residuals from fitted models.</p>
<p><br></p>
<pre class="r"><code># Make a DGEList object for edgeR
y &lt;- DGEList(counts = data_input, remove.zeros = TRUE)

#Let&#39;s remove samples with less than 0.5 cpm (this is ~10 counts in the count file) in fewer then 9/12 samples
keep &lt;- rowSums(cpm(y) &gt; .5) &gt;= 9

table(keep)</code></pre>
<pre><code>## keep
## FALSE  TRUE 
## 18871 62579</code></pre>
<pre class="r"><code># Set keep.lib.sizes = F and recalculate library sizes after filtering
y &lt;- y[keep, keep.lib.sizes = FALSE]

y &lt;- calcNormFactors(y)

# Calculate logCPM
df_log &lt;- cpm(y, log = TRUE, prior.count = 2)

# Plot distribution of filtered logCPM values
ggplot(data = data.frame(rowMeans(df_log)), 
       aes(x = rowMeans.df_log.) ) +
  geom_histogram(fill = &quot;grey&quot;) +
  theme_classic() +
  labs(y = &quot;Density&quot;, x = &quot;Filtered read counts (logCPM)&quot;,
       title = &quot;Distribution of normalized, filtered read counts&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-5-1.png" width="672" /></p>
<p>Our raw data appear to follow a strong negative binomial
distribution! Later, we will see whether this holds for residuals from
fitted negative binomial GLMs.</p>
<p><br></p>
<div id="mds-plot-visualizing-experimental-factors"
class="section level2">
<h2>MDS plot visualizing experimental factors</h2>
<p>Before analyzing our data, it is essential that we look at the
multivariate relationships between our samples based on
transcriptome-wide expression levels. Below is example code and output
for a principal coordinates analysis (PCOA) plot that visualizes
multifactorial RNA-seq replicates according to two predictor variables
across major and minor latent variables or PCOA axes. These predictor
variables, as discussed above, are <em>p</em>CO<span
class="math inline">\({_2}\)</span> and time-in-treatment.</p>
<p><br></p>
<pre class="r"><code># Export pcoa loadings
dds.pcoa = pcoa(vegdist(t(df_log &lt;- cpm(y, log = TRUE, prior.count = 2)),
                          method = &quot;euclidean&quot;) / 1000)

# Create df of MDS vector loading
scores &lt;- dds.pcoa$vectors

## Plot pcoa loadings of each sample, groouped by time point and pCO2 treatment

# Calculate % variation explained by each eigenvector
percent &lt;- dds.pcoa$values$Eigenvalues
cumulative_percent_variance &lt;- (percent / sum( percent)) * 100

# Prepare information for pcoa plot, then plot
color &lt;- c(&quot;steelblue1&quot;, &quot;tomato1&quot;, &quot;goldenrod1&quot;)
par(mfrow = c(1, 1))
plot(
  scores[, 1],
  scores[, 2],
  cex = .5,
  cex.axis = 1,
  cex.lab = 1.25,
  xlab = paste(&quot;PC1, &quot;, round(cumulative_percent_variance[1], 2), &quot;%&quot;),
  ylab = paste(&quot;PC2, &quot;, round(cumulative_percent_variance[2], 2), &quot;%&quot;)
  )

# Add visual groupings to pcoa plot
ordihull(
  scores,
  as.factor(targets$treatment),
  border = NULL,
  lty = 2,
  lwd = .5,
  label = F,
  col = color,
  draw = &quot;polygon&quot;,
  alpha = 100,
  cex = .5
  )

ordispider(scores, as.factor(targets$grouping), label = F) # Vectors connecting samples in same pCO2 x time group

ordilabel(scores, cex = 0.5) # Label sample IDs</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-6-1.png" width="672" /></p>
<p>In the PCOA above, blue = low <em>p</em><span
class="math inline">\(CO_{2}\)</span>, yellow = moderate <em>p</em><span
class="math inline">\(CO_{2}\)</span>, and red = high <em>p</em><span
class="math inline">\(CO_{2}\)</span>. From this plot, we can see that
treatment and time both influence multivariate gene expression across
the RNA-seq samples. By and large, samples cluster according to these
two predictors in a manner consistent with what we would expect from our
experimental design if <em>p</em><span
class="math inline">\(CO_{2}\)</span> and time were to affect gene
expression.</p>
<p><br></p>
<pre class="r"><code>logCPM.pca &lt;- prcomp(t (df_log))
logCPM.pca.proportionvariances &lt;-
((logCPM.pca$sdev ^ 2) / (sum(logCPM.pca$sdev ^ 2))) * 100

## Do treatment groups fully segregate? Wrap samples by pCO2 x time, not just pCO2
# Replot using logCPM.pca
plot(
  logCPM.pca$x,
  type = &quot;n&quot;,
  main = NA,
  xlab = paste(&quot;PC1, &quot;, round(logCPM.pca.proportionvariances[1], 2), &quot;%&quot;),
  ylab = paste(&quot;PC2, &quot;, round(logCPM.pca.proportionvariances[2], 2), &quot;%&quot;)
  )

points(logCPM.pca$x,
       col = &quot;black&quot;,
       pch = 16,
       cex = 1)
       colors2 &lt;-
       c(&quot;steelblue1&quot;,
         &quot;dodgerblue2&quot;,
         &quot;tomato1&quot;,
         &quot;coral&quot;,
         &quot;goldenrod1&quot;,
         &quot;goldenrod3&quot;)
       
       ordihull(
       logCPM.pca$x,
       targets$grouping,
       border = NULL,
       lty = 2,
       lwd = .5,
       col = colors2,
       draw = &quot;polygon&quot;,
       alpha = 75,
       cex = .5,
       label = T
       )</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-7-1.png" width="672" /></p>
<p>This plot is only a slight adjustment of the first PCOA. If you’d
like to wrap each multifactorial group and get a better idea of
potential overlap between clusters, the plot and code above can be
applied to do so.</p>
<p>Now that we have conducted QC analyses and plots of the read count
data, it is time to begin model fitting and testing for differential
expression.</p>
<p><br></p>
</div>
</div>
<div id="non-linear-effects" class="section level1">
<h1>Non-linear effects</h1>
<p>Gene expression traits can exhibit non-linear performance curves
across a continuous variable much like other physiological traits.
Studies that incorporate more than two levels of a continuous predictor
provide an opportunity to test for non-linear effects on gene
expression. Similar to performance traits such as growth or metabolic
rates, gene expression can be modelled across a continuous variable by
fitting non-linear ‘performance curves’ to read count data. Rivera
<em>et al.</em> 2021 recently discussed the benefits of fitting
non-linear reaction norms to gene expression data in order to better
model acclimation by tolerant and sensitive genotypes to an
environmental stressor. In a conceptual figure of theirs, shown below,
they depict non-linear variation in expression before, during, and after
exposure to a stressor for four reaction norm phenotypes: (i) a tolerant
conspecific that frontloads (e.g., constitutively upregulates)
stress-induced transcripts, (ii) a tolerant conspecific that exhibits
reduced overall expression of stress-responsive transcripts, (iii) a
sensitive conspecific that upregulates inducible transcripts and cannot
recover post-stress, and (iv) a resilient conspecific that mounts a
transcriptional stress response followed by recovery and downregulation
<span class="citation">(Rivera et al. 2021)</span>.</p>
<p><br></p>
<p><img src="images/Rivera_etal_fig.png" width="1137" height="40%" style="display: block; margin: auto;" />
From Rivera <em>et al.</em> 2021 - “Transcriptomic resilience and
timing. (a) Gene expression reaction norms of four strategies during
recovery after a stressor. We use triangles again for patterns that may
confer tolerance and circles for patterns associated with stress
sensitivity. While all triangle paths show a return to baseline
(resilience) the pink (frontloading) and yellow (dampening) are also
depicting differences in baseline and plasticity and are therefore
labelled differently. (b) Adapted from the rolling ball analogy commonly
used for ecological resilience and depicted in Hodgson et al. (2015).
Each ball represents a gene showing a color-matched expression pattern
in (a). Landscapes represent expression possibilities during a stress
event. In the absence of stress, the ball will settle in a trough,
representing baseline expression levels. Elasticity (rate of return to
the baseline) is represented by the size of the arrow (i.e., larger
arrows have faster rates of return). Pink dotted line is the expression
landscape for the frontloaded ball. (c) Using Torres et al. (2016) loops
through disease space as an alternative framework of an organism’s path
through stress response and recovery. The color gradient represents the
resulting phenotype for a given path through stress and recovery space,
though x-and y-axis can denote any two parameters that are correlated
but with a time lag.”</p>
<p>Non-linear reaction norms can be modelled across time, such as Rivera
<em>et al.</em> have presented, or across multiple levels of an abiotic
variable such as temperature, pH, etc. in order to fit a performance
curve. Performance curves are a fundamental tool in ecological
physiology, and enable more robust hypothesis testing in RNA-seq studies
of environmental acclimation. For example, testing for variation in the
shape of gene expression performance curves between early and long-term
exposure timepoints can provide critical information about the role of
gene expression in acclimation. Below we outline how to fit and test for
non-linear gene expression performance curves in multifactorial RNA-seq
experiments using examples in edgeR, DESeq2, and custom code.</p>
<p>One of the simplest non-linear relationships that can be fitted to
the expression of a transcript across an continuous variable is a
second-order polynomial, otherwise known as a quadratic function, which
can be expressed as <span class="math inline">\(y_{i} = \mu +
\beta_{1}x^2 + \beta_{2}x\)</span> where <span
class="math inline">\(y\)</span> is the abundance of a given transcript
(<span class="math inline">\(i\)</span>), <span
class="math inline">\(\mu\)</span> is the intercept, and <span
class="math inline">\(y\)</span> is the continuous variable. For the
parabola generated by fitting a second-order polynomial, <span
class="math inline">\(\beta_{1}\)</span> &gt; 0 opens the parabola
upwards while <span class="math inline">\(\beta_{1}\)</span> &lt; 0
opens the parabola downwards. The vertex of the parabola is controlled
by <span class="math inline">\(\beta_{2}\)</span> such that when <span
class="math inline">\(\beta_{1}\)</span> is negative, greater <span
class="math inline">\(\beta_{2}\)</span> values result in the vertex
falling at higher values of <span class="math inline">\(x\)</span>.</p>
<p>Quadratic polynomials applied to phenotypic performance curves
commonly possess negative <span class="math inline">\(\beta_{1}\)</span>
values with positive <span class="math inline">\(\beta_{2}\)</span>
values: a downard-opening parabola with a positive vertex. However,
quadtratic curves fitted to gene expression data can take on a variety
of postiive or negative forms similar to exponential curves, saturating
curves, and parabolas. For instance, the expression of a gene may peak
an intermediate level of an environmental level before crashing or it
may exponentially decline across that variable. Such trends may better
model changes in the transcription of a gene compared to a linear model.
To get started, we will fit non-linear second order polynomials before
testing for whether model predictions for a given gene are significantly
improved by a non-linear linear model.</p>
<p><br></p>
<div id="non-linear-effects-example-in-edger" class="section level2">
<h2>Non-linear effects: example in edgeR</h2>
<p>Let’s fit a second-order polynomial for the effect of
<em>p</em>CO<span class="math inline">\(_{2}\)</span> using edgeR. Using
differential expression tests, we will then determine whether
<em>p</em>CO<span class="math inline">\(_{2}\)</span> affected a gene’s
rate of change in expression and expression maximum by applying
differential expression tests to <span
class="math inline">\(\beta_{1}\)</span> and <span
class="math inline">\(\beta_{2}\)</span> parameters. By testing for
differential expression attributed to intereactions between time and
<span class="math inline">\(\beta_{1}\)</span> or <span
class="math inline">\(\beta_{2}\)</span>, we will then test for whether
these parameters were significantly different across exposure times such
that 0.5 days and 7 days of acclimation to <em>p</em>CO<span
class="math inline">\(_{2}\)</span> altered the rate of change in
expression across <em>p</em>CO<span class="math inline">\(_{2}\)</span>
(<span class="math inline">\(\beta_{1}\)</span>) or the maximum of
expression (<span class="math inline">\(\beta_{2}\)</span>).</p>
<p>Since this is the first time we’re fitting models to our data in this
walkthrough, we will also output some important plots to help us explore
and QC model predictions as we work toward testing for differential
expression attributed to non-linear effects.</p>
<p><br></p>
<pre class="r"><code># Square pCO2 variable
pCO2_2 &lt;- pCO2 ^ 2

# Estimate dispersion coefficients
y1 &lt;- estimateDisp(y, robust = TRUE) # Estimate mean dispersal

# Plot tagwise dispersal and impose w/ mean dispersal and trendline
plotBCV(y1) </code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-9-1.png" width="672" /></p>
<p>The above figure is an output of the plotBCV() function in edgeR,
which visualises the biological coefficient of variation (otherwise
known as dispersion) parameter fitted to individual genes (black
circles), fitted across gene expression level (blue trendline), and
averaged across the entire transcriptome (red horizontal line). The
BCV/dispersion parameter is a necessary parameter, symbolized as <span
class="math inline">\(\theta\)</span> in model specification, that must
be estimated in negative binomial GLMs. <span
class="math inline">\(\theta\)</span> represents the ‘overdispersion’ or
the shape of a negative binomial distribution and is thus important for
defining the distribution of read count data for a given gene during
model fitting. Dispersion or <span class="math inline">\(\theta\)</span>
can be input during model fitting in edgeR using the three estimates
visualized in the above regression.</p>
<p>Now that we’ve estimated dispersion, we will use these estimates to
fit negative binomial GLMs to our read count data using the glmQLFit()
function in edgeR, one of the more robust model fitting functions
offered by edgeR.</p>
<p><br></p>
<pre class="r"><code># Fit multifactorial design matrix
design_nl &lt;-
    model.matrix(~ 1 + pCO2_2 + pCO2 + pCO2_2:day + pCO2:day) # Generate multivariate edgeR glm

# Fit quasi-likelihood, neg binom linear regression
nl_fit &lt;-
    glmQLFit(y1, design_nl) # Fit multivariate model to counts

## Test for effect of pCO2 and pCO2^2
nl_pCO2_2 &lt;-
    glmQLFTest(nl_fit,
    coef = 2,
    contrast = NULL,
    poisson.bound = FALSE) # Estimate significant DEGs
nl_pCO2 &lt;-
    glmQLFTest(nl_fit,
    coef = 3,
    contrast = NULL,
    poisson.bound = FALSE) # Estimate significant DEGs

# Make contrasts
is.de_nl_pCO2 &lt;-
  decideTestsDGE(nl_pCO2, adjust.method = &quot;fdr&quot;, p.value = 0.05)
is.de_nl_pCO2_2 &lt;-
  decideTestsDGE(nl_pCO2_2, adjust.method = &quot;fdr&quot;, p.value = 0.05)

# Summarize differential expression attributed to pCO2 and pCO2^2
summary(is.de_nl_pCO2)</code></pre>
<pre><code>##         pCO2
## Down   14166
## NotSig 35037
## Up     13376</code></pre>
<pre class="r"><code>summary(is.de_nl_pCO2_2)</code></pre>
<pre><code>##        pCO2_2
## Down    11091
## NotSig  39573
## Up      11915</code></pre>
<p>We have just fit our first GLM to our read count data and have tested
for differential expression across <em>p</em>CO<span
class="math inline">\(_{2}\)</span>. At this stage, it is important to
output a few diagnostic plots. For example, edgeR has the function
‘plotMD()’ which, when input with a differential expression test object
such as a glmQLFTest result, will produce a plot of differential
expression log2 fold change values across gene expression level (log2
counts per million or logCPM). logCPM is a major component of gene
function and statistical power, and is a useful variable to plot in
order to make initial assessments of differential expression
results.</p>
<p>Visualizations of differential expression (logFC) across baseline
logCPM can be produced by the plotMD() (short for “mean-difference”)
function in edgeR. To do so, plotMD() is input with the results of
glmQLFTest, which we used above to test for differential expression
attributable to the parameters <em>p</em>CO<span
class="math inline">\(_{2}\)</span> and <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2. Let’s produce two such plots for
both parameters where “pCO2_2” represents the <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2 parameter:</p>
<p><br></p>
<pre class="r"><code># Plot differential expression due to pCO2 and pCO2^2
plotMD(nl_pCO2)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-11-1.png" width="672" /></p>
<pre class="r"><code>plotMD(nl_pCO2_2)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-11-2.png" width="672" /></p>
<p>In the two mean-difference plots above, large colored points
represent genes exhibiting significant differential expression
attributed to <em>p</em>CO<span class="math inline">\(_{2}\)</span> or
<em>p</em>CO<span class="math inline">\(_{2}\)</span>^2 below. As you
can see, there is a lot of differential expression in this experiment
resulting from both of these parameters. Part of the reason for this
large number of significant differentially expressed genes (DEGs) is
that we have not applied a cutoff value for logFC. For example, a logFC
cutoff of 2.0 value would not designate any genes with an absolute logFC
value less than 0.05 as a significant DEG. However, deciding on a logFC
cutoff is very tricky! Since we used a continuous predictor for
<em>p</em>CO<span class="math inline">\(_{2}\)</span> during model
fitting, the values in the y-axis of this plot are slopes representing
the rate of change in expression per unit <em>p</em>CO<span
class="math inline">\(_{2}\)</span> (<span
class="math inline">\(\mu\)</span>atm). Determining what slope
represents a significant change in expression requires informed
reasoning and a strong body of prior data. For that reason, we are not
applying a logFC cutoff in this walkthrough.</p>
<p>Next, we will use glmQLFTest() to test for differential expression
attributed to interactions between time and <em>p</em>CO<span
class="math inline">\(_{2}\)</span> or time and <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2 before producing mean-difference
plots visualizing DEGs derived from these two interactions:</p>
<p><br></p>
<pre class="r"><code>## Test for interactions between time and pCO2 or pCO2^2
nl_pCO2_int &lt;-
  glmQLFTest(nl_fit,
  coef = 4,
  contrast = NULL,
  poisson.bound = FALSE) # Estimate significant DEGs
nl_pCO2_2_int &lt;-
  glmQLFTest(nl_fit,
  coef = 5,
  contrast = NULL,
  poisson.bound = FALSE) # Estimate significant DEGs
  
# Make contrasts
is.de_nl_pCO2_int &lt;-
  decideTestsDGE(nl_pCO2_int, adjust.method = &quot;fdr&quot;, p.value = 0.05) # Make contrasts
is.de_nl_pCO2_2_int &lt;-
  decideTestsDGE(nl_pCO2_2_int, adjust.method = &quot;fdr&quot;, p.value = 0.05)
  
# Summarize differential expression attributed to pCO2 and pCO2^2
summary(is.de_nl_pCO2_int)</code></pre>
<pre><code>##        pCO2_2:day
## Down            2
## NotSig      62573
## Up              4</code></pre>
<pre class="r"><code>summary(is.de_nl_pCO2_2_int)</code></pre>
<pre><code>##        pCO2:day
## Down        390
## NotSig    61918
## Up          271</code></pre>
<pre class="r"><code># Plot differential expression due to pCO2 and pCO2^2
plotMD(nl_pCO2_int)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-12-1.png" width="672" /></p>
<pre class="r"><code>plotMD(nl_pCO2_2_int)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-12-2.png" width="672" /></p>
<p>In the mean-difference plots above, it is clear that there is
remarkably less differential expression associated with interactions
between time and <em>p</em>CO<span class="math inline">\(_{2}\)</span>
or <em>p</em>CO<span class="math inline">\(_{2}\)</span>^2 compared to
the direct effects of <em>p</em>CO<span
class="math inline">\(_{2}\)</span> and <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2. This is expected, as the
statistical power necessary for detecting such interactions is lesser
and there are grounds to expect that interactions would be less frequent
during our experiment.</p>
<p>Taking a detour from tests for DEGs attributed to non-linear
interactions, we can now output the residuals of our GLMs in order to
better test for whether our data fit the assumptions of negative
binomial distribution families. If the distribution of residuals from
our GLMs are normal, this indicates that our data meet the assumption of
the negative binomial distribution. We will use the equation for
estimating Pearson residuals: <span class="math display">\[ residual =
\frac{observed - fitted} {\sqrt{fitted(dispersion*fitted)}}
\]</span></p>
<p><br></p>
<pre class="r"><code># Output observed
y_nl &lt;- nl_fit$counts

# Output fitted
mu_nl &lt;- nl_fit$fitted.values

# Output dispersion or coefficient of variation
phi_nl &lt;- nl_fit$dispersion

# Calculate denominator
v_nl &lt;- mu_nl*(1+phi_nl*mu_nl)

# Calculate Pearson residual
resid.pearson &lt;- (y_nl-mu_nl) / sqrt(v_nl)

# Plot distribution of Pearson residuals
ggplot(data = melt(as.data.frame(resid.pearson)), aes(x = value)) +
  geom_histogram(fill = &quot;grey&quot;) +
  xlim(-2.5, 5.0) +
  theme_classic() +
  labs(title = &quot;Distribution of negative binomial GLM residuals&quot;,
       x = &quot;Pearson residuals&quot;,
       y = &quot;Density&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-13-1.png" width="672" /></p>
<p>Our residuals appear to be normally distributed, indicating that our
data fit the negative binomial distribution assumed by the GLM.</p>
<p>Now let’s get back to analyzing non-linear effects on gene
expression!</p>
<p><br></p>
</div>
<div id="plotting-non-linear-effects" class="section level2">
<h2>Plotting non-linear effects</h2>
<p>A variety of non-linear patterns may underly the significant effects
we’ve observed above. For example, interactions between the vertex of
expression and time may be attributed to variation in expression peaks
across convex curves or the rate of decline in an exponentially
decreasing curve among other patterns. Below we will assign transcripts
exhibiting significant non-linear variation across <em>p</em>CO<span
class="math inline">\(_{2}\)</span> to different groups based on the
values of parameters such as <span
class="math inline">\(\beta_{1}\)</span> and <span
class="math inline">\(\beta_{2}\)</span>. Then we will plot the
expression of these grouped transcripts across <em>p</em>CO<span
class="math inline">\(_{2}\)</span> and time to visualize different
sources of non-linear variation.</p>
<p><br></p>
<pre class="r"><code>## Bin transcripts based on (i) whether they have a significant positive or negative vertex and then (ii) whether they showed significant interactions between beta1 (vertex value) and time.

# Export diff expression data for transcripts with significant DE associated with PCO2^2 parameter
nl_pCO2_2_sig &lt;-
  topTags(
  nl_pCO2_2,
  n = (11091 + 11915),
  adjust.method = &quot;BH&quot;,
  p.value = 0.05
  )
nl_pCO2_2_sig_geneids &lt;- row.names(nl_pCO2_2_sig) #Output a list of geneids associated with sig PCO2^2 effect

nl_pCO2_sig &lt;-
  topTags(
  nl_pCO2,
  n = (14166 + 13376),
  adjust.method = &quot;BH&quot;,
  p.value = 0.05
  )
nl_pCO2_sig_geneids &lt;- row.names(nl_pCO2_sig) #Output a list of geneids associated with sig PCO2 effect

# Create tabulated dataframe of mean expression across each pCO2 level with metadata for transcript ID and timepoint
logCPM_df &lt;- as.data.frame(df_log)

# Create tabularized df containing all replicates using &#39;melt&#39; function in reshape2
logCPM_df$geneid &lt;- row.names(logCPM_df)
tab_exp_df &lt;- melt(logCPM_df,
                   id = c(&quot;geneid&quot;))

# Add covariate information for time and pCO2
tab_exp_df$pCO2 &lt;- substr(tab_exp_df$variable, 1, 3)
tab_exp_df$time &lt;- as.numeric(substr(tab_exp_df$variable, 5, 5))

# Correct pCO2s to exact values
tab_exp_df$pCO2 &lt;- as.numeric(ifelse(
  tab_exp_df$pCO2 == &quot;300&quot;,
  255,
  ifelse(tab_exp_df$pCO2 == &quot;900&quot;, 930,
  518)
  ))

# Correct time to exact values
tab_exp_df$time &lt;- as.numeric(ifelse(tab_exp_df$time == &quot;1&quot;, 0.5, 7))

# Create binary variable in df_all_log for significant non-linear expression
tab_exp_df$pCO2_2_sig &lt;-
  ifelse(tab_exp_df$geneid %in% nl_pCO2_2_sig_geneids, &quot;Yes&quot;, &quot;No&quot;)
tab_exp_df$pCO2_sig &lt;-
  ifelse(tab_exp_df$geneid %in% nl_pCO2_sig_geneids, &quot;Yes&quot;, &quot;No&quot;)

# Create a binary variable related to up or down-regulation
up_genes &lt;- filter(nl_pCO2_sig$table, logFC &gt; 0)
tab_exp_df$logFC_dir &lt;-
  ifelse(tab_exp_df$geneid %in% row.names(up_genes), &quot;Up&quot;, &quot;Down&quot;)

# Add geneid to nl_pCO2_int$coefficients
nl_pCO2_int$coefficients$geneid &lt;- row.names(nl_pCO2_int$coefficients)

# Estimate average logCPM per gene per timepoint
tab_exp_avg &lt;- summarySE(
  measurevar = &quot;value&quot;,
  groupvars = c(&quot;pCO2&quot;,     &quot;time&quot;,       &quot;geneid&quot;, 
                &quot;pCO2_sig&quot;, &quot;pCO2_2_sig&quot;, &quot;logFC_dir&quot;),
  data = tab_exp_df
  )

# First exploratory plot of non-linear expression grouping by exposure time and direction of differential expression
ggplot(data = filter(tab_exp_avg, pCO2_2_sig == &quot;Yes&quot;), 
       aes(x = pCO2, y = value)) +
  geom_path(
    alpha = 0.01,
    size = 0.25,
    stat = &quot;identity&quot;,
    aes(group = as.factor(geneid))) +
  facet_grid(logFC_dir ~ time) +
  theme_classic() +
  theme(strip.background = element_blank()) +
  labs(y = &quot;Avg logCPM&quot;, title = &quot;Non-linear changes in GE output by edgeR&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-14-1.png" width="672" /></p>
<p>Our plot of gene expression across tens of thousands of transcripts
exhibiting significant non-linear changes across <em>p</em>CO<span
class="math inline">\(_{2}\)</span> appears to include many trends that
appear… well, linear. This is a pervasive issue in modeling non-linear
regressions, and one potential pitfall of using outputs from packages
such as edgeR or DESeq2 alone in testing for non-linear effects. The
FDR-adjusted <span class="math inline">\(p\)</span>-values we have used
determine significance of non-linear effects essentially tell us the
probability that a parameter value equal to or greater to what we have
fitted could be generated given a random distribution of read counts.
The <span class="math inline">\(p\)</span>-value is not a representation
of the strength of a non-linear effect relative to a linear effect.
Numerous genes that nominally show significant non-linear effects of
<em>p</em>CO<span class="math inline">\(_{2}\)</span> may be only weakly
affected, and a linear effect may in fact be more probable than a
non-linear one despite what our p-values tell us. Instead of asking “for
what genes may there be significant, non-linear effects of
<em>p</em>CO<span class="math inline">\(_{2}\)</span>?”, we should ask
“for what genes <em>should</em> we test for significant, non-linear
effects?”.</p>
<p>One of the best ways to determine whether a non-linear model is
appropriate for a transcript is to determine whether it is more probable
that its expression is linear or non-linear relative to a continuous
predictor. We can calculate this relative probability using a likelihood
ratio test (LRT). In the code chunk below, we will fit linear and 2nd
order non-linear models to the expression of each gene before applying
LRTs to each transcript. We will then further filter our edgeR dataset
based on (i) significant differential expression attributed to
non-linear effects and (ii) a significant LRT ratio supporting
non-linear effects. Then, we will replot the expression levels of this
re-filtered set. The code below fits gaussian linear models to
log2-transformed CPM values, but can be adjusted to fit negative
binomial GLMs to untransformed CPM similar to edgeR and DESeq2 by
setting using the MASS package to set ‘family = negative_binomial(theta
= <span class="math inline">\(\theta\)</span>)’ where <span
class="math inline">\(\theta\)</span> = the dispersion estimate or
biological coefficient of variation for a given transcript.</p>
<p><br></p>
<pre class="r"><code>## Using dlply, fit linear and non-linear models to each gene
# Create pCO2^2 variable in df_all_log
tab_exp_df$pCO2_2 &lt;- tab_exp_df$pCO2^2

# Fit linear models - should take about 4 minutes
lms &lt;- dlply(tab_exp_df, c(&quot;geneid&quot;), function(df) 
lm(value ~ pCO2 + time + pCO2:time, data = df))

# Fit non-linear models - should take about 2 minutes
nlms &lt;- dlply(tab_exp_df, c(&quot;geneid&quot;), function(df) 
lm(value ~ pCO2 + pCO2_2 + time + pCO2:time + pCO2_2:time, data = df))

# Output nlm coefficients into dataframe
nlms_coeff &lt;- ldply(nlms, coef)
head(nlms_coeff)</code></pre>
<pre><code>##              geneid (Intercept)          pCO2        pCO2_2        time
## 1 TR107626|c1_g1_i1   1.9361224  3.995760e-02 -3.819357e-05  3.39371568
## 2 TR141909|c0_g1_i1   2.5171022 -2.413737e-03  1.468135e-06 -0.18785174
## 3 TR141946|c0_g1_i1  -1.0829071  2.491456e-03 -3.180644e-06 -0.27604730
## 4 TR141946|c0_g1_i2   0.6669886  5.415472e-03 -3.724832e-06  0.11726202
## 5 TR141951|c0_g1_i1   2.0588461 -7.240810e-04  1.457408e-06  0.01615581
## 6 TR141972|c0_g1_i1   0.3458156 -5.983124e-05 -5.072419e-06  0.16036294
##       pCO2:time   pCO2_2:time
## 1 -0.0180847725  1.627584e-05
## 2  0.0007717623 -5.931927e-07
## 3  0.0008763072 -5.730126e-07
## 4 -0.0006632481  5.727356e-07
## 5  0.0000993039 -1.247330e-07
## 6 -0.0001885367  3.957829e-07</code></pre>
<pre class="r"><code>## Apply LRTs to lm&#39;s and nlm&#39;s for each transcript - should take about 2 minutes
lrts &lt;- list() # Create list to add LRT results to

for (i in 1:length(lms)) {
  lrts[[i]] &lt;- lrtest(lms[[i]], nlms[[i]]) # Apply LRTs with for loop
}

## Filter lrt results for transcripts with significantly higher likelihoods of nl model
lrt_dfs &lt;- list()

# Turn list of LRT outputs into list of dataframes containing output info
for (i in 1:length(lrts)) {
  lrt_dfs[[i]] &lt;- data.frame(lrts[i])
}

# Create singular dataframe with geneids and model outputs for chi-squared and LRT p-value
lrt_coeff_df  &lt;- na.omit(bind_rows(lrt_dfs, .id = &quot;column_label&quot;)) # na.omit removes first row of each df, which lacks these data

# Add geneid based on element number from original list of LRT outputs
lrt_coeff_df &lt;- merge(lrt_coeff_df,
                      data.frame(geneid = names(nlms),
                      column_label = as.character(seq(length(
                      nlms
                      )))),
                      by = &quot;column_label&quot;)
                      
# Apply FDR adjustment to LRT p-values before filtering for sig non-linear effects
lrt_coeff_df$FDR &lt;- p.adjust(lrt_coeff_df$Pr..Chisq., method = &quot;fdr&quot;)

# Filter LRT results for sig FDR coeff... produces 162 genes
lrt_filt &lt;- filter(lrt_coeff_df, FDR &lt; 0.05)

## Plot sig nl genes according to LRT, grouped by timepoint and direction of beta 1 coefficient
# Add beta coefficients to logCPM df
pCO2_pos &lt;- filter(nlms_coeff, pCO2 &gt; 0)
pCO2_2_pos &lt;- filter(nlms_coeff, pCO2_2 &gt; 0)

# Bin genes based on positive or negative pCO2 and pCO2^2 betas
tab_exp_avg$pCO2_binom &lt;- ifelse(tab_exp_avg$geneid %in% pCO2_pos$geneid, &quot;Positive&quot;, &quot;Negative&quot;)
tab_exp_avg$pCO2_2_binom &lt;- ifelse(tab_exp_avg$geneid %in% pCO2_2_pos$geneid, &quot;Concave&quot;, &quot;Convex&quot;)

# Filter for how many gene id&#39;s with significant likelihood of nl effect in LRT
LRT_filt_df &lt;- filter(tab_exp_avg, geneid %in% lrt_filt$geneid)

# Plot
ggplot(data = LRT_filt_df,
       aes(x = pCO2, y = value)) +
  geom_path(
    alpha = .25,
    size = 0.25,
    stat = &quot;identity&quot;,
    aes(group = as.factor(geneid))
    ) +
  facet_grid(pCO2_2_binom ~ time) +
  geom_smooth(method = &quot;loess&quot;, se = TRUE, span = 1) +
  theme_classic() +
  theme(strip.background = element_blank()) +
  labs(y = &quot;Avg logCPM&quot;, title = &quot;Non-linear changes in GE output by LRTs&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-15-1.png" width="672" /></p>
<pre class="r"><code># Count how many gene id&#39;s with significant likelihood of nl effect in LRT... 162 genes
nrow(as.data.frame(unique(LRT_filt_df$geneid)))</code></pre>
<pre><code>## [1] 162</code></pre>
<p>The above plot, which visualizes the expression of 162 genes
identified in LRTs to exhibit significant and high likelihoods of
non-linear variation in logCPM across <em>p</em>CO<span
class="math inline">\(_{2}\)</span>, appears distinct from our first
plot of non-linear expression deemed significant by edgeR. The overall
trends appear to be more convex or concave, consistent with a
second-order polynomial regressions. This speaks to the strength of LRTs
in identifying probable, non-linear effects or other multifactorial
effects.</p>
<p>Let’s see how many non-linear transcripts identified in LRTs
exhibited significant non-linear effects within our edgeR models. To do
this, we’ll filter down the significant DEGs output by edgeR to include
gene id’s within our significant LRT output. Then, we’ll plot the
expression of this filtered set across <em>p</em>CO<span
class="math inline">\(_{2}\)</span>.</p>
<p><br></p>
<pre class="r"><code># Filter down df for gene id&#39;s exhibit pCO2 significant effect in edgeR and significant likelihood of nl effect in LRT 
edgeR_LRT_df &lt;- filter(tab_exp_avg, geneid %in% lrt_filt$geneid &amp; pCO2_sig == &quot;Yes&quot; |
                         geneid %in% lrt_filt$geneid &amp; pCO2_2_sig == &quot;Yes&quot;)

# Plot 
ggplot(data = edgeR_LRT_df, 
       aes(x = pCO2, y = value)) +
  geom_path(
    alpha = .25,
    size = 0.25,
    stat = &quot;identity&quot;,
    aes(group = as.factor(geneid))
    ) +
  facet_grid(pCO2_2_binom~time) +
  geom_smooth(method = &quot;loess&quot;, se = TRUE, span = 1) +
  theme_classic() +
  theme(strip.background = element_blank()) +
  labs(y = &quot;Avg logCPM&quot;, title = &quot;Non-linear changes in GE output by edgeR &amp; LRTs&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-16-1.png" width="672" /></p>
<pre class="r"><code># Count how many gene id&#39;s exhibit pCO2 significant effect in edgeR and significant likelihood of nl effect in LRT... 89 genes
nrow(as.data.frame(unique(edgeR_LRT_df$geneid)))</code></pre>
<pre><code>## [1] 89</code></pre>
<p>89 genes exhibited a significant effect of <em>p</em>CO<span
class="math inline">\(_{2}\)</span> or <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2 and a high likelihood of a
non-linear as identified by an LRT. This number is substantially lower
than the tens of thousands of genes identified by edgeR alone and the
162 identified via LRT. Additionally, there only appears to be partial
overlap between edgeR and LRTs, speaking to the difference in the
information you can extract from a significant effect identified in a
generalized linear model (e.g., edgeR) vs. likelihood-based approaches
such as an LRT. Because these two tests provide distinct and valuable
information, the most robust estimate of we have laid out for non-linear
effects of <em>p</em>CO<span class="math inline">\(_{2}\)</span> on gene
expression comes from filtering for significant DEGs in edgeR
<em>and</em> significant LRTs.</p>
<p>What combination of approaches are most informative ultimately
depends on your experimental design and the question you are addressing.
For example, we can explore an additional question that our previous
graphs did not touch on: how does gene expression vary between exposure
times for genes exhibiting significant interactions between
<em>p</em>CO<span class="math inline">\(_{2}\)</span>^2 and time? Let’s
produce an exploratory plot of non-linear expression across 0.5 and 7
days of exposure for such genes identified using edgeR alone.</p>
<p><br></p>
<pre class="r"><code># Export diff expression data for transcripts with significant DE associated with interaction between PCO2^2 and time
nl_pCO2_2_int_sig &lt;- topTags(nl_pCO2_2_int, n = (390 + 271), adjust.method = &quot;BH&quot;,p.value = 0.05)
nl_pCO2_2_int_sig_geneids &lt;- row.names(nl_pCO2_2_int_sig) #Output a list of geneids associated with sig PCO2^2 x time interaction

# Filter down df for gene id&#39;s exhibit pCO2 significant effect in edgeR and significant likelihood of nl effect in LRT 
edgeR_interaction_df &lt;- filter(tab_exp_avg, geneid %in% nl_pCO2_2_int_sig_geneids )
edgeR_interaction_df$gene_id_time &lt;- paste(edgeR_interaction_df$geneid,
                                           edgeR_interaction_df$time,
                                           sep = &quot;_&quot;)

# Average logCPM across different groups according to pCO2^2 estimate and time
edgeR_interaction_avg &lt;- summarySE(measurevar = &quot;value&quot;,
                                   groupvars = c(&quot;time&quot;, &quot;pCO2&quot;, &quot;pCO2_2_binom&quot;),
                                                 data = edgeR_interaction_df)

# Plot 
ggplot(data = edgeR_interaction_avg, 
       aes(
         x = pCO2,
         y = value,
         color = as.factor(time),
         group = as.factor(time)
         )) +
         geom_path(stat = &quot;identity&quot;) +
  geom_errorbar(aes(ymin = value - se, ymax = value + se), width = 0) +
  geom_point() +
  facet_wrap(~pCO2_2_binom) +
  theme_classic() +
  theme(strip.background = element_blank()) +
  labs(y = &quot;logCPM&quot;, color = &quot;Time (days)&quot;, title = &quot;Interactions between pCO2^2 and exposure time&quot;)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-17-1.png" width="672" /></p>
<p>After visualizing variation in expression across <em>p</em>CO<span
class="math inline">\(_{2}\)</span> for genes identified in edgeR to be
affected by interactions between <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2 and exposure time, we can see
interesting changes in gene expression curves across time. For example,
genes that exhibited a convex expression curve at 0.5 days of exposure
generally modified their curve such that their expression became
positively exponential relative to <em>p</em>CO<span
class="math inline">\(_{2}\)</span>. One potential explanation for this
change is that the upregulation of some genes under intermediate
<em>p</em>CO<span class="math inline">\(_{2}\)</span> was reduced
following acclimation, while less acclimation may have occurred at the
higher <em>p</em>CO<span class="math inline">\(_{2}\)</span> level after
7 days. Genes with a concave expression curves across <em>p</em>CO<span
class="math inline">\(_{2}\)</span> at 0.5 days of exposure exhibited a
marked decrease in expression as the experiment progressed, regardless
of <em>p</em>CO<span class="math inline">\(_{2}\)</span> level. Thus,
the interaction between <em>p</em>CO<span
class="math inline">\(_{2}\)</span>^2 and time may affect ‘convex’ and
‘concave’ genes in entirely different ways, underscorign the importance
of partitioning or binning your data by parameters such as
<em>p</em>CO<span class="math inline">\(_{2}\)</span>^2. Exploring your
data in this way is important for understanding interactive effects in
RNA-seq analyses, regardless of whether or not you are fitting
non-linear or linear continuous effects. Below we discuss further
considerations for modeling interactions between linear continuous
predictors.</p>
<p><br></p>
</div>
</div>
<div id="interactive-effects" class="section level1">
<h1>Interactive effects</h1>
<p>Interactive effects shaping gene expression are common in nature and
are becoming increasingly prevalent in models of gene expression derived
from experimental studies. Below, we go a little bit deeper on
interactive effects by outlining methods for fitting them using
categorical and continuous variables in models of expression. We provide
examples in edgeR, DESeq2, and Voom and point out how specification for
interactive effects differs between these packages. Lastly, we compare
correlations between these programs’ fold change (logFC) predictions and
test statistics. This final comparison of model predictions for
interaction parameters is as much a tutorial on interactive effects as
it is a tutorial on how to visualize differences in DE packages’
results. Please use section of the code for any comparisons between DE
packages you may want to make for other effect types, not just
interactive effects!</p>
<p><br></p>
<div id="interactive-effects-edger" class="section level2">
<h2>Interactive effects: edgeR</h2>
<p>edgeR and Voom both take the same syntax for interactive effects,
which we define below using the model.matrix() function as ‘design_multi
&lt;- model.matrix( ~1 + pCO2 + pCO2:day )’. Then, we will use this
multifactorial model design in both edgeR and Voom as specified
below:</p>
<p><br></p>
<pre class="r"><code># Fit multifactoria design matrix that includes an interaction term for pCO2 x day
design_multi &lt;-
  model.matrix(~ 1 + pCO2 + pCO2:day) #Generate multivariate edgeR glm
  
# Fit quasi-likelihood, neg binom linear regression
multi_fit &lt;-
  glmQLFit(y1, design_multi) # Fit multivariate model to counts
  
# Test for effect of pCO2
tr_pCO2 &lt;-
  glmQLFTest(
  multi_fit,
  coef = 2,
  contrast = NULL,
  poisson.bound = FALSE
  ) # Estimate significant DEGs
  
is.de_tr_pCO2 &lt;-
  decideTestsDGE(tr_pCO2, adjust.method = &quot;fdr&quot;, p.value = 0.05) # Make contrasts
  
summary(is.de_tr_pCO2)</code></pre>
<pre><code>##         pCO2
## Down       0
## NotSig 62579
## Up         0</code></pre>
<pre class="r"><code># Test for interaction between pCO2 and time
tr_int &lt;-
  glmQLFTest(multi_fit, coef = 3, poisson.bound = FALSE) # Estimate significant DEGs
  
is.de_int &lt;-
  decideTestsDGE(tr_int, adjust.method = &quot;fdr&quot;, p.value = 0.05) # Make contrasts
  
summary(is.de_int)</code></pre>
<pre><code>##        pCO2:day
## Down       1021
## NotSig    60821
## Up          737</code></pre>
</div>
<div id="interactive-effects-limma-voom" class="section level2">
<h2>Interactive effects: limma-Voom</h2>
<p>Below we will fit the same ‘design_multi’ model (y ~ 1 +
<em>p</em>CO<span class="math inline">\(_{2}\)</span> +
<em>p</em>CO<span class="math inline">\(_{2}\)</span>:days) to our read
counts using Voom rather than edgeR. As stated earlier, Voom fits
interactions between fixed effects using the same syntax as edgeR, so
the ‘design_multi’ object can be used by both programs.</p>
<p>What <em>is</em> different between edgeR and Voom is the manner by
which Voom accounts for variation between replicates. Rather than using
gene-wise or averaged estimations of biological coefficients of
variation and inputting these values as <span
class="math inline">\(\theta\)</span> in a negative binomial GLM, Voom
models what is referred to as the mean-variance relationship and
incorporates this relationship within a linear model rather than a GLM
by “weighting” the accuracy of a given observation based on its level of
expression and modelled variance. This mean-variance relationship can be
plotted by Voom after model fitting as we show below:</p>
<p><br></p>
<pre class="r"><code># Perform Voom transformation
Voom &lt;- voom(y, design_multi, plot = T)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-19-1.png" width="672" /></p>
<p>When deciding between packages for testing DGE, it can be helpful to
compare model assumptions regarding variance by plotting graphs like
Voom’s mean-variance relationship above and the output of edgeR’s
‘plotBCV()’ function, which we provided an example of earlier and will
replot below for the sake of making thsi visual comparison. Take note
that the y-axes of these variance plots in Voom (square root of standard
deviation) and edgeR (biological coefficient of variation) are
different:</p>
<p><br></p>
<p>The mean-variance relationships modelled in edgeR and Voom are
generally similar, but it appears that Voom assumes even lesser variance
in expression among genes at the highest expression level. One result of
this may be that greater statistical power is assumed in tests of
differential expression by Voom for genes with high expression relative
to low expression genes. Keep this in mind as we continue to move
forward with fitting parameters for interactions between
<em>p</em>CO<span class="math inline">\(_{2}\)</span> and time in
Voom.</p>
<p><br></p>
<pre class="r"><code># Fit using Voom
lm_Voom_fit &lt;- lmFit(Voom, design_multi)

# Create a contrast across continuous pCO2 variable
cont_pCO2 &lt;- contrasts.fit(lm_Voom_fit, coef = &quot;pCO2&quot;)

# Create a contrast across interaction etween continuous pCO2 and time variables
cont_pCO2_day &lt;- contrasts.fit(lm_Voom_fit, coef = &quot;pCO2:day&quot;)

# Perform empirical Bayes smoothing of standard errors
cont_pCO2 &lt;- eBayes(cont_pCO2)
cont_pCO2_day &lt;- eBayes(cont_pCO2_day)

# Output test statistics
pCO2_results &lt;-
  topTable(cont_pCO2,
  coef = &quot;pCO2&quot;,
  adjust.method = &quot;fdr&quot;,
  n = Inf)
pCO2_day_results &lt;-
  topTable(cont_pCO2_day,
  coef = &quot;pCO2:day&quot;,
  adjust.method = &quot;fdr&quot;,
  n = Inf)

# How many DEGs are associated with pCO2
length(which(pCO2_results$adj.P.Val &lt; 0.05))</code></pre>
<pre><code>## [1] 0</code></pre>
<pre class="r"><code># How many DEGs are associated with pCO2:day?
length(which(pCO2_day_results$adj.P.Val &lt; 0.05))</code></pre>
<pre><code>## [1] 1628</code></pre>
<p>Voom has identified 1628 genes whose variation is affected by an
interaction between <em>p</em>CO<span
class="math inline">\(_{2}\)</span> and time, compared to 1748
identified by edgeR.</p>
</div>
<div id="interactive-effects-deseq2" class="section level2">
<h2>Interactive effects: DESeq2</h2>
<p>DESeq2 requires a slightly different syntax for specifying
interactive effects compared to edgeR and Voom.</p>
<p><br></p>
<pre class="r"><code>gcounts &lt;- as.data.frame(data_input)

totalCounts &lt;- colSums(gcounts)

### REMOVE GENES WITH LOW MEAN COUNTS ###

# Make a DGEList object for edgeR
y &lt;- DGEList(counts = data_input, remove.zeros = TRUE)

# Let&#39;s remove samples with less then 0.5 cpm (this is ~10 counts in the count file) in fewer then 9/12 samples
keep_g &lt;- rowSums(cpm(gcounts) &gt; .5) &gt;= 9

table(keep_g)</code></pre>
<pre><code>## keep_g
## FALSE  TRUE 
## 20632 62579</code></pre>
<pre class="r"><code># Set keep.lib.sizes = F and recalculate library sizes after filtering
#gcounts &lt;- gcounts[ keep_g, keep.lib.sizes = FALSE ]

### BUILD A DATAFRAME ASSOCIATING SAMPLE NAMESWITH TREATMENT CONDITIONS ###
targets</code></pre>
<pre><code>##    pCO2 day treatment grouping
## 1   255 7.0         B    255.7
## 2   255 7.0         B    255.7
## 3   255 7.0         B    255.7
## 4   255 0.5         B  255.0.5
## 5   255 0.5         B  255.0.5
## 6   255 0.5         B  255.0.5
## 7   530 7.0         R    530.7
## 8   530 7.0         R    530.7
## 9   530 7.0         R    530.7
## 10  530 0.5         R  530.0.5
## 11  530 0.5         R  530.0.5
## 12  530 0.5         R  530.0.5
## 13  918 7.0         Y    918.7
## 14  918 7.0         Y    918.7
## 15  918 7.0         Y    918.7
## 16  918 0.5         Y  918.0.5
## 17  918 0.5         Y  918.0.5
## 18  918 0.5         Y  918.0.5</code></pre>
<pre class="r"><code>### WALD TEST - FULL MODEL ###

dds &lt;- DESeqDataSetFromMatrix(gcounts,
                              colData = targets,
                              design = formula( ~ 1 + pCO2 + day : pCO2))

rld &lt;- rlog(dds)
rld.df &lt;- assay(rld)

# Wald test for pCO2:day
dds_int &lt;- DESeq(dds, minReplicatesForReplace = Inf)

design &lt;- design(dds_int)

DESeq2_int_result_names &lt;- resultsNames(dds_int)

# Count DEGs due to interaction
DESeq2_int_results &lt;- results(dds_int, name = &quot;pCO2.day&quot;, lfcThreshold = 0, alpha = 0.05)

summary(DESeq2_int_results)</code></pre>
<pre><code>## 
## out of 81450 with nonzero total read count
## adjusted p-value &lt; 0.05
## LFC &gt; 0 (up)       : 2491, 3.1%
## LFC &lt; 0 (down)     : 2494, 3.1%
## outliers [1]       : 1373, 1.7%
## low counts [2]     : 2724, 3.3%
## (mean count &lt; 5)
## [1] see &#39;cooksCutoff&#39; argument of ?results
## [2] see &#39;independentFiltering&#39; argument of ?results</code></pre>
<p>DESeq2 has identified 4985 genes whose variation is affected by an
interaction between <em>p</em>CO<span
class="math inline">\(_{2}\)</span> and time, compared to 1748
identified by edgeR and 1628 by Voom. This is a ~3-fold greater number
of DEGs predicted by DESeq2 compared to edgeR and Voom.</p>
<p><br></p>
</div>
</div>
<div id="comparing-test-statistics-interactive-effects"
class="section level1">
<h1>Comparing test statistics: interactive effects</h1>
<p>Differential expression packages can make dramatically different
predictions for both the fold-change and probability of differential
expression. If you find yourself deciding between different packages or
wanting to compare how conservative different approaches are, it helps
to run regressions of model predictions by different packages. Below we
apply the ggpairs() function from GGAlly <span
class="citation">(Schloerke et al. 2018)</span> to plot a correlation
matrix of logFC values and negative log-transformed p-values attributed
to interactive effects between time and <em>p</em>CO<span
class="math inline">\(_{2}\)</span> predicted by edgeR, DESeq2, and
Voom:</p>
<p><br></p>
<pre class="r"><code># Merge logFC and pval data from each program
Voom_edgeR_deseq_int_comp &lt;- merge( 
  merge(
    data.frame(geneid = row.names(pCO2_day_results),
               Voom_logFC = pCO2_day_results$logFC,
               Voom_pval = pCO2_day_results$P.Value),
    data.frame(geneid = row.names(tr_int$table),
               edgeR_logFC = tr_int$table$logFC,
               edgeR_pval = tr_int$table$PValue), 
    by = &quot;geneid&quot; ),
  data.frame(geneid = row.names(DESeq2_int_results),
             DESeq2_logFC = DESeq2_int_results$log2FoldChange,
             DESeq2_pval = DESeq2_int_results$pvalue),
  by = &quot;geneid&quot;)

# Create neg log pvalues
Voom_edgeR_deseq_int_comp$Voom_neglogp &lt;- -log(Voom_edgeR_deseq_int_comp$Voom_pval)
Voom_edgeR_deseq_int_comp$edgeR_neglogp &lt;- -log(Voom_edgeR_deseq_int_comp$edgeR_pval)
Voom_edgeR_deseq_int_comp$DESeq2_neglogp &lt;- -log(Voom_edgeR_deseq_int_comp$DESeq2_pval)

# Create function for adding 1:1 trendline on ggpairs plot and plotting geom_hex instead of geom_point
my_fn &lt;- function(data, mapping, ...){
  p &lt;- ggplot(data = data, mapping = mapping) + 
    geom_hex( bins = 100,
            aes(fill = stat(log(count))), alpha = 1 ) +
    scale_fill_viridis_c() +
    geom_abline(
    slope = 1,
    intercept = 0,
    color = &quot;red&quot;,
    lty = 2,
    size = 1,
    alpha = 0.5
    )
  p
}

# Correlation matrix of pvalues
pval_pairs &lt;- ggpairs(data = Voom_edgeR_deseq_int_comp,
                      columns = c(8, 9, 10),
                      mapping = aes(alpha = 0.001),
                      lower = list(continuous = my_fn)) +
  labs(title = &quot;Correlation matrix: interaction p-values&quot;)

pval_pairs +
  theme_classic(base_rect_size = 0)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-23-1.png" width="672" /></p>
<p>The above plot is the output of ggpairs() showing correlations
between negative log-transformed p-values attributed to interactive
effects of time x <em>p</em>CO<span class="math inline">\(_{2}\)</span>
estimated between each of the three packages, as well as the
distributions of this parameter for each package, and correlation
statistics. Red dashed lines represent a slope equal to 1. Yellow colors
depict regions of each linear regression with more observations.</p>
<p>Voom and edgeR share the largest correlation between p-values
estimated for an interactive effect of <em>p</em>CO<span
class="math inline">\(_{2}\)</span> : days on differential expression
(R<span class="math inline">\(^2\)</span> = 0.965). The bottom row of
this ggpairs plot shows tranformed p-values from DESeq2 on the y-axis,
which appear to be skewed toward lower p-values/higher -log p-values.
This is consistent with the fact that we observed ~3-fold more DEGs
attributed to the <em>p</em>CO<span class="math inline">\(_{2}\)</span>
: days interaction when running DESeq2 compared to edgeR and Voom.</p>
<p>Next, let’s create a ggpairs() plot contrasting logFC estimates of DE
attributed to <em>p</em>CO<span class="math inline">\(_{2}\)</span> :
days in edgeR, Voom, and DESeq2. One might think that logFCs should
demonstrate stronger correlations between programs compared to p-values,
but due to the manners by which read counts are normalized and
transformed in different packages this is not always the case!</p>
<p><br></p>
<pre class="r"><code># Correlation matrix of logFC&#39;s
logFC_pairs &lt;- ggpairs(data = Voom_edgeR_deseq_int_comp,
                       columns = c( 2, 4, 6 ),
                       mapping = aes( alpha = 0.001 ),
                       lower = list(continuous = my_fn)) +
  labs(title = &quot;Correlation matrix: interaction logFC&#39;s&quot;)

logFC_pairs +
  theme_classic(base_rect_size = 0)</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-24-1.png" width="672" /></p>
<p>In this pairs plot of logFC estimates across packages, you can see
that R<span class="math inline">\(^2\)</span> values are, on average,
lower than those of p-value correlations between packages. Once again,
the strongest correlation exists between edgeR and Voom. However, the
slope of this correlation is less than 1, indicating that absolute edgeR
logFC’s are lower on average than absolute Voom logFCs. The correlation
pair that shows a slope closest to 1.0 is edgeR vs. DESeq2. edgeR and
DESeq2 normalize read counts in a similar manner, and thus a value
derived from differences in read counts across continuous predictors
such as logFC is likely to be more similar between these packages.</p>
</div>
<div id="random-intercepts-example-in-voom" class="section level1">
<h1>Random Intercepts: example in Voom</h1>
<p>We will skip edgeR and DESeq2 since they cannot fit random
effects.</p>
<p>Random intercepts (often referred to as random effects) can be an
essential component of estimating differential expression in experiments
that incorporate non-independent observations (e.g., repeated measures
and nested designs: individuals within populations or families) or
randomly sampled groups. Most differential expression packages do not
possess functionality for fitting random intercepts, extending GLMs to
generalized linear mixed models or GLMMs, but Voom does! Additional
packages that we don’t cover here also possess random intercept
functionality, including dream <span class="citation">(Hoffman and
Roussos 2021)</span> and <a
href="https://cran.r-project.org/web/packages/glmmSeq/index.html">glmmSeq</a>.</p>
<p>Below is an example of how to fit a random intercept associated with
the time variable of our dataset using Voom:</p>
<p><br></p>
<pre class="r"><code># Fit multifactoria design matrix
design_rand &lt;- model.matrix(~ 1 + pCO2 + (1 | day)) #Generate multivariate edgeR glm

# Perform Voom transformation
Voom_rand &lt;- voom(y, design_rand, plot = T)</code></pre>
<pre><code>## Coefficients not estimable: 1 | dayTRUE</code></pre>
<p><img src="DGE_comparison_v2_files/figure-html/unnamed-chunk-25-1.png" width="672" /></p>
<p>If you compare the mean-variance plot from our mixed model above,
with our first mean-variance plot from the Voom model that did not
include a random effect you will notice that the modelled relationship
between variance and mean expression is considerably different between
the two. While the linear model mean-variance plot took on a negative
exponential shape by which earlier increases in expression level
associate with a precipitous decline in variance, our mixed model’s fit
takes on a more logarithmic shape and the raw points appear parabolic.
Here, variance is modelled as remaining somewhat constant during early
increases in expression level before declining more rapidly across
intermediate expression levels. Additionally, Voom predicted a maximum
variance of 2.0 in our linear model while the mixed model above predicts
a maximum of 1.5. This reduction in overall variance may partially be
explained by incorporating a random intercept for time rather than
modelling it as a continuous fixed effect. As we progress through the DE
test derived this mixed model, we will anticipate that a reduction in
variance will influence our results, likely by increasing the
statistical power of genewise DE tests, and that modelling time as a
random effect will also modify what and how many genes are identified as
differentially expressed.</p>
<p><br></p>
<pre class="r"><code># Fit using Voom
lm_Voom_fit_rand &lt;- lmFit(Voom_rand, design_rand)</code></pre>
<pre><code>## Coefficients not estimable: 1 | dayTRUE</code></pre>
<pre class="r"><code># Create a contrast across continuous pCO2 variable
cont_rand_day &lt;- contrasts.fit(lm_Voom_fit_rand, coef = &quot;pCO2&quot;)

# Perform empirical Bayes smoothing of standard errors
cont_rand_day &lt;- eBayes(cont_rand_day)

# Output test statistics
rand_results &lt;- topTable(cont_rand_day,
                         coef = &quot;pCO2&quot;,
                         adjust.method = &quot;fdr&quot;,
                         n = Inf)

# How many DEG are associated with pCO2 after incorporating a random effect for day?
length(which(rand_results$adj.P.Val &lt; 0.05)) # number of DE genes</code></pre>
<pre><code>## [1] 3</code></pre>
<p>While the linear Voom model’s DE test output 0 predicted DEG’s
associated with <em>p</em>CO<span class="math inline">\({_2}\)</span>,
the mixed Voom model predicted 3 genes. As we saw eariler when fitting
linear models, the majority of DEG’s in this study were derived from an
interaction between time and <em>p</em>CO<span
class="math inline">\({_2}\)</span>. We modelled this as an interaction
between two continuous fixed effects, but random effects and continuous
effects can also jointly influence outome variables. Such effects can be
fit in mixed models using ‘random slopes’, which we describe in the next
section.</p>
<p><br></p>
</div>
<div id="random-slopes" class="section level1">
<h1>Random slopes</h1>
<p>Distinct from the random intercept, testing for variation in gene
expression attributed to a random slope effect asks, “Within a set of
groups, does the reaction norm of gene expression across a continuous
predictor randomly by group?”. This question is relevant to experimental
designs testing for differences in the plasticity of gene expression
across randomly-selected demographics such as genotypes or families. For
these kinds of groups, modelling an interaction between the continuous
predictor and a categorical effect representing different groups would
be inappropriate, as the use of a categorical effect should be reserved
for groups-of-interest specific selected in an experiment (e.g.,
different sexes, populations inhabiting distinct environments, etc.). To
our knowledge however, there is no well-documented differential
expression package that enables the fitting of random slopes.</p>
<p>Below we detail custom scripts for fitting gaussian linear mixed
models using lme4 and negative binomial GLMs to read count data. For
practical purposes, we will treat exposure time as a random variable
even though timepoints were not randomly selected in the experiment that
produced our data and apply. We will also apply linear mixed models
using lmer, which is computationally intensive. The models fitted below
are input with a highly filtered and reduced set of read count data.
After fitting a LMM to each gene with a random slope parameter by which
the effect of <em>p</em>CO<span class="math inline">\({_2}\)</span>
randomly varies across time, we will compare the likelihoods of these
models to those of a ‘null’ LMM that only possesses a random intercept
associated with time. We will then count and report the number of genes
that show significantly high likelihoods for random slopes.</p>
<p><br></p>
<pre class="r"><code>## For practical purposes, reduce size of input data in order to run lmer in for loop without using up memory
# Re-filter data
keep_red &lt;- rowSums(cpm(y) &gt; 3 &amp; cpm(y) &lt; 10) &gt;= 12

# Apply read filter to tab_exp_df
tab_exp_df_filt &lt;- filter(tab_exp_df, geneid %in% 
                            row.names(filter(as.data.frame(keep_red), keep_red == TRUE)))

# Using dlply, fit linear mixed model to tabularized df of log2-transformed CPM values for each transcript
tab_exp_df_filt$time &lt;- as.factor(tab_exp_df_filt$time)
rs_lmes &lt;- dlply(tab_exp_df_filt, c(&quot;geneid&quot;), function(df) 
lmer(value ~ pCO2 + (1 | time) + (pCO2 | time), data = df))

# Fit null model without random slope
null_lmes &lt;- dlply(tab_exp_df_filt, c(&quot;geneid&quot;), function(df) 
lmer(value ~ pCO2 + (1 | time), data = df))

# Apply LRT to each gene to test for effect of random slope
lmer_lrts &lt;- list() # Create list to add LRT results to

for (i in 1:length(rs_lmes)) {
 lmer_lrts[[i]] &lt;- lrtest(rs_lmes[[i]], null_lmes[[i]]) # Apply LRTs with for loop
}

## Filter lrt results for transcripts with significantly higher likelihoods than null model
lmer_lrt_dfs &lt;- list()

# Turn list of LRT outputs into list of dataframes containing output info
for (i in 1:length(lmer_lrts)) {
 lmer_lrt_dfs[[i]] &lt;- data.frame(lmer_lrts[i])
}

# Create singular dataframe with geneids and model outputs for chi-squared and LRT p-value
lmer_lrt_coeff_df  &lt;- na.omit(bind_rows(lmer_lrt_dfs, .id = &quot;column_label&quot;)) # na.omit removes first row of each df, which lacks these data

# Add geneid based on element number from original list of LRT outputs
lmer_lrt_coeff_df &lt;- merge(lmer_lrt_coeff_df,
                      data.frame(geneid = names(rs_lmes),
                                 column_label = as.character(seq(length(rs_lmes)))),
                      by = &quot;column_label&quot;)

# Apply FDR adjustment to LRT p-values before filtering for sig non-linear effects
lmer_lrt_coeff_df$FDR &lt;- p.adjust(lmer_lrt_coeff_df$Pr..Chisq., method = &quot;fdr&quot;)

# Filter LRT results for sig FDR coeff...
lmer_lrt_filt &lt;- filter(lmer_lrt_coeff_df, FDR &lt; 0.05)

# How many genes showed significant LRT?... none!
count(lmer_lrt_filt)</code></pre>
<pre><code>## [1] column_label X.Df         LogLik       Df           Chisq       
## [6] Pr..Chisq.   geneid       FDR          freq        
## &lt;0 rows&gt; (or 0-length row.names)</code></pre>
</div>
<div id="conclusion-moving-forward-with-multifactorial-rna-seq"
class="section level1">
<h1>Conclusion: Moving forward with multifactorial RNA-seq</h1>
<p>If you’ve made it this far, you probably found it to be a common
theme that popular DE packages do not provide all of the flexibility
necessary for fitting models to data coming from multifactorial
experimental designs, even some of the more standard designs. It should
be restated that our walkthrough only scratches the surface of (i)
available packages that can facilitate multifactorial DE tests and (ii)
opportunities that custom scripts for model fitting provide. For
example, our example code for fitting linear and linear mixed models
before testing for effects using likelihood ratio tests can easily be
edited to fit negative binomial GLMs such as those used by edgeR and
DESeq2 input with <span class="math inline">\(\theta\)</span> estimates
(e.g., dispersion) from edgeR or custom estimates, and we encourage
readers to try out this strategy! One useful resource for doing so is
<span class="citation">(Rocke et al. 2015)</span>, a preprint describing
transformations of RNA-seq data that enable model fitting using the
‘MASS’ function glm.nb(). If you have personal code that you believe
would make a valuable addition to this walkthrough, don’t hesitate to
post a GitHub issue or submit a pull-request to us. We will review the
suggestions and add your name to the list this page’s authors! We hope
that this page and others on the website can serve as a living document
that can keep pace with advances in multifactorial RNA-seq analyses,
which lend fidelity to experimental designs that are necessary for
generating reproducible and predictive results.</p>
<p><br></p>
</div>
<div id="references" class="section level1 unnumbered">
<h1 class="unnumbered">References</h1>
<div id="refs" class="references csl-bib-body hanging-indent">
<div id="ref-Bogan2020-qs" class="csl-entry">
Bogan, Samuel N, Kevin M Johnson, and Gretchen E Hofmann. 2020.
<span>“Changes in <span>Genome-Wide</span> Methylation and Gene
Expression in Response to Future <span class="nocase">pCO2</span>
Extremes in the Antarctic Pteropod Limacina Helicina Antarctica.”</span>
<em>Frontiers in Marine Science</em>, January, NA.
</div>
<div id="ref-Hawinkel2020-lk" class="csl-entry">
Hawinkel, Stijn, J C W Rayner, Luc Bijnens, and Olivier Thas. 2020.
<span>“Sequence Count Data Are Poorly Fit by the Negative Binomial
Distribution.”</span> <em>PLoS One</em> 15 (4): e0224909.
</div>
<div id="ref-Hoffman2021-br" class="csl-entry">
Hoffman, Gabriel E, and Panos Roussos. 2021. <span>“Dream: Powerful
Differential Expression Analysis for Repeated Measures Designs.”</span>
<em>Bioinformatics</em> 37 (2): 192–201.
</div>
<div id="ref-Johnson2016-wq" class="csl-entry">
Johnson, Kevin M, and Gretchen E Hofmann. 2016. <span>“A Transcriptome
Resource for the Antarctic Pteropod Limacina Helicina
Antarctica.”</span> <em>Mar. Genomics</em> 28 (August): 25–28.
</div>
<div id="ref-Law2014-ce" class="csl-entry">
Law, Charity W, Yunshun Chen, Wei Shi, and Gordon K Smyth. 2014.
<span>“Voom: Precision Weights Unlock Linear Model Analysis Tools for
<span class="nocase">RNA-seq</span> Read Counts.”</span> <em>Genome
Biol.</em> 15 (2): R29.
</div>
<div id="ref-Li2011-mi" class="csl-entry">
Li, Bo, and Colin N Dewey. 2011. <span>“<span>RSEM</span>: Accurate
Transcript Quantification from <span>RNA-Seq</span> Data with or Without
a Reference Genome.”</span> <em>BMC Bioinformatics</em> 12 (August):
323.
</div>
<div id="ref-Love2014-fy" class="csl-entry">
Love, Michael I, Wolfgang Huber, and Simon Anders. 2014.
<span>“Moderated Estimation of Fold Change and Dispersion for <span
class="nocase">RNA-seq</span> Data with <span>DESeq2</span>.”</span>
<em>Genome Biol.</em> 15 (12): 550.
</div>
<div id="ref-Rivera2021-hn" class="csl-entry">
Rivera, Hanny E, Hannah E Aichelman, James E Fifer, Nicola G Kriefall,
Daniel M Wuitchik, Sara J S Wuitchik, and Sarah W Davies. 2021. <span>“A
Framework for Understanding Gene Expression Plasticity and Its Influence
on Stress Tolerance.”</span> <em>Mol. Ecol.</em> 30 (6): 1381–97.
</div>
<div id="ref-Robinson2010-fd" class="csl-entry">
Robinson, Mark D, Davis J McCarthy, and Gordon K Smyth. 2010.
<span>“edgeR: A Bioconductor Package for Differential Expression
Analysis of Digital Gene Expression Data.”</span>
<em>Bioinformatics</em> 26 (1): 139–40.
</div>
<div id="ref-Rocke2015-uo" class="csl-entry">
Rocke, David M, Luyao Ruan, Yilun Zhang, J Jared Gossett, Blythe
Durbin-Johnson, and Sharon Aviran. 2015. <span>“Excess False Positive
Rates in Methods for Differential Gene Expression Analysis Using
<span>RNA-Seq</span> Data.”</span> <em>bioRxiv</em>.
</div>
<div id="ref-Schloerke2018-rq" class="csl-entry">
Schloerke, B, J Crowley, D Cook, F Briatte, M Marbach, et al. 2018.
<span>“Ggally: Extension to Ggplot2.”</span> <em>R Package Version</em>.
</div>
</div>
</div>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open');
  });
});
</script>

<!-- code folding -->

<script>
$(document).ready(function ()  {

    // temporarily add toc-ignore selector to headers for the consistency with Pandoc
    $('.unlisted.unnumbered').addClass('toc-ignore')

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_');
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = false;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>