From 1342860a327cbc82238615e40e39ebb871cfc22f Mon Sep 17 00:00:00 2001 From: lcolladotor Date: Sat, 20 Feb 2016 11:00:10 -0500 Subject: [PATCH] Completed migration of Hippo and Snyder scripts Moved files from https://github.com/leekgroup/derSupplement Updated timing report (less plots, better plots) Updated venn report to show information for both Snyder and Hippo data sets Updated index to reflect the files this repo describes --- .gitignore | 2 + .nojekyll | 0 additional-analyses/feature_counts.R | 54 + check-analysis-time.R | 92 + gff/createGFF.R | 17 + gff/runGFF.sh | 16 + hippo/.gitignore | 2 + hippo/counts-gene/.gitignore | 3 + hippo/counts-gene/counts-gene.R | 30 + hippo/counts-gene/run.sh | 22 + hippo/pnas/.gitignore | 5 + hippo/pnas/compareVsPNAS-gene.Rmd | 570 +++ hippo/pnas/compareVsPNAS-gene.html | 2789 ++++++++++++ hippo/pnas/compareVsPNAS.Rmd | 770 ++++ hippo/pnas/compareVsPNAS.html | 3205 ++++++++++++++ hippo/pnas/runComparison.sh | 13 + .../summaryInfo/run3-v1.0.10/summaryInfo.html | 1330 ++++++ index.Rmd | 173 + index.bib | 126 + index.html | 231 + optional1-sortSam.sh | 60 + optional2-HTSeq.sh | 83 + optional3-summOv.R | 32 + optional3-summOv.sh | 59 + run-all.sh | 40 + snyder/.gitignore | 1 + step1-fullCoverage.R | 86 + step1-fullCoverage.sh | 63 + step2-makeModels.R | 117 + step2-makeModels.sh | 47 + step3-analyzeChr.R | 76 + step3-analyzeChr.sh | 68 + step4-mergeResults.sh | 47 + step5-derfinderReport.sh | 48 + step6-regionMatrix.sh | 64 + step7-regMatVsDERs.Rmd | 430 ++ step7-regMatVsDERs.sh | 59 + step8-coverageToExon.R | 57 + step8-coverageToExon.sh | 64 + step9-summaryInfo.R | 96 + step9-summaryInfo.Rmd | 177 + step9-summaryInfo.sh | 57 + timing/.gitignore | 4 + timing/timing.Rmd | 321 ++ timing/timing.bib | 106 + timing/timing.html | 3862 +++++++++++++++++ venn/venn.Rmd | 216 + venn/venn.bib | 93 + venn/venn.html | 358 ++ 49 files changed, 16241 insertions(+) create mode 100644 .gitignore create mode 100644 .nojekyll create mode 100644 additional-analyses/feature_counts.R create mode 100644 check-analysis-time.R create mode 100644 gff/createGFF.R create mode 100755 gff/runGFF.sh create mode 100644 hippo/.gitignore create mode 100644 hippo/counts-gene/.gitignore create mode 100644 hippo/counts-gene/counts-gene.R create mode 100755 hippo/counts-gene/run.sh create mode 100644 hippo/pnas/.gitignore create mode 100644 hippo/pnas/compareVsPNAS-gene.Rmd create mode 100644 hippo/pnas/compareVsPNAS-gene.html create mode 100644 hippo/pnas/compareVsPNAS.Rmd create mode 100644 hippo/pnas/compareVsPNAS.html create mode 100755 hippo/pnas/runComparison.sh create mode 100644 hippo/summaryInfo/run3-v1.0.10/summaryInfo.html create mode 100644 index.Rmd create mode 100644 index.bib create mode 100644 index.html create mode 100755 optional1-sortSam.sh create mode 100755 optional2-HTSeq.sh create mode 100644 optional3-summOv.R create mode 100755 optional3-summOv.sh create mode 100755 run-all.sh create mode 100644 snyder/.gitignore create mode 100644 step1-fullCoverage.R create mode 100755 step1-fullCoverage.sh create mode 100644 step2-makeModels.R create mode 100755 step2-makeModels.sh create mode 100755 step3-analyzeChr.R create mode 100755 step3-analyzeChr.sh create mode 100755 step4-mergeResults.sh create mode 100755 step5-derfinderReport.sh create mode 100755 step6-regionMatrix.sh create mode 100644 step7-regMatVsDERs.Rmd create mode 100755 step7-regMatVsDERs.sh create mode 100644 step8-coverageToExon.R create mode 100755 step8-coverageToExon.sh create mode 100644 step9-summaryInfo.R create mode 100644 step9-summaryInfo.Rmd create mode 100644 step9-summaryInfo.sh create mode 100644 timing/.gitignore create mode 100644 timing/timing.Rmd create mode 100644 timing/timing.bib create mode 100644 timing/timing.html create mode 100644 venn/venn.Rmd create mode 100644 venn/venn.bib create mode 100644 venn/venn.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aba553a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +GenomicState.Hsapiens.UCSC.hg19.knownGene.rda +figure-hippo-reanalysis diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/additional-analyses/feature_counts.R b/additional-analyses/feature_counts.R new file mode 100644 index 0000000..d91df58 --- /dev/null +++ b/additional-analyses/feature_counts.R @@ -0,0 +1,54 @@ +### +# qsub -V -pe local 24 -l jabba,mf=80G,h_vmem=16G,h_stack=256M -cwd -b y R CMD BATCH --no-save feature_counts.R +source("/home/epi/ajaffe/Lieber/lieber_functions_aj.R") + +library(Rsubread) + +## stem +xx=load("/home/epi/ajaffe/Lieber/Projects/RNAseq/UCSD_samples/UCSD_stemcell_pheno.rda") +pdStem = pd +pdStem$bamFile = paste0("/dcs01/ajaffe/UCSC_Epigenome/RNAseq/TopHat/", + pdStem$sample, "_out/accepted_hits.bam") + +## hippo +load("/home/epi/ajaffe/Lieber/Projects/RNAseq/HippoPublic/sra_phenotype_file.rda") +pdHippo = sra +pdHippo = pdHippo[-grep("ED", pdHippo$SampleID),] + +## snyder +bam = list.files("/dcs01/ajaffe/Snyder/RNAseq/TopHat", + pattern="accepted_hits.bam$", recur=TRUE,full.names=TRUE) +names(bam) = list.files("/dcs01/ajaffe/Snyder/RNAseq/TopHat", + pattern="accepted_hits.bam$", recur=TRUE) + +pheno = data.frame(sampleName = c(pdStem$run, pdHippo$SampleID, ss(names(bam), "_")), + Study = c(rep("Stem", nrow(pdStem)), rep("Hippo", nrow(pdHippo)), + rep("Snyder", length(bam))), + bamFile = c(pdStem$bamFile, pdHippo$bamFile, bam), + stringsAsFactors=FALSE) +libSize = getTotalMapped(pheno$bamFile,mc.cores=12) +pheno$totalMapped = libSize$totalMapped +pheno$mitoMapped = libSize$mitoMapped + +## count genes +geneCounts = featureCounts(pheno$bamFile, annot.inbuilt="hg19", + useMetaFeatures = TRUE, nthreads=24) +exonCounts = featureCounts(pheno$bamFile, annot.inbuilt="hg19", + useMetaFeatures = FALSE,nthreads=24) +save(geneCounts,exonCounts, pheno, file="featureCounts_output.rda") + +##### +load("featureCounts_output.rda") + +pheno$geneAlign = as.numeric(geneCounts$stat[1,-1] / colSums(geneCounts$stat[,-1])) +pheno$geneAmbig = as.numeric(geneCounts$stat[2,-1] / colSums(geneCounts$stat[,-1])) +pheno$exonAlign = as.numeric(exonCounts$stat[1,-1] / colSums(exonCounts$stat[,-1])) +pheno$exonAmbig = as.numeric(exonCounts$stat[2,-1] / colSums(exonCounts$stat[,-1])) + +signif(tapply(pheno$geneAlign, pheno$Study, mean),3) +signif(tapply(pheno$geneAmbig, pheno$Study, mean),3) +signif(tapply(pheno$exonAlign, pheno$Study, mean),3) +signif(tapply(pheno$exonAmbig, pheno$Study, mean),3) + +### mito map +signif(tapply(pheno$mitoMapped/(pheno$mitoMapped+pheno$totalMapped), pheno$Study, mean),3) diff --git a/check-analysis-time.R b/check-analysis-time.R new file mode 100644 index 0000000..7642520 --- /dev/null +++ b/check-analysis-time.R @@ -0,0 +1,92 @@ +library('ggplot2') +library('getopt') + +## Specify parameters +spec <- matrix(c( + 'experiment', 'e', 1, 'character', 'Experiment', + 'run', 'r', 1, 'character', 'run name', + 'help' , 'h', 0, 'logical', 'Display help' +), byrow=TRUE, ncol=5) +opt <- getopt(spec) + +## if help was asked for print a friendly message +## and exit with a non-zero error code +if (!is.null(opt$help)) { + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +## Check experiment input +stopifnot(opt$experiment %in% c('snyder', 'hippo')) + +chrs <- paste0('chr', c(1:22, 'X', 'Y')) +study <- opt$experiment +run <- opt$run + +timediff <- lapply(chrs, function(chr) { + info <- tryCatch(system(paste0('grep permutation *', study, '*', run, '*', chr, '.e*'), intern = TRUE), warning = function(w) { 'no data'}) + if(info[1] == 'no data') { + info <- tryCatch(system(paste0('grep permutation ', file.path(study, 'derAnalysis', run, chr, 'logs'), '/*', chr, '.e*'), intern = TRUE), warning = function(w) { 'no data'}) + } + if(info[1] == 'no data') return(NULL) + + time <- strptime(gsub('([[:space:]]*calculate.*$)', '', info), + format = '%Y-%m-%d %H:%M:%S') + + idx <- seq_len(length(info) - 1) + difftime(time[idx + 1], time[idx], units = 'mins') +}) +names(timediff) <- chrs + + +## Organize time information +chrnum <- gsub('chr', '', chrs) +df <- data.frame(chr = factor(chrnum, levels = chrnum), mean = sapply(timediff, mean), sd = sapply(timediff, sd)) + +## Group by number of rounds per permutation given the number of chunks & cores used +if(!file.exists(file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))) { + nChunks <- sapply(chrs, function(chr) { + if(!file.exists(file.path(study, 'derAnalysis', run, chr, 'coveragePrep.Rdata'))) + return(NA) + load(file.path(study, 'derAnalysis', run, chr, 'coveragePrep.Rdata')) + max(prep$mclapply) + }) + save(nChunks, file = file.path(study, 'derAnalysis', run, 'nChunks.Rdata')) +} else { + load(file.path(study, 'derAnalysis', run, 'nChunks.Rdata')) +} + +if (study == 'brainspan') { + nCores <- c(40, 32, 27, rep(20, 15), 29, rep(20, 4), 2) +} else if (study == 'snyder') { + nCores <- rep(4, 24) +} else if (study == 'hippo') { + nCores <- rep(2, 24) +} +names(nCores) <- chrs + +df$n <- sapply(timediff, length) +df$se <- df$sd / sqrt(df$n) +df$nChunks <- nChunks +df$nCores <- nCores +df$nRound <- factor(ceiling(nChunks / nCores)) + + +## Print info +rownames(df) <- NULL +print(df) + + +## Make plot +pdf(file.path(study, 'derAnalysis', run, paste0('permuteTime-', study, '-', run, '.pdf'))) +ggplot(df, aes(x = chr, y = mean, color = nRound)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) + geom_line() + geom_point() + ylab('Time per permutation (minutes)\nMean +- SE') + xlab('Chromosome') + ggtitle(paste('Time info for', study, run)) + scale_y_continuous(breaks=seq(0, ceiling(max(df$mean + df$se, na.rm = TRUE)), 1)) +dev.off() + +print('Expected total number of days per chr and days remaining') +days <- data.frame(chr = chrnum, total = round(df$mean * 1001 / 60 / 24, 1), remaining = round(df$mean * (1001 - df$n - 2 ) / 60 / 24, 1)) +rownames(days) <- NULL +print(days) + + + + diff --git a/gff/createGFF.R b/gff/createGFF.R new file mode 100644 index 0000000..a19131c --- /dev/null +++ b/gff/createGFF.R @@ -0,0 +1,17 @@ +# Setup +library("GenomicRanges") +library("rtracklayer") + +load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.ensembl.GRCh37.p11.rda') +load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda') + +makeGFF <- function(exonicParts, file) { + message(paste(Sys.time(), "makeGFF: Saving", file)) + export.gff2(exonicParts, file) +} + +makeGFF(GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome[ GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome$theRegion == "exon"], "GenomicState.Hsapiens.ensembl.GRCh37.p11.exons.gff") +makeGFF(GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome[ GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome$theRegion == "exon"], "GenomicState.Hsapiens.UCSC.hg19.knownGene.exons.gff") + +proc.time() +sessionInfo() diff --git a/gff/runGFF.sh b/gff/runGFF.sh new file mode 100755 index 0000000..73c3d69 --- /dev/null +++ b/gff/runGFF.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#$ -cwd +#$ -l jabba,mem_free=10G,h_vmem=50G,h_fsize=40G +#$ -N exonsGFF +#$ -m e + +echo "**** Job starts ****" +date + +## Create GFF files +Rscript createGFF.R + +### Done +echo "**** Job ends ****" +date diff --git a/hippo/.gitignore b/hippo/.gitignore new file mode 100644 index 0000000..1f945ee --- /dev/null +++ b/hippo/.gitignore @@ -0,0 +1,2 @@ +derAnalysis +coverageToExon \ No newline at end of file diff --git a/hippo/counts-gene/.gitignore b/hippo/counts-gene/.gitignore new file mode 100644 index 0000000..12bd0bd --- /dev/null +++ b/hippo/counts-gene/.gitignore @@ -0,0 +1,3 @@ +logs +*Rdata + diff --git a/hippo/counts-gene/counts-gene.R b/hippo/counts-gene/counts-gene.R new file mode 100644 index 0000000..16b0dd5 --- /dev/null +++ b/hippo/counts-gene/counts-gene.R @@ -0,0 +1,30 @@ +## Setup +library('TxDb.Hsapiens.UCSC.hg19.knownGene') +library('derfinder') +library('Rsamtools') +library('GenomicAlignments') +library('parallel') +options(mc.cores=24) + +## Exons by gene +ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene') + +## Make bamFileList +files <- rawFiles(datadir='/dcs01/ajaffe/Hippo/TopHat', samplepatt="out$", + fileterm="accepted_hits.bam") +names(files) <- gsub('_out', '', names(files)) +bai <- paste0(files, ".bai") +bList <- BamFileList(files, bai) + +## Compute the overlaps +message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps()")) +summOv <- summarizeOverlaps(ex, bList, mode="Union", + singleEnd=TRUE, ignore.strand=TRUE) + +## Finish +message(paste(Sys.time(), "summarizeOverlaps: Saving summOverlaps")) +save(summOv, file="summOv.Rdata") + +proc.time() +options(width = 120) +devtools::session_info() diff --git a/hippo/counts-gene/run.sh b/hippo/counts-gene/run.sh new file mode 100755 index 0000000..1fd3b24 --- /dev/null +++ b/hippo/counts-gene/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#$ -cwd +#$ -m e +#$ -l mem_free=3G,h_vmem=15G,h_fsize=30G +#$ -pe local 24 +#$ -N summOv-hippo-rerun + +echo "**** Job starts ****" +date + +mkdir -p /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/logs + +## Summarize overlaps +module load R/3.2.x +Rscript counts-gene.R + +# Move log files into the logs directory +mv /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/summOv-hippo-rerun.* /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/logs/ + +### Done +echo "**** Job ends ****" +date diff --git a/hippo/pnas/.gitignore b/hippo/pnas/.gitignore new file mode 100644 index 0000000..13370ce --- /dev/null +++ b/hippo/pnas/.gitignore @@ -0,0 +1,5 @@ +*.e* +*.o* +*Rdata +*pdf + diff --git a/hippo/pnas/compareVsPNAS-gene.Rmd b/hippo/pnas/compareVsPNAS-gene.Rmd new file mode 100644 index 0000000..0f0203b --- /dev/null +++ b/hippo/pnas/compareVsPNAS-gene.Rmd @@ -0,0 +1,570 @@ +--- +output: + knitrBootstrap::bootstrap_document: + theme.chooser: TRUE + highlight.chooser: TRUE +--- + +Compare vs PNAS at gene level +=============== + + +# Gene-level analysis + +This section has the code for running `edgeR-robust` and `DESeq2` on the simulation data set using the known genes as features. + + + +This first code chunk loads the necessary data. + + +```{r 'setup', bootstrap.show.code = FALSE, bootstrap.show.message = FALSE} +## Track time spent on making the report +startTime <- Sys.time() + +library('edgeR') +library('DESeq2') +library('GenomicRanges') +library('TxDb.Hsapiens.UCSC.hg19.knownGene') + +## Load data +load("../counts-gene/summOv.Rdata") +load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata") +load("../derAnalysis/run3-v1.0.10/colsubset.Rdata") + +## GenomicState object +if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) { + load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda') +} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) { + load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda') +} else { + stop('Missing UCSC hg19 genomic state object') +} + +## Find genes +genes <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene') + +## Round matrix and remove genes with 0s +counts <- assay(summOv)[, colsubset] +nonzero <- sapply(rowSums(counts), function(x) {x > 0}) +``` + + +## DESeq2 + +The following code performs the DESeq2 analysis. Code is based on [edgeR_Robust supplementary code](http://imlspenticton.uzh.ch/robinson_lab/edgeR_robust/). The main change is that it has been modified for the multi-group scenario. + +```{r 'deseq2', bootstrap.show.code = FALSE} +## Round matrix and specify design +dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group) + +## Perform DE analysis +system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) ) + +## Extract results +deseq <- genes[nonzero] +mcols(deseq) <- cbind(mcols(deseq), results(dse)) + +## Which are significant? +mcols(deseq)$sig <- mcols(deseq)$padj < 0.05 +mcols(deseq)$sig[is.na(mcols(deseq)$sig)] <- FALSE + +## Save results +save(deseq, file = 'deseq-gene.Rdata') + +## Adjust by Holm +deseq_holm <- deseq +mcols(deseq_holm)$sig <- p.adjust(mcols(deseq_holm)$pvalue, 'holm') < 0.05 +``` + + +## edgeR-robust + +The following code performs the DESeq2 analysis. Code is based on [edgeR_Robust supplementary code](http://imlspenticton.uzh.ch/robinson_lab/edgeR_robust/). The main change is that it has been modified for the multi-group scenario. + + +```{r 'edgeR', bootstrap.show.code = FALSE} +## Determine design matrix +design <- model.matrix(~ groupInfo) + +## Perform DE analysis +d <- DGEList(counts = counts[nonzero, ], group = groupInfo) +d <- calcNormFactors(d) +system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6)) +fw <- glmFit(dw, design = design, coef = 2:3) +lrw <- glmLRT(fw, coef = 2:3) + +## Extract results +edger <- genes[nonzero] +mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table)) +mcols(edger)$pvalue <- lrw$table$PValue +mcols(edger)$padj <- p.adjust(lrw$table$PValue, 'BH') + +## Which are significant? +mcols(edger)$sig <- mcols(edger)$padj < 0.05 +mcols(edger)$sig[is.na(mcols(edger)$sig)] <- FALSE + +## Save results +save(edger, file = 'edger-gene.Rdata') + +## Adjust by Holm +edger_holm <- edger +mcols(edger_holm)$sig <- p.adjust(mcols(edger_holm)$pvalue, 'holm') < 0.05 +``` + + + + + + + + +## Overlap + +```{r 'ov-comp-setup', bootstrap.show.code = FALSE} +## Load data +load('../derAnalysis/run3-v1.0.10/fullRegions.Rdata') + +## Some formatting and subsets +names(fullRegions) <- seq_len(length(fullRegions)) +fullRegions$sigFWER <- as.logical(fullRegions$significantFWER) +fullRegs20 <- fullRegions[width(fullRegions) >= 20] + +## Overlap table for all 4 cases +ov_table <- function(ders, counts, query = 'der', minov = 0) { + if(query == 'der') { + if(minov == 0) { + res <- addmargins(table('Significant DER (FWER)' = ders$sigFWER, 'Overlaps sig DE gene' = countOverlaps(ders, counts[mcols(counts)$sig]) > 0)) + } else { + res <- addmargins(table(ders$sigFWER, countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FWER)', paste0('Overlaps sig DE gene (min ', minov, 'bp)')))) + } + } else if (query == 'counts') { + if(minov == 0) { + res <- addmargins(table('Significant DE gene' = mcols(counts)$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFWER]) > 0)) + } else { + res <- addmargins(table(mcols(counts)$sig[sapply(width(counts), sum) >= minov], countOverlaps(counts[sapply(width(counts), sum) >= minov], ders[ders$sigFWER], minoverlap = minov) > 0, dnn = c('Significant DE gene', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)')))) + } + } + return(res) +} + +## Explore mistmatched cases for DERs vs genes direction +explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) { + if(case == 'FALSE-TRUE') { + i <- which(countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & !ders$sigFWER) + } else if (case == 'TRUE-FALSE') { + i <- which(!countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & ders$sigFWER) + } else{ + stop('invalid case') + } + if(length(i) == 0) return("No such cases") + + if(case == 'FALSE-TRUE') { + res <- list( + n_overlaps = table(countOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)), + width_der = summary(width(ders[i])), + ders_per_gene_table = table(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))), + ders_per_gene = sort(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))), + i = i + ) + } else { + res <- list( + width_der = summary(width(ders[i])), + distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE))$distance), + distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE))$distance), + distance_nearest = distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE), + distance_nearest_sig = distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE), + i = i + ) + } + + return(res) +} + +## Explore mistmatched cases for genes vs DERs direction +explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) { + counts <- counts[sapply(width(counts), sum) >= minov] + if(case == 'FALSE-TRUE') { + i <- which(countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & !mcols(counts)$sig) + } else if (case == 'TRUE-FALSE') { + i <- which(!countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & mcols(counts)$sig) + } else{ + stop('invalid case') + } + if(length(i) == 0) return("No such cases") + + if(case == 'FALSE-TRUE') { + res <- list( + n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)), + width_gene = summary(sapply(width(counts[i]), sum)), + genes_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))), + genes_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))), + i = i + ) + } else { + res <- list( + width_gene = summary(sapply(width(counts[i]), sum)), + distance_nearest_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE))$distance), + distance_nearest_sig_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE))$distance), + distance_nearest = distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE), + distance_nearest_sig = distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE), + i = i + ) + } + + return(res) +} + +noNA <- function(x) { + x[!is.na(x)] +} +``` + + +### DESeq2 + + +#### Query: DERs + +We can first compare the results by using the DERs as the query and the genes as the subject. The following output shows the comparison using all DERs and exploring the mismatched cases. Then its repeated using the DERs $\geq$ 20 bp and a minimum overlap of 20bp. + +For the mismatched cases of non-significant DERs overlapping a significant gene, we check: + +* how many genes each DER overlaps, +* the width of the DERs +* the frequency table of how many DERs overlap the same gene + +For the other mismatched case, we check: + +* the width of the DERs +* distance to nearest gene (regardless of gene size) +* distance to nearest significant DE gene (ibidem) + +```{r 'ov-comp-deseq', bootstrap.show.code = FALSE} +## Overlap between DERs and significant DE genes +ov_table(fullRegions, deseq) + +## Explore mismatched cases +#noNA(explore_ov(fullRegions, deseq)[1:3]) +#noNA(explore_ov(fullRegions, deseq, 'TRUE-FALSE')[1:3]) + +## Min 20 bp overlap, using only DERs 20 bp long +ov_table(fullRegs20, deseq, minov = 20L) + +## Explore mismatched cases, min 20bp overlap +noNA(explore_ov(fullRegs20, deseq, minov = 20L)[1:3]) +noNA(explore_ov(fullRegs20, deseq, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Holm vs BH +addmargins(table('DESeq2 Holm' = mcols(deseq_holm)$sig, 'DESeq2 BH' = mcols(deseq)$sig)) + +## Use Holm and min 20 bp ov +ov_table(fullRegs20, deseq_holm, minov = 20L) +``` + +Most of the DERs are shorter than 20bp (`r round(sum(width(fullRegions) < 20) / length(fullRegions) * 100, 2)` percent), so we'll focus on the longer ones. The majority of the mismatches are from non significant DERs that overlap a significant gene. + +As expected, when controlling the FWER instead of the FDR, most of the DE genes are no longer significant. Using FWER-controlled DE genes, most of the DERs 20bp or longer agree with the genes as not being significantly DE. + + +#### Query: genes + +We can now repeat the comparison using the genes as the query and the DERs as the subject. + +For the mismatched cases of non-significant genes overlapping a significant DER, we check: + +* how many DERs each gene overlaps, +* the width of the genes +* the frequency table of how many genes overlap the same DER + +For the other mismatched case, we check: + +* the width of the genes +* distance to nearest DER (regardless of DER size) +* distance to nearest significant DER (ibidem) + +```{r 'ov-comp-deseq-counts', bootstrap.show.code = FALSE} +## Overlap between genes and significant DERs +#ov_table(fullRegions, deseq, 'counts') + +## Explore mismatched cases +#noNA(explore_ov_counts(fullRegions, deseq)[1:3]) +#noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE')[1:3]) + +## Overlap between genes and significant DERs, min 20 bp +ov_table(fullRegions, deseq, 'counts', 20L) + +## Explore mismatched cases +noNA(explore_ov_counts(fullRegions, deseq, minov = 20L)[1:3]) +noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Now with Holm +ov_table(fullRegions, deseq_holm, 'counts', 20L) +``` + +From these results, we can see that `derfinder` is more conservative. + + + + +### edgeR-robust + +#### Query: DERs + +Similar comparison using DERs as query and genes as subject with `edgeR-robust` results. + +```{r 'ov-comp-edger', bootstrap.show.code = FALSE} +## Overlap between DERs and significant DE genes +#ov_table(fullRegions, edger) + +## Explore mismatched cases +#noNA(explore_ov(fullRegions, edger)[1:3]) +#noNA(explore_ov(fullRegions, edger, 'TRUE-FALSE')[1:3]) + +## Min 20 bp overlap, using only DERs 20 bp long +ov_table(fullRegs20, edger, minov = 20L) + +## Explore mismatched cases, min 20bp overlap +noNA(explore_ov(fullRegs20, edger, minov = 20L)[1:3]) +noNA(explore_ov(fullRegs20, edger, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Holm vs BH +addmargins(table('edgeR Holm' = mcols(edger_holm)$sig, 'edger BH' = mcols(edger)$sig)) + +## With Holm, 20bp +ov_table(fullRegs20, edger_holm, minov = 20L) +``` + +The results are fairly similar to those from using `DESeq2`. + +#### Query: genes + +Similar comparison using genes as query and DERs as subject with `edgeR-robust` results. + +```{r 'ov-comp-edger-counts', bootstrap.show.code = FALSE} +## Overlap between genes and significant DERs +#ov_table(fullRegions, edger, 'counts') + +## Explore mismatched cases +#noNA(explore_ov_counts(fullRegions, edger)[1:3]) +#noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE')[1:3]) + +## Overlap between genes and significant DERs, min 20 bp +ov_table(fullRegions, edger, 'counts', 20L) + +## Explore mismatched cases +noNA(explore_ov_counts(fullRegions, edger, minov = 20L)[1:3]) +noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE', minov = 20L)[1:3]) + +## With Holm, 20 bp +ov_table(fullRegions, edger_holm, 'counts', 20L) +``` + + +### overall + +While the DERs vs genes results are fairly similar between `edgeR-robust` and `DESeq2`, as shown below the number of mismatched cases is high compared to the number of cases both counts-based methods agree. This is also true when controlling the FWER to determine significance. + +```{r 'deseq-vs-edger'} +## edgeR vs DESeq2 +addmargins(table('edgeR-robust (FDR)' = mcols(edger)$sig, 'DESeq2 (FDR)' = mcols(deseq)$sig)) + +## Control FWER +addmargins(table('edgeR-robust (FWER)' = mcols(edger_holm)$sig, 'DESeq2 (FWER)' = mcols(deseq_holm)$sig)) + +## Only sig if both edgeR and DEseq2 say it is +both <- deseq +mcols(both)$sig <- mcols(both)$sig & mcols(edger)$sig + +## Same, for holm +both_holm <- deseq_holm +mcols(both_holm)$sig <- mcols(both_holm)$sig & mcols(edger_holm)$sig +``` + +We can consider an gene to be DE only if both `edgeR-robust` and `DESeq2` find that its significantly DE. The next sections use this information. + +#### Query: DERs + +```{r 'ov-comp-both', bootstrap.show.code = FALSE} +## Overlap between DERs and significant DE genes +#ov_table(fullRegions, both) + +## Explore mismatched cases +#noNA(explore_ov(fullRegions, both)[1:3]) +#noNA(explore_ov(fullRegions, both, 'TRUE-FALSE')[1:3]) + +## Min 20 bp overlap, using only DERs 20 bp long +ov_table(fullRegs20, both, minov = 20L) + +## Explore mismatched cases, min 20bp overlap +noNA(explore_ov(fullRegs20, both, minov = 20L)[1:3]) +noNA(explore_ov(fullRegs20, both, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Holm vs BH +addmargins(table('Both Holm' = mcols(both_holm)$sig, 'Both BH' = mcols(both)$sig)) + +## Use Holm and min 20 bp ov +ov_table(fullRegs20, both_holm, minov = 20L) +``` + +The trends observed previously are maintained in this comparison with a reduction of cases where the gene is DE. This is expected due to the non-perfect agreement between `DESeq2` and `edgeR-robust`. + + +```{r 'regionPlot-setup', bootstrap.show.code = FALSE, bootstrap.show.message = FALSE} +library('TxDb.Hsapiens.UCSC.hg19.knownGene') +library('derfinder') +library('derfinderHelper') +library('derfinderPlot') +load('../derAnalysis/run3-v1.0.10/models.Rdata') +load('../derAnalysis/run3-v1.0.10/chr22/optionsStats.Rdata') +load("../CoverageInfo/fullCov.Rdata") + +def.par <- par() +def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))] + +regPlot <- function(region, title) { + ## Calculate F-stats + range <- start(region):end(region) + dat <- fullCov[[as.character(seqnames(region))]][range, colsubset] + + ## Log2 transform + for(i in seq_len(length(groupInfo))) dat[[i]] <- log2(dat[[i]] + 32) + + ## Calculate f-stats + fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0)) + + ## Find annotation + annoReg <- annotateRegions(region, GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome, verbose = FALSE) + symbol <- mcols(annoReg$annotationList[[1]])$symbol + symbol <- as.character(noNA(symbol)[[1]]) + if(length(symbol) > 1) symbol <- symbol[1] + symbol <- ifelse(is.null(symbol), NA, symbol) + ## Remove symbol name because it gets chomped on the plot + mcols(annoReg$annotationList[[1]])$symbol <- NA + + par(def.par) + + ## Plot long gene + plotRegionCoverage(region, getRegionCoverage(fullCov, region, verbose = FALSE), groupInfo, data.frame(name = title, distance = NA, region = symbol), annoReg, verbose = FALSE, ask = FALSE, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene) + + ## Add F-stat track + par(fig = c(0, 1, 0.065, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1)) + plot(y = fstats, x = range, ylab = 'F-stat', type = 'l', xlab = '', bty = 'n', ylim = c(0, max(fstats[is.finite(fstats)], optionsStats$cutoffFstatUsed) * 1.1), las = 2, yaxt = 'n') + y.max <- round(max(c(optionsStats$cutoffFstatUsed, fstats[is.finite(fstats)]), na.rm = TRUE), 0) + axis(2, at = c(0, round(y.max / 2, 0), y.max), c(0, round(y.max / 2, 0), y.max), las = 2, tick = TRUE) + abline(h = optionsStats$cutoffFstatUsed, col = 'red') + abline(h = 0, col = 'grey') +} + +sortWidth <- function(regions) { + regions[order(width(regions), decreasing = TRUE)] +} +``` + +We can now make plots to explore some DERs for each of the cases. + +```{r 'query-der-plots', fig.width = 10, fig.height = 7, bootstrap.show.code = FALSE} +query_der_plots <- function() { + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: DE agreement') + }) + + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & !fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: not DE agreement') + }) + + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: only gene not DE') + }) + + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & !fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: only gene DE') + }) +} +pdf(file = 'query_der_plots_gene.pdf', width = 10, height = 7) +query_der_plots() +dev.off() + +query_der_plots() +``` + + +#### Query: genes + +As was shown with either `DESeq2` or `edgeR-robust` results, `derfinder` is more conservative than the counts-based methods. + +```{r 'ov-comp-both-counts', bootstrap.show.code = FALSE} +## Overlap between genes and significant DERs, min 20 bp +ov_table(fullRegions, both, 'counts', 20L) + +## Explore mismatched cases +noNA(explore_ov_counts(fullRegions, both, minov = 20L)[1:3]) +noNA(explore_ov_counts(fullRegions, both, 'TRUE-FALSE', minov = 20L)[1:3]) + +## With Holm, 20 bp +ov_table(fullRegions, both_holm, 'counts', 20L) +``` + +We can now visually explore some genes (maximum 10kb long) for each of the four cases. + + +```{r 'query-gene-plots', fig.width = 10, fig.height = 7, bootstrap.show.code = FALSE} + +max_kb <- function(x, limit = 1e4) { + x[width(x) <= limit] +} + +selectRegions <- function(x) { + max_kb(sortWidth(unlist(range(x)))) +} + +query_gene_plots <- function() { + sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) { + regPlot(reg, 'Gene query: DE agreement') + }) + + sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) { + regPlot(reg, 'Gene query: not DE agreement') + }) + + sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) { + regPlot(reg, 'Gene query: only gene not DE') + }) + + sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) { + regPlot(reg, 'Gene query: only gene DE') + }) +} +pdf(file = 'query_gene_plots.pdf', width = 10, height = 7) +query_gene_plots() +dev.off() + +query_gene_plots() +``` + + + +# Reproducibility + +Date the report was generated. + +```{r reproducibility1, echo=FALSE, bootstrap.show.code=FALSE} +## Date the report was generated +Sys.time() +``` + +Wallclock time spent generating the report. + +```{r "reproducibility2", echo=FALSE, bootstrap.show.code=FALSE} +## Processing time in seconds +totalTime <- diff(c(startTime, Sys.time())) +round(totalTime, digits=3) +``` + +`R` session information. + +```{r "reproducibility3", echo=FALSE, bootstrap.show.code=FALSE, bootstrap.show.message=FALSE} +## Session info +options(width=120) +devtools::session_info() +``` diff --git a/hippo/pnas/compareVsPNAS-gene.html b/hippo/pnas/compareVsPNAS-gene.html new file mode 100644 index 0000000..1efdd9c --- /dev/null +++ b/hippo/pnas/compareVsPNAS-gene.html @@ -0,0 +1,2789 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+

Compare vs PNAS at gene level

+

Gene-level analysis

+

This section has the code for running edgeR-robust and DESeq2 on the simulation data set using the known genes as features.

+

This first code chunk loads the necessary data.

+
+ +
## Track time spent on making the report
+startTime <- Sys.time()
+
+library('edgeR')
+ +
## Loading required package: limma
+
+ +
library('DESeq2')
+ +
## Loading required package: S4Vectors
+## Loading required package: stats4
+## Loading required package: BiocGenerics
+## Loading required package: parallel
+## 
+## Attaching package: 'BiocGenerics'
+## 
+## The following objects are masked from 'package:parallel':
+## 
+##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
+##     clusterExport, clusterMap, parApply, parCapply, parLapply,
+##     parLapplyLB, parRapply, parSapply, parSapplyLB
+## 
+## The following object is masked from 'package:limma':
+## 
+##     plotMA
+## 
+## The following object is masked from 'package:stats':
+## 
+##     xtabs
+## 
+## The following objects are masked from 'package:base':
+## 
+##     anyDuplicated, append, as.data.frame, as.vector, cbind,
+##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
+##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
+##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
+##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
+##     table, tapply, union, unique, unlist, unsplit
+## 
+## Loading required package: IRanges
+## Loading required package: GenomicRanges
+## Loading required package: GenomeInfoDb
+## Loading required package: Rcpp
+## Loading required package: RcppArmadillo
+
+ +
library('GenomicRanges')
+library('TxDb.Hsapiens.UCSC.hg19.knownGene')
+ +
## Loading required package: GenomicFeatures
+## Loading required package: AnnotationDbi
+## Loading required package: Biobase
+## Welcome to Bioconductor
+## 
+##     Vignettes contain introductory material; view with
+##     'browseVignettes()'. To cite Bioconductor, see
+##     'citation("Biobase")', and for packages 'citation("pkgname")'.
+
+ +
## Load data
+load("../counts-gene/summOv.Rdata")
+load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata")
+load("../derAnalysis/run3-v1.0.10/colsubset.Rdata")
+
+## GenomicState object
+if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
+    load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
+} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
+    load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
+} else {
+    stop('Missing UCSC hg19 genomic state object')
+}
+
+## Find genes
+genes <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene')
+
+## Round matrix and remove genes with 0s
+counts <- assay(summOv)[, colsubset]
+nonzero <- sapply(rowSums(counts), function(x) {x > 0})
+
+

DESeq2

+

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

+
+ +
## Round matrix and specify design
+dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group)
+
+## Perform DE analysis
+system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) )
+ +
## estimating size factors
+## estimating dispersions
+## gene-wise dispersion estimates
+## mean-dispersion relationship
+## final dispersion estimates
+## fitting model and testing
+## -- replacing outliers and refitting for 70 genes
+## -- DESeq argument 'minReplicatesForReplace' = 7 
+## -- original counts are preserved in counts(dds)
+## estimating dispersions
+## fitting model and testing
+
+ +
##    user  system elapsed 
+##  32.601   0.844  33.560
+
+ +
## Extract results
+deseq <- genes[nonzero]
+mcols(deseq) <- cbind(mcols(deseq), results(dse))
+
+## Which are significant?
+mcols(deseq)$sig <- mcols(deseq)$padj < 0.05
+mcols(deseq)$sig[is.na(mcols(deseq)$sig)] <- FALSE
+
+## Save results
+save(deseq, file = 'deseq-gene.Rdata')
+
+## Adjust by Holm
+deseq_holm <- deseq
+mcols(deseq_holm)$sig <- p.adjust(mcols(deseq_holm)$pvalue, 'holm') < 0.05
+
+

edgeR-robust

+

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

+
+ +
## Determine design matrix
+design <- model.matrix(~ groupInfo)
+
+## Perform DE analysis
+d <- DGEList(counts = counts[nonzero, ], group = groupInfo)
+d <- calcNormFactors(d)
+system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6))
+ +
##    user  system elapsed 
+##  76.999   4.804  82.360
+
+ +
fw <- glmFit(dw, design = design, coef = 2:3)
+lrw <- glmLRT(fw, coef = 2:3)
+
+## Extract results
+edger <- genes[nonzero]
+mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table))
+mcols(edger)$pvalue <-  lrw$table$PValue
+mcols(edger)$padj <- p.adjust(lrw$table$PValue, 'BH')
+
+## Which are significant?
+mcols(edger)$sig <- mcols(edger)$padj < 0.05
+mcols(edger)$sig[is.na(mcols(edger)$sig)] <- FALSE
+
+## Save results
+save(edger, file = 'edger-gene.Rdata')
+
+## Adjust by Holm
+edger_holm <- edger
+mcols(edger_holm)$sig <- p.adjust(mcols(edger_holm)$pvalue, 'holm') < 0.05
+
+

Overlap

+
+ +
## Load data
+load('../derAnalysis/run3-v1.0.10/fullRegions.Rdata')
+
+## Some formatting and subsets
+names(fullRegions) <- seq_len(length(fullRegions))
+fullRegions$sigFWER <- as.logical(fullRegions$significantFWER)
+fullRegs20 <- fullRegions[width(fullRegions) >= 20]
+
+## Overlap table for all 4 cases
+ov_table <- function(ders, counts, query = 'der', minov = 0) {
+    if(query == 'der') {
+        if(minov == 0) {
+            res <- addmargins(table('Significant DER (FWER)' = ders$sigFWER, 'Overlaps sig DE gene' = countOverlaps(ders, counts[mcols(counts)$sig]) > 0))
+        } else {
+            res <- addmargins(table(ders$sigFWER, countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FWER)', paste0('Overlaps sig DE gene (min ', minov, 'bp)'))))
+        }
+    } else if (query == 'counts') {
+        if(minov == 0) {
+            res <- addmargins(table('Significant DE gene' = mcols(counts)$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFWER]) > 0))
+        } else {
+            res <- addmargins(table(mcols(counts)$sig[sapply(width(counts), sum) >= minov], countOverlaps(counts[sapply(width(counts), sum) >= minov], ders[ders$sigFWER], minoverlap = minov) > 0, dnn = c('Significant DE gene', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)'))))
+        }
+    }
+    return(res)
+}
+
+## Explore mistmatched cases for DERs vs genes direction
+explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
+    if(case == 'FALSE-TRUE') {
+        i <- which(countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & !ders$sigFWER)
+    } else if (case == 'TRUE-FALSE') {
+        i <- which(!countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & ders$sigFWER)
+    } else{
+        stop('invalid case')
+    }
+    if(length(i) == 0) return("No such cases")
+    
+    if(case == 'FALSE-TRUE') {
+        res <- list(
+            n_overlaps = table(countOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)),
+            width_der = summary(width(ders[i])),
+            ders_per_gene_table = table(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
+            ders_per_gene = sort(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
+            i = i
+        )
+    } else {
+        res <- list(
+            width_der = summary(width(ders[i])),
+            distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE))$distance),
+            distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE))$distance),
+            distance_nearest = distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE),
+            distance_nearest_sig = distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE),
+            i = i
+        )
+    }
+    
+    return(res)
+}
+
+## Explore mistmatched cases for genes vs DERs direction
+explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
+    counts <- counts[sapply(width(counts), sum) >= minov]
+    if(case == 'FALSE-TRUE') {
+        i <- which(countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & !mcols(counts)$sig)
+    } else if (case == 'TRUE-FALSE') {
+        i <- which(!countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & mcols(counts)$sig)
+    } else{
+        stop('invalid case')
+    }
+    if(length(i) == 0) return("No such cases")
+    
+    if(case == 'FALSE-TRUE') {
+        res <- list(
+            n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)),
+            width_gene = summary(sapply(width(counts[i]), sum)),
+            genes_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
+            genes_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
+            i = i
+        )
+    } else {
+        res <- list(
+            width_gene = summary(sapply(width(counts[i]), sum)),
+            distance_nearest_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE))$distance),
+             distance_nearest_sig_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE))$distance),
+            distance_nearest = distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE),
+            distance_nearest_sig = distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE),
+            i = i
+        )
+    }
+    
+    return(res)
+}
+
+noNA <- function(x) {
+    x[!is.na(x)]
+}
+
+

DESeq2

+

Query: DERs

+

We can first compare the results by using the DERs as the query and the genes as the subject. The following output shows the comparison using all DERs and exploring the mismatched cases. Then its repeated using the DERs  ≥  20 bp and a minimum overlap of 20bp.

+

For the mismatched cases of non-significant DERs overlapping a significant gene, we check:

+
    +
  • how many genes each DER overlaps,
  • +
  • the width of the DERs
  • +
  • the frequency table of how many DERs overlap the same gene
  • +
+

For the other mismatched case, we check:

+
    +
  • the width of the DERs
  • +
  • distance to nearest gene (regardless of gene size)
  • +
  • distance to nearest significant DE gene (ibidem)
  • +
+
+ +
## Overlap between DERs and significant DE genes
+ov_table(fullRegions, deseq)
+ +
##                       Overlaps sig DE gene
+## Significant DER (FWER) FALSE  TRUE   Sum
+##                  FALSE 13285 18178 31463
+##                  TRUE    285   230   515
+##                  Sum   13570 18408 31978
+
+ +
## Explore mismatched cases
+#noNA(explore_ov(fullRegions, deseq)[1:3])
+#noNA(explore_ov(fullRegions, deseq, 'TRUE-FALSE')[1:3])
+
+## Min 20 bp overlap, using only DERs 20 bp long
+ov_table(fullRegs20, deseq, minov = 20L)
+ +
##                       Overlaps sig DE gene (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  1026 1252 2278
+##                  TRUE    285  230  515
+##                  Sum    1311 1482 2793
+
+ +
## Explore mismatched cases, min 20bp overlap
+noNA(explore_ov(fullRegs20, deseq, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##    1    2 
+## 1245    7 
+## 
+## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   20.00   23.00   29.00   30.05   36.00   62.00 
+## 
+## $ders_per_gene_table
+## 
+##   1   2   3   4   5   6   7   8   9  10  21 
+## 528 127  52  24  14  10   3   3   1   2   1
+
+ +
noNA(explore_ov(fullRegs20, deseq, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   29.00   47.00   59.00   66.76   77.00  182.00 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0     933   16980   16990 1385000 
+## 
+## $distance_nearest_sig_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    5925   23620   83210   65500 2651000
+
+ +
## Holm vs BH
+addmargins(table('DESeq2 Holm' = mcols(deseq_holm)$sig, 'DESeq2 BH' = mcols(deseq)$sig))
+ +
##            DESeq2 BH
+## DESeq2 Holm FALSE  TRUE   Sum
+##       FALSE 14042  7463 21505
+##       TRUE      0   311   311
+##       Sum   14042  7774 21816
+
+ +
## Use Holm and min 20 bp ov
+ov_table(fullRegs20, deseq_holm, minov = 20L)
+ +
##                       Overlaps sig DE gene (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  2057  221 2278
+##                  TRUE    442   73  515
+##                  Sum    2499  294 2793
+
+
+

Most of the DERs are shorter than 20bp (91.27 percent), so we'll focus on the longer ones. The majority of the mismatches are from non significant DERs that overlap a significant gene.

+

As expected, when controlling the FWER instead of the FDR, most of the DE genes are no longer significant. Using FWER-controlled DE genes, most of the DERs 20bp or longer agree with the genes as not being significantly DE.

+

Query: genes

+

We can now repeat the comparison using the genes as the query and the DERs as the subject.

+

For the mismatched cases of non-significant genes overlapping a significant DER, we check:

+
    +
  • how many DERs each gene overlaps,
  • +
  • the width of the genes
  • +
  • the frequency table of how many genes overlap the same DER
  • +
+

For the other mismatched case, we check:

+
    +
  • the width of the genes
  • +
  • distance to nearest DER (regardless of DER size)
  • +
  • distance to nearest significant DER (ibidem)
  • +
+
+ +
## Overlap between genes and significant DERs
+#ov_table(fullRegions, deseq, 'counts')
+
+## Explore mismatched cases
+#noNA(explore_ov_counts(fullRegions, deseq)[1:3])
+#noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE')[1:3])
+
+## Overlap between genes and significant DERs, min 20 bp
+ov_table(fullRegions, deseq, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE gene FALSE  TRUE   Sum
+##               FALSE 13940   102 14042
+##               TRUE   7598   176  7774
+##               Sum   21538   278 21816
+
+ +
## Explore mismatched cases
+noNA(explore_ov_counts(fullRegions, deseq, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##  1  2  3  4  6 11 
+## 73 21  4  2  1  1 
+## 
+## $width_gene
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##     493    1488    3452    4354    7060   13630 
+## 
+## $genes_per_der_table
+## 
+##   1  20 
+## 132   1
+
+ +
noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_gene
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    2386    3770    4722    5941  118100 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0    6176   31160  138800  119100 5384000 
+## 
+## $distance_nearest_sig_sum
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
+##       52   759100  2443000  5405000  6433000 48690000
+
+ +
## Now with Holm
+ov_table(fullRegions, deseq_holm, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE gene FALSE  TRUE   Sum
+##               FALSE 21277   228 21505
+##               TRUE    261    50   311
+##               Sum   21538   278 21816
+
+
+

From these results, we can see that derfinder is more conservative.

+

edgeR-robust

+

Query: DERs

+

Similar comparison using DERs as query and genes as subject with edgeR-robust results.

+
+ +
## Overlap between DERs and significant DE genes
+#ov_table(fullRegions, edger)
+
+## Explore mismatched cases
+#noNA(explore_ov(fullRegions, edger)[1:3])
+#noNA(explore_ov(fullRegions, edger, 'TRUE-FALSE')[1:3])
+
+## Min 20 bp overlap, using only DERs 20 bp long
+ov_table(fullRegs20, edger, minov = 20L)
+ +
##                       Overlaps sig DE gene (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE   911 1367 2278
+##                  TRUE    227  288  515
+##                  Sum    1138 1655 2793
+
+ +
## Explore mismatched cases, min 20bp overlap
+noNA(explore_ov(fullRegs20, edger, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##    1    2 
+## 1354   13 
+## 
+## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   20.00   23.00   29.00   30.19   36.00   62.00 
+## 
+## $ders_per_gene_table
+## 
+##   1   2   3   4   5   6   7   8   9  10  21 
+## 563 136  61  27  16  12   4   3   1   2   1
+
+ +
noNA(explore_ov(fullRegs20, edger, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   32.00   46.00   58.00   64.83   74.00  174.00 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0    6121   21320   20470 1385000 
+## 
+## $distance_nearest_sig_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      69    7492   24510   74750   43900 2089000
+
+ +
## Holm vs BH
+addmargins(table('edgeR Holm' = mcols(edger_holm)$sig, 'edger BH' = mcols(edger)$sig))
+ +
##           edger BH
+## edgeR Holm FALSE  TRUE   Sum
+##      FALSE 13397  7887 21284
+##      TRUE      0   532   532
+##      Sum   13397  8419 21816
+
+ +
## With Holm, 20bp
+ov_table(fullRegs20, edger_holm, minov = 20L)
+ +
##                       Overlaps sig DE gene (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  2067  211 2278
+##                  TRUE    442   73  515
+##                  Sum    2509  284 2793
+
+
+

The results are fairly similar to those from using DESeq2.

+

Query: genes

+

Similar comparison using genes as query and DERs as subject with edgeR-robust results.

+
+ +
## Overlap between genes and significant DERs
+#ov_table(fullRegions, edger, 'counts')
+
+## Explore mismatched cases
+#noNA(explore_ov_counts(fullRegions, edger)[1:3])
+#noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE')[1:3])
+
+## Overlap between genes and significant DERs, min 20 bp
+ov_table(fullRegions, edger, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE gene FALSE  TRUE   Sum
+##               FALSE 13325    72 13397
+##               TRUE   8213   206  8419
+##               Sum   21538   278 21816
+
+ +
## Explore mismatched cases
+noNA(explore_ov_counts(fullRegions, edger, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##  1  2  3  4 
+## 57 11  2  2 
+## 
+## $width_gene
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##     493    1579    3758    4619    7095   13630 
+## 
+## $genes_per_der_table
+## 
+##  1 19 
+## 74  1
+
+ +
noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_gene
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      46    2282    3631    4514    5667  118100 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0    5977   31080  141900  122200 5384000 
+## 
+## $distance_nearest_sig_sum
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
+##       52   737100  2360000  5379000  6422000 48690000
+
+ +
## With Holm, 20 bp
+ov_table(fullRegions, edger_holm, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE gene FALSE  TRUE   Sum
+##               FALSE 21059   225 21284
+##               TRUE    479    53   532
+##               Sum   21538   278 21816
+
+
+

overall

+

While the DERs vs genes results are fairly similar between edgeR-robust and DESeq2, as shown below the number of mismatched cases is high compared to the number of cases both counts-based methods agree. This is also true when controlling the FWER to determine significance.

+
+ +
## edgeR vs DESeq2
+addmargins(table('edgeR-robust (FDR)' = mcols(edger)$sig, 'DESeq2 (FDR)' = mcols(deseq)$sig))
+ +
##                   DESeq2 (FDR)
+## edgeR-robust (FDR) FALSE  TRUE   Sum
+##              FALSE 12877   520 13397
+##              TRUE   1165  7254  8419
+##              Sum   14042  7774 21816
+
+ +
## Control FWER
+addmargins(table('edgeR-robust (FWER)' = mcols(edger_holm)$sig, 'DESeq2 (FWER)' = mcols(deseq_holm)$sig))
+ +
##                    DESeq2 (FWER)
+## edgeR-robust (FWER) FALSE  TRUE   Sum
+##               FALSE 21234    50 21284
+##               TRUE    271   261   532
+##               Sum   21505   311 21816
+
+ +
## Only sig if both edgeR and DEseq2 say it is
+both <- deseq
+mcols(both)$sig <- mcols(both)$sig & mcols(edger)$sig
+
+## Same, for holm
+both_holm <- deseq_holm
+mcols(both_holm)$sig <- mcols(both_holm)$sig & mcols(edger_holm)$sig
+
+

We can consider an gene to be DE only if both edgeR-robust and DESeq2 find that its significantly DE. The next sections use this information.

+

Query: DERs

+
+ +
## Overlap between DERs and significant DE genes
+#ov_table(fullRegions, both)
+
+## Explore mismatched cases
+#noNA(explore_ov(fullRegions, both)[1:3])
+#noNA(explore_ov(fullRegions, both, 'TRUE-FALSE')[1:3])
+
+## Min 20 bp overlap, using only DERs 20 bp long
+ov_table(fullRegs20, both, minov = 20L)
+ +
##                       Overlaps sig DE gene (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  1063 1215 2278
+##                  TRUE    286  229  515
+##                  Sum    1349 1444 2793
+
+ +
## Explore mismatched cases, min 20bp overlap
+noNA(explore_ov(fullRegs20, both, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##    1    2 
+## 1208    7 
+## 
+## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   20.00   23.00   29.00   30.15   36.00   62.00 
+## 
+## $ders_per_gene_table
+## 
+##   1   2   3   4   5   6   7   8   9  10  21 
+## 513 121  52  23  14   9   3   3   1   2   1
+
+ +
noNA(explore_ov(fullRegs20, both, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   29.00   47.00   59.50   66.84   77.00  182.00 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0     924   16930   16970 1385000 
+## 
+## $distance_nearest_sig_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    7028   26650   89060   72640 2651000
+
+ +
## Holm vs BH
+addmargins(table('Both Holm' = mcols(both_holm)$sig, 'Both BH' = mcols(both)$sig))
+ +
##          Both BH
+## Both Holm FALSE  TRUE   Sum
+##     FALSE 14562  6993 21555
+##     TRUE      0   261   261
+##     Sum   14562  7254 21816
+
+ +
## Use Holm and min 20 bp ov
+ov_table(fullRegs20, both_holm, minov = 20L)
+ +
##                       Overlaps sig DE gene (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  2109  169 2278
+##                  TRUE    454   61  515
+##                  Sum    2563  230 2793
+
+
+

The trends observed previously are maintained in this comparison with a reduction of cases where the gene is DE. This is expected due to the non-perfect agreement between DESeq2 and edgeR-robust.

+
+ +
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
+library('derfinder')
+ +
## Find out what's changed in derfinder with
+## news(Version == "1.1.17", package = "derfinder")
+
+ +
library('derfinderHelper')
+library('derfinderPlot')
+ +
## Find out what's changed in derfinderPlot with
+## news(Version == "1.1.6", package = "derfinderPlot")
+
+ +
load('../derAnalysis/run3-v1.0.10/models.Rdata')
+load('../derAnalysis/run3-v1.0.10/chr22/optionsStats.Rdata')
+load("../CoverageInfo/fullCov.Rdata")
+
+def.par <- par()
+def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))]
+
+regPlot <- function(region, title) {
+    ## Calculate F-stats
+    range <- start(region):end(region)
+    dat <- fullCov[[as.character(seqnames(region))]][range, colsubset]
+
+    ## Log2 transform
+    for(i in seq_len(length(groupInfo))) dat[[i]] <- log2(dat[[i]] + 32) 
+
+    ## Calculate f-stats
+    fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0))
+
+    ## Find annotation
+    annoReg <- annotateRegions(region, GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome, verbose = FALSE)
+    symbol <- mcols(annoReg$annotationList[[1]])$symbol
+    symbol <- as.character(noNA(symbol)[[1]])
+    if(length(symbol) > 1) symbol <- symbol[1]
+    symbol <- ifelse(is.null(symbol), NA, symbol)
+    ## Remove symbol name because it gets chomped on the plot
+    mcols(annoReg$annotationList[[1]])$symbol <- NA
+    
+    par(def.par)
+
+    ## Plot long gene
+    plotRegionCoverage(region, getRegionCoverage(fullCov, region, verbose = FALSE), groupInfo, data.frame(name = title, distance = NA, region = symbol), annoReg, verbose = FALSE, ask = FALSE, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene)
+
+    ## Add F-stat track
+    par(fig = c(0, 1, 0.065, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1))
+    plot(y = fstats, x = range, ylab = 'F-stat', type = 'l', xlab = '', bty = 'n', ylim = c(0, max(fstats[is.finite(fstats)], optionsStats$cutoffFstatUsed) * 1.1), las = 2, yaxt = 'n')
+    y.max <- round(max(c(optionsStats$cutoffFstatUsed, fstats[is.finite(fstats)]), na.rm = TRUE), 0)
+    axis(2, at = c(0, round(y.max / 2, 0), y.max), c(0, round(y.max / 2, 0), y.max), las = 2, tick = TRUE)
+    abline(h = optionsStats$cutoffFstatUsed, col = 'red')
+    abline(h = 0, col = 'grey')
+}
+
+sortWidth <- function(regions) {
+    regions[order(width(regions), decreasing = TRUE)]
+}
+
+

We can now make plots to explore some DERs for each of the cases.

+
+ +
query_der_plots <- function() {
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & fullRegs20$sigFWER])[1:10], function(reg) {
+        regPlot(reg, 'DER query: DE agreement')
+    })
+
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & !fullRegs20$sigFWER])[1:10],  function(reg) {
+        regPlot(reg, 'DER query: not DE agreement')
+    })
+
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & fullRegs20$sigFWER])[1:10], function(reg) {
+        regPlot(reg, 'DER query: only gene not DE')
+    })
+
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & !fullRegs20$sigFWER])[1:10], function(reg) {
+        regPlot(reg, 'DER query: only gene DE')
+    })
+}
+pdf(file = 'query_der_plots_gene.pdf', width = 10, height = 7)
+query_der_plots()
+ +
## $`535`
+## NULL
+## 
+## $`540`
+## NULL
+## 
+## $`625`
+## NULL
+## 
+## $`594`
+## NULL
+## 
+## $`589`
+## NULL
+## 
+## $`574`
+## NULL
+## 
+## $`610`
+## NULL
+## 
+## $`616`
+## NULL
+## 
+## $`626`
+## NULL
+## 
+## $`533`
+## NULL
+
+ +
dev.off()
+ +
## pdf 
+##   2
+
+ +
query_der_plots()
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+ +
## $`535`
+## NULL
+## 
+## $`540`
+## NULL
+## 
+## $`625`
+## NULL
+## 
+## $`594`
+## NULL
+## 
+## $`589`
+## NULL
+## 
+## $`574`
+## NULL
+## 
+## $`610`
+## NULL
+## 
+## $`616`
+## NULL
+## 
+## $`626`
+## NULL
+## 
+## $`533`
+## NULL
+
+
+

Query: genes

+

As was shown with either DESeq2 or edgeR-robust results, derfinder is more conservative than the counts-based methods.

+
+ +
## Overlap between genes and significant DERs, min 20 bp
+ov_table(fullRegions, both, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE gene FALSE  TRUE   Sum
+##               FALSE 14459   103 14562
+##               TRUE   7079   175  7254
+##               Sum   21538   278 21816
+
+ +
## Explore mismatched cases
+noNA(explore_ov_counts(fullRegions, both, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##  1  2  3  4  6 11 
+## 74 21  4  2  1  1 
+## 
+## $width_gene
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##     493    1507    3495    4369    7055   13630 
+## 
+## $genes_per_der_table
+## 
+##   1  20 
+## 133   1
+
+ +
noNA(explore_ov_counts(fullRegions, both, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_gene
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    2355    3709    4636    5822  118100 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0    5670   29530  137500  115400 5384000 
+## 
+## $distance_nearest_sig_sum
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
+##       52   756600  2439000  5465000  6498000 48690000
+
+ +
## With Holm, 20 bp
+ov_table(fullRegions, both_holm, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE gene FALSE  TRUE   Sum
+##               FALSE 21321   234 21555
+##               TRUE    217    44   261
+##               Sum   21538   278 21816
+
+
+

We can now visually explore some genes (maximum 10kb long) for each of the four cases.

+
+ +
max_kb <- function(x, limit = 1e4) {
+    x[width(x) <= limit]
+}
+
+selectRegions <- function(x) {
+    max_kb(sortWidth(unlist(range(x))))
+}
+
+query_gene_plots <- function() {
+    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
+        regPlot(reg, 'Gene query: DE agreement')
+    })
+
+    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
+        regPlot(reg, 'Gene query: not DE agreement')
+    })
+
+    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
+        regPlot(reg, 'Gene query: only gene not DE')
+    })
+
+    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
+        regPlot(reg, 'Gene query: only gene DE')
+    })
+}
+pdf(file = 'query_gene_plots.pdf', width = 10, height = 7)
+query_gene_plots()
+ +
## $`401505`
+## NULL
+## 
+## $`1581`
+## NULL
+## 
+## $`26168`
+## NULL
+## 
+## $`55299`
+## NULL
+## 
+## $`4708`
+## NULL
+## 
+## $`2524`
+## NULL
+## 
+## $`112703`
+## NULL
+## 
+## $`100506746`
+## NULL
+## 
+## $`54989`
+## NULL
+## 
+## $`363`
+## NULL
+
+ +
dev.off()
+ +
## pdf 
+##   2
+
+ +
query_gene_plots()
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+ +
## $`401505`
+## NULL
+## 
+## $`1581`
+## NULL
+## 
+## $`26168`
+## NULL
+## 
+## $`55299`
+## NULL
+## 
+## $`4708`
+## NULL
+## 
+## $`2524`
+## NULL
+## 
+## $`112703`
+## NULL
+## 
+## $`100506746`
+## NULL
+## 
+## $`54989`
+## NULL
+## 
+## $`363`
+## NULL
+
+
+

Reproducibility

+

Date the report was generated.

+
+ +
## [1] "2015-03-30 21:24:43 EDT"
+
+
+

Wallclock time spent generating the report.

+
+ +
## Time difference of 29.945 mins
+
+
+

R session information.

+
+ +
## Session info-----------------------------------------------------------------------------------------------------------
+
+ +
##  setting  value                                             
+##  version  R Under development (unstable) (2014-11-01 r66923)
+##  system   x86_64, darwin10.8.0                              
+##  ui       X11                                               
+##  language (EN)                                              
+##  collate  en_US.UTF-8                                       
+##  tz       America/New_York
+
+ +
## Packages---------------------------------------------------------------------------------------------------------------
+
+ +
##  package                           * version     date       source                                    
+##  acepack                             1.3.3.3     2013-05-03 CRAN (R 3.2.0)                            
+##  annotate                            1.45.4      2015-03-21 Bioconductor                              
+##  AnnotationDbi                     * 1.29.20     2015-03-19 Bioconductor                              
+##  Biobase                           * 2.27.3      2015-03-27 Bioconductor                              
+##  BiocGenerics                      * 0.13.10     2015-03-27 Bioconductor                              
+##  BiocParallel                        1.1.21      2015-03-24 Bioconductor                              
+##  biomaRt                             2.23.5      2014-11-22 Bioconductor                              
+##  Biostrings                          2.35.12     2015-03-26 Bioconductor                              
+##  biovizBase                          1.15.3      2015-03-30 Bioconductor                              
+##  bitops                              1.0.6       2013-08-17 CRAN (R 3.2.0)                            
+##  BSgenome                            1.35.20     2015-03-27 Bioconductor                              
+##  bumphunter                          1.7.6       2015-03-13 Github (lcolladotor/bumphunter@37d10e7)   
+##  cluster                             2.0.1       2015-01-31 CRAN (R 3.2.0)                            
+##  codetools                           0.2.11      2015-03-10 CRAN (R 3.2.0)                            
+##  colorout                          * 1.0.2       2014-11-03 local                                     
+##  colorspace                          1.2.6       2015-03-11 CRAN (R 3.2.0)                            
+##  DBI                                 0.3.1       2014-09-24 CRAN (R 3.2.0)                            
+##  derfinder                         * 1.1.17      2015-03-14 Github (lcolladotor/derfinder@3532e0c)    
+##  derfinderHelper                   * 1.1.6       2015-03-15 Bioconductor                              
+##  derfinderPlot                     * 1.1.6       2015-03-14 Github (lcolladotor/derfinderPlot@1319754)
+##  DESeq2                            * 1.7.45      2015-03-25 Bioconductor                              
+##  devtools                            1.6.1       2014-10-07 CRAN (R 3.2.0)                            
+##  dichromat                           2.0.0       2013-01-24 CRAN (R 3.2.0)                            
+##  digest                              0.6.8       2014-12-31 CRAN (R 3.2.0)                            
+##  doRNG                               1.6         2014-03-07 CRAN (R 3.2.0)                            
+##  edgeR                             * 3.9.14      2015-03-27 Bioconductor                              
+##  evaluate                            0.5.5       2014-04-29 CRAN (R 3.2.0)                            
+##  foreach                             1.4.2       2014-04-11 CRAN (R 3.2.0)                            
+##  foreign                             0.8.63      2015-02-20 CRAN (R 3.2.0)                            
+##  formatR                             1.0         2014-08-25 CRAN (R 3.2.0)                            
+##  Formula                             1.2.0       2015-01-20 CRAN (R 3.2.0)                            
+##  futile.logger                       1.4         2015-03-21 CRAN (R 3.2.0)                            
+##  futile.options                      1.0.0       2010-04-06 CRAN (R 3.2.0)                            
+##  genefilter                          1.49.2      2014-10-21 Bioconductor                              
+##  geneplotter                         1.45.0      2014-10-14 Bioconductor                              
+##  GenomeInfoDb                      * 1.3.16      2015-03-27 Bioconductor                              
+##  GenomicAlignments                   1.3.32      2015-03-18 Bioconductor                              
+##  GenomicFeatures                   * 1.19.36     2015-03-30 Bioconductor                              
+##  GenomicFiles                        1.3.14      2015-03-07 Bioconductor                              
+##  GenomicRanges                     * 1.19.48     2015-03-27 Bioconductor                              
+##  GGally                              0.4.8       2014-08-26 CRAN (R 3.2.0)                            
+##  ggbio                               1.15.2      2015-03-24 Bioconductor                              
+##  ggplot2                             1.0.0       2014-05-21 CRAN (R 3.2.0)                            
+##  graph                               1.45.2      2015-03-01 Bioconductor                              
+##  gridExtra                           0.9.1       2012-08-09 CRAN (R 3.2.0)                            
+##  gtable                              0.1.2       2012-12-05 CRAN (R 3.2.0)                            
+##  Hmisc                               3.14.5      2014-09-12 CRAN (R 3.2.0)                            
+##  htmltools                           0.2.6       2014-09-08 CRAN (R 3.2.0)                            
+##  IRanges                           * 2.1.43      2015-03-07 Bioconductor                              
+##  iterators                           1.0.7       2014-04-11 CRAN (R 3.2.0)                            
+##  knitr                               1.7         2014-10-13 CRAN (R 3.2.0)                            
+##  knitrBootstrap                      1.0.0       2014-11-03 Github (jimhester/knitrBootstrap@76c41f0) 
+##  lambda.r                            1.1.7       2015-03-20 CRAN (R 3.2.0)                            
+##  lattice                             0.20.30     2015-02-22 CRAN (R 3.2.0)                            
+##  latticeExtra                        0.6.26      2013-08-15 CRAN (R 3.2.0)                            
+##  limma                             * 3.23.11     2015-03-15 Bioconductor                              
+##  locfit                              1.5.9.1     2013-04-20 CRAN (R 3.2.0)                            
+##  markdown                            0.7.4       2014-08-24 CRAN (R 3.2.0)                            
+##  MASS                                7.3.40      2015-03-21 CRAN (R 3.2.0)                            
+##  Matrix                              1.1.5.1     2015-03-23 CRAN (R 3.2.0)                            
+##  matrixStats                         0.14.0      2015-02-14 CRAN (R 3.2.0)                            
+##  mime                                0.3         2015-03-29 CRAN (R 3.2.0)                            
+##  munsell                             0.4.2       2013-07-11 CRAN (R 3.2.0)                            
+##  nnet                                7.3.9       2015-02-11 CRAN (R 3.2.0)                            
+##  OrganismDbi                         1.9.15      2015-03-30 Bioconductor                              
+##  pkgmaker                            0.22        2014-05-14 CRAN (R 3.2.0)                            
+##  plyr                                1.8.1       2014-02-26 CRAN (R 3.2.0)                            
+##  proto                               0.3.10      2012-12-22 CRAN (R 3.2.0)                            
+##  qvalue                              1.99.0      2015-03-30 Bioconductor                              
+##  RBGL                                1.43.0      2014-10-14 Bioconductor                              
+##  RColorBrewer                        1.1.2       2014-12-07 CRAN (R 3.2.0)                            
+##  Rcpp                              * 0.11.5      2015-03-06 CRAN (R 3.2.0)                            
+##  RcppArmadillo                     * 0.4.650.1.1 2015-02-26 CRAN (R 3.2.0)                            
+##  RCurl                               1.95.4.5    2014-12-28 CRAN (R 3.2.0)                            
+##  registry                            0.2         2012-01-24 CRAN (R 3.2.0)                            
+##  reshape                             0.8.5       2014-04-23 CRAN (R 3.2.0)                            
+##  reshape2                            1.4.1       2014-12-06 CRAN (R 3.2.0)                            
+##  rmarkdown                           0.3.3       2014-09-17 CRAN (R 3.2.0)                            
+##  rngtools                            1.2.4       2014-03-06 CRAN (R 3.2.0)                            
+##  rpart                               4.1.9       2015-02-24 CRAN (R 3.2.0)                            
+##  Rsamtools                           1.19.49     2015-03-27 Bioconductor                              
+##  RSQLite                             1.0.0       2014-10-25 CRAN (R 3.2.0)                            
+##  rstudioapi                          0.2         2014-12-31 CRAN (R 3.2.0)                            
+##  rtracklayer                         1.27.10     2015-03-27 Bioconductor                              
+##  S4Vectors                         * 0.5.22      2015-03-06 Bioconductor                              
+##  scales                              0.2.4       2014-04-22 CRAN (R 3.2.0)                            
+##  stringr                             0.6.2       2012-12-06 CRAN (R 3.2.0)                            
+##  survival                            2.38.1      2015-02-24 CRAN (R 3.2.0)                            
+##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0       2014-09-26 Bioconductor                              
+##  VariantAnnotation                   1.13.46     2015-03-26 Bioconductor                              
+##  XML                                 3.98.1.1    2013-06-20 CRAN (R 3.2.0)                            
+##  xtable                              1.7.4       2014-09-12 CRAN (R 3.2.0)                            
+##  XVector                           * 0.7.4       2015-02-08 Bioconductor                              
+##  yaml                                2.1.13      2014-06-12 CRAN (R 3.2.0)                            
+##  zlibbioc                            1.13.3      2015-03-23 Bioconductor
+
+
+
+
+ + +
+
+ +
+ + +
+ + diff --git a/hippo/pnas/compareVsPNAS.Rmd b/hippo/pnas/compareVsPNAS.Rmd new file mode 100644 index 0000000..3796d4b --- /dev/null +++ b/hippo/pnas/compareVsPNAS.Rmd @@ -0,0 +1,770 @@ +--- +output: + knitrBootstrap::bootstrap_document: + theme.chooser: TRUE + highlight.chooser: TRUE +--- + +Compare vs PNAS +=============== + + +# Counts-based analysis + +This section has the code for running `edgeR-robust` and `DESeq2` on the simulation data set using the known exons as features. + + +This first code chunk loads the necessary data. + + +```{r 'setup', bootstrap.show.code = FALSE, bootstrap.show.message = FALSE} +## Track time spent on making the report +startTime <- Sys.time() + +library('edgeR') +library('DESeq2') +library('GenomicRanges') + +## Load data +load("../coverageToExon/covToEx-ucsc.Rdata") +load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata") +load("../derAnalysis/run3-v1.0.10/colsubset.Rdata") + +## GenomicState object +if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) { + load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda') +} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) { + load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda') +} else { + stop('Missing UCSC hg19 genomic state object') +} + +## Annotation used +exons <- GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome +exons <- exons[exons$theRegion == 'exon'] + +## Round matrix and remove exons with 0s +counts <- round(covToEx[, colsubset]) +nonzero <- sapply(rowSums(counts), function(x) {x > 0}) +``` + + +## DESeq2 + +The following code performs the DESeq2 analysis. Code is based on [edgeR_Robust supplementary code](http://imlspenticton.uzh.ch/robinson_lab/edgeR_robust/). The main change is that it has been modified for the multi-group scenario. + +```{r 'deseq2', bootstrap.show.code = FALSE} +## Round matrix and specify design +dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group) + +## Perform DE analysis +system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) ) + +## Extract results +deseq <- exons[nonzero] +mcols(deseq) <- cbind(mcols(deseq), results(dse)) + +## Which are significant? +deseq$sig <- deseq$padj < 0.05 +deseq$sig[is.na(deseq$sig)] <- FALSE + +## Save results +save(deseq, file = 'deseq.Rdata') + +## Adjust by Holm +deseq_holm <- deseq +deseq_holm$sig <- p.adjust(deseq_holm$pvalue, 'holm') < 0.05 +``` + + +## edgeR-robust + +The following code performs the DESeq2 analysis. Code is based on [edgeR_Robust supplementary code](http://imlspenticton.uzh.ch/robinson_lab/edgeR_robust/). The main change is that it has been modified for the multi-group scenario. + + +```{r 'edgeR', bootstrap.show.code = FALSE} +## Determine design matrix +design <- model.matrix(~ groupInfo) + +## Perform DE analysis +d <- DGEList(counts = counts[nonzero, ], group = groupInfo) +d <- calcNormFactors(d) +system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6)) +fw <- glmFit(dw, design = design, coef = 2:3) +lrw <- glmLRT(fw, coef = 2:3) + +## Extract results +edger <- exons[nonzero] +mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table)) +edger$pvalue <- lrw$table$PValue +edger$padj <- p.adjust(lrw$table$PValue, 'BH') + +## Which are significant? +edger$sig <- edger$padj < 0.05 +edger$sig[is.na(edger$sig)] <- FALSE + +## Save results +save(edger, file = 'edger.Rdata') + +## Adjust by Holm +edger_holm <- edger +edger_holm$sig <- p.adjust(edger_holm$pvalue, 'holm') < 0.05 +``` + + + + + + + + +## Overlap + +```{r 'ov-comp-setup', bootstrap.show.code = FALSE} +## Load data +load('../derAnalysis/run3-v1.0.10/fullRegions.Rdata') + +## Some formatting and subsets +names(fullRegions) <- seq_len(length(fullRegions)) +fullRegions$sigFWER <- as.logical(fullRegions$significantFWER) +fullRegs20 <- fullRegions[width(fullRegions) >= 20] + +## Overlap table for all 4 cases +ov_table <- function(ders, counts, query = 'der', minov = 0) { + if(query == 'der') { + if(minov == 0) { + res <- addmargins(table('Significant DER (FWER)' = ders$sigFWER, 'Overlaps sig DE exon' = countOverlaps(ders, counts[counts$sig]) > 0)) + } else { + res <- addmargins(table(ders$sigFWER, countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FWER)', paste0('Overlaps sig DE exon (min ', minov, 'bp)')))) + } + } else if (query == 'counts') { + if(minov == 0) { + res <- addmargins(table('Significant DE exon' = counts$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFWER]) > 0)) + } else { + res <- addmargins(table(counts$sig[width(counts) >= minov], countOverlaps(counts[width(counts) >= minov], ders[ders$sigFWER], minoverlap = minov) > 0, dnn = c('Significant DE exon', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)')))) + } + } + return(res) +} + +## Explore mistmatched cases for DERs vs Exons direction +explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) { + if(case == 'FALSE-TRUE') { + i <- which(countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0 & !ders$sigFWER) + } else if (case == 'TRUE-FALSE') { + i <- which(!countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0 & ders$sigFWER) + } else{ + stop('invalid case') + } + if(length(i) == 0) return("No such cases") + + if(case == 'FALSE-TRUE') { + res <- list( + n_overlaps = table(countOverlaps(ders[i], counts[counts$sig], minoverlap = minov)), + width_der = summary(width(ders[i])), + ders_per_exon_table = table(table(subjectHits(findOverlaps(ders[i], counts[counts$sig], minoverlap = minov)))), + ders_per_exon = sort(table(subjectHits(findOverlaps(ders[i], counts[counts$sig], minoverlap = minov)))), + i = i + ) + } else { + res <- list( + width_der = summary(width(ders[i])), + distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], counts, ignore.strand = TRUE))$distance), + distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], counts[counts$sig], ignore.strand = TRUE))$distance), + distance_nearest = distanceToNearest(ders[i], counts, ignore.strand = TRUE), + distance_nearest_sig = distanceToNearest(ders[i], counts[counts$sig], ignore.strand = TRUE), + i = i + ) + } + + return(res) +} + +## Explore mistmatched cases for Exons vs DERs direction +explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) { + counts <- counts[width(counts) >= minov] + if(case == 'FALSE-TRUE') { + i <- which(countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & !counts$sig) + } else if (case == 'TRUE-FALSE') { + i <- which(!countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & counts$sig) + } else{ + stop('invalid case') + } + if(length(i) == 0) return("No such cases") + + if(case == 'FALSE-TRUE') { + res <- list( + n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)), + width_exon = summary(width(counts[i])), + exons_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))), + exons_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))), + i = i + ) + } else { + res <- list( + width_exon = summary(width(counts[i])), + distance_nearest_sum = summary(mcols(distanceToNearest(counts[i], ders, ignore.strand = TRUE))$distance), + distance_nearest_sig_sum = summary(mcols(distanceToNearest(counts[i], ders[ders$sigFWER], ignore.strand = TRUE))$distance), + distance_nearest = distanceToNearest(counts[i], ders, ignore.strand = TRUE), + distance_nearest_sig = distanceToNearest(counts[i], ders[ders$sigFWER], ignore.strand = TRUE), + i = i + ) + } + + return(res) +} + +noNA <- function(x) { + x[!is.na(x)] +} +``` + + +### DESeq2 + + +#### Query: DERs + +We can first compare the results by using the DERs as the query and the exons as the subject. The following output shows the comparison using all DERs and exploring the mismatched cases. Then its repeated using the DERs $\geq$ 20 bp and a minimum overlap of 20bp. + +For the mismatched cases of non-significant DERs overlapping a significant exon, we check: + +* how many exons each DER overlaps, +* the width of the DERs +* the frequency table of how many DERs overlap the same exon + +For the other mismatched case, we check: + +* the width of the DERs +* distance to nearest exon (regardless of exon size) +* distance to nearest significant DE exon (ibidem) + +```{r 'ov-comp-deseq', bootstrap.show.code = FALSE} +## Overlap between DERs and significant DE exons +ov_table(fullRegions, deseq) + +## Explore mismatched cases +#noNA(explore_ov(fullRegions, deseq)[1:3]) +#noNA(explore_ov(fullRegions, deseq, 'TRUE-FALSE')[1:3]) + +## Min 20 bp overlap, using only DERs 20 bp long +ov_table(fullRegs20, deseq, minov = 20L) + +## Explore mismatched cases, min 20bp overlap +noNA(explore_ov(fullRegs20, deseq, minov = 20L)[1:3]) +noNA(explore_ov(fullRegs20, deseq, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Holm vs BH +addmargins(table('DESeq2 Holm' = deseq_holm$sig, 'DESeq2 BH' = deseq$sig)) + +## Use Holm and min 20 bp ov +ov_table(fullRegs20, deseq_holm, minov = 20L) +``` + +Most of the DERs are shorter than 20bp (`r round(sum(width(fullRegions) < 20) / length(fullRegions) * 100, 2)` percent), so we'll focus on the longer ones. The majority of the mismatches are from non significant DERs that overlap a significant exon. + +As expected, when controlling the FWER instead of the FDR, most of the DE exons are no longer significant. Using FWER-controlled DE exons, most of the DERs 20bp or longer agree with the exons as not being significantly DE. + + +#### Query: exons + +We can now repeat the comparison using the exons as the query and the DERs as the subject. + +For the mismatched cases of non-significant exons overlapping a significant DER, we check: + +* how many DERs each exon overlaps, +* the width of the exons +* the frequency table of how many exons overlap the same DER + +For the other mismatched case, we check: + +* the width of the exons +* distance to nearest DER (regardless of DER size) +* distance to nearest significant DER (ibidem) + +```{r 'ov-comp-deseq-counts', bootstrap.show.code = FALSE} +## Overlap between exons and significant DERs +#ov_table(fullRegions, deseq, 'counts') + +## Explore mismatched cases +#noNA(explore_ov_counts(fullRegions, deseq)[1:3]) +#noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE')[1:3]) + +## Overlap between exons and significant DERs, min 20 bp +ov_table(fullRegions, deseq, 'counts', 20L) + +## Explore mismatched cases +noNA(explore_ov_counts(fullRegions, deseq, minov = 20L)[1:3]) +noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Now with Holm +ov_table(fullRegions, deseq_holm, 'counts', 20L) +``` + +From these results, we can see that `derfinder` is more conservative. + + + + +### edgeR-robust + +#### Query: DERs + +Similar comparison using DERs as query and exons as subject with `edgeR-robust` results. + +```{r 'ov-comp-edger', bootstrap.show.code = FALSE} +## Overlap between DERs and significant DE exons +#ov_table(fullRegions, edger) + +## Explore mismatched cases +#noNA(explore_ov(fullRegions, edger)[1:3]) +#noNA(explore_ov(fullRegions, edger, 'TRUE-FALSE')[1:3]) + +## Min 20 bp overlap, using only DERs 20 bp long +ov_table(fullRegs20, edger, minov = 20L) + +## Explore mismatched cases, min 20bp overlap +noNA(explore_ov(fullRegs20, edger, minov = 20L)[1:3]) +noNA(explore_ov(fullRegs20, edger, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Holm vs BH +addmargins(table('edgeR Holm' = edger_holm$sig, 'edger BH' = edger$sig)) + +## With Holm, 20bp +ov_table(fullRegs20, edger_holm, minov = 20L) +``` + +The results are fairly similar to those from using `DESeq2`. + +#### Query: exons + +Similar comparison using exons as query and DERs as subject with `edgeR-robust` results. + +```{r 'ov-comp-edger-counts', bootstrap.show.code = FALSE} +## Overlap between exons and significant DERs +#ov_table(fullRegions, edger, 'counts') + +## Explore mismatched cases +#noNA(explore_ov_counts(fullRegions, edger)[1:3]) +#noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE')[1:3]) + +## Overlap between exons and significant DERs, min 20 bp +ov_table(fullRegions, edger, 'counts', 20L) + +## Explore mismatched cases +noNA(explore_ov_counts(fullRegions, edger, minov = 20L)[1:3]) +noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE', minov = 20L)[1:3]) + +## With Holm, 20 bp +ov_table(fullRegions, edger_holm, 'counts', 20L) +``` + + +### overall + +While the DERs vs exons results are fairly similar between `edgeR-robust` and `DESeq2`, as shown below the number of mismatched cases is high compared to the number of cases both counts-based methods agree. This is also true when controlling the FWER to determine significance. + +```{r 'deseq-vs-edger'} +## edgeR vs DESeq2 +addmargins(table('edgeR-robust (FDR)' = edger$sig, 'DESeq2 (FDR)' = deseq$sig)) + +## Control FWER +addmargins(table('edgeR-robust (FWER)' = edger_holm$sig, 'DESeq2 (FWER)' = deseq_holm$sig)) + +## Only sig if both edgeR and DEseq2 say it is +both <- deseq +both$sig <- both$sig & edger$sig + +## Same, for holm +both_holm <- deseq_holm +both_holm$sig <- both_holm$sig & edger_holm$sig +``` + +We can consider an exon to be DE only if both `edgeR-robust` and `DESeq2` find that its significantly DE. The next sections use this information. + +#### Query: DERs + +```{r 'ov-comp-both', bootstrap.show.code = FALSE} +## Overlap between DERs and significant DE exons +#ov_table(fullRegions, both) + +## Explore mismatched cases +#noNA(explore_ov(fullRegions, both)[1:3]) +#noNA(explore_ov(fullRegions, both, 'TRUE-FALSE')[1:3]) + +## Min 20 bp overlap, using only DERs 20 bp long +ov_table(fullRegs20, both, minov = 20L) + +## Explore mismatched cases, min 20bp overlap +noNA(explore_ov(fullRegs20, both, minov = 20L)[1:3]) +noNA(explore_ov(fullRegs20, both, 'TRUE-FALSE', minov = 20L)[1:3]) + +## Holm vs BH +addmargins(table('Both Holm' = both_holm$sig, 'Both BH' = both$sig)) + +## Use Holm and min 20 bp ov +ov_table(fullRegs20, both_holm, minov = 20L) +``` + +The trends observed previously are maintained in this comparison with a reduction of cases where the exon is DE. This is expected due to the non-perfect agreement between `DESeq2` and `edgeR-robust`. + + +```{r 'regionPlot-setup', bootstrap.show.code = FALSE, bootstrap.show.message = FALSE} +library('TxDb.Hsapiens.UCSC.hg19.knownGene') +library('derfinder') +library('derfinderHelper') +library('derfinderPlot') +load('../derAnalysis/run3-v1.0.10/models.Rdata') +load('../derAnalysis/run3-v1.0.10/chr22/optionsStats.Rdata') +load("../CoverageInfo/fullCov.Rdata") + +def.par <- par() +def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))] + +regPlot <- function(region, title) { + ## Calculate F-stats + range <- start(region):end(region) + dat <- fullCov[[as.character(seqnames(region))]][range, colsubset] + + ## Log2 transform + for(i in seq_len(length(groupInfo))) dat[[i]] <- log2(dat[[i]] + 32) + + ## Calculate f-stats + fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0)) + + ## Find annotation + annoReg <- annotateRegions(region, GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome, verbose = FALSE) + symbol <- mcols(annoReg$annotationList[[1]])$symbol + symbol <- as.character(noNA(symbol)[[1]]) + if(length(symbol) > 1) symbol <- symbol[1] + symbol <- ifelse(is.null(symbol), NA, symbol) + ## Remove symbol name because it gets chomped on the plot + mcols(annoReg$annotationList[[1]])$symbol <- NA + + par(def.par) + + ## Plot long exon + plotRegionCoverage(region, getRegionCoverage(fullCov, region, verbose = FALSE), groupInfo, data.frame(name = title, distance = NA, region = symbol), annoReg, verbose = FALSE, ask = FALSE, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene) + + ## Add F-stat track + par(fig = c(0, 1, 0.065, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1)) + plot(y = fstats, x = range, ylab = 'F-stat', type = 'l', xlab = '', bty = 'n', ylim = c(0, max(fstats[is.finite(fstats)], optionsStats$cutoffFstatUsed) * 1.1), las = 2, yaxt = 'n') + y.max <- round(max(c(optionsStats$cutoffFstatUsed, fstats[is.finite(fstats)]), na.rm = TRUE), 0) + axis(2, at = c(0, round(y.max / 2, 0), y.max), c(0, round(y.max / 2, 0), y.max), las = 2, tick = TRUE) + abline(h = optionsStats$cutoffFstatUsed, col = 'red') + abline(h = 0, col = 'grey') +} + +sortWidth <- function(regions) { + regions[order(width(regions), decreasing = TRUE)] +} +``` + +We can now make plots to explore some DERs for each of the cases. + +```{r 'query-der-plots', fig.width = 10, fig.height = 7, bootstrap.show.code = FALSE, dev = 'CairoPNG'} +query_der_plots <- function() { + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) > 0 & fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: DE agreement') + }) + + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) == 0 & !fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: not DE agreement') + }) + + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) == 0 & fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: only exon not DE') + }) + + sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) > 0 & !fullRegs20$sigFWER])[1:10], function(reg) { + regPlot(reg, 'DER query: only exon DE') + }) +} +pdf(file = 'query_der_plots.pdf', width = 10, height = 7) +query_der_plots() +dev.off() + +query_der_plots() +``` + + +#### Query: exons + +As was shown with either `DESeq2` or `edgeR-robust` results, `derfinder` is more conservative than the counts-based methods. + +```{r 'ov-comp-both-counts', bootstrap.show.code = FALSE} +## Overlap between exons and significant DERs, min 20 bp +ov_table(fullRegions, both, 'counts', 20L) + +## Explore mismatched cases +noNA(explore_ov_counts(fullRegions, both, minov = 20L)[1:3]) +noNA(explore_ov_counts(fullRegions, both, 'TRUE-FALSE', minov = 20L)[1:3]) + +## With Holm, 20 bp +ov_table(fullRegions, both_holm, 'counts', 20L) +``` + +We can now visually explore some exons for each of the four cases. + + +```{r 'query-exon-plots', fig.width = 10, fig.height = 7, bootstrap.show.code = FALSE, dev = 'CairoPNG'} +query_exon_plots <- function() { + sapply(sortWidth(both[width(both) >= 20 & both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) { + regPlot(reg, 'Exon query: DE agreement') + }) + + sapply(sortWidth(both[width(both) >= 20 & !both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) { + regPlot(reg, 'Exon query: not DE agreement') + }) + + sapply(sortWidth(both[width(both) >= 20 & !both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) { + regPlot(reg, 'Exon query: only exon not DE') + }) + + sapply(sortWidth(both[width(both) >= 20 & both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) { + regPlot(reg, 'Exon query: only exon DE') + }) +} +pdf(file = 'query_exon_plots.pdf', width = 10, height = 7) +query_exon_plots() +dev.off() + +query_exon_plots() +``` + + + + + +# Finding regions of interest + + +The code in this section is partially based on `/home/epi/ajaffe/Lieber/Projects/RNAseq/HippoPublic/clean_previous_hits.R`. + + +First we find the regions of the genome corresponding to the genes of interest. That is, the genes from the original paper that were differentially expressed in the conditions we are analyzing. + +```{r pkgs, bootstrap.show.message=FALSE, bootstrap.show.code = FALSE} +## Required pkgs +library("GenomicRanges") +library("ggbio") +library("reshape2") +library("plyr") +library("scales") +library("TxDb.Hsapiens.UCSC.hg19.knownGene") +library("org.Hs.eg.db") +``` + + +```{r identify, bootstrap.show.code = FALSE} +## Gene symbol names of interest +#### Original names +## symbols <- c("HIST1H4E", "RN7SK", "CDR1", "SNORD89", "SNORA73A", "SCARNA17", "PAPD1", "CACNB2", "LRCH4", "SNORD42A", "SNORA47", "LENG8", "FAM123A", "HIVEP3", "HNRPH1", "ZGPAT", "ERF", "SNORD116-29", "C9orf139", "C9orf3", "KCNA2", "EXOC6B", "CENTB5", "TAOK2", "TNRC6C", "ADAMTS4", "MSH4", "C16orf72", "CCR5") + +## http://www.genenames.org/data/hgnc_data.php?hgnc_id=25532 +## http://www.genenames.org/data/hgnc_data.php?hgnc_id=26360 +## http://www.genenames.org/data/hgnc_data.php?hgnc_id=5041 +## http://www.genenames.org/data/hgnc_data.php?hgnc_id=16754 + +## Updated symbols +symbols <- c("HIST1H4E", "RN7SK", "CDR1", "SNORD89", "SNORA73A", "SCARNA17", "MTPAP", "CACNB2", "LRCH4", "SNORD42A", "SNORA47", "LENG8", "AMER2", "HIVEP3", "HNRNPH1", "ZGPAT", "ERF", "SNORD116-29", "C9orf139", "C9orf3", "KCNA2", "EXOC6B", "ACAP3", "TAOK2", "TNRC6C", "ADAMTS4", "MSH4", "C16orf72", "CCR5") + + +## Map gene symbol names to entrezid's + +keys <- keys(org.Hs.eg.db, keytype = "ENTREZID") +columns <- c("SYMBOL") +map <- select(org.Hs.eg.db, keys, columns, keytype = "ENTREZID") +idx <- sapply(symbols, function(x) { + res <- which(map$SYMBOL == x) + ifelse(length(res) > 0, res, NA) +}) +ids <- map$ENTREZID[idx] +names(ids) <- names(idx) + +## Remove those not-found +ids <- ids[!is.na(ids)] + +## Find the exons + +txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +exonsUCSC <- exons(txdb, vals = list(gene_id = ids), columns = c("gene_id", "exon_id", "tx_name", "tx_id")) + +## some gene ids have multiples, straighten out +gids <- as.list(exonsUCSC$gene_id) +for(i in seq(along=gids)) gids[[i]] <- gids[[i]][which(gids[[i]] %in% ids)] +gids <- unlist(gids) + + +## split into list by EID +exonListUCSC <- split(exonsUCSC, gids) +exonListUCSC <- exonListUCSC[ ids[ids %in% gids] ] +### drop duplicated exons +# exonListUCSC = lapply(exonListUCSC, function(x) x[!duplicated(x)]) +# identical(names(exonListUCSC), ids[ids %in% gids]) # TRUE + +## Not found +ids[which(!ids %in% gids)] + +## Find them manually +# http://www.ncbi.nlm.nih.gov/gene/125050 +# http://www.ncbi.nlm.nih.gov/gene/6080 +missing <- GRanges(seqnames=c("chr6", "chr1"), ranges=IRanges(start=c(52860418, 28833877), end=c(52860749, 28834083))) +toAdd <- split(missing, 1:2) +names(toAdd) <- ids[which(!ids %in% gids)] + +## Reduce to min/max per gene +windows <- c(GRangesList(lapply(exonListUCSC, range)), toAdd) + +## Save for later use +save(windows, ids, idx, file="windows.Rdata") +``` + +# Original genes + +In this section, we make a plot for each gene showing the coverage data and whether `derfinder` identified candidate DERs as described in the main text. + +```{r plots, message=FALSE, fig.width=20, fig.height=10, dev="CairoPNG", bootstrap.show.code = FALSE} +## Find chrs used +chrs <- as.character(unique(unlist(seqnames(windows), use.names=FALSE))) + +## Build ideograms +data(hg19IdeogramCyto, package = "biovizBase") +p.ideos <- lapply(chrs, function(xx) { + plotIdeogram(hg19IdeogramCyto, xx) +}) +names(p.ideos) <- chrs + + +## Filter data +fullCovSmall <- lapply(chrs, function(chr) { + fullCov[[chr]][, colsubset] +}) +names(fullCovSmall) <- chrs +rm(fullCov) + +## Main plotting function +plotClusterCustom <- function(cluster, regions, titleName, coverageInfo, groupInfo, titleUse="fwer", txdb=NULL, p.ideogram=NULL, maxExtend=300L, colsubset=NULL, forceLarge=FALSE) { + + stopifnot(is.factor(groupInfo)) + if(is.null(colsubset)) colsubset <- seq_len(length(groupInfo)) + + ## Window length + l <- width(cluster) + 2 * min(maxExtend, width(cluster)) + + if(l > 1e5 & !forceLarge) { + message(paste("No plot will be made because the data is too large. The window size exceeds 100 kb.")) + return(invisible(l)) + } + + wh <- resize(cluster, l, fix="center") + title <- paste("Window view for ENTREZ Symbol", titleName) + + ## Plot the ideogram if not supplied + if(is.null(p.ideogram)) { + chr <- as.character(seqnames(wh)) + ## Now load the ideogram info + hg19IdeogramCyto <- NULL + load(system.file("data", "hg19IdeogramCyto.rda", package="biovizBase", mustWork=TRUE)) + p.ideogram <- plotIdeogram(hg19IdeogramCyto, chr) + } + + ## Regions found (from the view) + neighbors <- regions[queryHits(findOverlaps(regions, wh))] + if(length(neighbors) == 0) { + neighbors <- wh + neighbors$significant <- NA + neighbors$significantQval <- NA + neighbors$significantFWER <- NA + } + if(titleUse == "pval") { + p.region <- autoplot(neighbors, aes(fill=significant)) + + scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) + } else if (titleUse == "qval" ){ + p.region <- autoplot(neighbors, aes(fill=significantQval)) + + scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) + } else if (titleUse == "fwer" ){ + p.region <- autoplot(neighbors, aes(fill=significantFWER)) + + scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) + } else { + p.region <- autoplot(neighbors) + } + + ## Graphical parameters + nGroups <- length(levels(groupInfo)) + + ## Construct the coverage plot + pos <- start(wh):end(wh) + rawData <- as.data.frame(coverageInfo[pos, colsubset]) + rawData$position <- pos + covData <- melt(rawData, id.vars="position") + covData$group <- rep(groupInfo, each=nrow(rawData)) + p.coverage <- ggplot(covData, aes(x=position, y=value, group=variable, colour=group)) + geom_line(alpha=1/nGroups) + scale_y_continuous(trans=log2_trans()) + + ## Construct mean by group coverage plot + meanCoverage <- ddply(covData, c("position", "group"), summarise, meanCov=mean(value)) + p.meanCov <- ggplot(meanCoverage, aes(x=position, y=meanCov, colour=group)) + geom_line(alpha=1/max(1, 1/2 * nGroups)) + scale_y_continuous(trans=log2_trans()) + + ## Annotation info and final plot + if(is.null(txdb)) { + p.transcripts <- FALSE + } else { + ## The tryCatch is needed because not all regions overlap a transcript + p.transcripts <- tryCatch(autoplot(txdb, which = wh, names.expr = "tx_name(gene_id)"), error = function(e) { FALSE }) + } + if(!is.logical(p.transcripts)) { + result <- tracks(p.ideogram, "Coverage" = p.coverage, "Mean coverage" = p.meanCov, "Regions" = p.region, "tx_name\n(gene_id)" = p.transcripts, heights = c(2, 4, 4, 1.5, 3), xlim=wh, title=title) + ylab("") + theme_tracks_sunset() + } else { + result <- tracks(p.ideogram, "Coverage" = p.coverage, "Mean coverage" = p.meanCov, "Regions" = p.region, heights = c(2, 5, 5, 2), xlim=wh, title=title) + ylab("") + theme_tracks_sunset() + } + return(result) +} + +## Plotting function +regionClusterPlot <- function(i, tUse="fwer") { + ## Chr specific selections + chr <- as.character(seqnames(windows[[i]])) + p.ideo <- p.ideos[[chr]] + covInfo <- fullCovSmall[[chr]] + + ## Make the plot + p <- plotClusterCustom(windows[[i]], regions=fullRegions, titleName=names(ids)[ids == names(windows)[i]], coverageInfo=covInfo, groupInfo=groupInfo, titleUse=tUse, txdb=txdb, p.ideogram=p.ideo, forceLarge=TRUE) + print(p) + rm(p.ideo, covInfo) + + return(invisible(TRUE)) +} + +## Make plots +for(i in seq_len(length(windows))) { + regionClusterPlot(i) +} +``` + + + + +# Reproducibility + +Date the report was generated. + +```{r reproducibility1, echo=FALSE, bootstrap.show.code=FALSE} +## Date the report was generated +Sys.time() +``` + +Wallclock time spent generating the report. + +```{r "reproducibility2", echo=FALSE, bootstrap.show.code=FALSE} +## Processing time in seconds +totalTime <- diff(c(startTime, Sys.time())) +round(totalTime, digits=3) +``` + +`R` session information. + +```{r "reproducibility3", echo=FALSE, bootstrap.show.code=FALSE, bootstrap.show.message=FALSE} +## Session info +options(width=120) +devtools::session_info() +``` diff --git a/hippo/pnas/compareVsPNAS.html b/hippo/pnas/compareVsPNAS.html new file mode 100644 index 0000000..f8287ba --- /dev/null +++ b/hippo/pnas/compareVsPNAS.html @@ -0,0 +1,3205 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+

Compare vs PNAS

+

Counts-based analysis

+

This section has the code for running edgeR-robust and DESeq2 on the simulation data set using the known exons as features.

+

This first code chunk loads the necessary data.

+
+ +
## Track time spent on making the report
+startTime <- Sys.time()
+
+library('edgeR')
+ +
## Loading required package: limma
+## Loading required package: methods
+
+ +
library('DESeq2')
+ +
## Loading required package: S4Vectors
+## Loading required package: stats4
+## Loading required package: BiocGenerics
+## Loading required package: parallel
+## 
+## Attaching package: 'BiocGenerics'
+## 
+## The following objects are masked from 'package:parallel':
+## 
+##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
+##     clusterExport, clusterMap, parApply, parCapply, parLapply,
+##     parLapplyLB, parRapply, parSapply, parSapplyLB
+## 
+## The following object is masked from 'package:limma':
+## 
+##     plotMA
+## 
+## The following object is masked from 'package:stats':
+## 
+##     xtabs
+## 
+## The following objects are masked from 'package:base':
+## 
+##     anyDuplicated, append, as.data.frame, as.vector, cbind,
+##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
+##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
+##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
+##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
+##     table, tapply, union, unique, unlist, unsplit
+## 
+## Loading required package: IRanges
+## Loading required package: GenomicRanges
+## Loading required package: GenomeInfoDb
+## Loading required package: Rcpp
+## Loading required package: RcppArmadillo
+
+ +
library('GenomicRanges')
+
+## Load data
+load("../coverageToExon/covToEx-ucsc.Rdata")
+load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata")
+load("../derAnalysis/run3-v1.0.10/colsubset.Rdata")
+
+## GenomicState object
+if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
+    load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
+} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
+    load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
+} else {
+    stop('Missing UCSC hg19 genomic state object')
+}
+
+## Annotation used
+exons <- GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome
+exons <- exons[exons$theRegion == 'exon']
+
+## Round matrix and remove exons with 0s
+counts <- round(covToEx[, colsubset])
+nonzero <- sapply(rowSums(counts), function(x) {x > 0})
+
+

DESeq2

+

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

+
+ +
## Round matrix and specify design
+dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group)
+ +
## converting counts to integer mode
+
+ +
## Perform DE analysis
+system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) )
+ +
## estimating size factors
+## estimating dispersions
+## gene-wise dispersion estimates
+## mean-dispersion relationship
+## final dispersion estimates
+## fitting model and testing
+## -- replacing outliers and refitting for 194 genes
+## -- DESeq argument 'minReplicatesForReplace' = 7 
+## -- original counts are preserved in counts(dds)
+## estimating dispersions
+## fitting model and testing
+
+ +
##    user  system elapsed 
+## 680.447   2.572 683.900
+
+ +
## Extract results
+deseq <- exons[nonzero]
+mcols(deseq) <- cbind(mcols(deseq), results(dse))
+
+## Which are significant?
+deseq$sig <- deseq$padj < 0.05
+deseq$sig[is.na(deseq$sig)] <- FALSE
+
+## Save results
+save(deseq, file = 'deseq.Rdata')
+
+## Adjust by Holm
+deseq_holm <- deseq
+deseq_holm$sig <- p.adjust(deseq_holm$pvalue, 'holm') < 0.05
+
+

edgeR-robust

+

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

+
+ +
## Determine design matrix
+design <- model.matrix(~ groupInfo)
+
+## Perform DE analysis
+d <- DGEList(counts = counts[nonzero, ], group = groupInfo)
+d <- calcNormFactors(d)
+system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6))
+ +
##     user   system  elapsed 
+## 1555.359    9.853 1567.200
+
+ +
fw <- glmFit(dw, design = design, coef = 2:3)
+lrw <- glmLRT(fw, coef = 2:3)
+
+## Extract results
+edger <- exons[nonzero]
+mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table))
+edger$pvalue <-  lrw$table$PValue
+edger$padj <- p.adjust(lrw$table$PValue, 'BH')
+
+## Which are significant?
+edger$sig <- edger$padj < 0.05
+edger$sig[is.na(edger$sig)] <- FALSE
+
+## Save results
+save(edger, file = 'edger.Rdata')
+
+## Adjust by Holm
+edger_holm <- edger
+edger_holm$sig <- p.adjust(edger_holm$pvalue, 'holm') < 0.05
+
+

Overlap

+
+ +
## Load data
+load('../derAnalysis/run3-v1.0.10/fullRegions.Rdata')
+
+## Some formatting and subsets
+names(fullRegions) <- seq_len(length(fullRegions))
+fullRegions$sigFWER <- as.logical(fullRegions$significantFWER)
+fullRegs20 <- fullRegions[width(fullRegions) >= 20]
+
+## Overlap table for all 4 cases
+ov_table <- function(ders, counts, query = 'der', minov = 0) {
+    if(query == 'der') {
+        if(minov == 0) {
+            res <- addmargins(table('Significant DER (FWER)' = ders$sigFWER, 'Overlaps sig DE exon' = countOverlaps(ders, counts[counts$sig]) > 0))
+        } else {
+            res <- addmargins(table(ders$sigFWER, countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FWER)', paste0('Overlaps sig DE exon (min ', minov, 'bp)'))))
+        }
+    } else if (query == 'counts') {
+        if(minov == 0) {
+            res <- addmargins(table('Significant DE exon' = counts$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFWER]) > 0))
+        } else {
+            res <- addmargins(table(counts$sig[width(counts) >= minov], countOverlaps(counts[width(counts) >= minov], ders[ders$sigFWER], minoverlap = minov) > 0, dnn = c('Significant DE exon', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)'))))
+        }
+    }
+    return(res)
+}
+
+## Explore mistmatched cases for DERs vs Exons direction
+explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
+    if(case == 'FALSE-TRUE') {
+        i <- which(countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0 & !ders$sigFWER)
+    } else if (case == 'TRUE-FALSE') {
+        i <- which(!countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0 & ders$sigFWER)
+    } else{
+        stop('invalid case')
+    }
+    if(length(i) == 0) return("No such cases")
+    
+    if(case == 'FALSE-TRUE') {
+        res <- list(
+            n_overlaps = table(countOverlaps(ders[i], counts[counts$sig], minoverlap = minov)),
+            width_der = summary(width(ders[i])),
+            ders_per_exon_table = table(table(subjectHits(findOverlaps(ders[i], counts[counts$sig], minoverlap = minov)))),
+            ders_per_exon = sort(table(subjectHits(findOverlaps(ders[i], counts[counts$sig], minoverlap = minov)))),
+            i = i
+        )
+    } else {
+        res <- list(
+            width_der = summary(width(ders[i])),
+            distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], counts, ignore.strand = TRUE))$distance),
+            distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], counts[counts$sig], ignore.strand = TRUE))$distance),
+            distance_nearest = distanceToNearest(ders[i], counts, ignore.strand = TRUE),
+            distance_nearest_sig = distanceToNearest(ders[i], counts[counts$sig], ignore.strand = TRUE),
+            i = i
+        )
+    }
+    
+    return(res)
+}
+
+## Explore mistmatched cases for Exons vs DERs direction
+explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
+    counts <- counts[width(counts) >= minov]
+    if(case == 'FALSE-TRUE') {
+        i <- which(countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & !counts$sig)
+    } else if (case == 'TRUE-FALSE') {
+        i <- which(!countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & counts$sig)
+    } else{
+        stop('invalid case')
+    }
+    if(length(i) == 0) return("No such cases")
+    
+    if(case == 'FALSE-TRUE') {
+        res <- list(
+            n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)),
+            width_exon = summary(width(counts[i])),
+            exons_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
+            exons_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
+            i = i
+        )
+    } else {
+        res <- list(
+            width_exon = summary(width(counts[i])),
+            distance_nearest_sum = summary(mcols(distanceToNearest(counts[i], ders, ignore.strand = TRUE))$distance),
+             distance_nearest_sig_sum = summary(mcols(distanceToNearest(counts[i], ders[ders$sigFWER], ignore.strand = TRUE))$distance),
+            distance_nearest = distanceToNearest(counts[i], ders, ignore.strand = TRUE),
+            distance_nearest_sig = distanceToNearest(counts[i], ders[ders$sigFWER], ignore.strand = TRUE),
+            i = i
+        )
+    }
+    
+    return(res)
+}
+
+noNA <- function(x) {
+    x[!is.na(x)]
+}
+
+

DESeq2

+

Query: DERs

+

We can first compare the results by using the DERs as the query and the exons as the subject. The following output shows the comparison using all DERs and exploring the mismatched cases. Then its repeated using the DERs  ≥  20 bp and a minimum overlap of 20bp.

+

For the mismatched cases of non-significant DERs overlapping a significant exon, we check:

+
    +
  • how many exons each DER overlaps,
  • +
  • the width of the DERs
  • +
  • the frequency table of how many DERs overlap the same exon
  • +
+

For the other mismatched case, we check:

+
    +
  • the width of the DERs
  • +
  • distance to nearest exon (regardless of exon size)
  • +
  • distance to nearest significant DE exon (ibidem)
  • +
+
+ +
## Overlap between DERs and significant DE exons
+ov_table(fullRegions, deseq)
+ +
##                       Overlaps sig DE exon
+## Significant DER (FWER) FALSE  TRUE   Sum
+##                  FALSE 16671 14792 31463
+##                  TRUE    255   260   515
+##                  Sum   16926 15052 31978
+
+ +
## Explore mismatched cases
+#noNA(explore_ov(fullRegions, deseq)[1:3])
+#noNA(explore_ov(fullRegions, deseq, 'TRUE-FALSE')[1:3])
+
+## Min 20 bp overlap, using only DERs 20 bp long
+ov_table(fullRegs20, deseq, minov = 20L)
+ +
##                       Overlaps sig DE exon (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  1127 1151 2278
+##                  TRUE    255  260  515
+##                  Sum    1382 1411 2793
+
+ +
## Explore mismatched cases, min 20bp overlap
+noNA(explore_ov(fullRegs20, deseq, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##    1    2 
+## 1131   20 
+## 
+## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   20.00   23.00   29.00   30.22   36.00   62.00 
+## 
+## $ders_per_exon_table
+## 
+##   1   2   3   4   5   6 
+## 775 109  27  12   5   4
+
+ +
noNA(explore_ov(fullRegs20, deseq, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   29.00   49.00   60.00   67.99   77.00  182.00 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0     290   11010   16860  302400 
+## 
+## $distance_nearest_sig_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    7859   26350   60970   44550 2106000
+
+ +
## Holm vs BH
+addmargins(table('DESeq2 Holm' = deseq_holm$sig, 'DESeq2 BH' = deseq$sig))
+ +
##            DESeq2 BH
+## DESeq2 Holm  FALSE   TRUE    Sum
+##       FALSE 199844  14185 214029
+##       TRUE       4    387    391
+##       Sum   199848  14572 214420
+
+ +
## Use Holm and min 20 bp ov
+ov_table(fullRegs20, deseq_holm, minov = 20L)
+ +
##                       Overlaps sig DE exon (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  2143  135 2278
+##                  TRUE    433   82  515
+##                  Sum    2576  217 2793
+
+
+

Most of the DERs are shorter than 20bp (91.27 percent), so we'll focus on the longer ones. The majority of the mismatches are from non significant DERs that overlap a significant exon.

+

As expected, when controlling the FWER instead of the FDR, most of the DE exons are no longer significant. Using FWER-controlled DE exons, most of the DERs 20bp or longer agree with the exons as not being significantly DE.

+

Query: exons

+

We can now repeat the comparison using the exons as the query and the DERs as the subject.

+

For the mismatched cases of non-significant exons overlapping a significant DER, we check:

+
    +
  • how many DERs each exon overlaps,
  • +
  • the width of the exons
  • +
  • the frequency table of how many exons overlap the same DER
  • +
+

For the other mismatched case, we check:

+
    +
  • the width of the exons
  • +
  • distance to nearest DER (regardless of DER size)
  • +
  • distance to nearest significant DER (ibidem)
  • +
+
+ +
## Overlap between exons and significant DERs
+#ov_table(fullRegions, deseq, 'counts')
+
+## Explore mismatched cases
+#noNA(explore_ov_counts(fullRegions, deseq)[1:3])
+#noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE')[1:3])
+
+## Overlap between exons and significant DERs, min 20 bp
+ov_table(fullRegions, deseq, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE exon  FALSE   TRUE    Sum
+##               FALSE 199497     98 199595
+##               TRUE   14338    232  14570
+##               Sum   213835    330 214165
+
+ +
## Explore mismatched cases
+noNA(explore_ov_counts(fullRegions, deseq, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##  1  2  3  9 
+## 87  8  2  1 
+## 
+## $width_exon
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##    51.0   154.8   655.0  1328.0  1899.0  9761.0 
+## 
+## $exons_per_der_table
+## 
+##   1 
+## 118
+
+ +
noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_exon
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
+##     21.0    143.0    332.5   1160.0   1578.0 205000.0 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0    6440   86290   57270 5353000 
+## 
+## $distance_nearest_sig_sum
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
+##       54   526200  2054000  4818000  5809000 48690000      229
+
+ +
## Now with Holm
+ov_table(fullRegions, deseq_holm, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE exon  FALSE   TRUE    Sum
+##               FALSE 213513    261 213774
+##               TRUE     322     69    391
+##               Sum   213835    330 214165
+
+
+

From these results, we can see that derfinder is more conservative.

+

edgeR-robust

+

Query: DERs

+

Similar comparison using DERs as query and exons as subject with edgeR-robust results.

+
+ +
## Overlap between DERs and significant DE exons
+#ov_table(fullRegions, edger)
+
+## Explore mismatched cases
+#noNA(explore_ov(fullRegions, edger)[1:3])
+#noNA(explore_ov(fullRegions, edger, 'TRUE-FALSE')[1:3])
+
+## Min 20 bp overlap, using only DERs 20 bp long
+ov_table(fullRegs20, edger, minov = 20L)
+ +
##                       Overlaps sig DE exon (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  1256 1022 2278
+##                  TRUE    263  252  515
+##                  Sum    1519 1274 2793
+
+ +
## Explore mismatched cases, min 20bp overlap
+noNA(explore_ov(fullRegs20, edger, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##    1    2 
+## 1006   16 
+## 
+## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   20.00   24.00   30.00   30.46   36.00   58.00 
+## 
+## $ders_per_exon_table
+## 
+##   1   2   3   4   5   6  19 
+## 677  95  24   9   4   4   1
+
+ +
noNA(explore_ov(fullRegs20, edger, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##    32.0    49.0    59.0    66.0    75.5   174.0 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0     172   10680   16300  302400 
+## 
+## $distance_nearest_sig_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    4856   21090   50470   42110 2106000
+
+ +
## Holm vs BH
+addmargins(table('edgeR Holm' = edger_holm$sig, 'edger BH' = edger$sig))
+ +
##           edger BH
+## edgeR Holm  FALSE   TRUE    Sum
+##      FALSE 197071  16827 213898
+##      TRUE       0    522    522
+##      Sum   197071  17349 214420
+
+ +
## With Holm, 20bp
+ov_table(fullRegs20, edger_holm, minov = 20L)
+ +
##                       Overlaps sig DE exon (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  2155  123 2278
+##                  TRUE    450   65  515
+##                  Sum    2605  188 2793
+
+
+

The results are fairly similar to those from using DESeq2.

+

Query: exons

+

Similar comparison using exons as query and DERs as subject with edgeR-robust results.

+
+ +
## Overlap between exons and significant DERs
+#ov_table(fullRegions, edger, 'counts')
+
+## Explore mismatched cases
+#noNA(explore_ov_counts(fullRegions, edger)[1:3])
+#noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE')[1:3])
+
+## Overlap between exons and significant DERs, min 20 bp
+ov_table(fullRegions, edger, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE exon  FALSE   TRUE    Sum
+##               FALSE 196707    114 196821
+##               TRUE   17128    216  17344
+##               Sum   213835    330 214165
+
+ +
## Explore mismatched cases
+noNA(explore_ov_counts(fullRegions, edger, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##   1   2   3   4 
+## 105   7   1   1 
+## 
+## $width_exon
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##    66.0   181.2   954.0  1396.0  1908.0  9761.0 
+## 
+## $exons_per_der_table
+## 
+##   1   2 
+## 124   1
+
+ +
noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_exon
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
+##     21.0    125.0    211.5    941.3   1066.0 205000.0 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0     148   14720  115400   89810 5353000 
+## 
+## $distance_nearest_sig_sum
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
+##       54   612100  2251000  5073000  6170000 48690000      288
+
+ +
## With Holm, 20 bp
+ov_table(fullRegions, edger_holm, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE exon  FALSE   TRUE    Sum
+##               FALSE 213371    272 213643
+##               TRUE     464     58    522
+##               Sum   213835    330 214165
+
+
+

overall

+

While the DERs vs exons results are fairly similar between edgeR-robust and DESeq2, as shown below the number of mismatched cases is high compared to the number of cases both counts-based methods agree. This is also true when controlling the FWER to determine significance.

+
+ +
## edgeR vs DESeq2
+addmargins(table('edgeR-robust (FDR)' = edger$sig, 'DESeq2 (FDR)' = deseq$sig))
+ +
##                   DESeq2 (FDR)
+## edgeR-robust (FDR)  FALSE   TRUE    Sum
+##              FALSE 194436   2635 197071
+##              TRUE    5412  11937  17349
+##              Sum   199848  14572 214420
+
+ +
## Control FWER
+addmargins(table('edgeR-robust (FWER)' = edger_holm$sig, 'DESeq2 (FWER)' = deseq_holm$sig))
+ +
##                    DESeq2 (FWER)
+## edgeR-robust (FWER)  FALSE   TRUE    Sum
+##               FALSE 213832     66 213898
+##               TRUE     197    325    522
+##               Sum   214029    391 214420
+
+ +
## Only sig if both edgeR and DEseq2 say it is
+both <- deseq
+both$sig <- both$sig & edger$sig
+
+## Same, for holm
+both_holm <- deseq_holm
+both_holm$sig <- both_holm$sig & edger_holm$sig
+
+

We can consider an exon to be DE only if both edgeR-robust and DESeq2 find that its significantly DE. The next sections use this information.

+

Query: DERs

+
+ +
## Overlap between DERs and significant DE exons
+#ov_table(fullRegions, both)
+
+## Explore mismatched cases
+#noNA(explore_ov(fullRegions, both)[1:3])
+#noNA(explore_ov(fullRegions, both, 'TRUE-FALSE')[1:3])
+
+## Min 20 bp overlap, using only DERs 20 bp long
+ov_table(fullRegs20, both, minov = 20L)
+ +
##                       Overlaps sig DE exon (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  1326  952 2278
+##                  TRUE    286  229  515
+##                  Sum    1612 1181 2793
+
+ +
## Explore mismatched cases, min 20bp overlap
+noNA(explore_ov(fullRegs20, both, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##   1   2 
+## 936  16 
+## 
+## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   20.00   24.00   30.00   30.49   36.00   58.00 
+## 
+## $ders_per_exon_table
+## 
+##   1   2   3   4   5   6 
+## 647  89  21   9   4   4
+
+ +
noNA(explore_ov(fullRegs20, both, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_der
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   29.00   50.00   60.00   67.59   76.75  182.00 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0       0    9817   15010  302400 
+## 
+## $distance_nearest_sig_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      54    6497   23720   61260   43820 2106000
+
+ +
## Holm vs BH
+addmargins(table('Both Holm' = both_holm$sig, 'Both BH' = both$sig))
+ +
##          Both BH
+## Both Holm  FALSE   TRUE    Sum
+##     FALSE 202479  11616 214095
+##     TRUE       4    321    325
+##     Sum   202483  11937 214420
+
+ +
## Use Holm and min 20 bp ov
+ov_table(fullRegs20, both_holm, minov = 20L)
+ +
##                       Overlaps sig DE exon (min 20bp)
+## Significant DER (FWER) FALSE TRUE  Sum
+##                  FALSE  2185   93 2278
+##                  TRUE    456   59  515
+##                  Sum    2641  152 2793
+
+
+

The trends observed previously are maintained in this comparison with a reduction of cases where the exon is DE. This is expected due to the non-perfect agreement between DESeq2 and edgeR-robust.

+
+ +
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
+ +
## Loading required package: GenomicFeatures
+## Loading required package: AnnotationDbi
+## Loading required package: Biobase
+## Welcome to Bioconductor
+## 
+##     Vignettes contain introductory material; view with
+##     'browseVignettes()'. To cite Bioconductor, see
+##     'citation("Biobase")', and for packages 'citation("pkgname")'.
+## 
+## 
+## Attaching package: 'AnnotationDbi'
+## 
+## The following object is masked from 'package:GenomeInfoDb':
+## 
+##     species
+
+ +
library('derfinder')
+library('derfinderHelper')
+library('derfinderPlot')
+load('../derAnalysis/run3-v1.0.10/models.Rdata')
+load('../derAnalysis/run3-v1.0.10/chr22/optionsStats.Rdata')
+load("../CoverageInfo/fullCov.Rdata")
+
+def.par <- par()
+def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))]
+
+regPlot <- function(region, title) {
+    ## Calculate F-stats
+    range <- start(region):end(region)
+    dat <- fullCov[[as.character(seqnames(region))]][range, colsubset]
+
+    ## Log2 transform
+    for(i in seq_len(length(groupInfo))) dat[[i]] <- log2(dat[[i]] + 32) 
+
+    ## Calculate f-stats
+    fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0))
+
+    ## Find annotation
+    annoReg <- annotateRegions(region, GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome, verbose = FALSE)
+    symbol <- mcols(annoReg$annotationList[[1]])$symbol
+    symbol <- as.character(noNA(symbol)[[1]])
+    if(length(symbol) > 1) symbol <- symbol[1]
+    symbol <- ifelse(is.null(symbol), NA, symbol)
+    ## Remove symbol name because it gets chomped on the plot
+    mcols(annoReg$annotationList[[1]])$symbol <- NA
+    
+    par(def.par)
+
+    ## Plot long exon
+    plotRegionCoverage(region, getRegionCoverage(fullCov, region, verbose = FALSE), groupInfo, data.frame(name = title, distance = NA, region = symbol), annoReg, verbose = FALSE, ask = FALSE, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene)
+
+    ## Add F-stat track
+    par(fig = c(0, 1, 0.065, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1))
+    plot(y = fstats, x = range, ylab = 'F-stat', type = 'l', xlab = '', bty = 'n', ylim = c(0, max(fstats[is.finite(fstats)], optionsStats$cutoffFstatUsed) * 1.1), las = 2, yaxt = 'n')
+    y.max <- round(max(c(optionsStats$cutoffFstatUsed, fstats[is.finite(fstats)]), na.rm = TRUE), 0)
+    axis(2, at = c(0, round(y.max / 2, 0), y.max), c(0, round(y.max / 2, 0), y.max), las = 2, tick = TRUE)
+    abline(h = optionsStats$cutoffFstatUsed, col = 'red')
+    abline(h = 0, col = 'grey')
+}
+
+sortWidth <- function(regions) {
+    regions[order(width(regions), decreasing = TRUE)]
+}
+
+

We can now make plots to explore some DERs for each of the cases.

+
+ +
query_der_plots <- function() {
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) > 0 & fullRegs20$sigFWER])[1:10], function(reg) {
+        regPlot(reg, 'DER query: DE agreement')
+    })
+
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) == 0 & !fullRegs20$sigFWER])[1:10],  function(reg) {
+        regPlot(reg, 'DER query: not DE agreement')
+    })
+
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) == 0 & fullRegs20$sigFWER])[1:10], function(reg) {
+        regPlot(reg, 'DER query: only exon not DE')
+    })
+
+    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[both$sig], minoverlap = 20L) > 0 & !fullRegs20$sigFWER])[1:10], function(reg) {
+        regPlot(reg, 'DER query: only exon DE')
+    })
+}
+pdf(file = 'query_der_plots.pdf', width = 10, height = 7)
+query_der_plots()
+ +
## $`540`
+## NULL
+## 
+## $`625`
+## NULL
+## 
+## $`518`
+## NULL
+## 
+## $`589`
+## NULL
+## 
+## $`610`
+## NULL
+## 
+## $`616`
+## NULL
+## 
+## $`533`
+## NULL
+## 
+## $`643`
+## NULL
+## 
+## $`696`
+## NULL
+## 
+## $`520`
+## NULL
+
+ +
dev.off()
+ +
## pdf 
+##   2
+
+ +
query_der_plots()
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+ +
## $`540`
+## NULL
+## 
+## $`625`
+## NULL
+## 
+## $`518`
+## NULL
+## 
+## $`589`
+## NULL
+## 
+## $`610`
+## NULL
+## 
+## $`616`
+## NULL
+## 
+## $`533`
+## NULL
+## 
+## $`643`
+## NULL
+## 
+## $`696`
+## NULL
+## 
+## $`520`
+## NULL
+
+
+

Query: exons

+

As was shown with either DESeq2 or edgeR-robust results, derfinder is more conservative than the counts-based methods.

+
+ +
## Overlap between exons and significant DERs, min 20 bp
+ov_table(fullRegions, both, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE exon  FALSE   TRUE    Sum
+##               FALSE 202105    125 202230
+##               TRUE   11730    205  11935
+##               Sum   213835    330 214165
+
+ +
## Explore mismatched cases
+noNA(explore_ov_counts(fullRegions, both, minov = 20L)[1:3])
+ +
## $n_overlaps
+## 
+##   1   2   3   4   9 
+## 112   9   2   1   1 
+## 
+## $width_exon
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      51     180     890    1354    1905    9761 
+## 
+## $exons_per_der_table
+## 
+##   1   2 
+## 147   1
+
+ +
noNA(explore_ov_counts(fullRegions, both, 'TRUE-FALSE', minov = 20L)[1:3])
+ +
## $width_exon
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##      21     139     321    1179    1595  205000 
+## 
+## $distance_nearest_sum
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       0    6672   89960   60730 5353000 
+## 
+## $distance_nearest_sig_sum
+##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
+##       54   535600  2094000  4886000  5941000 48690000      199
+
+ +
## With Holm, 20 bp
+ov_table(fullRegions, both_holm, 'counts', 20L)
+ +
##                    Overlaps sig DER (FWER, min 20bp)
+## Significant DE exon  FALSE   TRUE    Sum
+##               FALSE 213563    277 213840
+##               TRUE     272     53    325
+##               Sum   213835    330 214165
+
+
+

We can now visually explore some exons for each of the four cases.

+
+ +
query_exon_plots <- function() {
+    sapply(sortWidth(both[width(both) >= 20 & both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
+        regPlot(reg, 'Exon query: DE agreement')
+    })
+
+    sapply(sortWidth(both[width(both) >= 20 & !both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
+        regPlot(reg, 'Exon query: not DE agreement')
+    })
+
+    sapply(sortWidth(both[width(both) >= 20 & !both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
+        regPlot(reg, 'Exon query: only exon not DE')
+    })
+
+    sapply(sortWidth(both[width(both) >= 20 & both$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
+        regPlot(reg, 'Exon query: only exon DE')
+    })
+}
+pdf(file = 'query_exon_plots.pdf', width = 10, height = 7)
+query_exon_plots()
+ +
## $`290397`
+## NULL
+## 
+## $`312032`
+## NULL
+## 
+## $`473746`
+## NULL
+## 
+## $`262083`
+## NULL
+## 
+## $`69671`
+## NULL
+## 
+## $`77501`
+## NULL
+## 
+## $`97002`
+## NULL
+## 
+## $`183187`
+## NULL
+## 
+## $`105322`
+## NULL
+## 
+## $`462539`
+## NULL
+
+ +
dev.off()
+ +
## pdf 
+##   2
+
+ +
query_exon_plots()
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+ +
## $`290397`
+## NULL
+## 
+## $`312032`
+## NULL
+## 
+## $`473746`
+## NULL
+## 
+## $`262083`
+## NULL
+## 
+## $`69671`
+## NULL
+## 
+## $`77501`
+## NULL
+## 
+## $`97002`
+## NULL
+## 
+## $`183187`
+## NULL
+## 
+## $`105322`
+## NULL
+## 
+## $`462539`
+## NULL
+
+
+

Finding regions of interest

+

The code in this section is partially based on /home/epi/ajaffe/Lieber/Projects/RNAseq/HippoPublic/clean_previous_hits.R.

+

First we find the regions of the genome corresponding to the genes of interest. That is, the genes from the original paper that were differentially expressed in the conditions we are analyzing.

+
+ +
## Required pkgs
+library("GenomicRanges")
+library("ggbio")
+ +
## Loading required package: ggplot2
+## Need specific help about ggbio? try mailing 
+##  the maintainer or visit http://tengfei.github.com/ggbio/
+## 
+## Attaching package: 'ggbio'
+## 
+## The following objects are masked from 'package:ggplot2':
+## 
+##     geom_bar, geom_rect, geom_segment, ggsave, stat_bin,
+##     stat_identity, xlim
+
+ +
library("reshape2")
+library("plyr")
+ +
## 
+## Attaching package: 'plyr'
+## 
+## The following object is masked from 'package:IRanges':
+## 
+##     desc
+## 
+## The following object is masked from 'package:S4Vectors':
+## 
+##     rename
+
+ +
library("scales")
+ +
## 
+## Attaching package: 'scales'
+## 
+## The following object is masked from 'package:ggbio':
+## 
+##     rescale
+
+ +
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
+library("org.Hs.eg.db")
+ +
## Loading required package: DBI
+
+
+
+ +
## Gene symbol names of interest
+#### Original names
+## symbols <- c("HIST1H4E", "RN7SK", "CDR1", "SNORD89", "SNORA73A", "SCARNA17", "PAPD1", "CACNB2", "LRCH4", "SNORD42A", "SNORA47", "LENG8", "FAM123A", "HIVEP3", "HNRPH1", "ZGPAT", "ERF", "SNORD116-29", "C9orf139", "C9orf3", "KCNA2", "EXOC6B", "CENTB5", "TAOK2", "TNRC6C", "ADAMTS4", "MSH4", "C16orf72", "CCR5")
+
+## http://www.genenames.org/data/hgnc_data.php?hgnc_id=25532
+## http://www.genenames.org/data/hgnc_data.php?hgnc_id=26360
+## http://www.genenames.org/data/hgnc_data.php?hgnc_id=5041
+## http://www.genenames.org/data/hgnc_data.php?hgnc_id=16754
+
+## Updated symbols
+symbols <- c("HIST1H4E", "RN7SK", "CDR1", "SNORD89", "SNORA73A", "SCARNA17", "MTPAP", "CACNB2", "LRCH4", "SNORD42A", "SNORA47", "LENG8", "AMER2", "HIVEP3", "HNRNPH1", "ZGPAT", "ERF", "SNORD116-29", "C9orf139", "C9orf3", "KCNA2", "EXOC6B", "ACAP3", "TAOK2", "TNRC6C", "ADAMTS4", "MSH4", "C16orf72", "CCR5")
+
+
+## Map gene symbol names to entrezid's
+
+keys <- keys(org.Hs.eg.db, keytype = "ENTREZID")
+columns <- c("SYMBOL")
+map <- select(org.Hs.eg.db, keys, columns, keytype = "ENTREZID")
+idx <- sapply(symbols, function(x) { 
+    res <- which(map$SYMBOL == x)
+    ifelse(length(res) > 0, res, NA)
+})
+ids <- map$ENTREZID[idx]
+names(ids) <- names(idx)
+
+## Remove those not-found
+ids <- ids[!is.na(ids)]
+
+## Find the exons
+
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+exonsUCSC <- exons(txdb, vals = list(gene_id =  ids), columns = c("gene_id", "exon_id", "tx_name", "tx_id"))
+
+## some gene ids have multiples, straighten out
+gids <- as.list(exonsUCSC$gene_id)
+for(i in seq(along=gids)) gids[[i]] <- gids[[i]][which(gids[[i]] %in% ids)]
+gids <- unlist(gids)
+
+
+## split into list by EID
+exonListUCSC <- split(exonsUCSC, gids)
+exonListUCSC <- exonListUCSC[ ids[ids %in% gids] ]
+### drop duplicated exons
+# exonListUCSC = lapply(exonListUCSC, function(x) x[!duplicated(x)])
+# identical(names(exonListUCSC), ids[ids %in% gids]) # TRUE
+
+## Not found
+ids[which(!ids %in% gids)]
+ +
##    RN7SK SNORA73A 
+## "125050"   "6080"
+
+ +
## Find them manually
+# http://www.ncbi.nlm.nih.gov/gene/125050
+# http://www.ncbi.nlm.nih.gov/gene/6080
+missing <- GRanges(seqnames=c("chr6", "chr1"), ranges=IRanges(start=c(52860418, 28833877), end=c(52860749, 28834083)))
+toAdd <- split(missing, 1:2)
+names(toAdd) <- ids[which(!ids %in% gids)]
+
+## Reduce to min/max per gene
+windows <- c(GRangesList(lapply(exonListUCSC, range)), toAdd)
+ +
## 
+## Attaching package: 'XVector'
+## 
+## The following object is masked from 'package:plyr':
+## 
+##     compact
+
+ +
## Save for later use
+save(windows, ids, idx, file="windows.Rdata")
+
+

Original genes

+

In this section, we make a plot for each gene showing the coverage data and whether derfinder identified candidate DERs as described in the main text.

+
+ +
## Find chrs used
+chrs <- as.character(unique(unlist(seqnames(windows), use.names=FALSE)))
+
+## Build ideograms
+data(hg19IdeogramCyto, package = "biovizBase")
+p.ideos <- lapply(chrs, function(xx) { 
+    plotIdeogram(hg19IdeogramCyto, xx)
+})
+names(p.ideos) <- chrs
+
+
+## Filter data
+fullCovSmall <- lapply(chrs, function(chr) {
+    fullCov[[chr]][, colsubset]
+})
+names(fullCovSmall) <- chrs
+rm(fullCov)
+
+## Main plotting function
+plotClusterCustom <- function(cluster, regions, titleName, coverageInfo, groupInfo, titleUse="fwer", txdb=NULL, p.ideogram=NULL, maxExtend=300L, colsubset=NULL, forceLarge=FALSE) {
+
+    stopifnot(is.factor(groupInfo))
+    if(is.null(colsubset)) colsubset <- seq_len(length(groupInfo))
+    
+    ## Window length
+    l <-  width(cluster) + 2 * min(maxExtend, width(cluster))
+    
+    if(l > 1e5 & !forceLarge) {
+        message(paste("No plot will be made because the data is too large. The window size exceeds 100 kb."))
+        return(invisible(l))
+    }
+    
+    wh <- resize(cluster, l, fix="center")
+    title <- paste("Window view for ENTREZ Symbol", titleName)
+    
+    ## Plot the ideogram if not supplied
+    if(is.null(p.ideogram)) {
+        chr <- as.character(seqnames(wh))
+        ## Now load the ideogram info
+        hg19IdeogramCyto <- NULL
+        load(system.file("data", "hg19IdeogramCyto.rda", package="biovizBase", mustWork=TRUE))
+        p.ideogram <- plotIdeogram(hg19IdeogramCyto, chr)
+    }
+    
+    ## Regions found (from the view)
+    neighbors <- regions[queryHits(findOverlaps(regions, wh))]
+    if(length(neighbors) == 0) {
+        neighbors <- wh
+        neighbors$significant <- NA
+        neighbors$significantQval <- NA
+        neighbors$significantFWER <- NA
+    } 
+    if(titleUse == "pval") {
+        p.region <- autoplot(neighbors, aes(fill=significant)) + 
+        scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) 
+    } else if (titleUse == "qval" ){
+        p.region <- autoplot(neighbors, aes(fill=significantQval)) +
+        scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) 
+    } else if (titleUse == "fwer" ){
+        p.region <- autoplot(neighbors, aes(fill=significantFWER)) +
+        scale_fill_manual(values=c("chartreuse4", "wheat2"), limits=c("TRUE", "FALSE")) 
+    } else {
+        p.region <- autoplot(neighbors)
+    }
+
+    ## Graphical parameters
+    nGroups <- length(levels(groupInfo))
+    
+    ## Construct the coverage plot
+    pos <- start(wh):end(wh)
+    rawData <- as.data.frame(coverageInfo[pos, colsubset])
+    rawData$position <- pos
+    covData <- melt(rawData, id.vars="position")
+    covData$group <- rep(groupInfo, each=nrow(rawData))
+    p.coverage <- ggplot(covData, aes(x=position, y=value, group=variable, colour=group)) + geom_line(alpha=1/nGroups) + scale_y_continuous(trans=log2_trans())
+    
+    ## Construct mean by group coverage plot
+    meanCoverage <- ddply(covData, c("position", "group"), summarise, meanCov=mean(value))
+    p.meanCov <- ggplot(meanCoverage, aes(x=position, y=meanCov, colour=group)) + geom_line(alpha=1/max(1, 1/2 * nGroups)) + scale_y_continuous(trans=log2_trans())
+    
+    ## Annotation info and final plot
+    if(is.null(txdb)) {
+        p.transcripts <- FALSE
+    } else {
+        ## The tryCatch is needed because not all regions overlap a transcript
+        p.transcripts <- tryCatch(autoplot(txdb, which = wh, names.expr = "tx_name(gene_id)"), error = function(e) { FALSE })
+    }   
+    if(!is.logical(p.transcripts)) {
+        result <- tracks(p.ideogram, "Coverage" = p.coverage, "Mean coverage" = p.meanCov, "Regions" = p.region, "tx_name\n(gene_id)" = p.transcripts, heights = c(2, 4, 4, 1.5, 3), xlim=wh, title=title) + ylab("") + theme_tracks_sunset()       
+    } else {
+        result <- tracks(p.ideogram, "Coverage" = p.coverage, "Mean coverage" = p.meanCov, "Regions" = p.region, heights = c(2, 5, 5, 2), xlim=wh, title=title) + ylab("") + theme_tracks_sunset()
+    }
+    return(result)  
+}
+
+## Plotting function
+regionClusterPlot <- function(i, tUse="fwer") {
+    ## Chr specific selections
+    chr <- as.character(seqnames(windows[[i]]))
+    p.ideo <- p.ideos[[chr]]
+    covInfo <- fullCovSmall[[chr]]
+    
+    ## Make the plot
+    p <- plotClusterCustom(windows[[i]], regions=fullRegions, titleName=names(ids)[ids == names(windows)[i]], coverageInfo=covInfo, groupInfo=groupInfo, titleUse=tUse, txdb=txdb, p.ideogram=p.ideo, forceLarge=TRUE)
+    print(p)
+    rm(p.ideo, covInfo)
+    
+    return(invisible(TRUE)) 
+}
+
+## Make plots
+for(i in seq_len(length(windows))) {
+    regionClusterPlot(i)
+}
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+

Reproducibility

+

Date the report was generated.

+
+ +
## [1] "2015-03-14 04:52:25 EDT"
+
+
+

Wallclock time spent generating the report.

+
+ +
## Time difference of 3.266 hours
+
+
+

R session information.

+
+ +
## Session info -----------------------------------------------------------------------------------------------------------
+
+ +
##  setting  value                                      
+##  version  R version 3.1.1 Patched (2014-10-16 r66782)
+##  system   x86_64, linux-gnu                          
+##  ui       X11                                        
+##  language (EN)                                       
+##  collate  en_US.UTF-8                                
+##  tz       
+
+ +
## Packages ---------------------------------------------------------------------------------------------------------------
+
+ +
##  package                           * version     date       source                                            
+##  acepack                           * 1.3-3.3     2013-05-03 CRAN (R 3.1.1)                                    
+##  annotate                          * 1.44.0      2014-10-15 Bioconductor                                      
+##  AnnotationDbi                       1.28.1      2014-10-29 Bioconductor                                      
+##  base64enc                         * 0.1-2       2014-06-26 CRAN (R 3.1.0)                                    
+##  BatchJobs                         * 1.5         2014-10-30 CRAN (R 3.1.1)                                    
+##  BBmisc                            * 1.9         2015-02-03 CRAN (R 3.1.1)                                    
+##  Biobase                             2.26.0      2014-10-15 Bioconductor                                      
+##  BiocGenerics                        0.12.1      2014-11-15 Bioconductor                                      
+##  BiocParallel                      * 1.0.3       2015-02-09 Bioconductor                                      
+##  biomaRt                           * 2.22.0      2014-10-15 Bioconductor                                      
+##  Biostrings                        * 2.34.1      2014-12-13 Bioconductor                                      
+##  biovizBase                        * 1.14.1      2014-12-14 Bioconductor                                      
+##  bitops                            * 1.0-6       2013-08-17 CRAN (R 3.1.0)                                    
+##  brew                              * 1.0-6       2011-04-13 CRAN (R 3.1.0)                                    
+##  BSgenome                          * 1.34.1      2014-12-31 Bioconductor                                      
+##  bumphunter                        * 1.6.0       2014-10-15 Bioconductor                                      
+##  Cairo                             * 1.5-6       2014-06-26 CRAN (R 3.1.0)                                    
+##  checkmate                         * 1.5.1       2014-12-14 CRAN (R 3.1.1)                                    
+##  cluster                           * 1.15.3      2014-09-04 CRAN (R 3.1.1)                                    
+##  codetools                         * 0.2-9       2014-08-21 CRAN (R 3.1.1)                                    
+##  colorspace                        * 1.2-6       2015-03-11 CRAN (R 3.1.1)                                    
+##  DBI                                 0.3.1       2014-09-24 CRAN (R 3.1.1)                                    
+##  derfinder                           1.0.10      2015-03-14 Bioconductor                                      
+##  derfinderHelper                     1.0.4       2014-11-05 Github (lcolladotor/derfinderHelper@27bcfe6)      
+##  derfinderPlot                       1.0.3       2015-02-22 Github (lcolladotor/derfinderPlot-release@d1dd4bd)
+##  DESeq2                              1.6.3       2015-03-12 Bioconductor                                      
+##  devtools                          * 1.7.0       2015-01-17 CRAN (R 3.1.1)                                    
+##  dichromat                         * 2.0-0       2013-01-24 CRAN (R 3.1.0)                                    
+##  digest                            * 0.6.8       2014-12-31 CRAN (R 3.1.1)                                    
+##  doRNG                             * 1.6         2014-03-07 CRAN (R 3.1.0)                                    
+##  edgeR                               3.8.6       2015-03-12 Bioconductor                                      
+##  evaluate                          * 0.5.5       2014-04-29 CRAN (R 3.1.0)                                    
+##  fail                              * 1.2         2013-09-19 CRAN (R 3.1.0)                                    
+##  foreach                           * 1.4.2       2014-04-11 CRAN (R 3.1.0)                                    
+##  foreign                           * 0.8-61      2014-03-28 CRAN (R 3.1.1)                                    
+##  formatR                           * 1.0         2014-08-25 CRAN (R 3.1.1)                                    
+##  Formula                           * 1.2-0       2015-01-20 CRAN (R 3.1.1)                                    
+##  genefilter                        * 1.48.1      2014-10-17 Bioconductor                                      
+##  geneplotter                       * 1.44.0      2014-10-15 Bioconductor                                      
+##  GenomeInfoDb                        1.2.4       2014-12-20 Bioconductor                                      
+##  GenomicAlignments                 * 1.2.2       2015-03-04 Bioconductor                                      
+##  GenomicFeatures                     1.18.3      2014-12-17 Bioconductor                                      
+##  GenomicFiles                      * 1.2.1       2015-02-08 Bioconductor                                      
+##  GenomicRanges                       1.18.4      2015-01-08 Bioconductor                                      
+##  GGally                            * 0.5.0       2014-12-02 CRAN (R 3.1.1)                                    
+##  ggbio                               1.14.0      2014-11-04 Bioconductor                                      
+##  ggplot2                             1.0.0       2014-05-21 CRAN (R 3.1.0)                                    
+##  graph                             * 1.44.1      2014-12-10 Bioconductor                                      
+##  gridExtra                         * 0.9.1       2012-08-09 CRAN (R 3.1.1)                                    
+##  gtable                            * 0.1.2       2012-12-05 CRAN (R 3.1.0)                                    
+##  Hmisc                             * 3.15-0      2015-02-16 CRAN (R 3.1.1)                                    
+##  htmltools                         * 0.2.6       2014-09-08 CRAN (R 3.1.1)                                    
+##  IRanges                             2.0.1       2014-12-13 Bioconductor                                      
+##  iterators                         * 1.0.7       2014-04-11 CRAN (R 3.1.0)                                    
+##  knitr                             * 1.9         2015-01-20 CRAN (R 3.1.1)                                    
+##  knitrBootstrap                    * 1.0.0       2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)         
+##  labeling                          * 0.3         2014-08-23 CRAN (R 3.1.1)                                    
+##  lattice                           * 0.20-29     2014-04-04 CRAN (R 3.1.1)                                    
+##  latticeExtra                      * 0.6-26      2013-08-15 CRAN (R 3.1.0)                                    
+##  limma                               3.22.7      2015-03-13 Bioconductor                                      
+##  locfit                            * 1.5-9.1     2013-04-20 CRAN (R 3.1.0)                                    
+##  markdown                          * 0.7.4       2014-08-24 CRAN (R 3.1.1)                                    
+##  MASS                              * 7.3-35      2014-09-30 CRAN (R 3.1.1)                                    
+##  Matrix                            * 1.1-4       2014-06-15 CRAN (R 3.1.1)                                    
+##  matrixStats                       * 0.14.0      2015-02-14 CRAN (R 3.1.1)                                    
+##  mime                              * 0.2         2014-09-26 CRAN (R 3.1.1)                                    
+##  munsell                           * 0.4.2       2013-07-11 CRAN (R 3.1.0)                                    
+##  nnet                              * 7.3-8       2014-03-28 CRAN (R 3.1.1)                                    
+##  OrganismDbi                       * 1.8.1       2015-03-11 Bioconductor                                      
+##  org.Hs.eg.db                        3.0.0       2014-09-27 Bioconductor                                      
+##  pkgmaker                          * 0.22        2014-05-14 CRAN (R 3.1.0)                                    
+##  plyr                                1.8.1       2014-02-26 CRAN (R 3.1.0)                                    
+##  proto                             * 0.3-10      2012-12-22 CRAN (R 3.1.0)                                    
+##  qvalue                            * 1.43.0      2015-03-06 Bioconductor                                      
+##  RBGL                              * 1.42.0      2014-10-15 Bioconductor                                      
+##  RColorBrewer                      * 1.1-2       2014-12-07 CRAN (R 3.1.1)                                    
+##  Rcpp                                0.11.5      2015-03-06 CRAN (R 3.1.1)                                    
+##  RcppArmadillo                       0.4.650.1.1 2015-02-26 CRAN (R 3.1.1)                                    
+##  RCurl                             * 1.95-4.5    2014-12-06 CRAN (R 3.1.1)                                    
+##  registry                          * 0.2         2012-01-24 CRAN (R 3.1.0)                                    
+##  reshape                           * 0.8.5       2014-04-23 CRAN (R 3.1.0)                                    
+##  reshape2                            1.4.1       2014-12-06 CRAN (R 3.1.1)                                    
+##  rmarkdown                           0.5.1       2015-01-26 CRAN (R 3.1.1)                                    
+##  rngtools                          * 1.2.4       2014-03-06 CRAN (R 3.1.0)                                    
+##  rpart                             * 4.1-8       2014-03-28 CRAN (R 3.1.1)                                    
+##  Rsamtools                         * 1.18.3      2015-03-04 Bioconductor                                      
+##  RSQLite                             1.0.0       2014-10-25 CRAN (R 3.1.1)                                    
+##  rstudioapi                        * 0.2         2014-12-31 CRAN (R 3.1.1)                                    
+##  rtracklayer                       * 1.26.2      2014-11-12 Bioconductor                                      
+##  S4Vectors                           0.4.0       2014-10-15 Bioconductor                                      
+##  scales                              0.2.4       2014-04-22 CRAN (R 3.1.0)                                    
+##  sendmailR                         * 1.2-1       2014-09-21 CRAN (R 3.1.1)                                    
+##  stringr                           * 0.6.2       2012-12-06 CRAN (R 3.1.0)                                    
+##  survival                          * 2.37-7      2014-01-22 CRAN (R 3.1.1)                                    
+##  TxDb.Hsapiens.UCSC.hg19.knownGene   3.0.0       2014-09-29 Bioconductor                                      
+##  VariantAnnotation                 * 1.12.9      2015-01-22 Bioconductor                                      
+##  XML                               * 3.98-1.1    2013-06-20 CRAN (R 3.1.0)                                    
+##  xtable                            * 1.7-4       2014-09-12 CRAN (R 3.1.1)                                    
+##  XVector                             0.6.0       2014-10-15 Bioconductor                                      
+##  yaml                              * 2.1.13      2014-06-12 CRAN (R 3.1.1)                                    
+##  zlibbioc                          * 1.12.0      2014-10-15 Bioconductor
+
+
+
+
+ + +
+
+ +
+ + +
+ + diff --git a/hippo/pnas/runComparison.sh b/hippo/pnas/runComparison.sh new file mode 100755 index 0000000..d5231f3 --- /dev/null +++ b/hippo/pnas/runComparison.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#$ -cwd +#$ -m e +#$ -l mem_free=30G,h_vmem=100G +#$ -N PNAS-hippo +echo "**** Job starts ****" +date + +# Generate HTML +Rscript -e "library(rmarkdown); render('compareVsPNAS.Rmd', clean = FALSE)" + +echo "**** Job ends ****" +date diff --git a/hippo/summaryInfo/run3-v1.0.10/summaryInfo.html b/hippo/summaryInfo/run3-v1.0.10/summaryInfo.html new file mode 100644 index 0000000..ab4cde5 --- /dev/null +++ b/hippo/summaryInfo/run3-v1.0.10/summaryInfo.html @@ -0,0 +1,1330 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Summary info

+
## Options
+opt$example <- eval(parse(text=opt$example))
+example <- opt$example
+names(example) <- NULL
+
+
+## Required libs
+suppressMessages(library("GenomicRanges"))
+suppressMessages(library("ggbio"))
+suppressMessages(library("TxDb.Hsapiens.UCSC.hg19.knownGene"))
+suppressMessages(library("derfinder"))
+suppressMessages(library("derfinderPlot"))
+ +

Summary information for hippo data set, analysis run3-v1.0.10 showcasing best clusters 3, 4, 8 which illustrate a coverage dip, the complex relationship with annotation, and a potentially extended UTR respectively.

+

Number of filtered bases

+
## Extract data from log files
+reads <- system(paste0('grep filterData ', file.path(rootdir, opt$short, 'CoverageInfo', 'logs'), '/fullCov-*.e* | grep -v "that 0 percent"'), intern=TRUE)
+filt <- data.frame(original=as.integer(gsub("were | rows", "", regmatches(reads, regexpr("were [0-9]* rows", reads)))), filtered=as.integer(gsub("are | rows", "", regmatches(reads, regexpr("are [0-9]* rows", reads)))))
+
+## How many were filtered?
+## What is the percent filtered?
+## Percent remaining?
+filtered <- colSums(filt)
+summ <- c(
+    'Filtered' = filtered["original"] - filtered["filtered"],
+    'PercentFilt' = (filtered["original"] - filtered["filtered"]) / filtered["original"] * 100,
+    'PercentRemaining' = 100 - (filtered["original"] - filtered["filtered"]) / filtered["original"] * 100
+)
+summ
##         Filtered.original      PercentFilt.original 
+##              3.059211e+09              9.882204e+01 
+## PercentRemaining.original 
+##              1.177965e+00
+
+ +

Number of candidate regions

+
## Load regions data
+load(file.path(rootdir, opt$short, 'derAnalysis', opt$run, 'fullRegions.Rdata'))
+
+## How many candidate regions?
+nRegs  <- c('cDERsN' = length(fullRegions))
+nRegs
## cDERsN 
+##  31978
+
+ +

Number of DE regions

+

As determined by q-value < 0.10

+
+
## How many regions DE? Judged by q-value
+## What is the percent of regions DE among the candidate ones?
+qval <- c(
+    'nDE' = sum(fullRegions$significantQval == TRUE),
+    'percentDE' = sum(fullRegions$significantQval == TRUE) / length(fullRegions) * 100
+    )
+qval
##        nDE  percentDE 
+## 1965.00000    6.14485
+
+ +

As determined by FWER adjusted p-value < 0.05

+
+
## How many regions DE? Judged by FWER adjusted p-value
+## What is the percent of regions DE among the candidate ones?
+fwer <- c(
+    'nDE' = sum(fullRegions$significantFWER == TRUE),
+    'percentDE' = sum(fullRegions$significantFWER == TRUE) / length(fullRegions) * 100
+    )
+fwer
##        nDE  percentDE 
+## 515.000000   1.610482
+
## Save results
+save(summ, nRegs, qval, fwer, file=file.path(resdir, "summaryResults.Rdata"))
+ + +

Example regions from each case

+
+
## Load full coverage data
+load(file.path(rootdir, opt$short, 'CoverageInfo', 'fullCov.Rdata'))
+
+## Load options
+load(file.path(rootdir, opt$short, 'derAnalysis', opt$run, 'chr1', 'optionsStats.Rdata'))
+
+## For ggplot
+tmp <- fullRegions
+names(tmp) <- seq_len(length(tmp))
+regions.df <- as.data.frame(tmp)
+regions.df$width <- width(tmp)
+rm(tmp)
+
+## Select clusters by cluster area
+df <- data.frame(area=fullRegions$area, clusterChr=paste0(as.integer(fullRegions$cluster), chr=as.character(seqnames(fullRegions))))
+regionClustAreas <- tapply(df$area, df$clusterChr, sum)
+bestArea <- sapply(names(head(sort(regionClustAreas, decreasing=TRUE), 20)), function(y) { which(df$clusterChr == y)[[1]]})
+
+## Graphical setup: ideograms 
+## Load ideogram info
+data(hg19IdeogramCyto, package = "biovizBase")
+ideos.set <- as.character(unique(seqnames(fullRegions[bestArea])))
+p.ideos <- lapply(ideos.set, function(xx) { 
+    plotIdeogram(hg19IdeogramCyto, xx)
+})
+names(p.ideos) <- ideos.set
+
+
+## Graphical setup: transcription database
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+
+## Graphical setup: main plotting function
+regionClusterPlot <- function(idx, tUse="fwer") {
+    ## Chr specific selections
+    chr <- as.character(seqnames(fullRegions[idx]))
+    p.ideo <- p.ideos[[chr]]
+    covInfo <- fullCov[[chr]]
+    
+    ## Make the plot
+    p <- plotCluster(idx, regions=fullRegions, annotation=regions.df, coverageInfo=covInfo, groupInfo=optionsStats$groupInfo, titleUse=tUse, txdb=txdb, p.ideogram=p.ideo)
+    print(p)
+    
+    ## Save .Rdata
+    save(p, file=file.path(resdir, paste0("exampleRegion", idx, ".Rdata")) )
+    
+    ## Save as pdf
+    pdf(file=file.path(resdir, paste0("exampleRegion", idx, ".pdf")), width=20, height=10)
+    print(p)
+    dev.off()
+    rm(p.ideo, covInfo)
+    
+    return(invisible(TRUE)) 
+}
+
+## Genome plots
+for(idx in opt$example) {
+    regionClusterPlot(bestArea[idx], "fwer")
+}
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+

Reproducibility

+

Date the report was generated.

+
## [1] "2014-11-24 16:38:41 EST"
+
+ +

Wallclock time spent generating the report.

+
+
## Time difference of 3.941 mins
+
+ +

R session information.

+
+
## Session info
+options(width=120)
+devtools::session_info()
## Session info-----------------------------------------------------------------------------------------------------------
+
##  setting  value                                      
+##  version  R version 3.1.1 Patched (2014-10-16 r66782)
+##  system   x86_64, linux-gnu                          
+##  ui       X11                                        
+##  language (EN)                                       
+##  collate  en_US.UTF-8                                
+##  tz       
+
## Packages---------------------------------------------------------------------------------------------------------------
+
##  package                           * version  date       source                                      
+##  acepack                             1.3.3.3  2013-05-03 CRAN (R 3.1.1)                              
+##  AnnotationDbi                     * 1.28.1   2014-10-29 Bioconductor                                
+##  base64enc                           0.1.2    2014-06-26 CRAN (R 3.1.0)                              
+##  BatchJobs                           1.5      2014-10-30 CRAN (R 3.1.1)                              
+##  BBmisc                              1.8      2014-10-30 CRAN (R 3.1.1)                              
+##  Biobase                           * 2.26.0   2014-10-15 Bioconductor                                
+##  BiocGenerics                      * 0.12.1   2014-11-15 Bioconductor                                
+##  BiocParallel                        1.0.0    2014-10-15 Bioconductor                                
+##  biomaRt                             2.22.0   2014-10-15 Bioconductor                                
+##  Biostrings                          2.34.0   2014-10-15 Bioconductor                                
+##  biovizBase                          1.14.0   2014-10-15 Bioconductor                                
+##  bitops                              1.0.6    2013-08-17 CRAN (R 3.1.0)                              
+##  brew                                1.0.6    2011-04-13 CRAN (R 3.1.0)                              
+##  BSgenome                            1.34.0   2014-10-15 Bioconductor                                
+##  bumphunter                          1.6.0    2014-10-15 Bioconductor                                
+##  Cairo                               1.5.6    2014-06-26 CRAN (R 3.1.0)                              
+##  checkmate                           1.5.0    2014-10-19 CRAN (R 3.1.1)                              
+##  cluster                             1.15.3   2014-09-04 CRAN (R 3.1.1)                              
+##  codetools                           0.2.9    2014-08-21 CRAN (R 3.1.1)                              
+##  colorspace                          1.2.4    2013-09-30 CRAN (R 3.1.0)                              
+##  DBI                                 0.3.1    2014-09-24 CRAN (R 3.1.1)                              
+##  derfinder                         * 1.0.10   2014-11-23 Bioconductor                                
+##  derfinderHelper                     1.0.4    2014-11-05 Github (lcolladotor/derfinderHelper@27bcfe6)
+##  derfinderPlot                     * 1.0.3    2014-11-05 Github (lcolladotor/derfinderPlot@8d076e3)  
+##  devtools                            1.6.1    2014-10-07 CRAN (R 3.1.1)                              
+##  dichromat                           2.0.0    2013-01-24 CRAN (R 3.1.0)                              
+##  digest                              0.6.4    2013-12-03 CRAN (R 3.1.0)                              
+##  doRNG                               1.6      2014-03-07 CRAN (R 3.1.0)                              
+##  evaluate                            0.5.5    2014-04-29 CRAN (R 3.1.0)                              
+##  fail                                1.2      2013-09-19 CRAN (R 3.1.0)                              
+##  foreach                             1.4.2    2014-04-11 CRAN (R 3.1.0)                              
+##  foreign                             0.8.61   2014-03-28 CRAN (R 3.1.1)                              
+##  formatR                             1.0      2014-08-25 CRAN (R 3.1.1)                              
+##  Formula                             1.1.2    2014-07-13 CRAN (R 3.1.1)                              
+##  GenomeInfoDb                      * 1.2.3    2014-11-15 Bioconductor                                
+##  GenomicAlignments                   1.2.1    2014-11-05 Bioconductor                                
+##  GenomicFeatures                   * 1.18.2   2014-10-29 Bioconductor                                
+##  GenomicFiles                        1.2.0    2014-10-15 Bioconductor                                
+##  GenomicRanges                     * 1.18.3   2014-11-20 Bioconductor                                
+##  getopt                            * 1.20.0   2013-08-30 CRAN (R 3.1.1)                              
+##  GGally                              0.4.8    2014-08-26 CRAN (R 3.1.1)                              
+##  ggbio                             * 1.14.0   2014-11-04 Bioconductor                                
+##  ggplot2                           * 1.0.0    2014-05-21 CRAN (R 3.1.0)                              
+##  graph                               1.44.0   2014-10-15 Bioconductor                                
+##  gridExtra                           0.9.1    2012-08-09 CRAN (R 3.1.1)                              
+##  gtable                              0.1.2    2012-12-05 CRAN (R 3.1.0)                              
+##  Hmisc                               3.14.6   2014-11-22 CRAN (R 3.1.1)                              
+##  htmltools                           0.2.6    2014-09-08 CRAN (R 3.1.1)                              
+##  IRanges                           * 2.0.0    2014-10-15 Bioconductor                                
+##  iterators                           1.0.7    2014-04-11 CRAN (R 3.1.0)                              
+##  knitr                               1.8      2014-11-11 CRAN (R 3.1.1)                              
+##  knitrBootstrap                    * 1.0.0    2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)   
+##  labeling                            0.3      2014-08-23 CRAN (R 3.1.1)                              
+##  lattice                             0.20.29  2014-04-04 CRAN (R 3.1.1)                              
+##  latticeExtra                        0.6.26   2013-08-15 CRAN (R 3.1.0)                              
+##  locfit                              1.5.9.1  2013-04-20 CRAN (R 3.1.0)                              
+##  markdown                            0.7.4    2014-08-24 CRAN (R 3.1.1)                              
+##  MASS                                7.3.35   2014-09-30 CRAN (R 3.1.1)                              
+##  Matrix                              1.1.4    2014-06-15 CRAN (R 3.1.1)                              
+##  matrixStats                         0.10.3   2014-10-15 CRAN (R 3.1.1)                              
+##  mime                                0.2      2014-09-26 CRAN (R 3.1.1)                              
+##  munsell                             0.4.2    2013-07-11 CRAN (R 3.1.0)                              
+##  nnet                                7.3.8    2014-03-28 CRAN (R 3.1.1)                              
+##  OrganismDbi                         1.8.0    2014-10-15 Bioconductor                                
+##  pkgmaker                            0.22     2014-05-14 CRAN (R 3.1.0)                              
+##  plyr                                1.8.1    2014-02-26 CRAN (R 3.1.0)                              
+##  proto                               0.3.10   2012-12-22 CRAN (R 3.1.0)                              
+##  qvalue                              1.40.0   2014-10-15 Bioconductor                                
+##  RBGL                                1.42.0   2014-10-15 Bioconductor                                
+##  RColorBrewer                        1.0.5    2011-06-17 CRAN (R 3.1.0)                              
+##  Rcpp                                0.11.3   2014-09-29 CRAN (R 3.1.1)                              
+##  RCurl                               1.95.4.3 2014-07-29 CRAN (R 3.1.1)                              
+##  registry                            0.2      2012-01-24 CRAN (R 3.1.0)                              
+##  reshape                             0.8.5    2014-04-23 CRAN (R 3.1.0)                              
+##  reshape2                            1.4      2014-04-23 CRAN (R 3.1.0)                              
+##  rmarkdown                         * 0.3.3    2014-09-17 CRAN (R 3.1.1)                              
+##  R.methodsS3                         1.6.1    2014-01-05 CRAN (R 3.1.0)                              
+##  rngtools                            1.2.4    2014-03-06 CRAN (R 3.1.0)                              
+##  rpart                               4.1.8    2014-03-28 CRAN (R 3.1.1)                              
+##  Rsamtools                           1.18.2   2014-11-12 Bioconductor                                
+##  RSQLite                             1.0.0    2014-10-25 CRAN (R 3.1.1)                              
+##  rstudioapi                          0.1      2014-03-27 CRAN (R 3.1.1)                              
+##  rtracklayer                         1.26.2   2014-11-12 Bioconductor                                
+##  S4Vectors                         * 0.4.0    2014-10-15 Bioconductor                                
+##  scales                              0.2.4    2014-04-22 CRAN (R 3.1.0)                              
+##  sendmailR                           1.2.1    2014-09-21 CRAN (R 3.1.1)                              
+##  stringr                             0.6.2    2012-12-06 CRAN (R 3.1.0)                              
+##  survival                            2.37.7   2014-01-22 CRAN (R 3.1.1)                              
+##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0    2014-09-29 Bioconductor                                
+##  VariantAnnotation                   1.12.4   2014-11-16 Bioconductor                                
+##  XML                                 3.98.1.1 2013-06-20 CRAN (R 3.1.0)                              
+##  xtable                              1.7.4    2014-09-12 CRAN (R 3.1.1)                              
+##  XVector                           * 0.6.0    2014-10-15 Bioconductor                                
+##  yaml                                2.1.13   2014-06-12 CRAN (R 3.1.1)                              
+##  zlibbioc                            1.12.0   2014-10-15 Bioconductor
+
+ + +
+
+ +
+ + +
+ + diff --git a/index.Rmd b/index.Rmd new file mode 100644 index 0000000..728cf34 --- /dev/null +++ b/index.Rmd @@ -0,0 +1,173 @@ +--- +title: "derfinder counting paper Supplementary Website" +author: "L Collado-Torres" +date: "`r doc_date()`" +output: + BiocStyle::html_document +--- + +```{r citationsSetup, echo=FALSE, message=FALSE, warning=FALSE} +## Track time spent on making the report +startTime <- Sys.time() + +## Bib setup +library('knitcitations') + +## Load knitcitations with a clean bibliography +cleanbib() +cite_options(hyperlink = 'to.doc', citation_format = 'text', style = 'html') +# Note links won't show for now due to the following issue +# https://github.com/cboettig/knitcitations/issues/63 + +bibs <- c("knitcitations" = citation("knitcitations"), + "derfinder" = citation("derfinder")[1], + "regionReport" = citation("regionReport")[1], + "GenomicRanges" = citation("GenomicRanges"), + "DESeq2" = citation("DESeq2"), + "edgeR" = citation("edgeR")[5], + "BiocStyle" = citation("BiocStyle"), + 'rmarkdown' = citation('rmarkdown'), + 'knitr' = citation('knitr')[3], + 'eff' = RefManageR::BibEntry('manual', key = 'eff', title = 'Efficiency analysis of Sun Grid Engine batch jobs', author = 'Alyssa Frazee', year = 2014, url = 'http://dx.doi.org/10.6084/m9.figshare.878000'), + 'zhou2011' = RefManageR::BibEntry('article', key = 'zhou2011', author = "Zhou, Zhifeng and Yuan, Qiaoping and Mash, Deborah C and Goldman, David", title = "Substance-specific and shared transcription and epigenetic changes in the human hippocampus chronically exposed to cocaine and alcohol", journal = "Proceedings of the National Academy of Sciences of the United States of America", year = 2011, volume = "108", number = "16", pages = "6626-6631"), + 'rail' = RefManageR::BibEntry('article', key = 'rail', author = 'Abhinav Nellore and Leonardo Collado-Torres and Andrew E. Jaffe and José Alquicira-Hernández and Jacob Pritt and James Morton and Jeffrey T. Leek and Ben Langmead', journal = 'bioRxiv', year = '2015', title = 'Rail-RNA: {Scalable} analysis of {RNA}-seq splicing and coverage'), + 'stringtie' = RefManageR::BibEntry('article', key = 'stringtie', author = ' Mihaela Pertea and Geo M. Pertea and Corina M. Antonescu and Tsung-Cheng Chang and Joshua T. Mendell and Steven L. Salzberg', journal = 'Nature Biotechnology', year = '2015', title = 'StringTie enables improved reconstruction of a transcriptome from RNA-seq reads'), + 'hisat' = RefManageR::BibEntry('article', key = 'hisat', author = 'Daehwan Kim and Ben Langmead and Steven L Salzberg', journal = 'Nature Methods', year = '2015', title = 'HISAT: a fast spliced aligner with low memory requirements'), + 'ballgown' = RefManageR::BibEntry('article', key = 'ballgown', author = 'Alyssa C. Frazee and Geo Pertea and Andrew E. Jaffe and Ben Langmead and Steven L. Salzberg and Jeffrey T. Leek', journal = 'Nature Biotechnology', year = '2015', title = 'Ballgown bridges the gap between transcriptome assembly and expression analysis')) + +write.bibtex(bibs, file = 'index.bib') +bib <- read.bibtex('index.bib') + +## Assign short names +names(bib) <- names(bibs) +``` + +This page describes the supplementary material for the `derfinder` counting paper. All the `bash`, `R` and `R Markdown` source files used to analyze the data for this project as well as generate the HTML reports are available in this website. However, it is easier to view them at [github.com/leekgroup/derCountSupp](https://github.com/leekgroup/derCountSupp). + +# Hippocampus and time-course data sets + +This section of the website describes the code and reports associated with the hippocampus and time-course data sets that are referred to in the paper. + + +## Code to reproduce analyses + + +There are 9 main `bash` scripts named _step1-*_ through _step9-*_. + +1. _fullCoverage_ loads the data from the raw files. See [step1-fullCoverage.sh](step1-fullCoverage.sh) and [step1-fullCoverage.R](step1-fullCoverage.R). +1. _makeModels_ creates the models used for the single-level base analysis. See [step2-makeModels.sh](step2-makeModels.sh) and [step2-makeModels.R](step2-makeModels.R). +1. _analyzeChr_ runs the single-base level analysis by chromosome. See [step3-analyzeChr.sh](step3-analyzeChr.sh) and [step3-analyzeChr.R](step3-analyzeChr.R). +1. _mergeResults_ merges the single-base level analysis results for all the chromosomes. See [step4-mergeResults.sh](step4-mergeResults.sh). +1. _derfinderReport_ generates a HTML report for the single-base level DERs. See [step5-derfinderReport.sh](step5-derfinderReport.sh). +1. _regionMatrix_ identifies the expressed regions for the expressed-regions level approach. See [step6-regionMatrix.sh](step6-regionMatrix.sh). +1. _regMatVsDERs_ creates a simple HTML report comparing the single-base and expressed-regions approaches. See [step7-regMatVsDERs.sh](step7-regMatVsDERs.sh) and [step7-regMatVsDERs.Rmd](step7-regMatVsDERs.Rmd). +1. _coverageToExon_ creates an exon count table using known annotation information. See [step8-coverageToExon.sh](step8-coverageToExon.sh) and [step8-coverageToExon.R](step8-coverageToExon.R). +1. _summaryInfo_ creates a HTML report with brief summary information for the given experiment. See [step9-summaryInfo.sh](step9-summaryInfo.sh), [step9-summaryInfo.R](step9-summaryInfo.R), and [step9-summaryInfo.Rmd](step9-summaryInfo.Rmd). + +There are also 3 optional `bash` scripts used when BAM files are available. + +1. _sortSam_ creates sorted by sequence name SAM files. See [optional1-sortSam.sh](optional1-sortSam.sh). +1. _HTSeq_ creates the exon count tables using `HTSeq`. See [optional2-HTSeq.sh](optional2-HTSeq.sh). +1. _summOv_ uses `GenomicRanges` to create the exon count tables. See [optional3-summOv.sh](optional3-summOv.sh) and [optional3-summOv.R](optional3-summOv.R). +1. _featureCounts_ + +A final `bash` script, [run-all.sh](run-all.sh), can be used to run the main 9 steps (or a subset of them). + +All scripts show at the beginning the way they were used. Some of them generate intermediate small `bash` scripts, for example one script per chromosome for the _analyzeChr_ step. For some steps, there is a companion `R` or `R Markdown` code file when the code is more involved or an HTML file is generated in the particular step. + + + +The [check-analysis-time.R](check-analysis-time.R) script was useful for checking the progress of the _step3-analyzeChr_ jobs and detect whenever a node in the cluster was presenting problems. + + + +We expect that these scripts will be useful to `derfinder` users who want to automate the single-base level and/or expressed-regions level analyses for several data sets and/or have the jobs run automatically without having to check if each step has finished running. + + +Note that all `bash` scripts are tailored for the cluster we have access to which administer job queues with Sun Grid Engine (SGE). + + +## Single-base level + +### Quick overview HTML report + +This HTML report contains basic information on the `derfinder` `r citep(bib[["derfinder"]])` results from the _Hippo_ data set. The report answers basic questions such as: + +* What is the number of filtered bases? +* What is the number of candidate regions? +* How many candidate regions are significant? + +It also illustrates three clusters of candidate differentially expressed regions (DERs) from the single-base level analysis. You can view the report by following this link: + +* [Hippo](hippo/summaryInfo/run3-v1.0.10/summaryInfo.html) + +### CSV files and annotation comparison + +This HTML report has the code for loading the R data files and generating the CSV files. The report also has Venn diagrams showing the number of candidate DERs from the single-base level analysis that overlap known exons, introns and intergenic regions using the UCSC hg19 annotation. It also includes a detailed description of the columns in the CSV files. + +View the [venn](venn/venn.html) report or its `R Markdown` source file [venn.Rmd](venn/venn.Rmd). + + +## Timing and memory information + + +This HTML report has code for reading and processing the time and memory information for each job extracted with [efficiency_analytics](https://github.com/alyssafrazee/efficiency_analytics) `r citep(bib[["eff"]])`. The report contains a detailed description of the analysis steps and tables summarizing the maximum memory and time for each analysis step if all the jobs for that particular step were running simultaneously. Finally, there is an interactive table with the timing results. + +View the [timing](timing/timing.html) report or check the `R Markdown` file [timing.Rmd](timing/timing.Rmd). + + +# Hippo vs previous results + +[compareVsPNAS](hippo/pnas/compareVsPNAS.html) is an HTML report comparing 29 regions that were previously found to be differentially expressed `r citep(bib[["zhou2011"]])` versus the `derfinder` single-base level results. It also has code for identified differentially expressed disjoint exons. The additional script [counts-gene.R](hippo/counts-gene/counts-gene.R) has the code for gene counting with `summarizeOverlaps()`. [compareVsPNAS-gene](hippo/pnas/compareVsPNAS-gene.html) compares the results between `DESeq2` and `edgeR`-robust against `derfinder` at the gene level with 40 total plots: 10 for each case of agreement/disagreement. + +View the [compareVsPNAS](hippo/pnas/compareVsPNAS.html) report or check the `R Markdown` file [compareVsPNAS.Rmd](hippo/pnas/compareVsPNAS.Rmd) run by the [runComparison.sh](hippo/pnas/runComparison.sh) script. Also view the [compareVsPNAS-gene](hippo/pnas/compareVsPNAS-gene.html) report and its linked `R Markdown` file [compareVsPNAS-gene.Rmd](hippo/pnas/compareVsPNAS-gene.Rmd). + + + +# Additional analyses + +The following `R` source files have the code for reproducing additional analyses described in the paper + +* [feature_counts.R](additional-analyses/feature_counts.R) Feature counts analysis of Hippo and Snyder data sets. + +This scripts also include other exploratory code. + +# Reproducibility + +Date this page was generated. + +```{r reproducibility1, echo=FALSE} +## Date the report was generated +Sys.time() +``` + +Wallclock time spent generating the report. + +```{r "reproducibility2", echo=FALSE} +## Processing time in seconds +totalTime <- diff(c(startTime, Sys.time())) +round(totalTime, digits=3) +``` + +`R` session information. + +```{r "reproducibility3", echo=FALSE} +## Session info +options(width=120) +devtools::session_info() +``` + +You can view the source `R Markdown` file for this page at [index.Rmd](index.Rmd). + +# Bibliography + +This report was generated using `BiocStyle` `r citep(bib[['BiocStyle']])` +with `knitr` `r citep(bib[['knitr']])` and `rmarkdown` `r citep(bib[['rmarkdown']])` running behind the scenes. + +Citations were made with `knitcitations` `r citep(bib[['knitcitations']])`. Citation file: [index.bib](index.bib). + +```{r vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE} +## Print bibliography +bibliography() +``` + diff --git a/index.bib b/index.bib new file mode 100644 index 0000000..d9d3e41 --- /dev/null +++ b/index.bib @@ -0,0 +1,126 @@ +@Manual{boettiger2015knitcitations, + title = {knitcitations: Citations for 'Knitr' Markdown Files}, + author = {Carl Boettiger}, + year = {2015}, + note = {R package version 1.0.7}, + url = {http://CRAN.R-project.org/package=knitcitations}, +} + +@Article{colladotorres2015derfinder, + title = {derfinder: Software for annotation-agnostic RNA-seq differential expression analysis}, + author = {Leonardo Collado-Torres and Alyssa C. Frazee and Michael I. Love and Rafael A. Irizarry and Andrew E. Jaffe and Jeffrey T. Leek}, + year = {2015}, + journal = {bioRxiv}, + doi = {10.1101/015370}, + url = {http://www.biorxiv.org/content/early/2015/02/19/015370.abstract}, +} + +@Manual{colladotorres2015regionreport, + title = {regionReport: Generate HTML reports for exploring a set of regions}, + author = {Leonardo Collado-Torres and Andrew E. Jaffe and Jeffrey T. Leek}, + year = {2015}, + url = {http://www.bioconductor.org/packages/release/bioc/html/regionReport.html}, + note = {https://github.com/leekgroup/regionReport - R package version 1.4.1}, +} + +@Article{lawrence2013software, + title = {Software for Computing and Annotating Genomic Ranges}, + author = {Michael Lawrence and Wolfgang Huber and Herv\'e Pag\`es and Patrick Aboyoun and Marc Carlson and Robert Gentleman and Martin Morgan and Vincent Carey}, + year = {2013}, + journal = {{PLoS} Computational Biology}, + volume = {9}, + issue = {8}, + doi = {10.1371/journal.pcbi.1003118}, + url = {http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118}, +} + +@Article{love2014moderated, + title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2}, + author = {Michael I Love and Wolfgang Huber and Simon Anders}, + year = {2014}, + journal = {Genome Biology}, + doi = {10.1186/s13059-014-0550-8}, + volume = {15}, + issue = {12}, + pages = {550}, +} + +@Article{zhou2014robustly, + title = {Robustly detecting differential expression in RNA sequencing data using observation weights}, + author = {Xiaobei Zhou and Helen Lindsay and Mark D Robinson}, + journal = {Nucleic Acids Research}, + volume = {42}, + pages = {e91}, + year = {2014}, +} + +@Manual{morganbiocstyle, + title = {BiocStyle: Standard styles for vignettes and other Bioconductor documents}, + author = {Martin Morgan and Andrzej Ole{\a's} and Wolfgang Huber}, + note = {R package version 1.8.0}, + url = {https://github.com/Bioconductor/BiocStyle}, +} + +@Manual{allaire2016rmarkdown, + title = {rmarkdown: Dynamic Documents for R}, + author = {JJ Allaire and Joe Cheng and Yihui Xie and Jonathan McPherson and Winston Chang and Jeff Allen and Hadley Wickham and Aron Atkins and Rob Hyndman}, + year = {2016}, + note = {R package version 0.9.2}, + url = {http://CRAN.R-project.org/package=rmarkdown}, +} + +@InCollection{xie2014knitr, + booktitle = {Implementing Reproducible Computational Research}, + editor = {Victoria Stodden and Friedrich Leisch and Roger D. Peng}, + title = {knitr: A Comprehensive Tool for Reproducible Research in {R}}, + author = {Yihui Xie}, + publisher = {Chapman and Hall/CRC}, + year = {2014}, + note = {ISBN 978-1466561595}, + url = {http://www.crcpress.com/product/isbn/9781466561595}, +} + +@Manual{eff, + title = {Efficiency analysis of Sun Grid Engine batch jobs}, + author = {Alyssa Frazee}, + year = {2014}, + url = {http://dx.doi.org/10.6084/m9.figshare.878000}, +} + +@Article{zhou2011, + author = {{Zhou} and {Zhifeng} and {Yuan} and {Qiaoping} and {Mash} and Deborah C and {Goldman} and {David}}, + title = {Substance-specific and shared transcription and epigenetic changes in the human hippocampus chronically exposed to cocaine and alcohol}, + journal = {Proceedings of the National Academy of Sciences of the United States of America}, + year = {2011}, + volume = {108}, + number = {16}, + pages = {6626-6631}, +} + +@Article{rail, + author = {Abhinav Nellore and Leonardo Collado-Torres and Andrew E. Jaffe and Jos{\a'e} Alquicira-Hern{\a'a}ndez and Jacob Pritt and James Morton and Jeffrey T. Leek and Ben Langmead}, + journal = {bioRxiv}, + year = {2015}, + title = {Rail-RNA: {Scalable} analysis of {RNA}-seq splicing and coverage}, +} + +@Article{stringtie, + author = { Mihaela Pertea and Geo M. Pertea and Corina M. Antonescu and Tsung-Cheng Chang and Joshua T. Mendell and Steven L. Salzberg}, + journal = {Nature Biotechnology}, + year = {2015}, + title = {StringTie enables improved reconstruction of a transcriptome from RNA-seq reads}, +} + +@Article{hisat, + author = {Daehwan Kim and Ben Langmead and Steven L Salzberg}, + journal = {Nature Methods}, + year = {2015}, + title = {HISAT: a fast spliced aligner with low memory requirements}, +} + +@Article{ballgown, + author = {Alyssa C. Frazee and Geo Pertea and Andrew E. Jaffe and Ben Langmead and Steven L. Salzberg and Jeffrey T. Leek}, + journal = {Nature Biotechnology}, + year = {2015}, + title = {Ballgown bridges the gap between transcriptome assembly and expression analysis}, +} diff --git a/index.html b/index.html new file mode 100644 index 0000000..b9fbcef --- /dev/null +++ b/index.html @@ -0,0 +1,231 @@ + + + + + + + + + + + + + + +derfinder counting paper Supplementary Website + + + + + + + + + + + + + + + + + + + + +

Contents

+
+ +
+ +

This page describes the supplementary material for the derfinder counting paper. All the bash, R and R Markdown source files used to analyze the data for this project as well as generate the HTML reports are available in this website. However, it is easier to view them at github.com/leekgroup/derCountSupp.

+
+

1 Hippocampus and time-course data sets

+

This section of the website describes the code and reports associated with the hippocampus and time-course data sets that are referred to in the paper.

+
+

1.1 Code to reproduce analyses

+

There are 9 main bash scripts named _step1-*_ through _step9-*_.

+
    +
  1. fullCoverage loads the data from the raw files. See step1-fullCoverage.sh and step1-fullCoverage.R.
  2. +
  3. makeModels creates the models used for the single-level base analysis. See step2-makeModels.sh and step2-makeModels.R.
  4. +
  5. analyzeChr runs the single-base level analysis by chromosome. See step3-analyzeChr.sh and step3-analyzeChr.R.
  6. +
  7. mergeResults merges the single-base level analysis results for all the chromosomes. See step4-mergeResults.sh.
  8. +
  9. derfinderReport generates a HTML report for the single-base level DERs. See step5-derfinderReport.sh.
  10. +
  11. regionMatrix identifies the expressed regions for the expressed-regions level approach. See step6-regionMatrix.sh.
  12. +
  13. regMatVsDERs creates a simple HTML report comparing the single-base and expressed-regions approaches. See step7-regMatVsDERs.sh and step7-regMatVsDERs.Rmd.
  14. +
  15. coverageToExon creates an exon count table using known annotation information. See step8-coverageToExon.sh and step8-coverageToExon.R.
  16. +
  17. summaryInfo creates a HTML report with brief summary information for the given experiment. See step9-summaryInfo.sh, step9-summaryInfo.R, and step9-summaryInfo.Rmd.
  18. +
+

There are also 3 optional bash scripts used when BAM files are available.

+
    +
  1. sortSam creates sorted by sequence name SAM files. See optional1-sortSam.sh.
  2. +
  3. HTSeq creates the exon count tables using HTSeq. See optional2-HTSeq.sh.
  4. +
  5. summOv uses GenomicRanges to create the exon count tables. See optional3-summOv.sh and optional3-summOv.R.
  6. +
  7. featureCounts
  8. +
+

A final bash script, run-all.sh, can be used to run the main 9 steps (or a subset of them).

+

All scripts show at the beginning the way they were used. Some of them generate intermediate small bash scripts, for example one script per chromosome for the analyzeChr step. For some steps, there is a companion R or R Markdown code file when the code is more involved or an HTML file is generated in the particular step.

+

The check-analysis-time.R script was useful for checking the progress of the step3-analyzeChr jobs and detect whenever a node in the cluster was presenting problems.

+

We expect that these scripts will be useful to derfinder users who want to automate the single-base level and/or expressed-regions level analyses for several data sets and/or have the jobs run automatically without having to check if each step has finished running.

+

Note that all bash scripts are tailored for the cluster we have access to which administer job queues with Sun Grid Engine (SGE).

+
+
+

1.2 Single-base level

+
+

1.2.1 Quick overview HTML report

+

This HTML report contains basic information on the derfinder (Collado-Torres, Frazee, Love, Irizarry, et al., 2015) results from the Hippo data set. The report answers basic questions such as:

+
    +
  • What is the number of filtered bases?
  • +
  • What is the number of candidate regions?
  • +
  • How many candidate regions are significant?
  • +
+

It also illustrates three clusters of candidate differentially expressed regions (DERs) from the single-base level analysis. You can view the report by following this link:

+ +
+
+

1.2.2 CSV files and annotation comparison

+

This HTML report has the code for loading the R data files and generating the CSV files. The report also has Venn diagrams showing the number of candidate DERs from the single-base level analysis that overlap known exons, introns and intergenic regions using the UCSC hg19 annotation. It also includes a detailed description of the columns in the CSV files.

+

View the venn report or its R Markdown source file venn.Rmd.

+
+
+
+

1.3 Timing and memory information

+

This HTML report has code for reading and processing the time and memory information for each job extracted with efficiency_analytics (Frazee, 2014). The report contains a detailed description of the analysis steps and tables summarizing the maximum memory and time for each analysis step if all the jobs for that particular step were running simultaneously. Finally, there is an interactive table with the timing results.

+

View the timing report or check the R Markdown file timing.Rmd.

+
+
+
+

2 Hippo vs previous results

+

compareVsPNAS is an HTML report comparing 29 regions that were previously found to be differentially expressed (Zhou, Zhifeng, Yuan, Qiaoping, et al., 2011) versus the derfinder single-base level results. It also has code for identified differentially expressed disjoint exons. The additional script counts-gene.R has the code for gene counting with summarizeOverlaps(). compareVsPNAS-gene compares the results between DESeq2 and edgeR-robust against derfinder at the gene level with 40 total plots: 10 for each case of agreement/disagreement.

+

View the compareVsPNAS report or check the R Markdown file compareVsPNAS.Rmd run by the runComparison.sh script. Also view the compareVsPNAS-gene report and its linked R Markdown file compareVsPNAS-gene.Rmd.

+
+
+

3 Additional analyses

+

The following R source files have the code for reproducing additional analyses described in the paper

+ +

This scripts also include other exploratory code.

+
+
+

4 Reproducibility

+

Date this page was generated.

+
## [1] "2016-02-20 10:55:44 EST"
+

Wallclock time spent generating the report.

+
## Time difference of 1.217 secs
+

R session information.

+
## Session info -----------------------------------------------------------------------------------------------------------
+
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_US.UTF-8                 
+##  tz       America/New_York            
+##  date     2016-02-20
+
## Packages ---------------------------------------------------------------------------------------------------------------
+
##  package       * version  date       source        
+##  bibtex          0.4.0    2014-12-31 CRAN (R 3.2.0)
+##  BiocStyle     * 1.8.0    2015-10-14 Bioconductor  
+##  bitops          1.0-6    2013-08-17 CRAN (R 3.2.0)
+##  devtools        1.10.0   2016-01-23 CRAN (R 3.2.3)
+##  digest          0.6.9    2016-01-08 CRAN (R 3.2.3)
+##  evaluate        0.8      2015-09-18 CRAN (R 3.2.0)
+##  formatR         1.2.1    2015-09-18 CRAN (R 3.2.0)
+##  htmltools       0.3      2015-12-29 CRAN (R 3.2.3)
+##  httr            1.1.0    2016-01-28 CRAN (R 3.2.3)
+##  knitcitations * 1.0.7    2015-10-28 CRAN (R 3.2.0)
+##  knitr           1.12.3   2016-01-22 CRAN (R 3.2.3)
+##  lubridate       1.5.0    2015-12-03 CRAN (R 3.2.3)
+##  magrittr        1.5      2014-11-22 CRAN (R 3.2.0)
+##  memoise         1.0.0    2016-01-29 CRAN (R 3.2.3)
+##  plyr            1.8.3    2015-06-12 CRAN (R 3.2.1)
+##  R6              2.1.2    2016-01-26 CRAN (R 3.2.3)
+##  Rcpp            0.12.3   2016-01-10 CRAN (R 3.2.3)
+##  RCurl           1.95-4.7 2015-06-30 CRAN (R 3.2.1)
+##  RefManageR      0.10.5   2016-01-02 CRAN (R 3.2.3)
+##  RJSONIO         1.3-0    2014-07-28 CRAN (R 3.2.0)
+##  rmarkdown     * 0.9.2    2016-01-01 CRAN (R 3.2.3)
+##  stringi         1.0-1    2015-10-22 CRAN (R 3.2.0)
+##  stringr         1.0.0    2015-04-30 CRAN (R 3.2.0)
+##  XML             3.98-1.3 2015-06-30 CRAN (R 3.2.0)
+##  yaml            2.1.13   2014-06-12 CRAN (R 3.2.0)
+

You can view the source R Markdown file for this page at index.Rmd.

+
+
+

5 Bibliography

+

This report was generated using BiocStyle (Morgan, Oleś, and Huber, 2016) with knitr (Xie, 2014) and rmarkdown (Allaire, Cheng, Xie, McPherson, et al., 2016) running behind the scenes.

+

Citations were made with knitcitations (Boettiger, 2015). Citation file: index.bib.

+

+[1] J. Allaire, J. Cheng, Y. Xie, J. McPherson, et al. rmarkdown: Dynamic Documents for R. R package version 0.9.2. 2016. URL: http://CRAN.R-project.org/package=rmarkdown. +

+

+[2] C. Boettiger. knitcitations: Citations for ‘Knitr’ Markdown Files. R package version 1.0.7. 2015. URL: http://CRAN.R-project.org/package=knitcitations. +

+

+[3] L. Collado-Torres, A. C. Frazee, M. I. Love, R. A. Irizarry, et al. “derfinder: Software for annotation-agnostic RNA-seq differential expression analysis”. In: bioRxiv (2015). DOI: 10.1101/015370. URL: http://www.biorxiv.org/content/early/2015/02/19/015370.abstract. +

+

+[4] A. Frazee. Efficiency analysis of Sun Grid Engine batch jobs. 2014. URL: http://dx.doi.org/10.6084/m9.figshare.878000. +

+

+[5] M. Morgan, A. Oleś and W. Huber. BiocStyle: Standard styles for vignettes and other Bioconductor documents. R package version 1.8.0. 2016. URL: https://github.com/Bioconductor/BiocStyle. +

+

+[6] Y. Xie. “knitr: A Comprehensive Tool for Reproducible Research in R”. In: Implementing Reproducible Computational Research. Ed. by V. Stodden, F. Leisch and R. D. Peng. ISBN 978-1466561595. Chapman and Hall/CRC, 2014. URL: http://www.crcpress.com/product/isbn/9781466561595. +

+

+[7] Zhou, Zhifeng, Yuan, Qiaoping, et al. “Substance-specific and shared transcription and epigenetic changes in the human hippocampus chronically exposed to cocaine and alcohol”. In: Proceedings of the National Academy of Sciences of the United States of America 108.16 (2011), pp. 6626-6631. +

+
+ + + + + + + + diff --git a/optional1-sortSam.sh b/optional1-sortSam.sh new file mode 100755 index 0000000..b0d4626 --- /dev/null +++ b/optional1-sortSam.sh @@ -0,0 +1,60 @@ +#!/bin/sh + +## Usage +# sh optional1-sortSam.sh snyder +# sh optional1-sortSam.sh hippo + +# Define variables +EXPERIMENT=$1 +SHORT="sortSam-${EXPERIMENT}" + +# Directories +ROOTDIR=/dcs01/ajaffe/Brain/derRuns/derCountSupp +MAINDIR=${ROOTDIR}/${EXPERIMENT} +WDIR=${MAINDIR}/sortSam + +if [[ "${EXPERIMENT}" == "snyder" ]] +then + SAMFILES='/dcs01/ajaffe/Snyder/RNAseq/TopHat/*out/accepted_hits.bam' +elif [[ "${EXPERIMENT}" == "hippo" ]] +then + SAMFILES='/dcs01/ajaffe/Hippo/TopHat/*out/accepted_hits.bam' +else + echo "Specify a valid experiment: snyder or hippo" +fi + + +# Construct shell files +sname="${SHORT}" +echo "Creating script ${sname}" +cat > ${ROOTDIR}/.${sname}.sh < ${file/.bam/_sorted.sam} +done + +## Move log files into the logs directory +mv ${ROOTDIR}/${sname}.* ${WDIR}/logs/ + +echo "**** Job ends ****" +date +EOF + +call="qsub .${sname}.sh" +echo $call +$call diff --git a/optional2-HTSeq.sh b/optional2-HTSeq.sh new file mode 100755 index 0000000..996f1cb --- /dev/null +++ b/optional2-HTSeq.sh @@ -0,0 +1,83 @@ +#!/bin/sh + +## Usage +# sh optional2-HTSeq.sh snyder +# sh optional2-HTSeq.sh hippo + +# Define variables +EXPERIMENT=$1 +SHORT="HTSeq-${EXPERIMENT}" + +# Directories +ROOTDIR=/dcs01/ajaffe/Brain/derRuns/derCountSupp +MAINDIR=${ROOTDIR}/${EXPERIMENT} +WDIR=${MAINDIR}/HTSeq + +if [[ "${EXPERIMENT}" == "snyder" ]] +then + SAMFILES='/dcs01/ajaffe/Snyder/RNAseq/TopHat/*out' +elif [[ "${EXPERIMENT}" == "hippo" ]] +then + SAMFILES='/dcs01/ajaffe/Hippo/TopHat/*out' +else + echo "Specify a valid experiment: snyder or hippo" +fi + + +# Construct shell files +for sam in $SAMFILES +do + current=${sam##*/} + sname="${SHORT}.${current}" + echo "Creating script ${sname}" + cat > ${ROOTDIR}/.${sname}.sh < htseq_output.txt + +echo "**** HTSeq finishes ****" +date + +# Copy back the results +mv htseq_output.txt ${sam}/ + +## Move log files into the logs directory +mv ${ROOTDIR}/${sname}.* ${WDIR}/logs/ + +echo "**** Job ends ****" +date +EOF + + call="qsub .${sname}.sh" + echo $call + $call +done diff --git a/optional3-summOv.R b/optional3-summOv.R new file mode 100644 index 0000000..7c697e1 --- /dev/null +++ b/optional3-summOv.R @@ -0,0 +1,32 @@ +## Following http://bioconductor.org/packages/release/data/experiment/vignettes/parathyroidSE/inst/doc/parathyroidSE.pdf + +## Setup +library("Rsamtools") +library("derfinder") +library("GenomicRanges") + +## Load pre-computed exonic parts from Ensembl annotation +# load("/dcs01/ajaffe/Brain/derRuns/derMisc/ensemblExons/exonicParts.Rdata") + +## For comparability, use the exons from genomic state +load("/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.ensembl.GRCh37.p11.rda") +exonicParts <- GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome[ GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome$theRegion == "exon"] + +## Make bamFileList +files <- rawFiles(datadir=datadir, samplepatt="out$", + fileterm="accepted_hits.bam") +names(files) <- gsub('_out', '', names(files)) +bai <- paste0(files, ".bai") +bList <- BamFileList(files, bai) + +## Compute the overlaps +message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps()")) +summOverlaps <- summarizeOverlaps(exonicParts, bList, mode="Union", + singleEnd=TRUE, ignore.strand=TRUE, inter.feature=FALSE, mc.cores=cores) + +## Finish +message(paste(Sys.time(), "summarizeOverlaps: Saving summOverlaps")) +save(summOverlaps, file="summOverlaps.Rdata") + +proc.time() +sessionInfo() diff --git a/optional3-summOv.sh b/optional3-summOv.sh new file mode 100755 index 0000000..214af8d --- /dev/null +++ b/optional3-summOv.sh @@ -0,0 +1,59 @@ +#!/bin/sh + +## Usage +# sh optional3-summOv.sh snyder +# sh optional3-summOv.sh hippo + +# Define variables +EXPERIMENT=$1 +SHORT="summOv-${EXPERIMENT}" + +# Directories +ROOTDIR=/dcs01/ajaffe/Brain/derRuns/derCountSupp +MAINDIR=${ROOTDIR}/${EXPERIMENT} +WDIR=${MAINDIR}/summOv + +if [[ "${EXPERIMENT}" == "snyder" ]] +then + CORES=10 + DATADIR=/dcs01/ajaffe/Snyder/RNAseq/TopHat +elif [[ "${EXPERIMENT}" == "hippo" ]] +then + CORES=24 + DATADIR=/dcs01/ajaffe/Hippo/TopHat +else + echo "Specify a valid experiment: snyder or hippo" +fi + + +# Construct shell files +sname="${SHORT}" +echo "Creating script ${sname}" +cat > ${ROOTDIR}/.${sname}.sh < ${ROOTDIR}/.${SHORT}.sh < ${ROOTDIR}/.${sname}.sh <= 0', + 'chr', 'c', 1, 'character', 'Chromosome under analysis', + 'mcores', 'm', 1, 'integer', 'Number of cores', + 'help' , 'h', 0, 'logical', 'Display help' +), byrow=TRUE, ncol=5) +opt <- getopt(spec) + + +## if help was asked for print a friendly message +## and exit with a non-zero error code +if (!is.null(opt$help)) { + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +## Check experiment input +stopifnot(opt$experiment %in% c('snyder', 'hippo')) + +## Format chromosome name appropriately +opt$chr <- mapSeqlevels(opt$chr, 'UCSC') + + +message('Loading Rdata file with the output from loadCoverage()') +load(opt$CovFile) + +## Make it easy to use the name later. Here I'm assuming the names were generated using output='auto' in loadCoverage() +eval(parse(text=paste0('covData <- ', opt$chr, 'CovInfo'))) +eval(parse(text=paste0('rm(', opt$chr, 'CovInfo)'))) + + +## Load the models +load('models.Rdata') + +## Load group information +load('groupInfo.Rdata') + +if(file.exists('colsubset.Rdata')) { + load('colsubset.Rdata') +} else { + colsubset <- NULL +} + + +## Run the analysis +if (opt$experiment == 'snyder') { + analyzeChr(chr = opt$chr, coverageInfo = covData, models = models, + cutoffFstat = 1e-05, colsubset = colsubset, + nPermute = 100, seeds = seq_len(100) + 20131212, maxClusterGap = 3000, + groupInfo = groupInfo, mc.cores = opt$mcores, + lowMemDir = file.path(tempdir(), opt$chr, 'chunksDir')) +} else if (opt$experiment == 'hippo') { + analyzeChr(chr = opt$chr, coverageInfo = covData, models = models, + cutoffFstat = 1e-04, colsubset = colsubset, cutoffPre = 3, + nPermute = 100, seeds = seq_len(100) + 20131212, maxClusterGap = 3000, + groupInfo = groupInfo, mc.cores = opt$mcores, + lowMemDir = file.path(tempdir(), opt$chr, 'chunksDir')) +} + + +## Done +proc.time() +options(width = 120) +session_info() diff --git a/step3-analyzeChr.sh b/step3-analyzeChr.sh new file mode 100755 index 0000000..501a5d2 --- /dev/null +++ b/step3-analyzeChr.sh @@ -0,0 +1,68 @@ +#!/bin/sh + +## Usage +# sh step3-analyzeChr.sh snyder run3-v1.0.10 +# sh step3-analyzeChr.sh hippo run3-v1.0.10 + +# Define variables +EXPERIMENT=$1 +SHORT="derA-${EXPERIMENT}" +PREFIX=$2 + +# Directories +ROOTDIR=/dcs01/ajaffe/Brain/derRuns/derCountSupp +MAINDIR=${ROOTDIR}/${EXPERIMENT} +WDIR=${MAINDIR}/derAnalysis +DATADIR=${MAINDIR}/CoverageInfo + +# Construct shell files +CHRNUMS="22 21 Y 20 19 18 17 16 15 14 13 12 11 10 9 8 X 7 6 5 4 3 2 1" + +for chrnum in ${CHRNUMS} +do + echo "Creating script for chromosome ${chrnum}" + + if [[ "${EXPERIMENT}" == "snyder" ]] + then + CORES=4 + elif [[ "${EXPERIMENT}" == "hippo" ]] + then + CORES=2 + else + echo "Specify a valid experiment: snyder or hippo" + fi + + chr="chr${chrnum}" + outdir="${PREFIX}/${chr}" + sname="${SHORT}.${PREFIX}.${chr}" + cat > ${ROOTDIR}/.${sname}.sh < ${ROOTDIR}/.${sname}.sh < ${ROOTDIR}/.${sname}.sh < ${ROOTDIR}/.${sname}.sh <= 3] +regs <- regs[width(regs) >= 3] + +## Save data used +save(ders, file = file.path(analysisPath, 'ders.Rdata')) +save(regs, file = file.path(analysisPath, 'regs.Rdata')) +``` + +Construct logical indexes for DERs and regionMatrix regions. + +```{r 'buildIndex'} +## Construct logical Rle indexes for bases with some region +build_index <- function(gr) { + res <- lapply(names(seqlengths(gr)), function(chr) { + chr.len <- seqlengths(gr)[chr] + ir <- sort(ranges(gr[seqnames(gr) == chr])) + log <- c(rep(c(FALSE, TRUE), length(ir)), FALSE) + + starts <- ends <- rep(NA, length(ir) * 2) + i <- rep(c(TRUE, FALSE), length(ir)) + starts[i] <- start(ir) + ends[i] <- end(ir) + + starts[!i] <- ends[i] + 1 + + if(max(ends, na.rm = TRUE) < chr.len) { + ends[!i] <- c(starts[i] - 1, chr.len)[-1] + } else { + ends[!i] <- c(starts[i] - 1, NULL)[-1] + starts <- starts[- length(starts)] + log <- log[- length(log)] + } + + if(starts[1] != 1) { + ends <- c(starts[1] - 1, ends) + starts <- c(1, starts) + } else { + log <- log[-1] + } + + widths <- mapply(function(s, e) { e - s + 1}, starts, ends) + + Rle(log, widths) + }) + names(res) <- names(seqlengths(gr)) + return(res) +} +index.ders <- build_index(ders) +index.regs <- build_index(regs) + +## Add info for chrs where there are no regs +miss <- !paste0('chr', c(1:22, 'X', 'Y')) %in% names(index.regs) +names(miss) <- paste0('chr', c(1:22, 'X', 'Y')) +if(any(miss)) { + miss.add <- lapply(names(miss)[miss], function(x) { + Rle(FALSE, seqlengths(hg19Ideogram)[x]) + }) + names(miss.add) <- names(miss)[miss] + index.regs <- c(index.regs, miss.add) + index.regs <- index.regs[match(names(miss), names(index.regs))] +} + +## Add info for chrs where there are no DERs +miss <- !names(index.regs) %in% names(index.ders) +if(any(miss)) { + miss.add <- lapply(names(index.regs)[miss], function(x) { + Rle(FALSE, length(index.regs[[x]])) + }) + names(miss.add) <- names(index.regs)[miss] + index.ders <- c(index.ders, miss.add) + index.ders <- index.ders[match(names(index.regs), names(index.ders))] +} +``` + +# Compare + +## Visually explore + +```{r 'epivizr', eval = FALSE} +library('epivizr') +mgr <- startEpiviz() +ders_dev <- mgr$addDevice(ders[!as.logical(ders$significantFWER)], "DERs no sig FWER") +ders_sig_dev <- mgr$addDevice(ders[as.logical(ders$significantFWER)], "DERs sig FWER") +regs_dev <- mgr$addDevice(regs, "Region Matrix") + +## SOX11 +mgr$navigate("chr2", 5810000, 5850000) + +## MEX3A +mgr$navigate("chr1", 156040000, 156090000) + +## VASH2 +mgr$navigate("chr1", 213120000, 213170000) + +## TG: +mgr$navigate("chr8", 134040000, 134120000) + +## IGF2BP2 +mgr$navigate("chr3", 185350000, 185410000) + +## FBN3 +mgr$navigate("chr19", 8130000, 8180000) + +## End +mgr$stopServer() +``` + + +## Basic comparison + +Number of regions + +```{r 'basic1'} +## Number of regions +c('ders' = length(ders), 'regs' = length(regs)) +``` + +Summary on width of regions + +```{r 'basic2'} +## Size of regions +c('ders' = summary(width(ders)), 'regs' = summary(width(regs))) +``` + + +## Compare indexes + +### Base-pairs + +Number of base-pairs in each index. Summary first, then overall info for the genome (in number of bases, then in percent of the genome), and finally results in interactive table. + +```{r 'index-num'} +## Merge all the indexes +index.all <- mapply(function(der, reg) { + both <- der & reg + only.der <- der & !reg + only.reg <- !der & reg + none <- !der & !reg + + res <- list('both' = both, 'only.der' = only.der, 'only.reg' = only.reg, + 'none' = none, 'all.der' = der, 'all.reg' = reg) + return(list(res)) +}, index.ders, index.regs) + +## Find number of base-pairs in each index +index.num <- data.frame(do.call(rbind, lapply(index.all, function(x) { sapply(x, sum)}))) +index.num$chrLen <- seqlengths(ders) +index.num$chr <- rownames(index.num) +rownames(index.num) <- NULL + +## Print info +summary(index.num) + +## Overall info +overallInfo <- colSums(index.num[, -ncol(index.num)]) +overallInfo + +## Overall info in percent +overallInfo / sum(as.numeric(index.num$chrLen)) * 100 +``` + + + + +```{r 'print-index-num', results = 'asis'} +d1 <- data.table(data.frame(row = seq_len(nrow(index.num)), index.num, check.names=FALSE)) +t1 <- dTable(d1, sPaginationType= 'full_numbers', iDisplayLength=25, + sScrollX='100%') +t1$print("bases", cdn=TRUE) +``` + +### Segments per index + +Number of segments per index. First summary, then results for genome, and finally an interactive table. + + +```{r 'index-seg'} +## Find number of segments in each index +index.seg <- data.frame(do.call(rbind, lapply(index.all, function(x) { + sapply(x, function(y) { + sum(runValue(y)) + }) +}))) +index.seg$chr <- rownames(index.seg) +rownames(index.seg) <- NULL + +## Print info +summary(index.seg) + +## Overall info +colSums(index.seg[, -ncol(index.seg)]) +``` + + +```{r 'print-index-seg', results = 'asis'} +d2 <- data.table(data.frame(row = seq_len(nrow(index.seg)), index.seg, check.names=FALSE)) +t2 <- dTable(d2, sPaginationType= 'full_numbers', iDisplayLength=25, + sScrollX='100%') +t2$print("segments") +``` + + +### Segments width + +Summary of the segment widths for each index. First the overall summary, then the results for each index. + +```{r 'index-width'} +## Get an idea of the width of the segments in each index +index.width <- data.frame(do.call(rbind, lapply(index.all, function(x) { + tmp <- data.frame(do.call(rbind, lapply(x, function(y) { + summary(runLength(y)[runValue(y)]) + })), check.names = FALSE) + tmp$index <- names(x) + rownames(tmp) <- NULL + return(tmp) +})), check.names = FALSE) +index.width$chr <- rep(names(seqlengths(ders)), each = 6) +rownames(index.width) <- NULL + +## Print info +summary(index.width) +``` + +```{r 'print-index-width', results = 'asis'} +d3 <- data.table(data.frame(row = seq_len(nrow(index.width)), index.width, check.names=FALSE)) +t3 <- dTable(d3, sPaginationType= 'full_numbers', iDisplayLength=25, + sScrollX='100%') +t3$print("widths") +``` + + +## Overlaps + +### Minimum 20 bp + +```{r 'overlaps20', bootstrap.show.message = FALSE} +ov20 <- findOverlaps(ders, regs, minoverlap = 20L) + +counts <- list() +for(type in c("any", "within", "equal")) { + ct.ders <- countOverlaps(ders, regs, minoverlap = 20L, type = type) + plot(log10(table(ct.ders)), main = paste("DERs in regs for type", type)) + ct.regs <- countOverlaps(regs, ders, minoverlap = 20L, type = type) + plot(log10(table(ct.regs)), main = paste("Regs in DERs for type", type)) + counts <- c(counts, list(table(ct.ders), table(ct.regs))) +} +cts <- as.integer(unique(unlist(lapply(counts, names)))) + +nOverlap20 <- do.call(rbind, lapply(counts, function(x) { + df <- data.frame(nOverlap = cts, freq = x[match(cts, names(x))], + row.names = seq_len(length(cts))) + df$observed <- !is.na(df$freq) + df$freq[is.na(df$freq)] <- 0 + df$cumFreq <- cumsum(df$freq) + df$cumPerc <- df$cumFreq / max(df$cumFreq) * 100 + return(df) +})) +nOverlap20$type <- factor(rep(c('any', 'within', 'equal'), each = length(cts) * 2), levels = c('any', 'within', 'equal')) +nOverlap20$match <- rep(rep(c('DERs-in-regs', 'regs-in-DERs'), each = length(cts)), 3) +#nOverlap <- nOverlap[complete.cases(nOverlap), ] +rownames(nOverlap20) <- NULL + +nOverlap20$alpha <- ifelse(nOverlap20$observed, 1, 1/3) +``` + +Summary plots showing cumulative frequency and cumulative percent. + +```{r 'gplot20'} +## Make a nice plot +ggplot(data = nOverlap20, aes(x = nOverlap, y = cumFreq, colour = match, alpha = alpha)) + geom_point() + facet_grid( . ~ type )# + geom_smooth(se=FALSE) + +## Show cumulative percents +ggplot(data = nOverlap20, aes(x = nOverlap, y = cumPerc, colour = match, linetype = match)) + geom_line(lwd=1) + facet_grid( . ~ type ) +``` + + +Some important numbers: percent of regions with width < 20 bp, base level agreement, region level agreement (min overlap 20 bp). + +```{r} +## Percent with widths < 20L +small <- c('ders' = sum(width(ders) < 20) / length(ders), 'regs' = sum(width(regs) < 20) / length(regs)) * 100 +data.frame('under-20' = small, '20-and-above' = 100 - small, check.names = FALSE) + +## Base level agreement +c('regs' = overallInfo['both'] / (overallInfo['both'] + overallInfo['only.reg']) * 100, 'ders' = overallInfo['both'] / (overallInfo['both'] + overallInfo['only.der']) * 100) + +## Overlap (min 20) agreement +c('regs' = 100 - subset(nOverlap20, match == 'regs-in-DERs' & nOverlap == 0 & type == 'any')$cumPerc, 'ders' = 100 - subset(nOverlap20, match == 'DERs-in-regs' & nOverlap == 0 & type == 'any')$cumPerc) +``` + +### Minimum 1 bp + +```{r 'overlaps1', bootstrap.show.message = FALSE} +ov1 <- findOverlaps(ders, regs, minoverlap = 1L) + +counts <- list() +for(type in c("any", "within", "equal")) { + ct.ders <- countOverlaps(ders, regs, minoverlap = 1L, type = type) + plot(log10(table(ct.ders)), main = paste("DERs in regs for type", type)) + ct.regs <- countOverlaps(regs, ders, minoverlap = 1L, type = type) + plot(log10(table(ct.regs)), main = paste("Regs in DERs for type", type)) + counts <- c(counts, list(table(ct.ders), table(ct.regs))) +} +cts <- as.integer(unique(unlist(lapply(counts, names)))) + +nOverlap1 <- do.call(rbind, lapply(counts, function(x) { + df <- data.frame(nOverlap = cts, freq = x[match(cts, names(x))], + row.names = seq_len(length(cts))) + df$observed <- !is.na(df$freq) + df$freq[is.na(df$freq)] <- 0 + df$cumFreq <- cumsum(df$freq) + df$cumPerc <- df$cumFreq / max(df$cumFreq) * 100 + return(df) +})) +nOverlap1$type <- factor(rep(c('any', 'within', 'equal'), each = length(cts) * 2), levels = c('any', 'within', 'equal')) +nOverlap1$match <- rep(rep(c('DERs-in-regs', 'regs-in-DERs'), each = length(cts)), 3) +#nOverlap <- nOverlap[complete.cases(nOverlap), ] +rownames(nOverlap1) <- NULL + +nOverlap1$alpha <- ifelse(nOverlap1$observed, 1, 1/3) + +## Overlap (min 1bp) agreement +c('regs' = 100 - subset(nOverlap1, match == 'regs-in-DERs' & nOverlap == 0 & type == 'any')$cumPerc, 'ders' = 100 - subset(nOverlap1, match == 'DERs-in-regs' & nOverlap == 0 & type == 'any')$cumPerc) +``` + +Summary plots showing cumulative frequency and cumulative percent. + +```{r 'gplot1'} +## Make a nice plot +ggplot(data = nOverlap1, aes(x = nOverlap, y = cumFreq, colour = match, alpha = alpha)) + geom_point() + facet_grid( . ~ type )# + geom_smooth(se=FALSE) + +## Show cumulative percents +ggplot(data = nOverlap1, aes(x = nOverlap, y = cumPerc, colour = match, linetype = match)) + geom_line(lwd=1) + facet_grid( . ~ type ) +``` + +# Save results + +```{r 'save'} +save(index.all, index.num, index.seg, index.width, nOverlap20, ov20, nOverlap1, ov1, overallInfo, file = file.path(analysisPath, "comparison-results.Rdata")) +``` + + + + + +# Reproducibility + +Analysis path: `r analysisPath` + +Re-make the report + +```{r 'remake', eval = FALSE} +# Load fullRegions.Rdata and regionMat.Rdata before this step +library('rmarkdown') +library('knitrBootstrap') +render('step7-regMatVsDERs.Rmd') +``` + + +Date the report was generated. + +```{r reproducibility1, echo=FALSE} +## Date the report was generated +Sys.time() +``` + +`R` session information. + +```{r reproducibility3, echo=FALSE} +## Session info +options(width = 120) +session_info() +``` diff --git a/step7-regMatVsDERs.sh b/step7-regMatVsDERs.sh new file mode 100755 index 0000000..88d0182 --- /dev/null +++ b/step7-regMatVsDERs.sh @@ -0,0 +1,59 @@ +## Usage +# sh step7-regMatVsDERs.sh snyder run3-v1.0.10 +# sh step7-regMatVsDERs.sh hippo run3-v1.0.10 + +# Define variables +EXPERIMENT=$1 +PREFIX=$2 +SHORT="regVsDERs-${EXPERIMENT}" +ncore=5 +cores="${ncore}cores" + +# Directories +ROOTDIR=/dcs01/ajaffe/Brain/derRuns/derCountSupp +MAINDIR=${ROOTDIR}/${EXPERIMENT} + +# Construct shell files +sname="${SHORT}.${PREFIX}" +echo "Creating script ${sname}" + +if [[ "${EXPERIMENT}" == "snyder" ]] +then + CUTOFF=5 +elif [[ "${EXPERIMENT}" == "hippo" ]] +then + CUTOFF=3 +else + echo "Specify a valid experiment: snyder or hippo" +fi + +WDIR=${MAINDIR}/regionMatrix-vs-DERs/cut${CUTOFF}-vs-${PREFIX} + +cat > ${ROOTDIR}/.${sname}.sh < ${ROOTDIR}/.${sname}.sh < ${ROOTDIR}/.${sname}.sh <= 20] <- '20+' +all$coresGroups <- factor(all$coresGroups, levels = c(1, 2, 4, 5, 8, 10, '20+')) + +## Types of analysis +all$analysis <- factor(ifelse(all$step %in% c('derMod', 'derA', 'derM'), 'Single-base DER', ifelse(all$step %in% c('toSam', 'htseq', 'summOv', 'covToEx'), 'Exon count', ifelse(all$step == 'regMat', 'Expressed-region DER', ifelse(all$step == 'fullCov', 'Load data', ifelse(all$step == 'derR', 'HTML report', 'misc')))))) + +## Show only information for the data sets described in this website +all <- subset(all, experiment %in% c('hippo', 'snyder')) +``` + + +## Adjusting by number of cores + +The following plots show the wall time and memory used by each job while taking into account the number of cores used by each job. Note that doing so is a crude approximation of how much time and memory each job would have needed had it ran on a single node. + +Points are colored by which analysis type they belong to. Note that the loading data step is required for the single-level and expressed-regions DER approaches as well as exon counting (with derfinder). + +```{r edaAnalysis, fig.width=10, bootstrap.show.code=FALSE} +## Walltime and memory adjusted by number of cores (it's an approximation) +ggplot(all, aes(x=timeByCore, y=memByCore, colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores") + ylab("Memory (GB) divided by the number of cores") + scale_colour_brewer(palette="Dark2") + theme_bw(base_size = 18) + theme(legend.position=c(.5, .75), legend.box = 'horizontal') +ggplot(all, aes(x=log2(timeByCore), y=memByCore, colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores (log2)") + ylab("Memory (GB) divided by the number of cores") + scale_colour_brewer(palette="Dark2") + theme_bw(base_size = 18) + theme(legend.position=c(.5, .75), legend.box = 'horizontal') + +## For supp text +time <- ggplot(subset(all, !software %in% c('TopHat', 'regionReport') & analysis != 'misc'), aes(x=log2(timeByCore), y=log2(memByCore), colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores (log2)") + ylab("GB memory divided by number of cores (log2)") + scale_colour_brewer(palette="Set1") + theme_bw(base_size = 18) + theme(legend.position=c(.55, .15), legend.box = 'horizontal') +time +pdf(file = 'time.pdf', width = 10) +time +dev.off() +#system('open time.pdf') +``` + +## Resources by step for each analysis + +```{r 'analysisSummary', bootstrap.show.code=FALSE} +getInfo <- function(df, sumTime = FALSE, peakCores = FALSE) { + memByCore <- max(df$memByCore) + walltime <- ifelse(sumTime, sum(df$walltime), max(df$walltime)) + memG <- max(df$memG) + peakCores <- ifelse(peakCores, max(df$peakCores), sum(df$cores)) + res <- c(memByCore = memByCore, walltime = walltime, memG = memG, peakCores = peakCores) + return(res) +} + +analysisInfo <- list('Single-base DER' = c('Load data', 'Single-base DER'), + 'Expressed-region DER' = c('Load data', 'Expressed-region DER'), + 'HTML report' = 'HTML report', + 'Exon count - derfinder' = 'Load data' +) +analysisInfo <- lapply(analysisInfo, function(x) { which(all$analysis %in% x)}) +analysisInfo[[4]] <- c(analysisInfo[[4]], which(all$step == 'covToEx')) +analysisInfo$"Exon count - HTSeq" <- which(all$step %in% c('toSam', 'htseq')) +analysisInfo$"Exon count - GenomicRanges" <- which(all$step == 'summOv') + +## Summarize the information for each step of each analysis +analysisSummary <- lapply(names(analysisInfo), function(analysis) { + current <- all[analysisInfo[[analysis]], ] + res_analysis <- lapply(exps, function(exp) { + use <- subset(current, experiment == exp) + if(nrow(use) == 0) return(NULL) + res_exp <- lapply(unique(use$step), function(step) { + res_step <- as.data.frame(t(getInfo(use[use$step == step, ]))) + res_step$step <- step + res_step$experiment <- exp + res_step$analysis <- analysis + return(res_step) + }) + res_exp <- do.call(rbind, res_exp) + return(res_exp) + }) + res_analysis <- do.call(rbind, res_analysis) + return(res_analysis) +}) +analysisSummary <- do.call(rbind, analysisSummary) +``` + +The table shown below shows per analysis the maximum memory used by a job and maximum wall time for that step. This is assuming that all jobs for a given step ran simultaneously. For example, that all jobs running `derfinder::analyzeChr()` were running at the same time. Note that for some analyses relied on the same steps, like loading the data (_fullCov_). This table can be useful to find the peak number of cores (the sum of cores for all jobs running simultaneously) for a given analysis step. + +```{r 'analysisSumTab', results = 'asis', bootstrap.show.code=FALSE} +kable(analysisSummary, format = 'html', digits = c(2, 4, 2)) +``` + +## Resources for each analysis + +```{r 'peakSummary', bootstrap.show.code=FALSE} +## Summary the information for each analysis +peaks <- lapply(names(analysisInfo), function(analysis) { + res_analysis <- lapply(exps, function(exp) { + current <- analysisSummary[analysisSummary$analysis == analysis & analysisSummary$experiment == exp, ] + if(nrow(current) == 0) return(NULL) + res_exp <- as.data.frame(t(getInfo(current, sumTime = TRUE, peakCores = TRUE))) + res_exp$experiment <- exp + res_exp$analysis <- analysis + return(res_exp) + }) + res_analysis <- do.call(rbind, res_analysis) + return(res_analysis) +}) +peaks <- do.call(rbind, peaks) + +save(peaks, file = 'peaks.Rdata') +``` + +We can further summarize the resources used by each analysis by identified the maximum memory used in the steps required for a particular analysis and the total wall time for running all the steps when all the jobs of a particular step are running simultaneously. Thus giving us the total actual wall time to run a specific analysis and the maximum memory required. + +The table below shows the final summary. Note that in some analyses, the peak memory is from the _fullCov_ step. We did not focus on reducing the memory load of this step as we sacrificed memory for speed. We know that much lower memory limits can be achieved using 1 core instead of the 10 cores used. + +```{r 'peakSumTab', bootstrap.show.code=FALSE, results = 'asis'} +kable(peaks, format = 'html', digits = c(2, 3, 2)) +``` + +Regarding the high memory load for the HTML report, this could be significantly lowered by only loading the required coverage data used for the plots instead of the full output from the _fullCov_ step. That is, using the _which_ argument from `fullCoverage()` to create a much smaller _fullCov_ object, which would also reduce the memory used when plotting. + +__Note__: since these analyses were done, we have found other ways to run `derfinder::regionMatrix()` that require less memory. In particular, if you have BigWig files (as those generated by `Rail-RNA` `r citep(bib[['rail']])`), we recommend using `railMatrix()`. + + +## Comparing methods for gene count table generation + +The previous table can also be used to compare the sum of the time and peak memory used by the different steps to obtain the exon count table with the following software options. + +* `derfinder`: includes resources used for reading coverage data in `R` and then running creating a feature count matrix. We did so for + * UCSC hg19 knownGene annotation + * Ensembl GRCh37 p11 annotation. +* `HTSeq`: includes resources used for generating sorted SAM files and then running HTSeq. +* `summOv`: resources used for running `GenomicRanges::summarizeOverlaps()` directly on the BAM files. + + + +# Details + +The following table shows the details of the resources used by the different jobs. It shows the experiment (_experiment_), the analysis step (_step_), wall time used (shown in hours, _walltime_), number of cores used (_cores_), memory in GB used (_memG_), software used (_software_), analysis for which the step is used (_analysis_), and the job name (_jobib_). Furthermore, it shows two simple approximations: + +* _timeByCore_ is the wall time (in hours) multiplied by the number of cores used. It is a very simple approximation for the wall time used had the job been ran on a single node. This approximation is known to be false, but it gives a basic idea. +* _memByCore_ is the memory (in GB) divided by the number of cores used. It is an approximation for the memory used had the job been ran on a single node. + +These are the following analysis steps: + +1. __fullCov__ Extract coverage information from raw files (BAM or BigWig) by chromosome, then filter it, and save it in Rdata files. +1. __derMod__ Calculate the sample depth adjustments and build models appropriate for the experiment. +1. __derA__ Run single-base level analysis by chromosome. +1. __derM__ Merge derfinder analysis results from the different chromosomes, calculate p-values and q-values. +1. __derR__ Generate HTML report with `regionReport`. +1. __regMat__ Run expressed-regions level analysis with `regionMatrix()`. +1. __regsVsDers__ Compare expressed-regions vs single-base level approaches for the derfinder software paper. +1. __toSam__ Transform BAM files to sorted (by name) SAM files for running HTSeq. +1. __htseq__ Run HTSeq to generate exon count table. +1. __summOv__ Run `GenomicRanges::summarizeOverlaps()` to generate exon count table. +1. __covToExon__ Generate exon table using `derfinder::coverageToExon()` for UCSC hg19 knownGene or GRCh37 p11 Ensembl annotation table. +1. __PNAS__ (Only for _Hippo_) Generate an HTML report comparing the derfinder results vs previously published results (PNAS paper). +1. __summInfo__ Summarize results to then use then in the derfinder software paper. + + + + +```{r tables, results="asis", bootstrap.show.code=FALSE} +library("rCharts") +library("data.table") + +## Print whole table +d <- data.table(all[, c("experiment", "step", "walltime", "cores", "memG", "timeByCore", "memByCore", "software", "analysis", "jobid")]) +t1 <- dTable(d, sPaginationType='full_numbers', iDisplayLength=50, sScrollX='100%') +t1$print("timing", cdn=TRUE) +``` +
+ +Table made using `rCharts` `r citep(bib[["rCharts"]])`. + +# Reproducibility + +Date the report was generated. + +```{r reproducibility1, echo=FALSE, bootstrap.show.code=FALSE} +## Date the report was generated +Sys.time() +``` + +Wallclock time spent generating the report. + +```{r "reproducibility2", echo=FALSE, bootstrap.show.code=FALSE} +## Processing time in seconds +totalTime <- diff(c(startTime, Sys.time())) +round(totalTime, digits=3) +``` + +`R` session information. + +```{r "reproducibility3", echo=FALSE, bootstrap.show.code=FALSE, bootstrap.show.message=FALSE} +## Session info +options(width=120) +devtools::session_info() +``` + +# Bibliography + +This report was generated using `knitrBootstrap` `r citep(bib[['knitrBootstrap']])` +with `knitr` `r citep(bib[['knitr']])` and `rmarkdown` `r citep(bib[['rmarkdown']])` running behind the scenes. Timing information extracted from the SGE reports using `efficiency analytics` `r citep(bib[["eff"]])`. Figures and citations were made using `ggplot2` `r citep(bib[["ggplot2"]])` and `knitcitations` `r citep(bib[['knitcitations']])` respectively. + +Citation file: [timing.bib](timing.bib) + +```{r vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE} +## Print bibliography +bibliography() +``` diff --git a/timing/timing.bib b/timing/timing.bib new file mode 100644 index 0000000..49ad4ea --- /dev/null +++ b/timing/timing.bib @@ -0,0 +1,106 @@ +@Manual{boettiger2015knitcitations, + title = {knitcitations: Citations for 'Knitr' Markdown Files}, + author = {Carl Boettiger}, + year = {2015}, + note = {R package version 1.0.7}, + url = {http://CRAN.R-project.org/package=knitcitations}, +} + +@Article{colladotorres2015derfinder, + title = {derfinder: Software for annotation-agnostic RNA-seq differential expression analysis}, + author = {Leonardo Collado-Torres and Alyssa C. Frazee and Michael I. Love and Rafael A. Irizarry and Andrew E. Jaffe and Jeffrey T. Leek}, + year = {2015}, + journal = {bioRxiv}, + doi = {10.1101/015370}, + url = {http://www.biorxiv.org/content/early/2015/02/19/015370.abstract}, +} + +@Article{frazee2014differential, + title = {Differential expression analysis of RNA-seq data at single-base resolution}, + author = {Alyssa C. Frazee and Sarven Sabunciyan and Kasper D. Hansen and Rafael A. Irizarry and Jeffrey T. Leek}, + year = {2014}, + journal = {Biostatistics}, + volume = {15 (3)}, + pages = {413-426}, + doi = {10.1093/biostatistics/kxt053}, + url = {http://biostatistics.oxfordjournals.org/content/15/3/413.long}, +} + +@Article{lawrence2013software, + title = {Software for Computing and Annotating Genomic Ranges}, + author = {Michael Lawrence and Wolfgang Huber and Herv\'e Pag\`es and Patrick Aboyoun and Marc Carlson and Robert Gentleman and Martin Morgan and Vincent Carey}, + year = {2013}, + journal = {{PLoS} Computational Biology}, + volume = {9}, + issue = {8}, + doi = {10.1371/journal.pcbi.1003118}, + url = {http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118}, +} + +@Article{anders2010differential, + title = {Differential expression analysis for sequence count data}, + author = {Simon Anders and Wolfgang Huber}, + year = {2010}, + journal = {Genome Biology}, + volume = {11}, + pages = {R106}, + doi = {10.1186/gb-2010-11-10-r106}, + url = {http://genomebiology.com/2010/11/10/R106/}, +} + +@Manual{vaidyanathan2013rcharts, + title = {rCharts: Interactive Charts using Javascript Visualization Libraries}, + author = {Ramnath Vaidyanathan}, + year = {2013}, + note = {R package version 0.4.5}, +} + +@Book{wickham2009ggplot2, + author = {Hadley Wickham}, + title = {ggplot2: Elegant Graphics for Data Analysis}, + publisher = {Springer-Verlag New York}, + year = {2009}, + isbn = {978-0-387-98140-6}, + url = {http://had.co.nz/ggplot2/book}, +} + +@Manual{hester2014knitrbootstrap, + title = {knitrBootstrap: Knitr Bootstrap framework.}, + author = {Jim Hester}, + year = {2014}, + note = {R package version 1.0.0}, + url = {https://github.com/jimhester/}, +} + +@Manual{allaire2016rmarkdown, + title = {rmarkdown: Dynamic Documents for R}, + author = {JJ Allaire and Joe Cheng and Yihui Xie and Jonathan McPherson and Winston Chang and Jeff Allen and Hadley Wickham and Aron Atkins and Rob Hyndman}, + year = {2016}, + note = {R package version 0.9.2}, + url = {http://CRAN.R-project.org/package=rmarkdown}, +} + +@InCollection{xie2014knitr, + booktitle = {Implementing Reproducible Computational Research}, + editor = {Victoria Stodden and Friedrich Leisch and Roger D. Peng}, + title = {knitr: A Comprehensive Tool for Reproducible Research in {R}}, + author = {Yihui Xie}, + publisher = {Chapman and Hall/CRC}, + year = {2014}, + note = {ISBN 978-1466561595}, + url = {http://www.crcpress.com/product/isbn/9781466561595}, +} + +@Manual{eff, + title = {Efficiency analysis of Sun Grid Engine batch jobs}, + author = {Alyssa Frazee}, + year = {2014}, + url = {http://dx.doi.org/10.6084/m9.figshare.878000}, +} + +@Article{rail, + author = {Abhinav Nellore and Leonardo Collado-Torres and Andrew E. Jaffe and Jos{\a'e} Alquicira-Hern{\a'a}ndez and Jacob Pritt and James Morton and Jeffrey T. Leek and Ben Langmead}, + journal = {bioRxiv}, + year = {2015}, + title = {Rail-RNA: {Scalable} analysis of {RNA}-seq splicing and coverage}, +} diff --git a/timing/timing.html b/timing/timing.html new file mode 100644 index 0000000..64ba3df --- /dev/null +++ b/timing/timing.html @@ -0,0 +1,3862 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+

Timing information

+

This report shows the time and memory used to run derfinder for single base resolution differential expression analysis. It also shows the same information for going from BAM files to getting ready to run DESeq (Anders and Huber, 2010) by using samtools (, 2016) to convert to SAM format and HTSeq (, 2014) to make the count tables. Furthermore, this process was compared to using the summarizeOverlaps() function from the GenomicRanges (Lawrence, Huber, Pagès, Aboyoun, et al., 2013) package as well as using the coverageToExon() function included in the derfinder package [requires the output from the fullCov step].

+

Results

+
+ +
## Extract information from Gmail
+system('cp ../../efficiency_analytics/client_secrets .')
+system('python ../../efficiency_analytics/analyze_efficiency.py --email fellgernon@gmail.com --folder "Cluster/derSoftware" --outfile timing-derSoftware.txt')
+
+
+ +
## Load libraries
+library("ggplot2")
+library("knitr")
+
+
+ +
## Setup
+
+## Define number of cores used
+exps <- c('brainspan', 'simulation', 'hippo', 'snyder', 'stem')
+
+## Read data and process it
+all <- read.table('timing-derSoftware.txt', header = TRUE, stringsAsFactors = FALSE)
+all <- all[!grepl('brainspan.*run3', all$jobid), ] # remove older info
+all$step <- gsub('.*th', 'TopHat', sapply(strsplit(all$jobid, "-"), function(x) x[1]))
+all$memG <- all$memory
+all$memG[all$memunit == "M"] <- all$memG[all$memunit == "M"] / 1024
+all$chr <- gsub('.*chr', 'chr', all$jobid)
+all$chr[ !grepl('chr', all$chr) ] <- NA
+
+## Experiment info
+all$experiment <- NA
+for(exp in exps) {
+    all$experiment[ grepl(exp, tolower(all$jobid)) ] <- exp
+}
+all$experiment[ all$step %in% c('TopHat', 'bigwig') ] <- 'simulation'
+all$experiment[ all$jobid == 'makeBai-Sim' ] <- 'simulation'
+
+## Cores info
+all$cores <- mapply(function(chr, exp, step) {
+    if(step == 'fullCov') {
+        return(10L)
+    } else if(step == 'derA') {
+        if(exp == 'brainspan') {
+            return(ifelse(chr == 'chrY', 2L, ifelse(chr == 'chr1', 40L, ifelse(chr == 'chr2', 32L, ifelse(chr == 'chr3', 27L, ifelse(chr == 'chr19', 29L, 20L))))))
+        } else if (exp == 'simulation'){
+            return(1L)
+        } else if (exp == 'hippo'){
+            return(2L)
+        } else if (exp == 'snyder'){
+            return(4L)
+        } else if (exp == 'stem'){
+            return(8L)
+        }
+    } else if(step == 'regMat') {
+        return(5L)
+    } else if(step == 'TopHat') {
+        return(4L)
+    } else if(step == 'summOv') {
+        return(ifelse(exp == 'hippo', 24L, 10L))
+    } else {
+        return(1L)
+    }
+}, all$chr, all$experiment, all$step)
+all$timeByCore <- all$walltime * all$cores
+all$memByCore <- all$memG / all$cores
+
+
+## Add software labels
+all$software <- factor(ifelse(all$step %in% c('toSam', 'htseq'), 'HTSeq', ifelse(all$step == 'summOv', 'GenomicRanges', ifelse(all$step == 'TopHat', 'TopHat', ifelse(all$step %in% c('makeBai', 'regVsDERs', 'PNAS', 'summInfo'), 'misc', ifelse(all$step == 'derR', 'regionReport', 'derfinder'))))))
+
+## Experiment and cores groups info
+all$experiment <- factor(all$experiment, levels = exps)
+all$coresGroups <- all$cores
+all$coresGroups[ all$cores >= 20] <- '20+'
+all$coresGroups <- factor(all$coresGroups, levels = c(1, 2, 4, 5, 8, 10, '20+'))
+
+## Types of analysis
+all$analysis <- factor(ifelse(all$step %in% c('derMod', 'derA', 'derM'), 'Single-base DER', ifelse(all$step %in% c('toSam', 'htseq', 'summOv', 'covToEx'), 'Exon count', ifelse(all$step == 'regMat', 'Expressed-region DER', ifelse(all$step == 'fullCov', 'Load data', ifelse(all$step == 'derR', 'HTML report', 'misc'))))))
+
+## Show only information for the data sets described in this website
+all <- subset(all, experiment %in% c('hippo', 'snyder'))
+
+

Adjusting by number of cores

+

The following plots show the wall time and memory used by each job while taking into account the number of cores used by each job. Note that doing so is a crude approximation of how much time and memory each job would have needed had it ran on a single node.

+

Points are colored by which analysis type they belong to. Note that the loading data step is required for the single-level and expressed-regions DER approaches as well as exon counting (with derfinder).

+
+ +
## Walltime and memory adjusted by number of cores (it's an approximation)
+ggplot(all, aes(x=timeByCore, y=memByCore, colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores") + ylab("Memory (GB) divided by the number of cores") + scale_colour_brewer(palette="Dark2") + theme_bw(base_size = 18) + theme(legend.position=c(.5, .75), legend.box = 'horizontal')
+
+
+ +
+
+ +
ggplot(all, aes(x=log2(timeByCore), y=memByCore, colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores (log2)") + ylab("Memory (GB) divided by the number of cores") + scale_colour_brewer(palette="Dark2") + theme_bw(base_size = 18) + theme(legend.position=c(.5, .75), legend.box = 'horizontal')
+
+
+ +
+
+ +
## For supp text
+time <- ggplot(subset(all, !software %in% c('TopHat', 'regionReport') & analysis != 'misc'), aes(x=log2(timeByCore), y=log2(memByCore), colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores (log2)") + ylab("GB memory divided by number of cores (log2)") + scale_colour_brewer(palette="Set1") + theme_bw(base_size = 18) + theme(legend.position=c(.55, .15), legend.box = 'horizontal')
+time
+
+
+ +
+
+ +
pdf(file = 'time.pdf', width = 10)
+time
+dev.off()
+ +
## quartz_off_screen 
+##                 2
+
+ +
#system('open time.pdf')
+
+

Resources by step for each analysis

+
+ +
getInfo <- function(df, sumTime = FALSE, peakCores = FALSE) {
+    memByCore <- max(df$memByCore)
+    walltime <- ifelse(sumTime, sum(df$walltime), max(df$walltime))
+    memG <- max(df$memG)
+    peakCores <- ifelse(peakCores, max(df$peakCores), sum(df$cores))
+    res <- c(memByCore = memByCore, walltime = walltime, memG = memG, peakCores = peakCores)
+    return(res)
+}
+
+analysisInfo <- list('Single-base DER' = c('Load data', 'Single-base DER'),
+    'Expressed-region DER' = c('Load data', 'Expressed-region DER'),
+    'HTML report' = 'HTML report',
+    'Exon count - derfinder' = 'Load data'
+)
+analysisInfo <- lapply(analysisInfo, function(x) { which(all$analysis %in% x)})
+analysisInfo[[4]] <- c(analysisInfo[[4]], which(all$step == 'covToEx'))
+analysisInfo$"Exon count - HTSeq" <- which(all$step %in% c('toSam', 'htseq'))
+analysisInfo$"Exon count - GenomicRanges" <- which(all$step == 'summOv')
+
+## Summarize the information for each step of each analysis
+analysisSummary <- lapply(names(analysisInfo), function(analysis) {
+    current <- all[analysisInfo[[analysis]], ]
+    res_analysis <- lapply(exps, function(exp) {
+        use <- subset(current, experiment == exp)
+        if(nrow(use) == 0) return(NULL)
+        res_exp <- lapply(unique(use$step), function(step) {
+            res_step <- as.data.frame(t(getInfo(use[use$step == step, ])))
+            res_step$step <- step
+            res_step$experiment <- exp
+            res_step$analysis <- analysis
+            return(res_step)
+        })
+        res_exp <- do.call(rbind, res_exp)
+        return(res_exp)
+    })
+    res_analysis <- do.call(rbind, res_analysis)
+    return(res_analysis)
+})
+analysisSummary <- do.call(rbind, analysisSummary)
+
+

The table shown below shows per analysis the maximum memory used by a job and maximum wall time for that step. This is assuming that all jobs for a given step ran simultaneously. For example, that all jobs running derfinder::analyzeChr() were running at the same time. Note that for some analyses relied on the same steps, like loading the data (fullCov). This table can be useful to find the peak number of cores (the sum of cores for all jobs running simultaneously) for a given analysis step.

+
+ +
kable(analysisSummary, format = 'html', digits = c(2, 4, 2))
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+memByCore + +walltime + +memG + +peakCores + +step + +experiment + +analysis +
+1.32 + +0.0492 + +1.32 + +1 + +derM + +hippo + +Single-base DER +
+3.90 + +0.9697 + +7.80 + +48 + +derA + +hippo + +Single-base DER +
+3.25 + +0.0222 + +3.25 + +1 + +derMod + +hippo + +Single-base DER +
+1.29 + +0.1967 + +12.91 + +10 + +fullCov + +hippo + +Single-base DER +
+4.39 + +1.2494 + +4.39 + +1 + +derM + +snyder + +Single-base DER +
+5.14 + +2.3453 + +20.55 + +96 + +derA + +snyder + +Single-base DER +
+7.02 + +0.0558 + +7.02 + +2 + +derMod + +snyder + +Single-base DER +
+2.71 + +1.2539 + +27.10 + +10 + +fullCov + +snyder + +Single-base DER +
+2.07 + +0.2442 + +10.33 + +5 + +regMat + +hippo + +Expressed-region DER +
+1.29 + +0.1967 + +12.91 + +10 + +fullCov + +hippo + +Expressed-region DER +
+5.32 + +1.1131 + +26.62 + +5 + +regMat + +snyder + +Expressed-region DER +
+2.71 + +1.2539 + +27.10 + +10 + +fullCov + +snyder + +Expressed-region DER +
+36.46 + +0.8094 + +36.46 + +1 + +derR + +hippo + +HTML report +
+37.20 + +0.4836 + +37.20 + +1 + +derR + +snyder + +HTML report +
+1.29 + +0.1967 + +12.91 + +10 + +fullCov + +hippo + +Exon count - derfinder +
+11.16 + +0.6286 + +11.16 + +2 + +covToEx + +hippo + +Exon count - derfinder +
+2.71 + +1.2539 + +27.10 + +10 + +fullCov + +snyder + +Exon count - derfinder +
+16.20 + +0.7375 + +16.20 + +2 + +covToEx + +snyder + +Exon count - derfinder +
+0.38 + +0.5672 + +0.38 + +31 + +htseq + +hippo + +Exon count - HTSeq +
+1.73 + +3.7153 + +1.73 + +1 + +toSam + +hippo + +Exon count - HTSeq +
+0.38 + +7.8933 + +0.38 + +20 + +htseq + +snyder + +Exon count - HTSeq +
+1.44 + +42.0253 + +1.44 + +1 + +toSam + +snyder + +Exon count - HTSeq +
+1.80 + +0.2967 + +43.24 + +24 + +summOv + +hippo + +Exon count - GenomicRanges +
+6.32 + +2.6850 + +63.24 + +10 + +summOv + +snyder + +Exon count - GenomicRanges +
+
+

Resources for each analysis

+
+ +
## Summary the information for each analysis
+peaks <- lapply(names(analysisInfo), function(analysis) {
+    res_analysis <- lapply(exps, function(exp) {
+        current <- analysisSummary[analysisSummary$analysis == analysis & analysisSummary$experiment == exp, ]
+        if(nrow(current) == 0) return(NULL)
+        res_exp <- as.data.frame(t(getInfo(current, sumTime = TRUE, peakCores = TRUE)))
+        res_exp$experiment <- exp
+        res_exp$analysis <- analysis
+        return(res_exp)
+    })
+    res_analysis <- do.call(rbind, res_analysis)
+    return(res_analysis)
+})
+peaks <- do.call(rbind, peaks)
+
+save(peaks, file = 'peaks.Rdata')
+
+

We can further summarize the resources used by each analysis by identified the maximum memory used in the steps required for a particular analysis and the total wall time for running all the steps when all the jobs of a particular step are running simultaneously. Thus giving us the total actual wall time to run a specific analysis and the maximum memory required.

+

The table below shows the final summary. Note that in some analyses, the peak memory is from the fullCov step. We did not focus on reducing the memory load of this step as we sacrificed memory for speed. We know that much lower memory limits can be achieved using 1 core instead of the 10 cores used.

+
+ +
kable(peaks, format = 'html', digits = c(2, 3, 2))
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+memByCore + +walltime + +memG + +peakCores + +experiment + +analysis +
+3.90 + +1.238 + +12.91 + +48 + +hippo + +Single-base DER +
+7.02 + +4.904 + +27.10 + +96 + +snyder + +Single-base DER +
+2.07 + +0.441 + +12.91 + +10 + +hippo + +Expressed-region DER +
+5.32 + +2.367 + +27.10 + +10 + +snyder + +Expressed-region DER +
+36.46 + +0.809 + +36.46 + +1 + +hippo + +HTML report +
+37.20 + +0.484 + +37.20 + +1 + +snyder + +HTML report +
+11.16 + +0.825 + +12.91 + +10 + +hippo + +Exon count - derfinder +
+16.20 + +1.991 + +27.10 + +10 + +snyder + +Exon count - derfinder +
+1.73 + +4.283 + +1.73 + +31 + +hippo + +Exon count - HTSeq +
+1.44 + +49.919 + +1.44 + +20 + +snyder + +Exon count - HTSeq +
+1.80 + +0.297 + +43.24 + +24 + +hippo + +Exon count - GenomicRanges +
+6.32 + +2.685 + +63.24 + +10 + +snyder + +Exon count - GenomicRanges +
+
+

Regarding the high memory load for the HTML report, this could be significantly lowered by only loading the required coverage data used for the plots instead of the full output from the fullCov step. That is, using the which argument from fullCoverage() to create a much smaller fullCov object, which would also reduce the memory used when plotting.

+

Note: since these analyses were done, we have found other ways to run derfinder::regionMatrix() that require less memory. In particular, if you have BigWig files (as those generated by Rail-RNA (Nellore, Collado-Torres, Jaffe, Alquicira-Hernández, et al., 2015)), we recommend using railMatrix().

+

Comparing methods for gene count table generation

+

The previous table can also be used to compare the sum of the time and peak memory used by the different steps to obtain the exon count table with the following software options.

+
    +
  • derfinder: includes resources used for reading coverage data in R and then running creating a feature count matrix. We did so for +
      +
    • UCSC hg19 knownGene annotation
    • +
    • Ensembl GRCh37 p11 annotation.
    • +
  • +
  • HTSeq: includes resources used for generating sorted SAM files and then running HTSeq.
  • +
  • summOv: resources used for running GenomicRanges::summarizeOverlaps() directly on the BAM files.
  • +
+

Details

+

The following table shows the details of the resources used by the different jobs. It shows the experiment (experiment), the analysis step (step), wall time used (shown in hours, walltime), number of cores used (cores), memory in GB used (memG), software used (software), analysis for which the step is used (analysis), and the job name (jobib). Furthermore, it shows two simple approximations:

+
    +
  • timeByCore is the wall time (in hours) multiplied by the number of cores used. It is a very simple approximation for the wall time used had the job been ran on a single node. This approximation is known to be false, but it gives a basic idea.
  • +
  • memByCore is the memory (in GB) divided by the number of cores used. It is an approximation for the memory used had the job been ran on a single node.
  • +
+

These are the following analysis steps:

+
    +
  1. fullCov Extract coverage information from raw files (BAM or BigWig) by chromosome, then filter it, and save it in Rdata files.
  2. +
  3. derMod Calculate the sample depth adjustments and build models appropriate for the experiment.
  4. +
  5. derA Run single-base level analysis by chromosome.
  6. +
  7. derM Merge derfinder analysis results from the different chromosomes, calculate p-values and q-values.
  8. +
  9. derR Generate HTML report with regionReport.
  10. +
  11. regMat Run expressed-regions level analysis with regionMatrix().
  12. +
  13. regsVsDers Compare expressed-regions vs single-base level approaches for the derfinder software paper.
  14. +
  15. toSam Transform BAM files to sorted (by name) SAM files for running HTSeq.
  16. +
  17. htseq Run HTSeq to generate exon count table.
  18. +
  19. summOv Run GenomicRanges::summarizeOverlaps() to generate exon count table.
  20. +
  21. covToExon Generate exon table using derfinder::coverageToExon() for UCSC hg19 knownGene or GRCh37 p11 Ensembl annotation table.
  22. +
  23. PNAS (Only for Hippo) Generate an HTML report comparing the derfinder results vs previously published results (PNAS paper).
  24. +
  25. summInfo Summarize results to then use then in the derfinder software paper.
  26. +
+ + +
+ +
library("rCharts")
+library("data.table")
+
+## Print whole table
+d <- data.table(all[, c("experiment", "step", "walltime", "cores", "memG", "timeByCore", "memByCore", "software", "analysis", "jobid")])
+t1 <- dTable(d, sPaginationType='full_numbers', iDisplayLength=50, sScrollX='100%')
+t1$print("timing", cdn=TRUE)
+ +
+ +
+


+

Table made using rCharts (Vaidyanathan, 2013).

+

Reproducibility

+

Date the report was generated.

+
+ +
## [1] "2016-02-20 10:54:24 EST"
+
+
+

Wallclock time spent generating the report.

+
+ +
## Time difference of 5.3 secs
+
+
+

R session information.

+
+ +
## Session info -----------------------------------------------------------------------------------------------------------
+
+ +
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_US.UTF-8                 
+##  tz       America/New_York            
+##  date     2016-02-20
+
+ +
## Packages ---------------------------------------------------------------------------------------------------------------
+
+ +
##  package        * version  date       source                                   
+##  bibtex           0.4.0    2014-12-31 CRAN (R 3.2.0)                           
+##  bitops           1.0-6    2013-08-17 CRAN (R 3.2.0)                           
+##  chron            2.3-47   2015-06-24 CRAN (R 3.2.1)                           
+##  colorspace       1.2-6    2015-03-11 CRAN (R 3.2.0)                           
+##  curl             0.9.5    2016-01-23 CRAN (R 3.2.3)                           
+##  data.table     * 1.9.6    2015-09-19 CRAN (R 3.2.0)                           
+##  devtools         1.10.0   2016-01-23 CRAN (R 3.2.3)                           
+##  digest           0.6.9    2016-01-08 CRAN (R 3.2.3)                           
+##  evaluate         0.8      2015-09-18 CRAN (R 3.2.0)                           
+##  formatR          1.2.1    2015-09-18 CRAN (R 3.2.0)                           
+##  ggplot2        * 2.0.0    2015-12-18 CRAN (R 3.2.3)                           
+##  gtable           0.1.2    2012-12-05 CRAN (R 3.2.0)                           
+##  highr            0.5.1    2015-09-18 CRAN (R 3.2.0)                           
+##  htmltools        0.3      2015-12-29 CRAN (R 3.2.3)                           
+##  httr             1.1.0    2016-01-28 CRAN (R 3.2.3)                           
+##  knitcitations  * 1.0.7    2015-10-28 CRAN (R 3.2.0)                           
+##  knitr          * 1.12.3   2016-01-22 CRAN (R 3.2.3)                           
+##  knitrBootstrap   1.0.0    2015-05-19 Github (jimhester/knitrBootstrap@76c41f0)
+##  labeling         0.3      2014-08-23 CRAN (R 3.2.0)                           
+##  lattice          0.20-33  2015-07-14 CRAN (R 3.2.2)                           
+##  lubridate        1.5.0    2015-12-03 CRAN (R 3.2.3)                           
+##  magrittr         1.5      2014-11-22 CRAN (R 3.2.0)                           
+##  markdown         0.7.7    2015-04-22 CRAN (R 3.2.0)                           
+##  memoise          1.0.0    2016-01-29 CRAN (R 3.2.3)                           
+##  mime             0.4      2015-09-03 CRAN (R 3.2.0)                           
+##  munsell          0.4.2    2013-07-11 CRAN (R 3.2.0)                           
+##  plyr             1.8.3    2015-06-12 CRAN (R 3.2.1)                           
+##  R6               2.1.2    2016-01-26 CRAN (R 3.2.3)                           
+##  rCharts        * 0.4.5    2015-05-19 Github (ramnathv/rCharts@389e214)        
+##  RColorBrewer     1.1-2    2014-12-07 CRAN (R 3.2.0)                           
+##  Rcpp             0.12.3   2016-01-10 CRAN (R 3.2.3)                           
+##  RCurl            1.95-4.7 2015-06-30 CRAN (R 3.2.1)                           
+##  RefManageR       0.10.5   2016-01-02 CRAN (R 3.2.3)                           
+##  reshape2         1.4.1    2014-12-06 CRAN (R 3.2.0)                           
+##  RJSONIO          1.3-0    2014-07-28 CRAN (R 3.2.0)                           
+##  rmarkdown      * 0.9.2    2016-01-01 CRAN (R 3.2.3)                           
+##  scales           0.3.0    2015-08-25 CRAN (R 3.2.0)                           
+##  stringi          1.0-1    2015-10-22 CRAN (R 3.2.0)                           
+##  stringr          1.0.0    2015-04-30 CRAN (R 3.2.0)                           
+##  whisker          0.3-2    2013-04-28 CRAN (R 3.2.0)                           
+##  XML              3.98-1.3 2015-06-30 CRAN (R 3.2.0)                           
+##  yaml             2.1.13   2014-06-12 CRAN (R 3.2.0)
+
+
+

Bibliography

+

This report was generated using knitrBootstrap (Hester, 2014) with knitr (Xie, 2014) and rmarkdown (Allaire, Cheng, Xie, McPherson, et al., 2016) running behind the scenes. Timing information extracted from the SGE reports using efficiency analytics (Frazee, 2014). Figures and citations were made using ggplot2 (Wickham, 2009) and knitcitations (Boettiger, 2015) respectively.

+

Citation file: timing.bib

+
+

+[1] HTSeq: Analysing high-throughput sequencing data with Python — HTSeq 0.6.1p2 documentation. http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html. 2014. URL: http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html. +

+

+[2] SAMtools. http://samtools.sourceforge.net/. 2016. URL: http://samtools.sourceforge.net/. +

+

+[3] J. Allaire, J. Cheng, Y. Xie, J. McPherson, et al. rmarkdown: Dynamic Documents for R. R package version 0.9.2. 2016. URL: http://CRAN.R-project.org/package=rmarkdown. +

+

+[4] S. Anders and W. Huber. “Differential expression analysis for sequence count data”. In: Genome Biology 11 (2010), p. R106. DOI: 10.1186/gb-2010-11-10-r106. URL: http://genomebiology.com/2010/11/10/R106/. +

+

+[5] C. Boettiger. knitcitations: Citations for 'Knitr' Markdown Files. R package version 1.0.7. 2015. URL: http://CRAN.R-project.org/package=knitcitations. +

+

+[6] A. Frazee. Efficiency analysis of Sun Grid Engine batch jobs. 2014. URL: http://dx.doi.org/10.6084/m9.figshare.878000. +

+

+[7] J. Hester. knitrBootstrap: Knitr Bootstrap framework. R package version 1.0.0. 2014. URL: https://github.com/jimhester/. +

+

+[8] M. Lawrence, W. Huber, H. Pagès, P. Aboyoun, et al. “Software for Computing and Annotating Genomic Ranges”. In: PLoS Computational Biology 9 (8 2013). DOI: 10.1371/journal.pcbi.1003118. URL: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118}. +

+ +

+

+[9] A. Nellore, L. Collado-Torres, A. E. Jaffe, J. Alquicira-Hernández, et al. “Rail-RNA: Scalable analysis of RNA-seq splicing and coverage”. In: bioRxiv (2015). +

+

+[10] R. Vaidyanathan. rCharts: Interactive Charts using Javascript Visualization Libraries. R package version 0.4.5. 2013. +

+

+[11] H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009. ISBN: 978-0-387-98140-6. URL: http://had.co.nz/ggplot2/book. +

+

+[12] Y. Xie. “knitr: A Comprehensive Tool for Reproducible Research in R”. In: Implementing Reproducible Computational Research. Ed. by V. Stodden, F. Leisch and R. D. Peng. ISBN 978-1466561595. Chapman and Hall/CRC, 2014. URL: http://www.crcpress.com/product/isbn/9781466561595. +

+
+
+
+ + +
+
+ +
+ + +
+ + diff --git a/venn/venn.Rmd b/venn/venn.Rmd new file mode 100644 index 0000000..8141679 --- /dev/null +++ b/venn/venn.Rmd @@ -0,0 +1,216 @@ +--- +title: "Table of results and venn diagrams" +author: "L Collado-Torres" +date: "`r doc_date()`" +output: + BiocStyle::html_document +--- + + +```{r citationsSetup, echo=FALSE, message=FALSE, warning=FALSE} +## Track time spent on making the report +startTime <- Sys.time() + +## Bib setup +library('knitcitations') + +## Load knitcitations with a clean bibliography +cleanbib() +cite_options(hyperlink = 'to.doc', citation_format = 'text', style = 'html') +# Note links won't show for now due to the following issue +# https://github.com/cboettig/knitcitations/issues/63 + +bibs <- c("knitcitations" = citation("knitcitations"), + "derfinder" = citation("derfinder")[1], + "derfinderPlot" = citation("derfinderPlot")[1], + "GenomicRanges" = citation("GenomicRanges"), + "bumphunter" = citation("bumphunter"), + "BiocStyle" = citation("BiocStyle"), + "qvalue" = citation("qvalue"), + 'knitr' = citation('knitr')[3], + 'rmarkdown' = citation('rmarkdown')) + +write.bibtex(bibs, file = 'venn.bib') +bib <- read.bibtex('venn.bib') + +## Assign short names +names(bib) <- names(bibs) +``` + + +This report creates the CSV files with the candidate differentially expressed regions (DERs) found using `derfinder` `r citep(bib[["derfinder"]])` via the single-base level approach on the _Snyder_ and _Hippo_ data sets. It also has Venn diagrams illustrating the overlap with known annotation. + +# Results + + +## Generate CSV files + +The following code shows how to generate the CSV files from the `R` objects. + +```{r process, warning=FALSE} +## Setup +suppressMessages(library("GenomicRanges")) +suppressMessages(library("derfinder")) +suppressMessages(library("derfinderPlot")) + +## Experiments +exps <- c('snyder', 'hippo') + +## Find the latest run from each experiment +run <- sapply(exps, function(exp) { + runs <- dir(file.path('..', exp, 'derAnalysis'), pattern = 'run') + runs[length(runs)] +}) + +## Load regions and annotation data +regions <- lapply(exps, function(exp) { + load(file.path('..', exp, 'derAnalysis', run[exp], 'fullRegions.Rdata')) + res <- fullRegions + names(res) <- seq_len(length(res)) + return(res) +}) + +anno <- lapply(exps, function(exp) { + load(file.path('..', exp, 'derAnalysis', run[exp], + 'fullAnnotatedRegions.Rdata')) + + ## Fix region labels: its intergenic, not intragenic + if('intragenic' %in% names(fullAnnotatedRegions$countTable)) { + ## Fix countTable + names(fullAnnotatedRegions$countTable)[names(fullAnnotatedRegions$countTable) == 'intragenic'] <- 'intergenic' + } + res <- fullAnnotatedRegions + return(res) +}) + +## Fix names +names(regions) <- names(anno) <- exps + +## Perform check and create csv files +check <- vector("list", length(exps)) +names(check) <- exps +for(exp in exps) { + + ## Peform check + check[[exp]] <- identical(nrow(anno[[exp]]$countTable), + length(regions[[exp]])) + + ## Export regions information into plain text + write.csv(as.data.frame(regions[[exp]]), file = paste0("supplementaryFile", + which(exp == exps), ".csv"), quote = FALSE, row.names = FALSE) + + ## You can later read them in R using: + # read.csv("supplementaryFile1.csv") + # read.csv("supplementaryFile2.csv") + + ## Compress + system(paste0('gzip supplementaryFile', which(exp == exps), '.csv')) +} + +## Check that the rows match +unlist(check) +``` + +CSV files correspond to experiments `r paste(exps, collapse = ', ')` from runs `r paste(run, collapse = ', ')` respectively. + +## Venn diagrams + +The following Venn diagrams show how many candidate differentially expressed regions (DERs) overlap with an exon, intron, or intergenic regions of the UCSC hg19 knownGene annotation. By default, a minimum overlap of 20 base pairs is required to say that a candidate DER overlaps any of feature. The annotation overlap was done using the `mergeResults()` function from `derfinder` `r citep(bib[["derfinder"]])` while the Venn diagrams were made using `derfinderPlot` `r citep(bib[["derfinderPlot"]])`. + +A maximum of three Venn diagrams are shown per data set. The first one uses all the candidate DERs, the second one uses only the candidate DERs that had a significant q-value (by default less than 0.10), and the third uses only the candidate DERs that had a FWER adjusted p-value less than 0.05. + +```{r venn, dev='CairoPNG'} +for(exp in exps) { + ## Using all candidate DERs + vennRegions(anno[[exp]], main=paste("\n", exp, "using UCSC.hg19.knownGene"), + counts.col="blue") + + ## Using candidate DERs with a significant q-value + if(sum(regions[[exp]]$significantQval == "TRUE") > 0) + vennRegions(anno[[exp]], regions[[exp]]$significantQval == "TRUE", + main=paste("\n\n", exp, "using UCSC.hg19.knownGene\nRestricted to significant q-value candidate DERs"), + counts.col = "blue") + + + ## Using candidate DERs with a significant FWER adjusted p-value + if(sum(regions[[exp]]$significantFWER == "TRUE") > 0) + vennRegions(anno[[exp]], regions[[exp]]$significantFWER == "TRUE", + main=paste("\n\n", exp, "using UCSC.hg19.knownGene\nRestricted to significant FWER adjusted p-value candidate DERs"), + counts.col="blue") +} +``` + +# Details + +The supplementary files 1 and 2 contain the information for the candidate DERs found in [CSV](http://en.wikipedia.org/wiki/Comma-separated_values) format. The CSV files contain the following columns: + +1. __seqnames__ The chromosome name. +1. __start__ The position of the chromosome where the candidate DER begins. +1. __end__ The position of the chromosome where the candidate DER ends. +1. __width__ The width of the candidate DER. +1. __strand__ The strand of the candidate DER. +1. __value__ The mean of the F-statistics for the candidate DER. +1. __area__ The sum of the F-statistics for the candidate DER. +1. __indexStart__ Among the bases from the chromosome that passed the filtering step, the position where the candidate DER begins. +1. __indexEnd__ Among the bases from the chromosome that passed the filtering step, the position where the candidate DER ends. +1. __cluster__ The cluster number to which the candidate DER belongs to. Clusters were defined by chromosome and two candidate DERs belong to the same cluster if they are less than 3000 bp apart. +1. __clusterL__ The length of the cluster to which the candidate DER belongs to. +1. __meanCoverage__ The mean coverage among all samples for the candidate DER. +1. *mean__G__* The mean coverage among the samples of group __G__ for the candidate DER. +1. *log2FoldChange__G1__.vs__G2__* The log2 fold change between the samples in __G1__ and the samples in __G2__ for the candidate DER. +1. __pvalues__ The p-value for the candidate DER. It is calculated empirically using the null candidate DERs (obtained via permutations) from all chromosomes. +1. __significant__ Whether the p-value is less than 0.05. +1. __qvalues__ The p-value adjusted to control the FDR by using the `qvalue` `r citep(bib[["qvalue"]])` function from the package with the same name. +1. __significantQval__ Whether the q-value is less than 0.10. +1. __name__ This and the following fields are computed using `annotateNearest()` from the `bumphunter` `r citep(bib[["bumphunter"]])` package. They were calculated using the UCSC hg19 annotation. __name__ refers to the nearest gene. +1. __annotation__ RefSeq ID. Taken from the help page of `bumphunter::annotateNearest()`. +1. __description__ A factor with levels _upstream_, _promoter_, _overlaps 5'_, _inside intron_, _inside exon_, _covers exon(s)_, _overlaps exon upstream_, _overlaps exon downstream_, _overlaps two exons_, _overlaps 3'_, _close to 3'_, _downstream_, _covers_. Taken from the help page of `bumphunter::annotateNearest()`. +1. __region__ A factor with levels _upstream_, _promoter_, _overlaps 5'_, _inside_, _overlaps 3'_, _close to 3'_, _downstream_, _covers_. Taken from the help page of `bumphunter::annotateNearest()`. +1. __subregion__ A factor with levels _inside intron_, _inside exon_, _covers exon(s)_, _overlaps exon upstream_, _overlaps exon downstream_, _overlaps two exons_. Taken from the help page of `bumphunter::annotateNearest()`. +1. __insidedistance__ Distance past 5 prime end of gene. Taken from the help page of `bumphunter::annotateNearest()`. +1. __exonnumber__ Which exon. Taken from the help page of `bumphunter::annotateNearest()`. +1. __nexons__ Number of exons. Taken from the help page of `bumphunter::annotateNearest()`. +1. __UTR__ A factor with levels _inside transcription region_, _5' UTR_, _overlaps 5' UTR_, _3' UTR_, _overlaps 3' UTR_, _covers transcription region_. Taken from the help page of `bumphunter::annotateNearest()`. +1. __annoStrand__ + or -. Taken from the help page of `bumphunter::annotateNearest()`. +1. __geneL__ The gene length. Taken from the help page of `bumphunter::annotateNearest()`. +1. __codingL__ The coding length. Taken from the help page of `bumphunter::annotateNearest()`. +1. __fwer__ The FWER adjusted p-value for the region. +1. __significantFWER__ Whether the FWER adjusted p-value is less than 0.05. + + +# Reproducibility + +Date the report was generated. + +```{r reproducibility1, echo=FALSE, bootstrap.show.code=FALSE} +## Date the report was generated +Sys.time() +``` + +Wallclock time spent generating the report. + +```{r "reproducibility2", echo=FALSE, bootstrap.show.code=FALSE} +## Processing time in seconds +totalTime <- diff(c(startTime, Sys.time())) +round(totalTime, digits=3) +``` + +`R` session information. + +```{r "reproducibility3", echo=FALSE, bootstrap.show.code=FALSE, bootstrap.show.message=FALSE} +## Session info +options(width=120) +devtools::session_info() +``` + +# Bibliography + +This report was generated using `BiocStyle` `r citep(bib[['BiocStyle']])` +with `knitr` `r citep(bib[['knitr']])` and `rmarkdown` `r citep(bib[['rmarkdown']])` running behind the scenes. + +Citations made with `knitcitations` `r citep(bib[['knitcitations']])`. Citation file: [venn.bib](venn.bib). + +```{r vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE} +## Print bibliography +bibliography() +``` diff --git a/venn/venn.bib b/venn/venn.bib new file mode 100644 index 0000000..dfad1d8 --- /dev/null +++ b/venn/venn.bib @@ -0,0 +1,93 @@ +@Manual{boettiger2015knitcitations, + title = {knitcitations: Citations for 'Knitr' Markdown Files}, + author = {Carl Boettiger}, + year = {2015}, + note = {R package version 1.0.7}, + url = {http://CRAN.R-project.org/package=knitcitations}, +} + +@Article{colladotorres2015derfinder, + title = {derfinder: Software for annotation-agnostic RNA-seq differential expression analysis}, + author = {Leonardo Collado-Torres and Alyssa C. Frazee and Michael I. Love and Rafael A. Irizarry and Andrew E. Jaffe and Jeffrey T. Leek}, + year = {2015}, + journal = {bioRxiv}, + doi = {10.1101/015370}, + url = {http://www.biorxiv.org/content/early/2015/02/19/015370.abstract}, +} + +@Manual{colladotorres2015derfinderplot, + title = {derfinderPlot: Plotting functions for derfinder}, + author = {Leonardo Collado-Torres and Andrew E. Jaffe and Jeffrey T. Leek}, + year = {2015}, + url = {http://www.bioconductor.org/packages/release/bioc/html/derfinderPlot.html}, + note = {https://github.com/leekgroup/derfinderPlot - R package version 1.4.1}, +} + +@Article{lawrence2013software, + title = {Software for Computing and Annotating Genomic Ranges}, + author = {Michael Lawrence and Wolfgang Huber and Herv\'e Pag\`es and Patrick Aboyoun and Marc Carlson and Robert Gentleman and Martin Morgan and Vincent Carey}, + year = {2013}, + journal = {{PLoS} Computational Biology}, + volume = {9}, + issue = {8}, + doi = {10.1371/journal.pcbi.1003118}, + url = {http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118}, +} + +@Article{bumphunter, + author = {Andrew E. Jaffe and Peter Murakami and Hwajin Lee and Jeffrey T. Leek and Daniele M. Fallin and Andrew P. Feinberg and Rafael A. Irizarry}, + title = {Bump hunting to identify differentially methylated regions in epigenetic epidemiology studies}, + journal = {International journal of epidemiology}, + year = {2012}, + volume = {41}, + number = {1}, + pages = {200--209}, + doi = {10.1093/ije/dyr238}, + pubmed = {22422453}, +} + +@Article{minfi, + title = {Minfi: A flexible and comprehensive Bioconductor package for the analysis of Infinium DNA Methylation microarrays}, + author = {Martin J. Aryee and Andrew E. Jaffe and Hector Corrada-Bravo and Christine Ladd-Acosta and Andrew P. Feinberg and Kasper D. Hansen and Rafael A. Irizarry}, + year = {2014}, + journal = {Bioinformatics}, + volume = {30}, + number = {10}, + pages = {1363--1369}, + doi = {10.1093/bioinformatics/btu049}, + pubmed = {24478339}, +} + +@Manual{morganbiocstyle, + title = {BiocStyle: Standard styles for vignettes and other Bioconductor documents}, + author = {Martin Morgan and Andrzej Ole{\a's} and Wolfgang Huber}, + note = {R package version 1.8.0}, + url = {https://github.com/Bioconductor/BiocStyle}, +} + +@Manual{bass2015qvalue, + title = {qvalue: Q-value estimation for false discovery rate control}, + author = {John D. Storey with contributions from Andrew J. Bass and Alan Dabney and David Robinson}, + year = {2015}, + note = {R package version 2.2.2}, + url = {http://github.com/jdstorey/qvalue}, +} + +@InCollection{xie2014knitr, + booktitle = {Implementing Reproducible Computational Research}, + editor = {Victoria Stodden and Friedrich Leisch and Roger D. Peng}, + title = {knitr: A Comprehensive Tool for Reproducible Research in {R}}, + author = {Yihui Xie}, + publisher = {Chapman and Hall/CRC}, + year = {2014}, + note = {ISBN 978-1466561595}, + url = {http://www.crcpress.com/product/isbn/9781466561595}, +} + +@Manual{allaire2016rmarkdown, + title = {rmarkdown: Dynamic Documents for R}, + author = {JJ Allaire and Joe Cheng and Yihui Xie and Jonathan McPherson and Winston Chang and Jeff Allen and Hadley Wickham and Aron Atkins and Rob Hyndman}, + year = {2016}, + note = {R package version 0.9.2}, + url = {http://CRAN.R-project.org/package=rmarkdown}, +} diff --git a/venn/venn.html b/venn/venn.html new file mode 100644 index 0000000..86b3fd0 --- /dev/null +++ b/venn/venn.html @@ -0,0 +1,358 @@ + + + + + + + + + + + + + + +Table of results and venn diagrams + + + + + + + + + + + + + + + + + + + + +

Contents

+ + +

This report creates the CSV files with the candidate differentially expressed regions (DERs) found using derfinder (Collado-Torres, Frazee, Love, Irizarry, et al., 2015) via the single-base level approach on the Snyder and Hippo data sets. It also has Venn diagrams illustrating the overlap with known annotation.

+
+

1 Results

+
+

1.1 Generate CSV files

+

The following code shows how to generate the CSV files from the R objects.

+
## Setup
+suppressMessages(library("GenomicRanges"))
+suppressMessages(library("derfinder"))
+suppressMessages(library("derfinderPlot"))
+
+## Experiments
+exps <- c('snyder', 'hippo')
+
+## Find the latest run from each experiment
+run <- sapply(exps, function(exp) {
+    runs <- dir(file.path('..', exp, 'derAnalysis'), pattern = 'run')
+    runs[length(runs)]
+})
+
+## Load regions and annotation data
+regions <- lapply(exps, function(exp) {
+    load(file.path('..', exp, 'derAnalysis', run[exp], 'fullRegions.Rdata'))
+    res <- fullRegions
+    names(res) <- seq_len(length(res))
+    return(res)
+})
+
+anno <- lapply(exps, function(exp) {
+    load(file.path('..', exp, 'derAnalysis', run[exp],
+        'fullAnnotatedRegions.Rdata'))
+    
+    ## Fix region labels: its intergenic, not intragenic
+    if('intragenic' %in% names(fullAnnotatedRegions$countTable)) {
+        ## Fix countTable
+        names(fullAnnotatedRegions$countTable)[names(fullAnnotatedRegions$countTable) == 'intragenic'] <- 'intergenic'
+    }
+    res <- fullAnnotatedRegions
+    return(res)
+})
+
+## Fix names
+names(regions) <- names(anno) <- exps
+
+## Perform check and create csv files
+check <- vector("list", length(exps))
+names(check) <- exps
+for(exp in exps) {
+    
+    ## Peform check
+    check[[exp]] <- identical(nrow(anno[[exp]]$countTable),
+        length(regions[[exp]]))
+    
+    ## Export regions information into plain text
+    write.csv(as.data.frame(regions[[exp]]), file = paste0("supplementaryFile", 
+        which(exp == exps), ".csv"), quote = FALSE, row.names = FALSE)
+    
+    ## You can later read them in R using:
+    # read.csv("supplementaryFile1.csv")
+    # read.csv("supplementaryFile2.csv")
+    
+    ## Compress
+    system(paste0('gzip supplementaryFile', which(exp == exps), '.csv'))
+}
+
+## Check that the rows match
+unlist(check)
+
## snyder  hippo 
+##   TRUE   TRUE
+

CSV files correspond to experiments snyder, hippo from runs run3-v1.0.10, run3-v1.0.10 respectively.

+
+
+

1.2 Venn diagrams

+

The following Venn diagrams show how many candidate differentially expressed regions (DERs) overlap with an exon, intron, or intergenic regions of the UCSC hg19 knownGene annotation. By default, a minimum overlap of 20 base pairs is required to say that a candidate DER overlaps any of feature. The annotation overlap was done using the mergeResults() function from derfinder (Collado-Torres, Frazee, Love, Irizarry, et al., 2015) while the Venn diagrams were made using derfinderPlot (Collado-Torres, Jaffe, and Leek, 2015).

+

A maximum of three Venn diagrams are shown per data set. The first one uses all the candidate DERs, the second one uses only the candidate DERs that had a significant q-value (by default less than 0.10), and the third uses only the candidate DERs that had a FWER adjusted p-value less than 0.05.

+
for(exp in exps) {
+    ## Using all candidate DERs
+    vennRegions(anno[[exp]], main=paste("\n", exp, "using UCSC.hg19.knownGene"),
+        counts.col="blue")
+    
+    ## Using candidate DERs with a significant q-value
+    if(sum(regions[[exp]]$significantQval == "TRUE") > 0)
+    vennRegions(anno[[exp]], regions[[exp]]$significantQval == "TRUE", 
+        main=paste("\n\n", exp, "using UCSC.hg19.knownGene\nRestricted to significant q-value candidate DERs"),
+        counts.col = "blue")
+        
+    
+    ## Using candidate DERs with a significant FWER adjusted p-value
+    if(sum(regions[[exp]]$significantFWER == "TRUE") > 0)
+    vennRegions(anno[[exp]], regions[[exp]]$significantFWER == "TRUE",
+        main=paste("\n\n", exp, "using UCSC.hg19.knownGene\nRestricted to significant FWER adjusted p-value candidate DERs"),
+        counts.col="blue")
+}
+

+
+
+
+

2 Details

+

The supplementary files 1 and 2 contain the information for the candidate DERs found in CSV format. The CSV files contain the following columns:

+
    +
  1. seqnames The chromosome name.
  2. +
  3. start The position of the chromosome where the candidate DER begins.
  4. +
  5. end The position of the chromosome where the candidate DER ends.
  6. +
  7. width The width of the candidate DER.
  8. +
  9. strand The strand of the candidate DER.
  10. +
  11. value The mean of the F-statistics for the candidate DER.
  12. +
  13. area The sum of the F-statistics for the candidate DER.
  14. +
  15. indexStart Among the bases from the chromosome that passed the filtering step, the position where the candidate DER begins.
  16. +
  17. indexEnd Among the bases from the chromosome that passed the filtering step, the position where the candidate DER ends.
  18. +
  19. cluster The cluster number to which the candidate DER belongs to. Clusters were defined by chromosome and two candidate DERs belong to the same cluster if they are less than 3000 bp apart.
  20. +
  21. clusterL The length of the cluster to which the candidate DER belongs to.
  22. +
  23. meanCoverage The mean coverage among all samples for the candidate DER.
  24. +
  25. *mean__G__* The mean coverage among the samples of group G for the candidate DER.
  26. +
  27. *log2FoldChange__G1__.vs__G2__* The log2 fold change between the samples in G1 and the samples in G2 for the candidate DER.
  28. +
  29. pvalues The p-value for the candidate DER. It is calculated empirically using the null candidate DERs (obtained via permutations) from all chromosomes.
  30. +
  31. significant Whether the p-value is less than 0.05.
  32. +
  33. qvalues The p-value adjusted to control the FDR by using the qvalue (with contributions from Andrew J. Bass, Dabney, and Robinson, 2015) function from the package with the same name.
  34. +
  35. significantQval Whether the q-value is less than 0.10.
  36. +
  37. name This and the following fields are computed using annotateNearest() from the bumphunter package. They were calculated using the UCSC hg19 annotation. name refers to the nearest gene.
  38. +
  39. annotation RefSeq ID. Taken from the help page of bumphunter::annotateNearest().
  40. +
  41. description A factor with levels upstream, promoter, overlaps 5’, inside intron, inside exon, covers exon(s), overlaps exon upstream, overlaps exon downstream, overlaps two exons, overlaps 3’, close to 3’, downstream, covers. Taken from the help page of bumphunter::annotateNearest().
  42. +
  43. region A factor with levels upstream, promoter, overlaps 5’, inside, overlaps 3’, close to 3’, downstream, covers. Taken from the help page of bumphunter::annotateNearest().
  44. +
  45. subregion A factor with levels inside intron, inside exon, covers exon(s), overlaps exon upstream, overlaps exon downstream, overlaps two exons. Taken from the help page of bumphunter::annotateNearest().
  46. +
  47. insidedistance Distance past 5 prime end of gene. Taken from the help page of bumphunter::annotateNearest().
  48. +
  49. exonnumber Which exon. Taken from the help page of bumphunter::annotateNearest().
  50. +
  51. nexons Number of exons. Taken from the help page of bumphunter::annotateNearest().
  52. +
  53. UTR A factor with levels inside transcription region, 5’ UTR, overlaps 5’ UTR, 3’ UTR, overlaps 3’ UTR, covers transcription region. Taken from the help page of bumphunter::annotateNearest().
  54. +
  55. annoStrand + or -. Taken from the help page of bumphunter::annotateNearest().
  56. +
  57. geneL The gene length. Taken from the help page of bumphunter::annotateNearest().
  58. +
  59. codingL The coding length. Taken from the help page of bumphunter::annotateNearest().
  60. +
  61. fwer The FWER adjusted p-value for the region.
  62. +
  63. significantFWER Whether the FWER adjusted p-value is less than 0.05.
  64. +
+
+
+

3 Reproducibility

+

Date the report was generated.

+
## [1] "2016-02-20 10:27:43 EST"
+

Wallclock time spent generating the report.

+
## Time difference of 21.416 secs
+

R session information.

+
## Session info -----------------------------------------------------------------------------------------------------------
+
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_US.UTF-8                 
+##  tz       America/New_York            
+##  date     2016-02-20
+
## Packages ---------------------------------------------------------------------------------------------------------------
+
##  package              * version  date       source        
+##  acepack                1.3-3.3  2014-11-24 CRAN (R 3.2.0)
+##  AnnotationDbi          1.32.3   2015-12-24 Bioconductor  
+##  bibtex                 0.4.0    2014-12-31 CRAN (R 3.2.0)
+##  Biobase                2.30.0   2015-10-14 Bioconductor  
+##  BiocGenerics         * 0.16.1   2015-11-06 Bioconductor  
+##  BiocInstaller          1.20.1   2015-11-18 Bioconductor  
+##  BiocParallel           1.4.3    2015-12-16 Bioconductor  
+##  BiocStyle            * 1.8.0    2015-10-14 Bioconductor  
+##  biomaRt                2.26.1   2015-11-23 Bioconductor  
+##  Biostrings             2.38.3   2016-01-02 Bioconductor  
+##  biovizBase             1.18.0   2015-10-14 Bioconductor  
+##  bitops                 1.0-6    2013-08-17 CRAN (R 3.2.0)
+##  BSgenome               1.38.0   2015-10-14 Bioconductor  
+##  bumphunter             1.10.0   2015-10-14 Bioconductor  
+##  Cairo                  1.5-9    2015-09-26 CRAN (R 3.2.0)
+##  cluster                2.0.3    2015-07-21 CRAN (R 3.2.2)
+##  codetools              0.2-14   2015-07-15 CRAN (R 3.2.2)
+##  colorspace             1.2-6    2015-03-11 CRAN (R 3.2.0)
+##  DBI                    0.3.1    2014-09-24 CRAN (R 3.2.0)
+##  derfinder            * 1.5.19   2015-12-15 Bioconductor  
+##  derfinderHelper        1.4.1    2015-11-03 Bioconductor  
+##  derfinderPlot        * 1.4.1    2015-11-03 Bioconductor  
+##  devtools               1.10.0   2016-01-23 CRAN (R 3.2.3)
+##  dichromat              2.0-0    2013-01-24 CRAN (R 3.2.0)
+##  digest                 0.6.9    2016-01-08 CRAN (R 3.2.3)
+##  doRNG                  1.6      2014-03-07 CRAN (R 3.2.0)
+##  evaluate               0.8      2015-09-18 CRAN (R 3.2.0)
+##  foreach                1.4.3    2015-10-13 CRAN (R 3.2.0)
+##  foreign                0.8-66   2015-08-19 CRAN (R 3.2.0)
+##  formatR                1.2.1    2015-09-18 CRAN (R 3.2.0)
+##  Formula                1.2-1    2015-04-07 CRAN (R 3.2.0)
+##  futile.logger          1.4.1    2015-04-20 CRAN (R 3.2.0)
+##  futile.options         1.0.0    2010-04-06 CRAN (R 3.2.0)
+##  GenomeInfoDb         * 1.6.3    2016-01-26 Bioconductor  
+##  GenomicAlignments      1.6.3    2016-01-06 Bioconductor  
+##  GenomicFeatures        1.22.12  2016-01-28 Bioconductor  
+##  GenomicFiles           1.6.2    2015-12-30 Bioconductor  
+##  GenomicRanges        * 1.22.4   2016-01-30 Bioconductor  
+##  GGally                 1.0.1    2016-01-14 CRAN (R 3.2.3)
+##  ggbio                  1.18.3   2016-01-13 Bioconductor  
+##  ggplot2                2.0.0    2015-12-18 CRAN (R 3.2.3)
+##  graph                  1.48.0   2015-10-14 Bioconductor  
+##  gridExtra              2.0.0    2015-07-14 CRAN (R 3.2.0)
+##  gtable                 0.1.2    2012-12-05 CRAN (R 3.2.0)
+##  Hmisc                  3.17-1   2015-12-18 CRAN (R 3.2.3)
+##  htmltools              0.3      2015-12-29 CRAN (R 3.2.3)
+##  httr                   1.1.0    2016-01-28 CRAN (R 3.2.3)
+##  IRanges              * 2.4.6    2015-12-12 Bioconductor  
+##  iterators              1.0.8    2015-10-13 CRAN (R 3.2.0)
+##  knitcitations        * 1.0.7    2015-10-28 CRAN (R 3.2.0)
+##  knitr                  1.12.3   2016-01-22 CRAN (R 3.2.3)
+##  lambda.r               1.1.7    2015-03-20 CRAN (R 3.2.0)
+##  lattice                0.20-33  2015-07-14 CRAN (R 3.2.2)
+##  latticeExtra           0.6-26   2013-08-15 CRAN (R 3.2.0)
+##  limma                  3.26.7   2016-01-28 Bioconductor  
+##  locfit                 1.5-9.1  2013-04-20 CRAN (R 3.2.0)
+##  lubridate              1.5.0    2015-12-03 CRAN (R 3.2.3)
+##  magrittr               1.5      2014-11-22 CRAN (R 3.2.0)
+##  Matrix                 1.2-3    2015-11-28 CRAN (R 3.2.2)
+##  matrixStats            0.50.1   2015-12-15 CRAN (R 3.2.3)
+##  memoise                1.0.0    2016-01-29 CRAN (R 3.2.3)
+##  munsell                0.4.2    2013-07-11 CRAN (R 3.2.0)
+##  nnet                   7.3-11   2015-08-30 CRAN (R 3.2.0)
+##  OrganismDbi            1.12.1   2015-12-23 Bioconductor  
+##  pkgmaker               0.22     2014-05-14 CRAN (R 3.2.0)
+##  plyr                   1.8.3    2015-06-12 CRAN (R 3.2.1)
+##  qvalue                 2.2.2    2016-01-08 Bioconductor  
+##  R6                     2.1.2    2016-01-26 CRAN (R 3.2.3)
+##  RBGL                   1.46.0   2015-10-14 Bioconductor  
+##  RColorBrewer           1.1-2    2014-12-07 CRAN (R 3.2.0)
+##  Rcpp                   0.12.3   2016-01-10 CRAN (R 3.2.3)
+##  RCurl                  1.95-4.7 2015-06-30 CRAN (R 3.2.1)
+##  RefManageR             0.10.5   2016-01-02 CRAN (R 3.2.3)
+##  registry               0.3      2015-07-08 CRAN (R 3.2.1)
+##  reshape                0.8.5    2014-04-23 CRAN (R 3.2.0)
+##  reshape2               1.4.1    2014-12-06 CRAN (R 3.2.0)
+##  RJSONIO                1.3-0    2014-07-28 CRAN (R 3.2.0)
+##  rmarkdown            * 0.9.2    2016-01-01 CRAN (R 3.2.3)
+##  rngtools               1.2.4    2014-03-06 CRAN (R 3.2.0)
+##  rpart                  4.1-10   2015-06-29 CRAN (R 3.2.2)
+##  Rsamtools              1.22.0   2015-10-14 Bioconductor  
+##  RSQLite                1.0.0    2014-10-25 CRAN (R 3.2.0)
+##  rtracklayer            1.30.1   2015-10-22 Bioconductor  
+##  S4Vectors            * 0.8.11   2016-01-29 Bioconductor  
+##  scales                 0.3.0    2015-08-25 CRAN (R 3.2.0)
+##  stringi                1.0-1    2015-10-22 CRAN (R 3.2.0)
+##  stringr                1.0.0    2015-04-30 CRAN (R 3.2.0)
+##  SummarizedExperiment   1.0.2    2016-01-01 Bioconductor  
+##  survival               2.38-3   2015-07-02 CRAN (R 3.2.2)
+##  VariantAnnotation      1.16.4   2015-12-09 Bioconductor  
+##  XML                    3.98-1.3 2015-06-30 CRAN (R 3.2.0)
+##  xtable                 1.8-0    2015-11-02 CRAN (R 3.2.0)
+##  XVector                0.10.0   2015-10-14 Bioconductor  
+##  yaml                   2.1.13   2014-06-12 CRAN (R 3.2.0)
+##  zlibbioc               1.16.0   2015-10-14 Bioconductor
+
+
+

4 Bibliography

+

This report was generated using BiocStyle (Morgan, Oleś, and Huber, 2016) with knitr (Xie, 2014) and rmarkdown (Allaire, Cheng, Xie, McPherson, et al., 2016) running behind the scenes.

+

Citations made with knitcitations (Boettiger, 2015). Citation file: venn.bib.

+

+[1] J. Allaire, J. Cheng, Y. Xie, J. McPherson, et al. rmarkdown: Dynamic Documents for R. R package version 0.9.2. 2016. URL: http://CRAN.R-project.org/package=rmarkdown. +

+

+[2] J. D. S. with contributions from Andrew J. Bass, A. Dabney and D. Robinson. qvalue: Q-value estimation for false discovery rate control. R package version 2.2.2. 2015. URL: http://github.com/jdstorey/qvalue. +

+

+[3] C. Boettiger. knitcitations: Citations for ‘Knitr’ Markdown Files. R package version 1.0.7. 2015. URL: http://CRAN.R-project.org/package=knitcitations. +

+

+[4] L. Collado-Torres, A. C. Frazee, M. I. Love, R. A. Irizarry, et al. “derfinder: Software for annotation-agnostic RNA-seq differential expression analysis”. In: bioRxiv (2015). DOI: 10.1101/015370. URL: http://www.biorxiv.org/content/early/2015/02/19/015370.abstract. +

+

+[5] L. Collado-Torres, A. E. Jaffe and J. T. Leek. derfinderPlot: Plotting functions for derfinder. https://github.com/leekgroup/derfinderPlot - R package version 1.4.1. 2015. URL: http://www.bioconductor.org/packages/release/bioc/html/derfinderPlot.html. +

+

+[6] M. Morgan, A. Oleś and W. Huber. BiocStyle: Standard styles for vignettes and other Bioconductor documents. R package version 1.8.0. 2016. URL: https://github.com/Bioconductor/BiocStyle. +

+

+[7] Y. Xie. “knitr: A Comprehensive Tool for Reproducible Research in R”. In: Implementing Reproducible Computational Research. Ed. by V. Stodden, F. Leisch and R. D. Peng. ISBN 978-1466561595. Chapman and Hall/CRC, 2014. URL: http://www.crcpress.com/product/isbn/9781466561595. +

+
+ + + + + + + +