Completed migration of Hippo and Snyder scripts

Moved files from https://github.com/leekgroup/derSupplement Updated timing report (less plots, better plots) Updated venn report to show information for both Snyder and Hippo data sets Updated index to reflect the files this repo describes
leekgroup · Feb 20, 2016 · 1342860 · 1342860
1 parent 5d3caa0
commit 1342860
Show file tree

Hide file tree

Showing 49 changed files with 16,241 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+GenomicState.Hsapiens.UCSC.hg19.knownGene.rda
+figure-hippo-reanalysis
diff --git a/.nojekyll b/.nojekyll
diff --git a/additional-analyses/feature_counts.R b/additional-analyses/feature_counts.R
@@ -0,0 +1,54 @@
+###
+# qsub -V -pe local 24 -l jabba,mf=80G,h_vmem=16G,h_stack=256M -cwd -b y R CMD BATCH --no-save feature_counts.R
+source("/home/epi/ajaffe/Lieber/lieber_functions_aj.R") 
+
+library(Rsubread)
+
+##  stem
+xx=load("/home/epi/ajaffe/Lieber/Projects/RNAseq/UCSD_samples/UCSD_stemcell_pheno.rda")
+pdStem = pd
+pdStem$bamFile = paste0("/dcs01/ajaffe/UCSC_Epigenome/RNAseq/TopHat/",	
+	pdStem$sample, "_out/accepted_hits.bam")
+
+## hippo
+load("/home/epi/ajaffe/Lieber/Projects/RNAseq/HippoPublic/sra_phenotype_file.rda")
+pdHippo = sra
+pdHippo = pdHippo[-grep("ED", pdHippo$SampleID),]
+
+## snyder
+bam = list.files("/dcs01/ajaffe/Snyder/RNAseq/TopHat",
+		pattern="accepted_hits.bam$", recur=TRUE,full.names=TRUE)
+names(bam) = list.files("/dcs01/ajaffe/Snyder/RNAseq/TopHat",
+		pattern="accepted_hits.bam$", recur=TRUE)
+
+pheno  = data.frame(sampleName = c(pdStem$run, pdHippo$SampleID, ss(names(bam), "_")), 
+	Study = c(rep("Stem", nrow(pdStem)), rep("Hippo", nrow(pdHippo)),
+		rep("Snyder", length(bam))),
+	bamFile = c(pdStem$bamFile, pdHippo$bamFile, bam),
+	stringsAsFactors=FALSE)
+libSize = getTotalMapped(pheno$bamFile,mc.cores=12)
+pheno$totalMapped = libSize$totalMapped
+pheno$mitoMapped = libSize$mitoMapped
+
+## count genes
+geneCounts = featureCounts(pheno$bamFile, annot.inbuilt="hg19",
+	useMetaFeatures = TRUE,	nthreads=24)
+exonCounts = featureCounts(pheno$bamFile, annot.inbuilt="hg19", 
+	useMetaFeatures = FALSE,nthreads=24)
+save(geneCounts,exonCounts, pheno, file="featureCounts_output.rda")
+
+#####
+load("featureCounts_output.rda")
+
+pheno$geneAlign = as.numeric(geneCounts$stat[1,-1] / colSums(geneCounts$stat[,-1]))
+pheno$geneAmbig = as.numeric(geneCounts$stat[2,-1] / colSums(geneCounts$stat[,-1]))
+pheno$exonAlign = as.numeric(exonCounts$stat[1,-1] / colSums(exonCounts$stat[,-1]))
+pheno$exonAmbig = as.numeric(exonCounts$stat[2,-1] / colSums(exonCounts$stat[,-1]))
+
+signif(tapply(pheno$geneAlign, pheno$Study, mean),3)
+signif(tapply(pheno$geneAmbig, pheno$Study, mean),3)
+signif(tapply(pheno$exonAlign, pheno$Study, mean),3)
+signif(tapply(pheno$exonAmbig, pheno$Study, mean),3)
+
+### mito map
+signif(tapply(pheno$mitoMapped/(pheno$mitoMapped+pheno$totalMapped), pheno$Study, mean),3)
diff --git a/check-analysis-time.R b/check-analysis-time.R
@@ -0,0 +1,92 @@
+library('ggplot2')
+library('getopt')
+
+## Specify parameters
+spec <- matrix(c(
+	'experiment', 'e', 1, 'character', 'Experiment',
+	'run', 'r', 1, 'character', 'run name',
+	'help' , 'h', 0, 'logical', 'Display help'
+), byrow=TRUE, ncol=5)
+opt <- getopt(spec)
+
+## if help was asked for print a friendly message
+## and exit with a non-zero error code
+if (!is.null(opt$help)) {
+	cat(getopt(spec, usage=TRUE))
+	q(status=1)
+}
+
+## Check experiment input
+stopifnot(opt$experiment %in% c('snyder', 'hippo'))
+
+chrs <- paste0('chr', c(1:22, 'X', 'Y'))
+study <- opt$experiment
+run <- opt$run
+
+timediff <- lapply(chrs, function(chr) {
+    info <- tryCatch(system(paste0('grep permutation *', study, '*', run, '*', chr, '.e*'), intern = TRUE), warning = function(w) { 'no data'})
+    if(info[1] == 'no data') {
+        info <- tryCatch(system(paste0('grep permutation ', file.path(study, 'derAnalysis', run, chr, 'logs'), '/*', chr, '.e*'), intern = TRUE), warning = function(w) { 'no data'})
+    }
+    if(info[1] == 'no data') return(NULL)
+
+    time <- strptime(gsub('([[:space:]]*calculate.*$)', '', info),
+        format = '%Y-%m-%d %H:%M:%S')
+
+    idx <- seq_len(length(info) - 1)
+    difftime(time[idx + 1], time[idx], units = 'mins')
+})
+names(timediff) <- chrs
+
+
+## Organize time information
+chrnum <- gsub('chr', '', chrs)
+df <- data.frame(chr = factor(chrnum, levels = chrnum), mean = sapply(timediff, mean), sd = sapply(timediff, sd))
+
+## Group by number of rounds per permutation given the number of chunks & cores used
+if(!file.exists(file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))) {
+    nChunks <- sapply(chrs, function(chr) { 
+        if(!file.exists(file.path(study, 'derAnalysis', run, chr, 'coveragePrep.Rdata')))
+            return(NA)
+        load(file.path(study, 'derAnalysis', run, chr, 'coveragePrep.Rdata'))
+        max(prep$mclapply) 
+    })
+    save(nChunks, file = file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))
+} else {
+    load(file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))
+}
+
+if (study == 'brainspan') {
+    nCores <- c(40, 32, 27, rep(20, 15), 29, rep(20, 4), 2)
+} else if (study == 'snyder') {
+    nCores <- rep(4, 24)
+} else if (study == 'hippo') {
+    nCores <- rep(2, 24)
+}
+names(nCores) <- chrs
+
+df$n <- sapply(timediff, length)
+df$se <- df$sd / sqrt(df$n)
+df$nChunks <- nChunks
+df$nCores <- nCores
+df$nRound <- factor(ceiling(nChunks / nCores))
+
+
+## Print info
+rownames(df) <- NULL
+print(df)
+
+
+## Make plot
+pdf(file.path(study, 'derAnalysis', run, paste0('permuteTime-', study, '-', run, '.pdf')))
+ggplot(df, aes(x = chr, y = mean, color = nRound)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) + geom_line() + geom_point() + ylab('Time per permutation (minutes)\nMean +- SE') + xlab('Chromosome') + ggtitle(paste('Time info for', study, run)) + scale_y_continuous(breaks=seq(0, ceiling(max(df$mean + df$se, na.rm = TRUE)), 1))
+dev.off()
+
+print('Expected total number of days per chr and days remaining')
+days <- data.frame(chr = chrnum, total = round(df$mean * 1001 / 60 / 24, 1), remaining = round(df$mean * (1001 - df$n - 2 ) / 60 / 24, 1))
+rownames(days) <- NULL
+print(days)
+
+
+
+
diff --git a/gff/createGFF.R b/gff/createGFF.R
@@ -0,0 +1,17 @@
+# Setup
+library("GenomicRanges")
+library("rtracklayer")
+
+load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.ensembl.GRCh37.p11.rda')
+load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
+
+makeGFF <- function(exonicParts, file) {
+	message(paste(Sys.time(), "makeGFF: Saving", file))
+	export.gff2(exonicParts, file)
+}
+
+makeGFF(GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome[ GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome$theRegion == "exon"], "GenomicState.Hsapiens.ensembl.GRCh37.p11.exons.gff")
+makeGFF(GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome[ GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome$theRegion == "exon"], "GenomicState.Hsapiens.UCSC.hg19.knownGene.exons.gff")
+
+proc.time()
+sessionInfo()
diff --git a/gff/runGFF.sh b/gff/runGFF.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#$ -cwd
+#$ -l jabba,mem_free=10G,h_vmem=50G,h_fsize=40G 
+#$ -N exonsGFF
+#$ -m e
+
+echo "**** Job starts ****"
+date
+
+## Create GFF files
+Rscript createGFF.R
+
+### Done
+echo "**** Job ends ****"
+date
diff --git a/hippo/.gitignore b/hippo/.gitignore
@@ -0,0 +1,2 @@
+derAnalysis
+coverageToExon
diff --git a/hippo/counts-gene/.gitignore b/hippo/counts-gene/.gitignore
@@ -0,0 +1,3 @@
+logs
+*Rdata
+
diff --git a/hippo/counts-gene/counts-gene.R b/hippo/counts-gene/counts-gene.R
@@ -0,0 +1,30 @@
+## Setup
+library('TxDb.Hsapiens.UCSC.hg19.knownGene')
+library('derfinder')
+library('Rsamtools')
+library('GenomicAlignments')
+library('parallel')
+options(mc.cores=24)
+
+## Exons by gene
+ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene')
+
+## Make bamFileList
+files <- rawFiles(datadir='/dcs01/ajaffe/Hippo/TopHat', samplepatt="out$",
+    fileterm="accepted_hits.bam")
+names(files) <- gsub('_out', '', names(files))
+bai <- paste0(files, ".bai")
+bList <- BamFileList(files, bai)
+
+## Compute the overlaps
+message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps()"))
+summOv <- summarizeOverlaps(ex, bList, mode="Union",
+    singleEnd=TRUE, ignore.strand=TRUE)
+
+## Finish
+message(paste(Sys.time(), "summarizeOverlaps: Saving summOverlaps"))
+save(summOv, file="summOv.Rdata")
+
+proc.time()
+options(width = 120)
+devtools::session_info()
diff --git a/hippo/counts-gene/run.sh b/hippo/counts-gene/run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash	
+#$ -cwd
+#$ -m e
+#$ -l mem_free=3G,h_vmem=15G,h_fsize=30G
+#$ -pe local 24
+#$ -N summOv-hippo-rerun
+
+echo "**** Job starts ****"
+date
+
+mkdir -p /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/logs
+
+## Summarize overlaps
+module load R/3.2.x
+Rscript counts-gene.R
+
+# Move log files into the logs directory
+mv /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/summOv-hippo-rerun.* /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/logs/
+
+### Done
+echo "**** Job ends ****"
+date
diff --git a/hippo/pnas/.gitignore b/hippo/pnas/.gitignore
@@ -0,0 +1,5 @@
+*.e*
+*.o*
+*Rdata
+*pdf
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		GenomicState.Hsapiens.UCSC.hg19.knownGene.rda
		figure-hippo-reanalysis
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    *.e*
+    *.o*
+    *Rdata
+    *pdf