Skip to content

Commit

Permalink
Completed migration of Hippo and Snyder scripts
Browse files Browse the repository at this point in the history
Moved files from https://github.com/leekgroup/derSupplement
Updated timing report (less plots, better plots)
Updated venn report to show information for both Snyder and Hippo data sets
Updated index to reflect the files this repo describes
  • Loading branch information
lcolladotor committed Feb 20, 2016
1 parent 5d3caa0 commit 1342860
Show file tree
Hide file tree
Showing 49 changed files with 16,241 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GenomicState.Hsapiens.UCSC.hg19.knownGene.rda
figure-hippo-reanalysis
Empty file added .nojekyll
Empty file.
54 changes: 54 additions & 0 deletions additional-analyses/feature_counts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
###
# qsub -V -pe local 24 -l jabba,mf=80G,h_vmem=16G,h_stack=256M -cwd -b y R CMD BATCH --no-save feature_counts.R
source("/home/epi/ajaffe/Lieber/lieber_functions_aj.R")

library(Rsubread)

## stem
xx=load("/home/epi/ajaffe/Lieber/Projects/RNAseq/UCSD_samples/UCSD_stemcell_pheno.rda")
pdStem = pd
pdStem$bamFile = paste0("/dcs01/ajaffe/UCSC_Epigenome/RNAseq/TopHat/",
pdStem$sample, "_out/accepted_hits.bam")

## hippo
load("/home/epi/ajaffe/Lieber/Projects/RNAseq/HippoPublic/sra_phenotype_file.rda")
pdHippo = sra
pdHippo = pdHippo[-grep("ED", pdHippo$SampleID),]

## snyder
bam = list.files("/dcs01/ajaffe/Snyder/RNAseq/TopHat",
pattern="accepted_hits.bam$", recur=TRUE,full.names=TRUE)
names(bam) = list.files("/dcs01/ajaffe/Snyder/RNAseq/TopHat",
pattern="accepted_hits.bam$", recur=TRUE)

pheno = data.frame(sampleName = c(pdStem$run, pdHippo$SampleID, ss(names(bam), "_")),
Study = c(rep("Stem", nrow(pdStem)), rep("Hippo", nrow(pdHippo)),
rep("Snyder", length(bam))),
bamFile = c(pdStem$bamFile, pdHippo$bamFile, bam),
stringsAsFactors=FALSE)
libSize = getTotalMapped(pheno$bamFile,mc.cores=12)
pheno$totalMapped = libSize$totalMapped
pheno$mitoMapped = libSize$mitoMapped

## count genes
geneCounts = featureCounts(pheno$bamFile, annot.inbuilt="hg19",
useMetaFeatures = TRUE, nthreads=24)
exonCounts = featureCounts(pheno$bamFile, annot.inbuilt="hg19",
useMetaFeatures = FALSE,nthreads=24)
save(geneCounts,exonCounts, pheno, file="featureCounts_output.rda")

#####
load("featureCounts_output.rda")

pheno$geneAlign = as.numeric(geneCounts$stat[1,-1] / colSums(geneCounts$stat[,-1]))
pheno$geneAmbig = as.numeric(geneCounts$stat[2,-1] / colSums(geneCounts$stat[,-1]))
pheno$exonAlign = as.numeric(exonCounts$stat[1,-1] / colSums(exonCounts$stat[,-1]))
pheno$exonAmbig = as.numeric(exonCounts$stat[2,-1] / colSums(exonCounts$stat[,-1]))

signif(tapply(pheno$geneAlign, pheno$Study, mean),3)
signif(tapply(pheno$geneAmbig, pheno$Study, mean),3)
signif(tapply(pheno$exonAlign, pheno$Study, mean),3)
signif(tapply(pheno$exonAmbig, pheno$Study, mean),3)

### mito map
signif(tapply(pheno$mitoMapped/(pheno$mitoMapped+pheno$totalMapped), pheno$Study, mean),3)
92 changes: 92 additions & 0 deletions check-analysis-time.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
library('ggplot2')
library('getopt')

## Specify parameters
spec <- matrix(c(
'experiment', 'e', 1, 'character', 'Experiment',
'run', 'r', 1, 'character', 'run name',
'help' , 'h', 0, 'logical', 'Display help'
), byrow=TRUE, ncol=5)
opt <- getopt(spec)

## if help was asked for print a friendly message
## and exit with a non-zero error code
if (!is.null(opt$help)) {
cat(getopt(spec, usage=TRUE))
q(status=1)
}

## Check experiment input
stopifnot(opt$experiment %in% c('snyder', 'hippo'))

chrs <- paste0('chr', c(1:22, 'X', 'Y'))
study <- opt$experiment
run <- opt$run

timediff <- lapply(chrs, function(chr) {
info <- tryCatch(system(paste0('grep permutation *', study, '*', run, '*', chr, '.e*'), intern = TRUE), warning = function(w) { 'no data'})
if(info[1] == 'no data') {
info <- tryCatch(system(paste0('grep permutation ', file.path(study, 'derAnalysis', run, chr, 'logs'), '/*', chr, '.e*'), intern = TRUE), warning = function(w) { 'no data'})
}
if(info[1] == 'no data') return(NULL)

time <- strptime(gsub('([[:space:]]*calculate.*$)', '', info),
format = '%Y-%m-%d %H:%M:%S')

idx <- seq_len(length(info) - 1)
difftime(time[idx + 1], time[idx], units = 'mins')
})
names(timediff) <- chrs


## Organize time information
chrnum <- gsub('chr', '', chrs)
df <- data.frame(chr = factor(chrnum, levels = chrnum), mean = sapply(timediff, mean), sd = sapply(timediff, sd))

## Group by number of rounds per permutation given the number of chunks & cores used
if(!file.exists(file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))) {
nChunks <- sapply(chrs, function(chr) {
if(!file.exists(file.path(study, 'derAnalysis', run, chr, 'coveragePrep.Rdata')))
return(NA)
load(file.path(study, 'derAnalysis', run, chr, 'coveragePrep.Rdata'))
max(prep$mclapply)
})
save(nChunks, file = file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))
} else {
load(file.path(study, 'derAnalysis', run, 'nChunks.Rdata'))
}

if (study == 'brainspan') {
nCores <- c(40, 32, 27, rep(20, 15), 29, rep(20, 4), 2)
} else if (study == 'snyder') {
nCores <- rep(4, 24)
} else if (study == 'hippo') {
nCores <- rep(2, 24)
}
names(nCores) <- chrs

df$n <- sapply(timediff, length)
df$se <- df$sd / sqrt(df$n)
df$nChunks <- nChunks
df$nCores <- nCores
df$nRound <- factor(ceiling(nChunks / nCores))


## Print info
rownames(df) <- NULL
print(df)


## Make plot
pdf(file.path(study, 'derAnalysis', run, paste0('permuteTime-', study, '-', run, '.pdf')))
ggplot(df, aes(x = chr, y = mean, color = nRound)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) + geom_line() + geom_point() + ylab('Time per permutation (minutes)\nMean +- SE') + xlab('Chromosome') + ggtitle(paste('Time info for', study, run)) + scale_y_continuous(breaks=seq(0, ceiling(max(df$mean + df$se, na.rm = TRUE)), 1))
dev.off()

print('Expected total number of days per chr and days remaining')
days <- data.frame(chr = chrnum, total = round(df$mean * 1001 / 60 / 24, 1), remaining = round(df$mean * (1001 - df$n - 2 ) / 60 / 24, 1))
rownames(days) <- NULL
print(days)




17 changes: 17 additions & 0 deletions gff/createGFF.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Setup
library("GenomicRanges")
library("rtracklayer")

load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.ensembl.GRCh37.p11.rda')
load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')

makeGFF <- function(exonicParts, file) {
message(paste(Sys.time(), "makeGFF: Saving", file))
export.gff2(exonicParts, file)
}

makeGFF(GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome[ GenomicState.Hsapiens.ensembl.GRCh37.p11$fullGenome$theRegion == "exon"], "GenomicState.Hsapiens.ensembl.GRCh37.p11.exons.gff")
makeGFF(GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome[ GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome$theRegion == "exon"], "GenomicState.Hsapiens.UCSC.hg19.knownGene.exons.gff")

proc.time()
sessionInfo()
16 changes: 16 additions & 0 deletions gff/runGFF.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

#$ -cwd
#$ -l jabba,mem_free=10G,h_vmem=50G,h_fsize=40G
#$ -N exonsGFF
#$ -m e

echo "**** Job starts ****"
date

## Create GFF files
Rscript createGFF.R

### Done
echo "**** Job ends ****"
date
2 changes: 2 additions & 0 deletions hippo/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
derAnalysis
coverageToExon
3 changes: 3 additions & 0 deletions hippo/counts-gene/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
logs
*Rdata

30 changes: 30 additions & 0 deletions hippo/counts-gene/counts-gene.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## Setup
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
library('derfinder')
library('Rsamtools')
library('GenomicAlignments')
library('parallel')
options(mc.cores=24)

## Exons by gene
ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene')

## Make bamFileList
files <- rawFiles(datadir='/dcs01/ajaffe/Hippo/TopHat', samplepatt="out$",
fileterm="accepted_hits.bam")
names(files) <- gsub('_out', '', names(files))
bai <- paste0(files, ".bai")
bList <- BamFileList(files, bai)

## Compute the overlaps
message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps()"))
summOv <- summarizeOverlaps(ex, bList, mode="Union",
singleEnd=TRUE, ignore.strand=TRUE)

## Finish
message(paste(Sys.time(), "summarizeOverlaps: Saving summOverlaps"))
save(summOv, file="summOv.Rdata")

proc.time()
options(width = 120)
devtools::session_info()
22 changes: 22 additions & 0 deletions hippo/counts-gene/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#$ -cwd
#$ -m e
#$ -l mem_free=3G,h_vmem=15G,h_fsize=30G
#$ -pe local 24
#$ -N summOv-hippo-rerun

echo "**** Job starts ****"
date

mkdir -p /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/logs

## Summarize overlaps
module load R/3.2.x
Rscript counts-gene.R

# Move log files into the logs directory
mv /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/summOv-hippo-rerun.* /dcs01/ajaffe/Brain/derRuns/derSoftware/hippo/counts-gene/logs/

### Done
echo "**** Job ends ****"
date
5 changes: 5 additions & 0 deletions hippo/pnas/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.e*
*.o*
*Rdata
*pdf

Loading

0 comments on commit 1342860

Please sign in to comment.