-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update processing methods for new datasets
- Loading branch information
Helen Miller
committed
Apr 6, 2021
1 parent
bf7f96e
commit 609ad52
Showing
73 changed files
with
286,996 additions
and
429 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
# Data is saved in subdirectories under DATA_DIR | ||
DATA_DIR=/fh/fast/gottardo_r/ytian_working/covid19_datasets | ||
# Location of reference dataset, downloaded from https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat | ||
REFERENCE_DATASET=/fh/fast/gottardo_r/ytian_working/covid19_datasets/data/seurat/pbmc_multimodal.h5seurat | ||
REFERENCE_DATASET=/fh/scratch/delete10/gottardo_r/hmiller/pbmc_multimodal.h5seurat | ||
# Directory to save intermediate objects for debugging | ||
DEBUG_DIR=/fh/scratch/delete10/gottardo_r/hmiller |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
*Rproj* | ||
.DS_Store | ||
.Rhistory | ||
.RData | ||
.Ruserdata | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
renv::load("..") |
152 changes: 152 additions & 0 deletions
152
differential_expression/differential_expression_combined.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
# Get differentially expressed marker genes for L1, L2, and L3 cell types for the combined dataset | ||
|
||
message(getwd()) | ||
library(Seurat) | ||
library(SeuratDisk) | ||
library(SingleCellExperiment) | ||
library(HDF5Array) | ||
library(MAST) | ||
library(slurmR) | ||
|
||
debugDir <- file.path(Sys.getenv("DEBUG_DIR"), "covid_DE", "combined") | ||
h5seuratDir <- file.path(Sys.getenv("DATA_DIR"), "h5seurat_reprocessed") | ||
h5SCEDir <- file.path(h5seuratDir, "combined_sce") | ||
|
||
message(">>> debugDir: ", debugDir) | ||
message(">>> h5seuratDir: ", h5seuratDir) | ||
message(">>>h5SCEDir: ", h5SCEDir) | ||
|
||
# h5SCEDir <- "/fh/scratch/delete30/gottardo_r/hmiller/example_datasets/combined_sce" | ||
# h5seuratDir <- "/fh/scratch/delete30/gottardo_r/hmiller/example_datasets" | ||
|
||
message(">>> Getting cell types from ", file.path(h5seuratDir, "covid19_datasets.h5seurat")) | ||
seu <- Connect(file.path(h5seuratDir, "covid19_datasets.h5seurat")) | ||
celltypesL1 <- table(seu[["meta.data"]][["predicted.celltype.l1"]][]) | ||
celltypesL1 <- names(celltypesL1)[celltypesL1 >5] | ||
celltypesL2 <- table(seu[["meta.data"]][["predicted.celltype.l2"]][]) | ||
celltypesL2 <- names(celltypesL2)[celltypesL2 > 5] | ||
celltypesL3 <- table(seu[["meta.data"]][["predicted.celltype.l3"]][]) | ||
celltypesL3 <- names(celltypesL3)[celltypesL3 > 5] | ||
seu$close_all() | ||
|
||
slurmR_tmp_dir <- file.path(Sys.getenv("DEBUG_DIR"), "covid_DE", "slurmr") | ||
if (!dir.exists(slurmR_tmp_dir)) dir.create(slurmR_tmp_dir) | ||
|
||
message(">>> slurmR_tmp_dir: ", slurmR_tmp_dir) | ||
|
||
FindMarkersForCelltype <- function(celltype, | ||
inpath, | ||
label) { | ||
sce <- loadHDF5SummarizedExperiment(inpath, prefix = "combined_") | ||
|
||
cells.1 <- Cells(sce)[which(sce[[label]] == celltype)] | ||
cells.2 <- Cells(sce)[which(sce[[label]] != celltype)] | ||
fc.results <- Seurat::FoldChange(logcounts(sce), | ||
cells.1 = cells.1, | ||
cells.2 = cells.2, | ||
fc.name = "avg_log2FC", | ||
mean.fxn = function(x) { | ||
return(log(x = rowMeans(x = expm1(x = x)) + 1, base = 2)) | ||
}) | ||
markers <- Seurat::FindMarkers(logcounts(sce), | ||
cells.1 = cells.1, | ||
cells.2 = cells.2, | ||
only.pos = TRUE, | ||
min.pct = 0.25, | ||
logfc.threshold = 0.25, | ||
test.use = "MAST", | ||
fc.results = fc.results) | ||
markers$gene <- rownames(markers) | ||
markers$cluster <- celltype | ||
return(markers) | ||
} | ||
|
||
l1_job <- Slurm_lapply(celltypesL1, FindMarkersForCelltype, | ||
inpath = h5SCEDir, | ||
label = "predicted.celltype.l1", | ||
|
||
njobs = length(celltypesL1), | ||
mc.cores = 1, | ||
job_name = paste0("DE_l1_combined_", as.character(Sys.Date())), | ||
tmp_path = slurmR_tmp_dir, | ||
sbatch_opt = list("constraint" = "gizmok", | ||
"cpus-per-task" = "16"), | ||
seeds = 1:length(celltypesL1), | ||
plan = "submit", | ||
overwrite = TRUE) | ||
|
||
|
||
l2_job <- Slurm_lapply(celltypesL2, FindMarkersForCelltype, | ||
inpath = h5SCEDir, | ||
label = "predicted.celltype.l2", | ||
|
||
njobs = length(celltypesL2), | ||
mc.cores = 1, | ||
job_name = paste0("DE_l2_combined_", as.character(Sys.Date())), | ||
tmp_path = slurmR_tmp_dir, | ||
sbatch_opt = list("constraint" = "gizmok", | ||
"cpus-per-task" = "16"), | ||
seeds = 1:length(celltypesL2), | ||
plan = "submit", | ||
overwrite = TRUE) | ||
|
||
l3_job <- Slurm_lapply(celltypesL3, FindMarkersForCelltype, | ||
inpath = h5SCEDir, | ||
label = "predicted.celltype.l3", | ||
|
||
njobs = length(celltypesL3), | ||
mc.cores = 1, | ||
job_name = paste0("DE_l3_combined_", as.character(Sys.Date())), | ||
tmp_path = slurmR_tmp_dir, | ||
sbatch_opt = list("constraint" = "gizmok", | ||
"cpus-per-task" = "16"), | ||
seeds = 1:length(celltypesL3), | ||
plan = "submit", | ||
overwrite = TRUE) | ||
|
||
collectResults <- function(de_job) { | ||
still_running <- ( length(status(de_job)$running) + length(status(de_job)$pending) )> 0 | ||
while(still_running) { | ||
Sys.sleep(1) | ||
still_running <- (length(status(de_job)$running) + length(status(de_job)$pending)) > 0 | ||
} | ||
|
||
markerList <- Slurm_collect(de_job) | ||
markerdf <- rbindlist(markerList) | ||
setDT(markerdf) | ||
setorder(markerdf, p_val_adj, -avg_log2FC) | ||
|
||
# Return the top 30 markers with p-value < 0.05 | ||
markerdf[p_val_adj < 0.05, head(.SD, 30), cluster] | ||
} | ||
|
||
Sys.sleep(5) | ||
|
||
message(">>> Waiting for results for l1...") | ||
de_markers_l1 <- collectResults(l1_job) | ||
message(">>> Collected results for l1") | ||
message(">>> Writing l1 results to ", file.path(debugDir, "de_markers_l1.csv")) | ||
fwrite(de_markers_l1, file.path(debugDir, "de_markers_l1.csv")) | ||
|
||
message(">>> Waiting for results for l2...") | ||
de_markers_l2 <- collectResults(l2_job) | ||
message(">>> Collected results for l2") | ||
message(">>> Writing l2 results to ", file.path(debugDir, "de_markers_l2.csv")) | ||
write.csv(de_markers_l2, file.path(debugDir, "de_markers_l2.csv")) | ||
|
||
message(">>> Waiting for results for l3...") | ||
de_markers_l3 <- collectResults(l3_job) | ||
message(">>> Collected results for l3") | ||
message(">>> Writing l3 results to ", file.path(debugDir, "de_markers_l3.csv")) | ||
write.csv(de_markers_l3, file.path(debugDir, "de_markers_l3.csv")) | ||
|
||
|
||
# Save results to hdf5 file in @misc slot | ||
h5seu <- Connect(file.path(h5seuratDir, "covid19_datasets.h5seurat"), mode = "r+") | ||
|
||
h5seu[["misc"]]$create_group("de_results") | ||
h5seu[["misc"]][["de_results"]][["de_markers_L1"]] <- de_markers_l1 | ||
h5seu[["misc"]][["de_results"]][["de_markers_L2"]] <- de_markers_l2 | ||
h5seu[["misc"]][["de_results"]][["de_markers_L3"]] <- de_markers_l3 | ||
|
||
h5seu$close_all() |
173 changes: 173 additions & 0 deletions
173
differential_expression/differential_expression_parallel.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
# Performs differential expression analysis for each of the L3 cell types | ||
# based on Seurat PBMC reference using the MAST test. | ||
# Results will be saved in misc slot in h5seurat file. | ||
# | ||
# Uses slurmR to launch separate slurm jobs for each cluster. | ||
# | ||
# ----- Parse options ------ | ||
message(getwd()) | ||
library(optparse) | ||
option_list = list( | ||
make_option(c("-D", "--dataset"), | ||
help = "name of dataset", | ||
default = "silvin_2020"), | ||
make_option(c("-i", "--indir"), | ||
help = "directory with preprocessed h5seurat datasets", | ||
default = file.path(Sys.getenv("DATA_DIR"), "h5seurat_reprocessed")), | ||
make_option(c("-d", "--debugdir"), | ||
help = "directory to store final processed dataset", | ||
default = file.path(Sys.getenv("DEBUG_DIR"), "covid_DE")), | ||
make_option(c("-c", "--corespertask"), | ||
help = "number of cores to allocate to each sbatch job", | ||
default = "4") | ||
) | ||
|
||
opt_parser = OptionParser(option_list=option_list) | ||
opt = parse_args(opt_parser) | ||
message(paste0(capture.output(opt), collaps = "\n")) | ||
# check args | ||
inpath <- file.path(opt$indir, paste0(opt$dataset, "_processed.h5seurat")) | ||
if (!file.exists(inpath)) { | ||
stop("could not find ", inpath) | ||
} | ||
debugDir <- file.path(opt$debugdir, opt$dataset) | ||
if (!dir.exists(opt$debugdir)) { | ||
stop("could not find ", opt$debugdir) | ||
} | ||
if (!dir.exists(debugDir)) dir.create(debugDir) | ||
dataset <- opt$dataset | ||
# ----- START ----- | ||
# https://satijalab.org/seurat/articles/de_vignette.html | ||
|
||
library(Seurat) | ||
library(SeuratDisk) | ||
library(slurmR) | ||
library(data.table) | ||
|
||
seu <- Connect(inpath) | ||
celltypesL1 <- table(seu[["meta.data"]][["predicted.celltype.l1"]][]) | ||
celltypesL1 <- names(celltypesL1)[celltypesL1 >5] | ||
celltypesL2 <- table(seu[["meta.data"]][["predicted.celltype.l2"]][]) | ||
celltypesL2 <- names(celltypesL2)[celltypesL2 > 5] | ||
celltypesL3 <- table(seu[["meta.data"]][["predicted.celltype.l3"]][]) | ||
celltypesL3 <- names(celltypesL3)[celltypesL3 > 5] | ||
assay <- ifelse("SCT" %in% names(seu[["assays"]]), "SCT", "RNA") | ||
seu$close_all() | ||
|
||
slurmR_tmp_dir <- file.path(opt$debugdir, "slurmr") | ||
if (!dir.exists(slurmR_tmp_dir)) dir.create(slurmR_tmp_dir) | ||
|
||
FindMarkersForCelltype <- function(celltype, | ||
inpath, | ||
label, | ||
assay) { | ||
seu <- LoadH5Seurat(inpath, | ||
assays = "SCT") | ||
Idents(seu) <- label | ||
|
||
markers <- FindMarkers(object = seu, | ||
assay = assay, | ||
slot = "data", | ||
ident.1 = celltype, | ||
only.pos = TRUE, | ||
min.pct = 0.25, | ||
logfc.threshold = 0.25, | ||
test.use = "MAST") | ||
markers$gene <- rownames(markers) | ||
markers$cluster <- celltype | ||
return(markers) | ||
} | ||
|
||
|
||
l1_job <- Slurm_lapply(celltypesL1, FindMarkersForCelltype, | ||
inpath = inpath, | ||
label = "predicted.celltype.l1", | ||
assay = assay, | ||
|
||
njobs = length(celltypesL1), | ||
mc.cores = 1, | ||
job_name = paste0("DE_l1_", opt$dataset, "_", as.character(Sys.Date())), | ||
tmp_path = slurmR_tmp_dir, | ||
sbatch_opt = list("constraint" = "gizmok", | ||
"cpus-per-task" = opt$corespertask), | ||
seeds = 1:length(celltypesL1), | ||
plan = "submit", | ||
overwrite = TRUE) | ||
|
||
|
||
l2_job <- Slurm_lapply(celltypesL2, FindMarkersForCelltype, | ||
inpath = inpath, | ||
label = "predicted.celltype.l2", | ||
assay = assay, | ||
|
||
njobs = length(celltypesL2), | ||
mc.cores = 1, | ||
job_name = paste0("DE_l2_", opt$dataset, "_", as.character(Sys.Date())), | ||
tmp_path = slurmR_tmp_dir, | ||
sbatch_opt = list("constraint" = "gizmok", | ||
"cpus-per-task" = opt$corespertask), | ||
seeds = 1:length(celltypesL2), | ||
plan = "submit", | ||
overwrite = TRUE) | ||
|
||
l3_job <- Slurm_lapply(celltypesL3, FindMarkersForCelltype, | ||
inpath = inpath, | ||
label = "predicted.celltype.l3", | ||
assay = assay, | ||
|
||
njobs = length(celltypesL3), | ||
mc.cores = 1, | ||
job_name = paste0("DE_l3_", opt$dataset, "_", as.character(Sys.Date())), | ||
tmp_path = slurmR_tmp_dir, | ||
sbatch_opt = list("constraint" = "gizmok", | ||
"cpus-per-task" = opt$corespertask), | ||
seeds = 1:length(celltypesL3), | ||
plan = "submit", | ||
overwrite = TRUE) | ||
|
||
collectResults <- function(de_job) { | ||
still_running <- ( length(status(de_job)$running) + length(status(de_job)$pending) )> 0 | ||
while(still_running) { | ||
Sys.sleep(1) | ||
still_running <- (length(status(de_job)$running) + length(status(de_job)$pending)) > 0 | ||
} | ||
|
||
markerList <- Slurm_collect(de_job) | ||
markerdf <- rbindlist(markerList) | ||
setDT(markerdf) | ||
setorder(markerdf, p_val_adj, -avg_log2FC) | ||
|
||
# Return the top 30 markers with p-value < 0.05 | ||
markerdf[p_val_adj < 0.05, head(.SD, 30), cluster] | ||
} | ||
|
||
Sys.sleep(5) | ||
|
||
message(">>> Waiting for results for l1...") | ||
de_markers_l1 <- collectResults(l1_job) | ||
message(">>> Collected results for l1") | ||
message(">>> Writing l1 results to ", file.path(debugDir, "de_markers_l1.csv")) | ||
fwrite(de_markers_l1, file.path(debugDir, "de_markers_l1.csv")) | ||
|
||
message(">>> Waiting for results for l2...") | ||
de_markers_l2 <- collectResults(l2_job) | ||
message(">>> Collected results for l2") | ||
message(">>> Writing l2 results to ", file.path(debugDir, "de_markers_l2.csv")) | ||
write.csv(de_markers_l2, file.path(debugDir, "de_markers_l2.csv")) | ||
|
||
message(">>> Waiting for results for l3...") | ||
de_markers_l3 <- collectResults(l3_job) | ||
message(">>> Collected results for l3") | ||
message(">>> Writing l3 results to ", file.path(debugDir, "de_markers_l3.csv")) | ||
write.csv(de_markers_l3, file.path(debugDir, "de_markers_l3.csv")) | ||
|
||
|
||
# Save results to hdf5 file in @misc slot | ||
h5seu <- Connect(inpath, mode = "r+") | ||
|
||
h5seu[["misc"]]$create_group("de_results") | ||
h5seu[["misc"]][["de_results"]][["de_markers_L1"]] <- de_markers_l1 | ||
h5seu[["misc"]][["de_results"]][["de_markers_L2"]] <- de_markers_l2 | ||
h5seu[["misc"]][["de_results"]][["de_markers_L3"]] <- de_markers_l3 | ||
|
||
h5seu$close_all() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
|
||
ml fhR/4.0.3-foss-2020bml fhR/4.0.3-foss-2020b | ||
cd $wd/differential_expression | ||
Rscript differential_expression/differential_expression_parallel.R -D $1 ${2-4} |
Oops, something went wrong.