Skip to content

Commit

Permalink
Update processing methods for new datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
Helen Miller committed Apr 6, 2021
1 parent bf7f96e commit 609ad52
Show file tree
Hide file tree
Showing 73 changed files with 286,996 additions and 429 deletions.
2 changes: 1 addition & 1 deletion .Renviron
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Data is saved in subdirectories under DATA_DIR
DATA_DIR=/fh/fast/gottardo_r/ytian_working/covid19_datasets
# Location of reference dataset, downloaded from https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat
REFERENCE_DATASET=/fh/fast/gottardo_r/ytian_working/covid19_datasets/data/seurat/pbmc_multimodal.h5seurat
REFERENCE_DATASET=/fh/scratch/delete10/gottardo_r/hmiller/pbmc_multimodal.h5seurat
# Directory to save intermediate objects for debugging
DEBUG_DIR=/fh/scratch/delete10/gottardo_r/hmiller
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
*Rproj*
.DS_Store
.Rhistory
.RData
.Ruserdata
Expand Down
119 changes: 64 additions & 55 deletions README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions differential_expression/.Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
renv::load("..")
152 changes: 152 additions & 0 deletions differential_expression/differential_expression_combined.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Get differentially expressed marker genes for L1, L2, and L3 cell types for the combined dataset

message(getwd())
library(Seurat)
library(SeuratDisk)
library(SingleCellExperiment)
library(HDF5Array)
library(MAST)
library(slurmR)

debugDir <- file.path(Sys.getenv("DEBUG_DIR"), "covid_DE", "combined")
h5seuratDir <- file.path(Sys.getenv("DATA_DIR"), "h5seurat_reprocessed")
h5SCEDir <- file.path(h5seuratDir, "combined_sce")

message(">>> debugDir: ", debugDir)
message(">>> h5seuratDir: ", h5seuratDir)
message(">>>h5SCEDir: ", h5SCEDir)

# h5SCEDir <- "/fh/scratch/delete30/gottardo_r/hmiller/example_datasets/combined_sce"
# h5seuratDir <- "/fh/scratch/delete30/gottardo_r/hmiller/example_datasets"

message(">>> Getting cell types from ", file.path(h5seuratDir, "covid19_datasets.h5seurat"))
seu <- Connect(file.path(h5seuratDir, "covid19_datasets.h5seurat"))
celltypesL1 <- table(seu[["meta.data"]][["predicted.celltype.l1"]][])
celltypesL1 <- names(celltypesL1)[celltypesL1 >5]
celltypesL2 <- table(seu[["meta.data"]][["predicted.celltype.l2"]][])
celltypesL2 <- names(celltypesL2)[celltypesL2 > 5]
celltypesL3 <- table(seu[["meta.data"]][["predicted.celltype.l3"]][])
celltypesL3 <- names(celltypesL3)[celltypesL3 > 5]
seu$close_all()

slurmR_tmp_dir <- file.path(Sys.getenv("DEBUG_DIR"), "covid_DE", "slurmr")
if (!dir.exists(slurmR_tmp_dir)) dir.create(slurmR_tmp_dir)

message(">>> slurmR_tmp_dir: ", slurmR_tmp_dir)

FindMarkersForCelltype <- function(celltype,
inpath,
label) {
sce <- loadHDF5SummarizedExperiment(inpath, prefix = "combined_")

cells.1 <- Cells(sce)[which(sce[[label]] == celltype)]
cells.2 <- Cells(sce)[which(sce[[label]] != celltype)]
fc.results <- Seurat::FoldChange(logcounts(sce),
cells.1 = cells.1,
cells.2 = cells.2,
fc.name = "avg_log2FC",
mean.fxn = function(x) {
return(log(x = rowMeans(x = expm1(x = x)) + 1, base = 2))
})
markers <- Seurat::FindMarkers(logcounts(sce),
cells.1 = cells.1,
cells.2 = cells.2,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25,
test.use = "MAST",
fc.results = fc.results)
markers$gene <- rownames(markers)
markers$cluster <- celltype
return(markers)
}

l1_job <- Slurm_lapply(celltypesL1, FindMarkersForCelltype,
inpath = h5SCEDir,
label = "predicted.celltype.l1",

njobs = length(celltypesL1),
mc.cores = 1,
job_name = paste0("DE_l1_combined_", as.character(Sys.Date())),
tmp_path = slurmR_tmp_dir,
sbatch_opt = list("constraint" = "gizmok",
"cpus-per-task" = "16"),
seeds = 1:length(celltypesL1),
plan = "submit",
overwrite = TRUE)


l2_job <- Slurm_lapply(celltypesL2, FindMarkersForCelltype,
inpath = h5SCEDir,
label = "predicted.celltype.l2",

njobs = length(celltypesL2),
mc.cores = 1,
job_name = paste0("DE_l2_combined_", as.character(Sys.Date())),
tmp_path = slurmR_tmp_dir,
sbatch_opt = list("constraint" = "gizmok",
"cpus-per-task" = "16"),
seeds = 1:length(celltypesL2),
plan = "submit",
overwrite = TRUE)

l3_job <- Slurm_lapply(celltypesL3, FindMarkersForCelltype,
inpath = h5SCEDir,
label = "predicted.celltype.l3",

njobs = length(celltypesL3),
mc.cores = 1,
job_name = paste0("DE_l3_combined_", as.character(Sys.Date())),
tmp_path = slurmR_tmp_dir,
sbatch_opt = list("constraint" = "gizmok",
"cpus-per-task" = "16"),
seeds = 1:length(celltypesL3),
plan = "submit",
overwrite = TRUE)

collectResults <- function(de_job) {
still_running <- ( length(status(de_job)$running) + length(status(de_job)$pending) )> 0
while(still_running) {
Sys.sleep(1)
still_running <- (length(status(de_job)$running) + length(status(de_job)$pending)) > 0
}

markerList <- Slurm_collect(de_job)
markerdf <- rbindlist(markerList)
setDT(markerdf)
setorder(markerdf, p_val_adj, -avg_log2FC)

# Return the top 30 markers with p-value < 0.05
markerdf[p_val_adj < 0.05, head(.SD, 30), cluster]
}

Sys.sleep(5)

message(">>> Waiting for results for l1...")
de_markers_l1 <- collectResults(l1_job)
message(">>> Collected results for l1")
message(">>> Writing l1 results to ", file.path(debugDir, "de_markers_l1.csv"))
fwrite(de_markers_l1, file.path(debugDir, "de_markers_l1.csv"))

message(">>> Waiting for results for l2...")
de_markers_l2 <- collectResults(l2_job)
message(">>> Collected results for l2")
message(">>> Writing l2 results to ", file.path(debugDir, "de_markers_l2.csv"))
write.csv(de_markers_l2, file.path(debugDir, "de_markers_l2.csv"))

message(">>> Waiting for results for l3...")
de_markers_l3 <- collectResults(l3_job)
message(">>> Collected results for l3")
message(">>> Writing l3 results to ", file.path(debugDir, "de_markers_l3.csv"))
write.csv(de_markers_l3, file.path(debugDir, "de_markers_l3.csv"))


# Save results to hdf5 file in @misc slot
h5seu <- Connect(file.path(h5seuratDir, "covid19_datasets.h5seurat"), mode = "r+")

h5seu[["misc"]]$create_group("de_results")
h5seu[["misc"]][["de_results"]][["de_markers_L1"]] <- de_markers_l1
h5seu[["misc"]][["de_results"]][["de_markers_L2"]] <- de_markers_l2
h5seu[["misc"]][["de_results"]][["de_markers_L3"]] <- de_markers_l3

h5seu$close_all()
173 changes: 173 additions & 0 deletions differential_expression/differential_expression_parallel.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Performs differential expression analysis for each of the L3 cell types
# based on Seurat PBMC reference using the MAST test.
# Results will be saved in misc slot in h5seurat file.
#
# Uses slurmR to launch separate slurm jobs for each cluster.
#
# ----- Parse options ------
message(getwd())
library(optparse)
option_list = list(
make_option(c("-D", "--dataset"),
help = "name of dataset",
default = "silvin_2020"),
make_option(c("-i", "--indir"),
help = "directory with preprocessed h5seurat datasets",
default = file.path(Sys.getenv("DATA_DIR"), "h5seurat_reprocessed")),
make_option(c("-d", "--debugdir"),
help = "directory to store final processed dataset",
default = file.path(Sys.getenv("DEBUG_DIR"), "covid_DE")),
make_option(c("-c", "--corespertask"),
help = "number of cores to allocate to each sbatch job",
default = "4")
)

opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
message(paste0(capture.output(opt), collaps = "\n"))
# check args
inpath <- file.path(opt$indir, paste0(opt$dataset, "_processed.h5seurat"))
if (!file.exists(inpath)) {
stop("could not find ", inpath)
}
debugDir <- file.path(opt$debugdir, opt$dataset)
if (!dir.exists(opt$debugdir)) {
stop("could not find ", opt$debugdir)
}
if (!dir.exists(debugDir)) dir.create(debugDir)
dataset <- opt$dataset
# ----- START -----
# https://satijalab.org/seurat/articles/de_vignette.html

library(Seurat)
library(SeuratDisk)
library(slurmR)
library(data.table)

seu <- Connect(inpath)
celltypesL1 <- table(seu[["meta.data"]][["predicted.celltype.l1"]][])
celltypesL1 <- names(celltypesL1)[celltypesL1 >5]
celltypesL2 <- table(seu[["meta.data"]][["predicted.celltype.l2"]][])
celltypesL2 <- names(celltypesL2)[celltypesL2 > 5]
celltypesL3 <- table(seu[["meta.data"]][["predicted.celltype.l3"]][])
celltypesL3 <- names(celltypesL3)[celltypesL3 > 5]
assay <- ifelse("SCT" %in% names(seu[["assays"]]), "SCT", "RNA")
seu$close_all()

slurmR_tmp_dir <- file.path(opt$debugdir, "slurmr")
if (!dir.exists(slurmR_tmp_dir)) dir.create(slurmR_tmp_dir)

FindMarkersForCelltype <- function(celltype,
inpath,
label,
assay) {
seu <- LoadH5Seurat(inpath,
assays = "SCT")
Idents(seu) <- label

markers <- FindMarkers(object = seu,
assay = assay,
slot = "data",
ident.1 = celltype,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25,
test.use = "MAST")
markers$gene <- rownames(markers)
markers$cluster <- celltype
return(markers)
}


l1_job <- Slurm_lapply(celltypesL1, FindMarkersForCelltype,
inpath = inpath,
label = "predicted.celltype.l1",
assay = assay,

njobs = length(celltypesL1),
mc.cores = 1,
job_name = paste0("DE_l1_", opt$dataset, "_", as.character(Sys.Date())),
tmp_path = slurmR_tmp_dir,
sbatch_opt = list("constraint" = "gizmok",
"cpus-per-task" = opt$corespertask),
seeds = 1:length(celltypesL1),
plan = "submit",
overwrite = TRUE)


l2_job <- Slurm_lapply(celltypesL2, FindMarkersForCelltype,
inpath = inpath,
label = "predicted.celltype.l2",
assay = assay,

njobs = length(celltypesL2),
mc.cores = 1,
job_name = paste0("DE_l2_", opt$dataset, "_", as.character(Sys.Date())),
tmp_path = slurmR_tmp_dir,
sbatch_opt = list("constraint" = "gizmok",
"cpus-per-task" = opt$corespertask),
seeds = 1:length(celltypesL2),
plan = "submit",
overwrite = TRUE)

l3_job <- Slurm_lapply(celltypesL3, FindMarkersForCelltype,
inpath = inpath,
label = "predicted.celltype.l3",
assay = assay,

njobs = length(celltypesL3),
mc.cores = 1,
job_name = paste0("DE_l3_", opt$dataset, "_", as.character(Sys.Date())),
tmp_path = slurmR_tmp_dir,
sbatch_opt = list("constraint" = "gizmok",
"cpus-per-task" = opt$corespertask),
seeds = 1:length(celltypesL3),
plan = "submit",
overwrite = TRUE)

collectResults <- function(de_job) {
still_running <- ( length(status(de_job)$running) + length(status(de_job)$pending) )> 0
while(still_running) {
Sys.sleep(1)
still_running <- (length(status(de_job)$running) + length(status(de_job)$pending)) > 0
}

markerList <- Slurm_collect(de_job)
markerdf <- rbindlist(markerList)
setDT(markerdf)
setorder(markerdf, p_val_adj, -avg_log2FC)

# Return the top 30 markers with p-value < 0.05
markerdf[p_val_adj < 0.05, head(.SD, 30), cluster]
}

Sys.sleep(5)

message(">>> Waiting for results for l1...")
de_markers_l1 <- collectResults(l1_job)
message(">>> Collected results for l1")
message(">>> Writing l1 results to ", file.path(debugDir, "de_markers_l1.csv"))
fwrite(de_markers_l1, file.path(debugDir, "de_markers_l1.csv"))

message(">>> Waiting for results for l2...")
de_markers_l2 <- collectResults(l2_job)
message(">>> Collected results for l2")
message(">>> Writing l2 results to ", file.path(debugDir, "de_markers_l2.csv"))
write.csv(de_markers_l2, file.path(debugDir, "de_markers_l2.csv"))

message(">>> Waiting for results for l3...")
de_markers_l3 <- collectResults(l3_job)
message(">>> Collected results for l3")
message(">>> Writing l3 results to ", file.path(debugDir, "de_markers_l3.csv"))
write.csv(de_markers_l3, file.path(debugDir, "de_markers_l3.csv"))


# Save results to hdf5 file in @misc slot
h5seu <- Connect(inpath, mode = "r+")

h5seu[["misc"]]$create_group("de_results")
h5seu[["misc"]][["de_results"]][["de_markers_L1"]] <- de_markers_l1
h5seu[["misc"]][["de_results"]][["de_markers_L2"]] <- de_markers_l2
h5seu[["misc"]][["de_results"]][["de_markers_L3"]] <- de_markers_l3

h5seu$close_all()
5 changes: 5 additions & 0 deletions differential_expression/differential_expression_parallel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

ml fhR/4.0.3-foss-2020bml fhR/4.0.3-foss-2020b
cd $wd/differential_expression
Rscript differential_expression/differential_expression_parallel.R -D $1 ${2-4}
Loading

0 comments on commit 609ad52

Please sign in to comment.