-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathexpressed_regions.R
132 lines (122 loc) · 4.69 KB
/
expressed_regions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#' Identify expressed regions from the mean coverage for a given SRA project
#'
#' This function uses the pre-computed mean coverage for a given SRA project to
#' identify the expressed regions (ERs) for a given chromosome. It returns a
#' [GRanges-class][GenomicRanges::GRanges-class] object with the expressed regions as
#' defined by [findRegions][derfinder::findRegions].
#'
#' @param project A character vector with one SRA study id.
#' @param chr A character vector with the name of the chromosome.
#' @param cutoff The base-pair level cutoff to use.
#' @param outdir The destination directory for the downloaded file(s) that were
#' previously downloaded with [download_study]. If the files are missing,
#' but `outdir` is specified, they will get downloaded first. By default
#' `outdir` is set to `NULL` which will use the data from the web.
#' We only recommend downloading the full data if you will use it several times.
#' @param maxClusterGap This determines the maximum gap between candidate ERs.
#' @param chrlen The chromosome length in base pairs. If it's `NULL`, the
#' chromosome length is extracted from the Rail-RNA runs GitHub repository.
#' Alternatively check the `SciServer` section on the vignette to see
#' how to access all the recount data via a R Jupyter Notebook.
#' @param verbose If `TRUE` basic status updates will be printed along the
#' way.
#' @param ... Additional arguments passed to [download_study] when
#' `outdir` is specified but the required files are missing.
#'
#' @return A [GRanges-class][GenomicRanges::GRanges-class] object as created by
#' [findRegions][derfinder::findRegions].
#'
#' @author Leonardo Collado-Torres
#' @export
#'
#' @import derfinder GenomicRanges RCurl S4Vectors GenomeInfoDb
#' @importMethodsFrom rtracklayer import import.bw
#' @importFrom RCurl url.exists
#'
#' @seealso [download_study], [findRegions][derfinder::findRegions],
#' [railMatrix][derfinder::railMatrix]
#'
#' @examples
#' ## Define expressed regions for study SRP002001, chrY
#'
#' ## Workaround for https://github.com/lawremi/rtracklayer/issues/83
#' download_study("SRP002001", type = "mean")
#'
#' regions <- expressed_regions("SRP002001", "chrY",
#' cutoff = 5L,
#' maxClusterGap = 3000L,
#' outdir = "SRP002001"
#' )
#'
#' \dontrun{
#' ## Define the regions for multiple chrs
#' regs <- sapply(chrs, expressed_regions, project = "SRP002001", cutoff = 5L)
#'
#' ## You can then combine them into a single GRanges object if you want to
#' library("GenomicRanges")
#' single <- unlist(GRangesList(regs))
#' }
#'
expressed_regions <- function(
project, chr, cutoff, outdir = NULL,
maxClusterGap = 300L, chrlen = NULL, verbose = TRUE, ...) {
## Check inputs
stopifnot(is.character(project) & length(project) == 1)
stopifnot(is.character(chr) & length(chr) == 1)
## Use table from the package
url_table <- recount::recount_url
## Subset url data
url_table <- url_table[url_table$project == project, ]
if (nrow(url_table) == 0) {
stop("Invalid 'project' argument. There's no such 'project' in the recount_url data.frame.")
}
## Find chromosome length if absent
if (is.null(chrlen)) {
chrinfo <- read.table("https://raw.githubusercontent.com/nellore/runs/master/gtex/hg38.sizes",
col.names = c("chr", "size"), colClasses = c(
"character",
"integer"
)
)
chrlen <- chrinfo$size[chrinfo$chr == chr]
stopifnot(length(chrlen) == 1)
}
## Check if data is present, otherwise download it
if (!is.null(outdir)) {
## Check mean file
meanFile <- file.path(outdir, "bw", url_table$file_name[grep(
"mean",
url_table$file_name
)])
if (!file.exists(meanFile)) {
download_study(
project = project, type = "mean", outdir = outdir,
download = TRUE, ...
)
}
} else {
meanFile <- download_study(
project = project, type = "mean",
download = FALSE
)
}
## Load coverage
meanCov <- derfinder::loadCoverage(
files = meanFile, chr = chr,
chrlen = chrlen
)
## Find regions
regs <- derfinder::findRegions(
position = S4Vectors::Rle(TRUE, length(meanCov$coverage[[1]])),
fstats = meanCov$coverage[[1]], chr = chr,
maxClusterGap = maxClusterGap, cutoff = cutoff, verbose = verbose
)
## If there are no regions, return NULL
if (is.null(regs)) regs <- GenomicRanges::GRanges()
## Format appropriately
names(regs) <- seq_len(length(regs))
## Set the length
GenomeInfoDb::seqlengths(regs) <- length(meanCov$coverage[[1]])
## Finish
return(regs)
}