-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsnaptron_query.R
157 lines (145 loc) · 5.96 KB
/
snaptron_query.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#' Query Snaptron to get data from exon-exon junctions present in Intropolis
#'
#' This function uses the Snaptron API to query specific exon-exon junctions
#' that are available via Intropolis as described in the vignette.
#'
#' @param junctions A [GRanges-class][GenomicRanges::GRanges-class] object with the
#' exon-exon junctions of interest. The chromosome names should be in UCSC
#' format, such as 'chr1'. The strand information is ignored in the query.
#' @param version Either `srav1`, `srav2`, `gtex` or
#' `tcga`. SRA Version 1 of Intropolis has the
#' exon-exon junctions from about 20 thousand RNA-seq samples in hg19
#' coordinates. SRA Version 2 has the data from about 50 thousand RNA-seq
#' samples aligned to hg38. GTEx has about 30 million junctions from about 10
#' thousand samples from the GTEx consortium on hg38 coordinates. Finally,
#' TCGA has about 36 million junctions from about 11 thousand samples
#' from the TCGA consortium on hg38 coordinates.
#' @param verbose If `TRUE` status updates will be printed.
#' @param async Defaults to `TRUE` but in some situations it might be
#' preferrable to set it to `FALSE`. This argument gets passed to
#' [getURL][RCurl::getURL]. Check <https://github.com/ChristopherWilks/snaptron/issues/11>
#' for more details.
#'
#' @return A [GRanges-class][GenomicRanges::GRanges-class] object with the results from
#' the Snaptron query. For information on the different columns please see
#' <http://snaptron.cs.jhu.edu>.
#'
#' @references Please cite <http://snaptron.cs.jhu.edu>
#' if you use this function as Snaptron is a separate project from recount.
#' Thank you!
#'
#' @author Leonardo Collado-Torres
#' @export
#'
#' @import GenomicRanges
#' @import RCurl
#'
#' @examples
#'
#' library("GenomicRanges")
#' ## Define some exon-exon junctions (hg19 coordinates)
#' junctions <- GRanges(seqnames = "chr2", IRanges(
#' start = c(28971710:28971712, 29555081:29555083, 29754982:29754984),
#' end = c(29462417:29462419, 29923338:29923340, 29917714:29917716)
#' ))
#'
#' ## Check against Snaptron SRA version 1 (hg19 coordinates)
#' snaptron_query(junctions)
#' \dontrun{
#' ## Check another set of junctions against SRA version 2 (more data, hg38
#' ## coordinates)
#' junctions_v2 <- GRanges(seqnames = "chr2", IRanges(
#' start = 29532116:29532118, end = 29694848:29694850
#' ))
#' snaptron_query(junctions_v2, version = "srav2")
#'
#' ## Check these junctions in GTEx and TCGA data
#' snaptron_query(junctions_v2, version = "gtex")
#' snaptron_query(junctions_v2, version = "tcga")
#' }
#'
snaptron_query <- function(junctions, version = "srav1", verbose = TRUE, async = TRUE) {
## Check input
stopifnot(is(junctions, "GRanges"))
stopifnot(all(grepl("chr", seqlevels(junctions))))
version <- tolower(version)
stopifnot(version %in% c("srav1", "srav2", "gtex", "tcga"))
## Build query URLs
urls <- paste0(
"http://snaptron.cs.jhu.edu/", version,
"/snaptron?regions=", seqnames(junctions), ":", start(junctions), "-",
end(junctions), "&exact=1&header=0"
)
## Get results
if (verbose) message(paste(Sys.time(), "querying Snaptron"))
query_res <- getURL(urls, async = async)
## Split by line
if (verbose) message(paste(Sys.time(), "processing results"))
query_split <- strsplit(query_res, "\n")
names(query_split) <- NULL
## Are there any valid ones?
valid <- which(elementNROWS(query_split) > 0)
if (length(valid) == 0) {
message(paste(
Sys.time(),
"found no exon-exon junctions in Intropolis version", version,
"matching your query: this version uses",
ifelse(version == "srav1", "hg19", "hg38"), "coordinates."
))
return(NULL)
}
## Extract information
if (verbose) message(paste(Sys.time(), "extracting information"))
info <- lapply(query_split[valid], function(jxs) {
matrix(strsplit(jxs, "\t")[[1]], ncol = 18)
})
## Build result
res <- do.call(rbind, info)
colnames(res) <- c(
"type", "snaptron_id", "chromosome", "start", "end",
"length", "strand", "annotated", "left_motif", "right_motif",
"left_annotated", "right_annotated", "samples",
"samples_count", "coverage_sum", "coverage_avg", "coverage_median",
"source_dataset_id"
)
## Helper function for some special variables
to_chr_list <- function(x) {
r <- strsplit(x, ",")
i <- which(sapply(r, function(y) {
y[[1]] == "0"
}))
if (length(i) > 0) r[i] <- NA
return(CharacterList(r))
}
## Build into a GRanges object
result <- GRanges(
seqnames = res[, "chromosome"],
IRanges(as.numeric(res[, "start"]), as.numeric(res[, "end"])),
strand = res[, "strand"]
)
## Add other variables
result$type <- as.factor(res[, "type"])
result$snaptron_id <- as.integer(res[, "snaptron_id"])
result$annotated <- to_chr_list(res[, "annotated"])
result$left_motif <- res[, "left_motif"]
result$right_motif <- res[, "right_motif"]
result$left_annotated <- to_chr_list(res[, "left_annotated"])
result$right_annotated <- to_chr_list(res[, "right_annotated"])
## Remove leading comma
res[, "samples"] <- gsub("^,", "", res[, "samples"])
## Split sample ids from the read coverage by sample
sample_info <- IntegerList(strsplit(res[, "samples"], ":|,"))
sample_idx <- LogicalList(lapply(elementNROWS(sample_info), function(y) {
rep(c(TRUE, FALSE), y / 2)
}))
result$samples <- sample_info[sample_idx]
result$read_coverage_by_sample <- sample_info[!sample_idx]
## Continue with other variables
result$samples_count <- as.integer(res[, "samples_count"])
result$coverage_sum <- as.integer(res[, "coverage_sum"])
result$coverage_avg <- as.numeric(res[, "coverage_avg"])
result$coverage_median <- as.numeric(res[, "coverage_median"])
result$source_dataset_id <- as.integer(res[, "source_dataset_id"])
## Finish
return(result)
}