diff --git a/.Rbuildignore b/.Rbuildignore index 9a43b8b..5dccde6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,3 +2,10 @@ .DS_Store ^.*\.Rproj$ ^\.Rproj\.user$ +dev +^\.github$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +^README\.Rmd$ +^codecov\.yml$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..b36903f --- /dev/null +++ b/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, +and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall +community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or +advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email +address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards +of acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies +when an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at [INSERT CONTACT +METHOD]. All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, +available at https://www.contributor-covenant.org/version/2/0/ +code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at https:// +www.contributor-covenant.org/translations. diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..7fcd044 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,47 @@ +# Contributing to recount + +This outlines how to propose a change to recount. +For more detailed info about contributing to this, and other tidyverse packages, please see the +[**development contributing guide**](https://rstd.io/tidy-contrib). + +## Fixing typos + +You can fix typos, spelling mistakes, or grammatical errors in the documentation directly using the GitHub web interface, as long as the changes are made in the _source_ file. +This generally means you'll need to edit [roxygen2 comments](https://roxygen2.r-lib.org/articles/roxygen2.html) in an `.R`, not a `.Rd` file. +You can find the `.R` file that generates the `.Rd` by reading the comment in the first line. + +## Bigger changes + +If you want to make a bigger change, it's a good idea to first file an issue and make sure someone from the team agrees that it’s needed. +If you’ve found a bug, please file an issue that illustrates the bug with a minimal +[reprex](https://www.tidyverse.org/help/#reprex) (this will also help you write a unit test, if needed). + +### Pull request process + +* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis::create_from_github("leekgroup/recount", fork = TRUE)`. + +* Install all development dependences with `devtools::install_dev_deps()`, and then make sure the package passes R CMD check by running `devtools::check()`. + If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing. +* Create a Git branch for your pull request (PR). We recommend using `usethis::pr_init("brief-description-of-change")`. + +* Make your changes, commit to git, and then create a PR by running `usethis::pr_push()`, and following the prompts in your browser. + The title of your PR should briefly describe the change. + The body of your PR should contain `Fixes #issue-number`. + +* For user-facing changes, add a bullet to the top of `NEWS.md` (i.e. just below the first header). Follow the style described in . + +### Code style + +* New code should follow the tidyverse [style guide](https://style.tidyverse.org). + You can use the [styler](https://CRAN.R-project.org/package=styler) package to apply these styles, but please don't restyle code that has nothing to do with your PR. + +* We use [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation. + +* We use [testthat](https://cran.r-project.org/package=testthat) for unit tests. + Contributions with test cases included are easier to accept. + +## Code of Conduct + +Please note that the recount project is released with a +[Contributor Code of Conduct](CODE_OF_CONDUCT.md). By contributing to this +project you agree to abide by its terms. diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md new file mode 100644 index 0000000..6e857c6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -0,0 +1,56 @@ +--- +name: Bug report or feature request +about: Describe a bug you've seen or make a case for a new feature +title: "[BUG] Your bug or feature request" +labels: '' +assignees: '' +--- + +Please briefly describe your problem and what output you expect. If you have a question, please don't use this form. Instead, ask on using the appropriate tag(s) including one for this package. + +## Context + +Provide some context for your bug report or feature request. This could be the: + +* link to raw code, example: https://github.com/lcolladotor/osca_LIIGH_UNAM_2020/blob/master/00-template.Rmd#L24-L28 +* link to a commit, example: https://github.com/lcolladotor/osca_LIIGH_UNAM_2020/commit/6aa30b22eda614d932c12997ba611ba582c435d7 +* link to a line of code inside a commit, example: https://github.com/lcolladotor/osca_LIIGH_UNAM_2020/commit/6aa30b22eda614d932c12997ba611ba582c435d7#diff-e265269fe4f17929940e81341b92b116R17 +* link to code from an R package, example: https://github.com/LieberInstitute/spatialLIBD/blob/master/R/run_app.R#L51-L55 + +## Code + +Include the code you ran and comments + +```R +## prompt an error +stop('hola') + +## check the error trace +traceback() +``` + +## Small reproducible example + +If you copy the lines of code that lead to your error, you can then run [`reprex::reprex()`](https://reprex.tidyverse.org/reference/reprex.html) which will create a small website with code you can then easily copy-paste here in a way that will be easy to work with later on. + +```R +## prompt an error +stop('hola') +#> Error in eval(expr, envir, enclos): hola + +## check the error trace +traceback() +#> No traceback available +``` + + +## R session information + +Remember to include your full R session information. + +```R +options(width = 120) +sessioninfo::session_info() +``` + +The output of `sessioninfo::session_info()` includes relevant GitHub installation information and other details that are missed by `sessionInfo()`. diff --git a/.github/SUPPORT.md b/.github/SUPPORT.md new file mode 100644 index 0000000..a4fd888 --- /dev/null +++ b/.github/SUPPORT.md @@ -0,0 +1,35 @@ +# Getting help with recount + +Thanks for using recount! +Before filing an issue, there are a few places to explore and pieces to put together to make the process as smooth as possible. + +## Make a reprex + +Start by making a minimal **repr**oducible **ex**ample using the [reprex](https://reprex.tidyverse.org/) package. +If you haven't heard of or used reprex before, you're in for a treat! +Seriously, reprex will make all of your R-question-asking endeavors easier (which is a pretty insane ROI for the five to ten minutes it'll take you to learn what it's all about). +For additional reprex pointers, check out the [Get help!](https://www.tidyverse.org/help/) section of the tidyverse site. + +## Where to ask? + +Armed with your reprex, the next step is to figure out [where to ask](https://www.tidyverse.org/help/#where-to-ask). + +* If it's a question: start with the [Bioconductor Support Website](https://support.bioconductor.org/) using the appropriate package tag. There are more people there to answer questions. + +* If it's a bug: you're in the right place, [file an issue](https://github.com/leekgroup/recount/issues/new). + +* If you're not sure: let the community help you figure it out! + If your problem _is_ a bug or a feature request, you can easily return here and report it. + +Before opening a new issue, be sure to [search issues and pull requests](https://github.com/leekgroup/recount/issues) to make sure the bug hasn't been reported and/or already fixed in the development version. +By default, the search will be pre-populated with `is:issue is:open`. +You can [edit the qualifiers](https://help.github.com/articles/searching-issues-and-pull-requests/) (e.g. `is:pr`, `is:closed`) as needed. +For example, you'd simply remove `is:open` to search _all_ issues in the repo, open or closed. + +## What happens next? + +To be as efficient as possible, development of tidyverse packages tends to be very bursty, so you shouldn't worry if you don't get an immediate response. +Typically we don't look at a repo until a sufficient quantity of issues accumulates, then there’s a burst of intense activity as we focus our efforts. +That makes development more efficient because it avoids expensive context switching between problems, at the cost of taking longer to get back to you. +This process makes a good reprex particularly important because it might be multiple months between your initial report and when we start working on it. +If we can’t reproduce the bug, we can’t fix it! diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml new file mode 100644 index 0000000..0343da1 --- /dev/null +++ b/.github/workflows/check-bioc.yml @@ -0,0 +1,358 @@ +## Read more about GitHub actions at +## https://www.tidyverse.org/blog/2020/04/usethis-1-6-0/ +## which will lead you to +## https://github.com/r-lib/actions/tree/master/examples. +## Also check the reference manual at +## https://help.github.com/en/actions +## I also found this work in progress book +## https://ropenscilabs.github.io/actions_sandbox/ +## as well as these two other GitHub actions config files +## https://github.com/seandavi/BiocActions/blob/master/.github/workflows/main.yml +## https://github.com/csoneson/dreval/blob/master/.github/workflows/R-CMD-check.yaml +## See also this blog post +## https://seandavi.github.io/post/learning-github-actions/ +## +## The development of this GHA workflow and history is documented at +## https://github.com/r-lib/actions/issues/84 + +on: + push: + branches: + - master + - 'RELEASE_*' + pull_request: + branches: + - master + - 'RELEASE_*' + +name: R-CMD-check-bioc + +env: + has_testthat: 'true' + run_covr: 'true' + run_pkgdown: 'true' + +jobs: + define-docker-info: + runs-on: ubuntu-latest + outputs: + imagename: ${{ steps.findinfo.outputs.imagename }} + biocversion: ${{ steps.findinfo.outputs.biocversion }} + steps: + - id: findinfo + run: | + ## Find what branch we are working on + if echo "$GITHUB_REF" | grep -q "master"; then + biocversion="devel" + elif echo "$GITHUB_REF" | grep -q "RELEASE_"; then + biocversion="$(basename -- $GITHUB_REF | tr '[:upper:]' '[:lower:]')" + fi + ## Define the image name and print the info + imagename="bioconductor/bioconductor_docker:${biocversion}" + echo $imagename + echo $biocversion + + ## Save the info for the next job + echo "::set-output name=imagename::${imagename}" + echo "::set-output name=biocversion::${biocversion}" + + R-CMD-check-bioc: + runs-on: ubuntu-latest + needs: define-docker-info + + name: ubuntu-latest (r-biocdocker bioc-${{ needs.define-docker-info.outputs.biocversion }}) + + outputs: + rversion: ${{ steps.findrversion.outputs.rversion }} + biocversionnum: ${{ steps.findrversion.outputs.biocversionnum }} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + TZ: UTC + NOT_CRAN: true + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + container: + image: ${{ needs.define-docker-info.outputs.imagename }} + volumes: + - /home/runner/work/_temp/Library:/usr/local/lib/R/host-site-library + + steps: + - uses: actions/checkout@v2 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + shell: Rscript {0} + + - name: Cache R packages + if: "!contains(github.event.head_commit.message, '/nocache')" + uses: actions/cache@v1 + with: + path: /home/runner/work/_temp/Library + key: ${{ runner.os }}-r-biocdocker-bioc-${{ needs.define-docker-info.outputs.biocversion }}-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-r-biocdocker-bioc-${{ needs.define-docker-info.outputs.biocversion }}- + + - name: Install system dependencies + if: runner.os == 'Linux' + env: + RHUB_PLATFORM: linux-x86_64-ubuntu-gcc + run: | + Rscript -e "remotes::install_github('r-hub/sysreqs')" + sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") + sudo -s eval "$sysreqs" + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + remotes::install_cran("BiocManager") + BiocManager::install("BiocCheck") + + ## Copy all the installed packages to a location where BiocCheck + ## will find them later. This is needed when running biocdocker + ## with the shared volume. + libs <- .libPaths() + message(paste(Sys.time(), 'current R library paths:')) + print(libs) + if(length(libs) > 1) { + sapply(dir(libs[1], full.names = TRUE), file.copy, to = libs[2], recursive = TRUE) + } + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + rcmdcheck::rcmdcheck( + args = c("--no-build-vignettes", "--no-manual", "--timings"), + build_args = c("--no-manual", "--no-resave-data"), + error_on = "warning", + check_dir = "check" + ) + shell: Rscript {0} + + - name: Reveal testthat details + if: env.has_testthat == 'true' + run: find . -name testthat.Rout -exec cat '{}' ';' + + - name: BiocCheck + run: | + R CMD BiocCheck --no-check-R-ver --no-check-bioc-help check/*.tar.gz + ## For more options check http://bioconductor.org/packages/release/bioc/vignettes/BiocCheck/inst/doc/BiocCheck.html + + - name: Install covr + if: github.ref == 'refs/heads/master' && env.run_covr == 'true' + run: | + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Test coverage + if: github.ref == 'refs/heads/master' && env.run_covr == 'true' + run: | + covr::codecov() + shell: Rscript {0} + + - name: Install pkgdown + if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' + run: | + remotes::install_cran("pkgdown") + shell: Rscript {0} + + - name: Install package + if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' + run: R CMD INSTALL . + + - name: Deploy package + if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" + shell: bash {0} + ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) + ## at least one locally before this will work. This creates the gh-pages + ## branch (erasing anything you haven't version controlled!) and + ## makes the git history recognizable by pkgdown. + + - id: findrversion + run: | + ## Find what branch we are working on + if echo "$GITHUB_REF" | grep -q "master"; then + biocversion="devel" + elif echo "$GITHUB_REF" | grep -q "RELEASE_"; then + biocversion="release" + fi + + ## Define the R and Bioconductor version numbers + biocversionnum=$(Rscript -e "info <- BiocManager:::.version_map_get_online('https://bioconductor.org/config.yaml'); res <- subset(info, BiocStatus == '${biocversion}')[, 'Bioc']; cat(as.character(res))") + rversion=$(Rscript -e "info <- BiocManager:::.version_map_get_online('https://bioconductor.org/config.yaml'); res <- subset(info, BiocStatus == '${biocversion}')[, 'R']; cat(as.character(res))") + + ## I might need this for now until R 4.0 is out (try without it first) + if echo "$rversion" | grep -q "4.0"; then + rversion="devel" + fi + + ## Print the results + echo $biocversion + echo $biocversionnum + echo $rversion + + ## Save the info for the next job + echo "::set-output name=rversion::${rversion}" + echo "::set-output name=biocversionnum::${biocversionnum}" + shell: + bash {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@master + with: + name: ${{ runner.os }}-r-biocdocker-bioc-${{ needs.define-docker-info.outputs.biocversion }}-results + path: check + + R-CMD-check-r-lib: + runs-on: ${{ matrix.config.os }} + needs: [define-docker-info, R-CMD-check-bioc] + + name: ${{ matrix.config.os }} (r-${{ needs.R-CMD-check-bioc.outputs.rversion }} bioc-${{ needs.define-docker-info.outputs.biocversion }}) + + strategy: + fail-fast: false + matrix: + config: + # ## Un-comment in case you also want to run other versions + - {os: windows-latest} + - {os: macOS-latest} + # - {os: ubuntu-16.04, rspm: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + BIOCVERSIONNUM: ${{ needs.R-CMD-check-bioc.outputs.biocversionnum }} + + steps: + - uses: actions/checkout@v2 + + - name: Setup R from r-lib + uses: r-lib/actions/setup-r@master + with: + r-version: ${{ needs.R-CMD-check-bioc.outputs.rversion }} + + - name: Setup pandoc from r-lib + uses: r-lib/actions/setup-pandoc@master + + - name: Query dependencies + run: | + if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + shell: Rscript {0} + + - name: Cache R packages + if: "!contains(github.event.head_commit.message, '/nocache')" + uses: actions/cache@v1 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ needs.R-CMD-check-bioc.outputs.rversion }}-bioc-${{ needs.define-docker-info.outputs.biocversion }}-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-r-${{ needs.R-CMD-check-bioc.outputs.rversion }}-bioc-${{ needs.define-docker-info.outputs.biocversion }}- + + - name: Install Linux system dependencies + if: runner.os == 'Linux' + env: + RHUB_PLATFORM: linux-x86_64-ubuntu-gcc + run: | + Rscript -e "remotes::install_github('r-hub/sysreqs')" + sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") + sudo -s eval "$sysreqs" + + - name: Install macOS system dependencies + if: matrix.config.os == 'macOS-latest' && needs.R-CMD-check-bioc.outputs.rversion == 'devel' + run: | + ## Enable installing XML from source if needed + brew install libxml2 + echo "::set-env name=XML_CONFIG::/usr/local/opt/libxml2/bin/xml2-config" + + brew install imagemagick@6 + ## Won't be needed once the current xml2 dev is available on CRAN + Rscript -e "remotes::install_github('r-lib/xml2')" + + - name: Install Windows system dependencies + if: runner.os == 'Windows' + run: | + if (!requireNamespace('RCurl', quietly = TRUE)) install.packages('RCurl', type = 'binary') + shell: Rscript {0} + + - name: Install BiocManager + run: | + remotes::install_cran("BiocManager") + shell: Rscript {0} + + - name: Set BiocVersion + run: | + BiocManager::install(version = Sys.getenv('BIOCVERSIONNUM')) + shell: Rscript {0} + + - name: Install dependencies - Windows R-devel hack + if: runner.os == 'Windows' && needs.R-CMD-check-bioc.outputs.rversion == 'devel' + run: | + ## Need GenomeInfoDbData first + BiocManager::install('GenomeInfoDbData') + + ## Hack my way through Windows for R-devel (4.1) ... + ## I only need this now to avoid the compilation issues on Windows + ## such as with Rhtslib and other packages. + ## At the end of April, once R 4.0 is actually released, then this + ## won't be an issue =) + ## Note that officially Bioconductor as of today 2020-04-20 does + ## not support R-devel! Hence the lack of binaries on R 4.1 + contrib <- BiocManager::repositories() + is_cran <- names(contrib) == 'CRAN' + is_ann <- names(contrib) %in% c('BioCann', 'BioCexp', 'BioCworkflows') + contrib[!is_cran & !is_ann] <- paste0(contrib[!is_cran], '/bin/windows/contrib/4.0/') + contrib[is_ann] <- paste0(contrib[is_ann], '/src/contrib/') + contrib[is_cran] <- paste0(contrib[is_cran], '/bin/windows/contrib/4.1/') + contrib + + remotes::install_deps(dependencies = TRUE, contriburl = contrib) + remotes::install_cran("rcmdcheck", contriburl = contrib) + if (!requireNamespace("BiocCheck", quietly = TRUE)) BiocManager::install("BiocCheck", contriburl = contrib) + shell: Rscript {0} + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + if (!requireNamespace("BiocCheck", quietly = TRUE)) BiocManager::install("BiocCheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + rcmdcheck::rcmdcheck( + args = c("--no-build-vignettes", "--no-manual", "--timings"), + build_args = c("--no-manual", "--no-resave-data"), + error_on = "warning", + check_dir = "check" + ) + shell: Rscript {0} + + - name: Reveal testthat details + if: env.has_testthat == 'true' + run: find . -name testthat.Rout -exec cat '{}' ';' + + - name: BiocCheck + run: | + ## This syntax works on Windows as well as other OS + BiocCheck::BiocCheck(dir('check', 'tar.gz$', full.names = TRUE), `no-check-R-ver` = TRUE, `no-check-bioc-help` = TRUE) + ## For more options check http://bioconductor.org/packages/release/bioc/vignettes/BiocCheck/inst/doc/BiocCheck.html + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@master + with: + name: ${{ runner.os }}-r-${{ needs.R-CMD-check-bioc.outputs.rversion }}-bioc-${{ needs.define-docker-info.outputs.biocversion }}-results + path: check diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index cb7ace4..0000000 --- a/.travis.yml +++ /dev/null @@ -1,28 +0,0 @@ -language: r - -r: bioc-devel -warnings_are_errors: false -sudo: false -cache: packages - -r_packages: - - covr - - DT - - pheatmap - -r_build_args: "--no-manual --no-resave-data" -r_check_args: "--no-build-vignettes --no-manual --timings" - -after_success: - - Rscript -e 'library(covr); codecov()' - -notifications: - email: - on_success: change - on_failure: change - slack: - secure: AoSIHGx0+IQCoCeuCc3skINDMOoa2fDVUTdNNK4CjIkWfrWqMMXSOUc0txZdflWhwzijDF3ZzcMSJl2mHpK+yRTGT7GeaQ7djvmy2rypr4YnIttWm5APo3DXeKWFq+mfB/iXoA16D/Wu8gPgBijtd3BJntp2DeDGTYwAVtzGf52tZjbUZK29K3VdYeeLTq7h7HOkF104Z/yZ+ptTA+OWFSRsv/3vUK2gtzotM4ZkwdAOI7lQ1wktosVzMSjl6Fv8OJQVwqxpswY973C9e2dBzse2SUbF2vX1pM7R7UkQyAbFSTH3Yrcmz72SXM7U2uXhC57N5e9ZdhugUXagLK28oyREYWfnXDigmBkrBpbEThq0pGclShixl8HW5I/bYAeqF5w2TTD0H9gXpZJM7oq6TKQe3Lp6OtoF/GKACE+Bf8e14dcDsGg3GhtJoSPOdY7/zh1GeS5E7J1AmrxHlVdfodLeFtIN0RiMIg2VHHzW9/Bsks8JEzLafLPOuNumWFXF8ePR1SbTjETxxq8oET10F4wcQ2M5ewzlGTvKpjQtXIkBXzkggLJ5El51rMNRi8g5iixIhnQEx5wg9pB4DMq0n7e4lOZpCUoqrLeuHYvTE/epoetHNgj7g/EiEYBr3MkQ/WcuTvsDVT6IFjWD1b8UC1jbjDBowXYytiWjHcwCJJQ= - -env: - global: - - _R_CHECK_TIMINGS_="0" diff --git a/DESCRIPTION b/DESCRIPTION index 549f80b..35bb6fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: recount Title: Explore and download data from the recount project -Version: 1.13.1 -Date: 2019-11-06 +Version: 1.13.2 +Date: 2020-04-21 Authors@R: c(person("Leonardo", "Collado-Torres", role = c("aut", "cre"), email = "lcolladotor@gmail.com", comment = c(ORCID = "0000-0003-2140-308X")), person("Abhinav", "Nellore", role = "ctb", @@ -51,7 +51,8 @@ Suggests: RefManageR, regionReport (>= 1.9.4), rmarkdown (>= 0.9.5), - testthat (>= 2.1.0) + testthat (>= 2.1.0), + covr VignetteBuilder: knitr Description: Explore and download data from the recount project available at https://jhubiostatistics.shinyapps.io/recount/. Using the recount package you can @@ -70,5 +71,5 @@ URL: https://github.com/leekgroup/recount BugReports: https://support.bioconductor.org/t/recount/ biocViews: Coverage, DifferentialExpression, GeneExpression, RNASeq, Sequencing, Software, DataImport, ImmunoOncology -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 Roxygen: list(markdown = TRUE) diff --git a/NEWS.md b/NEWS.md index 0c6c0bf..a46d6ec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# recount 1.13.2 + +SIGNIFICANT USER-VISIBLE CHANGES + +* Documentation website is now available at +http://leekgroup.github.io/recount/. It gets updated with every +commit on the master branch (bioc-devel) using GitHub Actions and pkgdown. + # recount 1.13.1 * Mention in `reproduce_ranges()` the link to diff --git a/R/abstract_search.R b/R/abstract_search.R index 5b8b863..0001bc2 100644 --- a/R/abstract_search.R +++ b/R/abstract_search.R @@ -26,26 +26,25 @@ #' #' @examples #' ## Find the Geuvadis consortium project -#' project_info <- abstract_search('Geuvadis consortium') -#' +#' project_info <- abstract_search("Geuvadis consortium") +#' #' ## See some summary information for this project #' project_info - abstract_search <- function(query, id_only = FALSE, ...) { ## Check input stopifnot(is.character(query)) stopifnot(is.logical(id_only)) stopifnot(length(id_only) == 1) - + ## Use table from the package abstract_table <- recount::recount_abstract - + ## Get abstracts abstracts <- tolower(abstract_table$abstract) query <- tolower(query) i <- grep(query, abstracts, ...) - - if(id_only) { + + if (id_only) { return(abstract_table$project[i]) } else { return(abstract_table[i, ]) diff --git a/R/add_metadata.R b/R/add_metadata.R index 86f3754..8961f9a 100644 --- a/R/add_metadata.R +++ b/R/add_metadata.R @@ -47,7 +47,7 @@ #' @examples #' #' ## Add the sample metadata to an example rse_gene object -#' rse_gene <- add_metadata(rse_gene_SRP009615, 'recount_brain_v2') +#' rse_gene <- add_metadata(rse_gene_SRP009615, "recount_brain_v2") #' #' ## Explore the metadata #' colData(rse_gene) @@ -60,21 +60,19 @@ #' #' ## Obtain all the recount_brain_v2 metadata if you want to #' ## explore the metadata manually -#' recount_brain_v2 <- add_metadata(source = 'recount_brain_v2') -#' - -add_metadata <- function(rse, source = 'recount_brain_v2', is_tcga = FALSE, +#' recount_brain_v2 <- add_metadata(source = "recount_brain_v2") +add_metadata <- function(rse, source = "recount_brain_v2", is_tcga = FALSE, verbose = TRUE) { - stopifnot(length(source) == 1) ## For a NOTE in R CMD check valid_sources <- data.frame( - name = c('recount_brain_v1', 'recount_brain_v2'), + name = c("recount_brain_v1", "recount_brain_v2"), url = c( - 'https://github.com/LieberInstitute/recount-brain/blob/master/merged_metadata/recount_brain_v1.Rdata?raw=true', 'https://github.com/LieberInstitute/recount-brain/blob/master/cross_studies_metadata/recount_brain_v2.Rdata?raw=true'), - object = c('recount_brain', 'recount_brain'), - sample_id = c('run_s', 'run_s'), + "https://github.com/LieberInstitute/recount-brain/blob/master/merged_metadata/recount_brain_v1.Rdata?raw=true", "https://github.com/LieberInstitute/recount-brain/blob/master/cross_studies_metadata/recount_brain_v2.Rdata?raw=true" + ), + object = c("recount_brain", "recount_brain"), + sample_id = c("run_s", "run_s"), stringsAsFactors = FALSE ) @@ -82,14 +80,14 @@ add_metadata <- function(rse, source = 'recount_brain_v2', is_tcga = FALSE, to_use <- valid_sources[tolower(valid_sources$name) == tolower(source), ] - destfile <- file.path(tempdir(), paste0(to_use$name, '.Rdata')) + destfile <- file.path(tempdir(), paste0(to_use$name, ".Rdata")) - if(verbose) message(paste(Sys.time(), 'downloading the', to_use$object, 'metadata to', destfile)) + if (verbose) message(paste(Sys.time(), "downloading the", to_use$object, "metadata to", destfile)) download_retry( url = to_use$url, destfile = destfile, - mode = 'wb' + mode = "wb" ) load_meta <- function() { load(destfile, verbose = verbose) @@ -97,26 +95,30 @@ add_metadata <- function(rse, source = 'recount_brain_v2', is_tcga = FALSE, } new_meta <- load_meta() - if(missing(rse)) return(new_meta) + if (missing(rse)) { + return(new_meta) + } - if(is_tcga) { + if (is_tcga) { map <- match(colData(rse)$gdc_file_id, new_meta[, to_use$sample_id]) } else { map <- match(colData(rse)$run, new_meta[, to_use$sample_id]) } - if(verbose) { - message(paste(Sys.time(), 'found', sum(!is.na(map)), 'out of', length(map), 'samples in the', to_use$object, 'metadata')) + if (verbose) { + message(paste(Sys.time(), "found", sum(!is.na(map)), "out of", length(map), "samples in the", to_use$object, "metadata")) } ## Make a dummy table with the new metadata to be added - dummy <- as.data.frame(matrix(NA, nrow = ncol(rse), - ncol = ncol(new_meta) - 1)) + dummy <- as.data.frame(matrix(NA, + nrow = ncol(rse), + ncol = ncol(new_meta) - 1 + )) cols_to_drop <- which(colnames(new_meta) == to_use$sample_id) - colnames(dummy) <- colnames(new_meta)[- cols_to_drop] + colnames(dummy) <- colnames(new_meta)[-cols_to_drop] ## In case new data is present - if(any(!is.na(map))){ - dummy[!is.na(map), ] <- new_meta[map[!is.na(map)], - cols_to_drop] + if (any(!is.na(map))) { + dummy[!is.na(map), ] <- new_meta[map[!is.na(map)], -cols_to_drop] } rownames(dummy) <- NULL diff --git a/R/add_predictions.R b/R/add_predictions.R index df90770..6e59c70 100644 --- a/R/add_predictions.R +++ b/R/add_predictions.R @@ -65,43 +65,51 @@ #' #' ## Download all the latest predictions #' PredictedPhenotypes <- add_predictions() -#' - -add_predictions <- function(rse, is_tcga = FALSE, version = 'latest', +add_predictions <- function(rse, is_tcga = FALSE, version = "latest", verbose = TRUE) { ## For a NOTE in R CMD check PredictedPhenotypes <- NULL - if(version == 'latest') version <- tryCatch(suppressWarnings(readLines( - 'https://raw.githubusercontent.com/leekgroup/recount-website/master/predictions/latestVersion.txt'))[1] - , error = function(e) { + if (version == "latest") { + version <- tryCatch(suppressWarnings(readLines( + "https://raw.githubusercontent.com/leekgroup/recount-website/master/predictions/latestVersion.txt" + ))[1], + error = function(e) { print(e) - v <- '0.0.05' - message(paste(Sys.time(), 'Failed to check the latest version, using version', v)) - return(v) }) + v <- "0.0.05" + message(paste(Sys.time(), "Failed to check the latest version, using version", v)) + return(v) + } + ) + } ## Download file - predfile <- paste0('PredictedPhenotypes_v', version, '.rda') + predfile <- paste0("PredictedPhenotypes_v", version, ".rda") url <- paste0( - 'https://github.com/leekgroup/recount-website/blob/master/predictions/', - predfile, '?raw=true') + "https://github.com/leekgroup/recount-website/blob/master/predictions/", + predfile, "?raw=true" + ) destfile <- file.path(tempdir(), predfile) - if(verbose) message(paste(Sys.time(), 'downloading the predictions to', destfile)) + if (verbose) message(paste(Sys.time(), "downloading the predictions to", destfile)) download_retry( url = url, destfile = destfile, - mode = 'wb' + mode = "wb" ) load(destfile, verbose = verbose) - if(missing(rse)) return(PredictedPhenotypes) + if (missing(rse)) { + return(PredictedPhenotypes) + } - if(is_tcga) { + if (is_tcga) { map <- match(colData(rse)$gdc_file_id, PredictedPhenotypes$sample_id) } else { map <- match(colData(rse)$run, PredictedPhenotypes$sample_id) } - colData(rse) <- cbind(colData(rse), PredictedPhenotypes[map, - !colnames(PredictedPhenotypes) %in% c('sample_id', 'dataset')]) + colData(rse) <- cbind(colData(rse), PredictedPhenotypes[ + map, + !colnames(PredictedPhenotypes) %in% c("sample_id", "dataset") + ]) return(rse) } diff --git a/R/all_metadata.R b/R/all_metadata.R index 541ca8b..c03b1ec 100644 --- a/R/all_metadata.R +++ b/R/all_metadata.R @@ -19,7 +19,6 @@ #' @examples #' #' metadata <- all_metadata() -#' #' @details Note that for `subset = 'gtex'`, there are more variables than #' the ones we have for 'sra'. This information corresponds to file #' GTEx_Data_V6_Annotations_SampleAttributesDS.txt available at @@ -33,27 +32,28 @@ #' For more information, check . #' -all_metadata <- function(subset = 'sra', verbose = TRUE) { +all_metadata <- function(subset = "sra", verbose = TRUE) { ## For R CMD check metadata_clean <- NULL ## check inputs subset <- tolower(subset) - stopifnot(subset %in% c('sra', 'gtex', 'tcga')) + stopifnot(subset %in% c("sra", "gtex", "tcga")) stopifnot(length(subset) == 1) ## Download file - metafile <- paste0('metadata_clean_', subset, '.Rdata') + metafile <- paste0("metadata_clean_", subset, ".Rdata") url <- paste0( - 'https://github.com/leekgroup/recount-website/blob/master/metadata/', - metafile, '?raw=true') + "https://github.com/leekgroup/recount-website/blob/master/metadata/", + metafile, "?raw=true" + ) destfile <- file.path(tempdir(), metafile) - if(verbose) message(paste(Sys.time(), 'downloading the metadata to', destfile)) + if (verbose) message(paste(Sys.time(), "downloading the metadata to", destfile)) download_retry( url = url, destfile = destfile, - mode = 'wb' + mode = "wb" ) load(destfile) return(metadata_clean) diff --git a/R/browse_study.R b/R/browse_study.R index 71548b5..b816b5d 100644 --- a/R/browse_study.R +++ b/R/browse_study.R @@ -15,24 +15,23 @@ #' #' @examples #' ## Find the Geuvadis consortium project -#' id <- abstract_search('Geuvadis consortium', id_only = TRUE) +#' id <- abstract_search("Geuvadis consortium", id_only = TRUE) #' id #' #' ## Explore the Geuvadis consortium project #' url <- browse_study(id) -#' +#' #' ## See the actual URL #' url - browse_study <- function(project, browse = interactive()) { ## Check inputs stopifnot(is.logical(browse) & length(browse == 1)) stopifnot(is.character(project)) - + ## Construct url - url <- paste0('https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=', project) - + url <- paste0("https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=", project) + ## Finish - if(browse) sapply(url, browseURL) + if (browse) sapply(url, browseURL) return(invisible(url)) } diff --git a/R/coverage_matrix.R b/R/coverage_matrix.R index 67d3227..e1e98cc 100644 --- a/R/coverage_matrix.R +++ b/R/coverage_matrix.R @@ -10,23 +10,23 @@ #' for `chr` for which to calculate the coverage matrix. #' @param chunksize A single integer vector defining the chunksize to use for #' computing the coverage matrix. Regions will be split into different chunks -#' which can be useful when using a parallel instance as defined by +#' which can be useful when using a parallel instance as defined by #' `bpparam`. #' @param bpparam A [BiocParallelParam-class][BiocParallel::BiocParallelParam-class] instance which -#' will be used to calculate the coverage matrix in parallel. By default, +#' will be used to calculate the coverage matrix in parallel. By default, #' [SerialParam-class][BiocParallel::SerialParam-class] will be used. #' @param verboseLoad If `TRUE` basic status updates for loading the data #' will be printed. -#' @param scale If `TRUE`, the coverage counts will be scaled to read -#' counts based on a library size of 40 million reads. Set `scale` to +#' @param scale If `TRUE`, the coverage counts will be scaled to read +#' counts based on a library size of 40 million reads. Set `scale` to #' `FALSE` if you want the raw coverage counts. The scaling method is by #' AUC, as in the default option of [scale_counts]. -#' @param round If `TRUE`, the counts are rounded to integers. Set to +#' @param round If `TRUE`, the counts are rounded to integers. Set to #' `TRUE` if you want to match the defaults of [scale_counts]. -#' +#' #' #' @return A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] -#' object with the counts stored in the assays slot. +#' object with the counts stored in the assays slot. #' #' @details When using `outdir = NULL` the information will be accessed #' from the web on the fly. If you encounter internet access problems, it might @@ -54,136 +54,152 @@ #' #' @examples #' -#' if(.Platform$OS.type != 'windows') { -#' ## Reading BigWig files is not supported by rtracklayer on Windows +#' if (.Platform$OS.type != "windows") { +#' ## Reading BigWig files is not supported by rtracklayer on Windows #' ## Define expressed regions for study DRP002835, chrY -#' regions <- expressed_regions('DRP002835', 'chrY', cutoff = 5L, -#' maxClusterGap = 3000L) +#' regions <- expressed_regions("DRP002835", "chrY", +#' cutoff = 5L, +#' maxClusterGap = 3000L +#' ) #' #' ## Now calculate the coverage matrix for this study -#' rse <- coverage_matrix('DRP002835', 'chrY', regions) +#' rse <- coverage_matrix("DRP002835", "chrY", regions) #' #' ## One row per region #' identical(length(regions), nrow(rse)) #' } -#' - coverage_matrix <- function(project, chr, regions, chunksize = 1000, bpparam = NULL, outdir = NULL, chrlen = NULL, verbose = TRUE, - verboseLoad = verbose, scale = TRUE, round = FALSE, ...) { + verboseLoad = verbose, scale = TRUE, round = FALSE, ...) { ## Check inputs stopifnot(is.character(project) & length(project) == 1) stopifnot(is.character(chr) & length(chr) == 1) stopifnot((is.numeric(chunksize) | is.integer(chunksize)) & length(chunksize) == 1) - + ## Windows-specific info - if(.Platform$OS.type == 'windows') { - warning('rtracklayer does not support importing BigWig files on Windows, so this function might not work') + if (.Platform$OS.type == "windows") { + warning("rtracklayer does not support importing BigWig files on Windows, so this function might not work") } - + ## Use table from the package url_table <- recount::recount_url - + ## Subset url data url_table <- url_table[url_table$project == project, ] - if(nrow(url_table) == 0) { + if (nrow(url_table) == 0) { stop("Invalid 'project' argument. There's no such 'project' in the recount_url data.frame.") } - + ## Find chromosome length if absent - if(is.null(chrlen)) { - chrinfo <- read.table('https://raw.githubusercontent.com/nellore/runs/master/gtex/hg38.sizes', - col.names = c('chr', 'size'), colClasses = c('character', - 'integer')) + if (is.null(chrlen)) { + chrinfo <- read.table("https://raw.githubusercontent.com/nellore/runs/master/gtex/hg38.sizes", + col.names = c("chr", "size"), colClasses = c( + "character", + "integer" + ) + ) chrlen <- chrinfo$size[chrinfo$chr == chr] stopifnot(length(chrlen) == 1) } - - samples_i <- which(grepl('[.]bw$', url_table$file_name) & !grepl('mean', - url_table$file_name)) + + samples_i <- which(grepl("[.]bw$", url_table$file_name) & !grepl( + "mean", + url_table$file_name + )) ## Check if data is present, otherwise download it - if(!is.null(outdir)) { + if (!is.null(outdir)) { ## Check sample files sampleFiles <- sapply(samples_i, function(i) { - file.path(outdir, 'bw', url_table$file_name[i]) + file.path(outdir, "bw", url_table$file_name[i]) }) - if(any(!file.exists(sampleFiles))) { - download_study(project = project, type = 'samples', outdir = outdir, - download = TRUE, ...) + if (any(!file.exists(sampleFiles))) { + download_study( + project = project, type = "samples", outdir = outdir, + download = TRUE, ... + ) } - + ## Check phenotype data - phenoFile <- file.path(outdir, paste0(project, '.tsv')) - if(!file.exists(phenoFile)) { - download_study(project = project, type = 'phenotype', - outdir = outdir, download = TRUE, ...) + phenoFile <- file.path(outdir, paste0(project, ".tsv")) + if (!file.exists(phenoFile)) { + download_study( + project = project, type = "phenotype", + outdir = outdir, download = TRUE, ... + ) } } else { - sampleFiles <- download_study(project = project, type = 'samples', - download = FALSE) - phenoFile <- download_study(project = project, type = 'phenotype', - download = FALSE) + sampleFiles <- download_study( + project = project, type = "samples", + download = FALSE + ) + phenoFile <- download_study( + project = project, type = "phenotype", + download = FALSE + ) } - + ## Read pheno data pheno <- .read_pheno(phenoFile, project) - + ## Get sample names m <- match(url_table$file_name[samples_i], pheno$bigwig_file) ## Match the pheno with the samples pheno <- pheno[m, ] - if(project != 'TCGA') { + if (project != "TCGA") { names(sampleFiles) <- pheno$run } else { names(sampleFiles) <- pheno$gdc_file_id } - + ## Define library size normalization factor - if(scale) { + if (scale) { targetSize <- 40e6 totalMapped <- pheno$auc[m] mappedPerXM <- totalMapped / targetSize } else { mappedPerXM <- rep(1, nrow(pheno)) } - - + + ## Keep only regions from the chr in question regions <- regions[seqnames(regions) == chr] - + ## Split regions into chunks nChunks <- length(regions) %/% chunksize - if(length(regions) %% chunksize > 0) nChunks <- nChunks + 1 + if (length(regions) %% chunksize > 0) nChunks <- nChunks + 1 names(regions) <- seq_len(length(regions)) - + ## Split regions into chunks - if(nChunks == 1) { + if (nChunks == 1) { regs_split <- list(regions) - names(regs_split) <- '1' + names(regs_split) <- "1" } else { regs_split <- split(regions, cut(seq_len(length(regions)), - breaks = nChunks, labels = FALSE)) + breaks = nChunks, labels = FALSE + )) } - + ## Define bpparam - if(is.null(bpparam)) bpparam <- BiocParallel::SerialParam() - + if (is.null(bpparam)) bpparam <- BiocParallel::SerialParam() + ## Load coverage data resChunks <- lapply(regs_split, derfinder:::.railMatChrRegion, sampleFiles = sampleFiles, chr = chr, mappedPerXM = mappedPerXM, L = 1, verbose = verbose, BPPARAM.railChr = bpparam, - verboseLoad = verboseLoad, chrlen = chrlen) - + verboseLoad = verboseLoad, chrlen = chrlen + ) + ## Group results from chunks - coverageMatrix <- do.call(rbind, lapply(resChunks, '[[', 'coverageMatrix')) - - if(round) coverageMatrix <- round(coverageMatrix, 0) - + coverageMatrix <- do.call(rbind, lapply(resChunks, "[[", "coverageMatrix")) + + if (round) coverageMatrix <- round(coverageMatrix, 0) + ## Build a RSE object rse <- SummarizedExperiment::SummarizedExperiment( - assays = list('counts' = coverageMatrix), - colData = DataFrame(pheno), rowRanges = regions) - + assays = list("counts" = coverageMatrix), + colData = DataFrame(pheno), rowRanges = regions + ) + ## Finish return(rse) } @@ -191,14 +207,18 @@ coverage_matrix <- function(project, chr, regions, chunksize = 1000, ## Helper function for reading the phenotype files .read_pheno <- function(phenoFile, project) { - if(project %in% c('SRP012682', 'TCGA')) { - subsets <- c('SRP012682' = 'gtex', 'TCGA' = 'tcga') + if (project %in% c("SRP012682", "TCGA")) { + subsets <- c("SRP012682" = "gtex", "TCGA" = "tcga") res <- all_metadata(subsets[project], verbose = FALSE) - } else { + } else { info <- readLines(phenoFile) - res <- read.table(text = info[grepl(paste0('^project|^', project), - info)], header = TRUE, stringsAsFactors = FALSE, sep = '\t', - comment.char = '', quote = '') + res <- read.table( + text = info[grepl( + paste0("^project|^", project), + info + )], header = TRUE, stringsAsFactors = FALSE, sep = "\t", + comment.char = "", quote = "" + ) } return(res) } diff --git a/R/download_retry.R b/R/download_retry.R index f3065df..f8ecdf7 100644 --- a/R/download_retry.R +++ b/R/download_retry.R @@ -24,10 +24,9 @@ #' #' ## Download the first files_info.tsv file (version 1) #' download_retry( -#' recount_url$url[which(recount_url$file_name == 'files_info.tsv')[1]] +#' recount_url$url[which(recount_url$file_name == "files_info.tsv")[1]] #' ) -#' -download_retry <- function(url, destfile = basename(url), mode = 'wb', +download_retry <- function(url, destfile = basename(url), mode = "wb", N.TRIES = 3L, ...) { ## Based on http://bioconductor.org/developers/how-to/web-query/ ## and downloader::download() @@ -38,18 +37,22 @@ download_retry <- function(url, destfile = basename(url), mode = 'wb', while (N.TRIES > 0L) { result <- tryCatch(downloader::download( - url = url, destfile = destfile, mode = mode, ...), error=identity) - if (!inherits(result, "error")) - break + url = url, destfile = destfile, mode = mode, ... + ), error = identity) + if (!inherits(result, "error")) { + break + } ## Wait between 0 and 2 seconds between retries Sys.sleep(runif(n = 1, min = 2, max = 5)) N.TRIES <- N.TRIES - 1L } if (N.TRIES == 0L) { - stop("'download_retry()' failed:", - "\n URL: ", url, - "\n error: ", conditionMessage(result)) + stop( + "'download_retry()' failed:", + "\n URL: ", url, + "\n error: ", conditionMessage(result) + ) } invisible(result) diff --git a/R/download_study.R b/R/download_study.R index 8bc2660..53f8a4e 100644 --- a/R/download_study.R +++ b/R/download_study.R @@ -81,36 +81,34 @@ #' @examples #' ## Find the URL to download the RangedSummarizedExperiment for the #' ## Geuvadis consortium study. -#' url <- download_study('ERP001942', download = FALSE) +#' url <- download_study("ERP001942", download = FALSE) #' #' ## See the actual URL #' url -#' #' \dontrun{ #' ## Download the example data included in the package for study SRP009615 #' -#' url2 <- download_study('SRP009615') +#' url2 <- download_study("SRP009615") #' url2 #' #' ## Load the data -#' load(file.path('SRP009615', 'rse_gene.Rdata')) +#' load(file.path("SRP009615", "rse_gene.Rdata")) #' #' ## Compare the data -#' library('testthat') +#' library("testthat") #' expect_equivalent(rse_gene, rse_gene_SRP009615) -#' #' } #' - -download_study <- function(project, type = 'rse-gene', outdir = project, +download_study <- function(project, type = "rse-gene", outdir = project, download = TRUE, version = 2, ...) { ## Check inputs stopifnot(is.character(project) & length(project) == 1) stopifnot(version %in% c(1, 2)) stopifnot(type %in% c( - 'rse-gene', 'rse-exon', 'rse-jx', 'rse-tx', - 'counts-gene', 'counts-exon', 'counts-jx', - 'phenotype', 'files-info', 'samples', 'mean', 'rse-fc', 'all')) + "rse-gene", "rse-exon", "rse-jx", "rse-tx", + "counts-gene", "counts-exon", "counts-jx", + "phenotype", "files-info", "samples", "mean", "rse-fc", "all" + )) stopifnot(length(type) == 1) stopifnot(is.logical(download) & length(download) == 1) @@ -118,77 +116,88 @@ download_study <- function(project, type = 'rse-gene', outdir = project, url_table <- recount::recount_url ## URLs default to version 2 (disjoint exons). - if(version == 1) url_table$url <- gsub('v2/', '', url_table$url) + if (version == 1) url_table$url <- gsub("v2/", "", url_table$url) ## Subset url data url_table <- url_table[url_table$project == project, ] - if(nrow(url_table) == 0) { + if (nrow(url_table) == 0) { stop("Invalid 'project' argument. There's no such 'project' in the recount_url data.frame.") } ## If all, download each type individually - if(type == 'all') { + if (type == "all") { urls <- sapply(c( - 'rse-gene', 'rse-exon', 'rse-jx', 'rse-tx', - 'counts-gene', 'counts-exon', 'counts-jx', - 'phenotype', 'files-info', 'samples', 'mean'), function(file_type) { + "rse-gene", "rse-exon", "rse-jx", "rse-tx", + "counts-gene", "counts-exon", "counts-jx", + "phenotype", "files-info", "samples", "mean" + ), function(file_type) { Sys.sleep(runif(n = 1, min = 2, max = 5)) - download_study(project = project, type = file_type, - outdir = outdir, download = download, version = version, ...) + download_study( + project = project, type = file_type, + outdir = outdir, download = download, version = version, ... + ) }) return(invisible(urls)) } ## Create output directory if needed - if(download) { + if (download) { dir.create(outdir, showWarnings = FALSE) - if(type %in% c('samples', 'mean')) { + if (type %in% c("samples", "mean")) { ## Save bigwigs on their own folder - outdir <- file.path(outdir, 'bw') + outdir <- file.path(outdir, "bw") dir.create(outdir, showWarnings = FALSE) } } ## Select files to download and download them - if(type != 'samples') { + if (type != "samples") { filename <- switch(type, - 'rse-gene' = 'rse_gene.Rdata', - 'rse-exon' = 'rse_exon.Rdata', - 'rse-jx' = 'rse_jx.Rdata', - 'rse-tx' = 'rse_tx.RData', - 'counts-gene' = 'counts_gene.tsv.gz', - 'counts-exon' = 'counts_exon.tsv.gz', - 'counts-jx' = 'counts_jx.tsv.gz', - phenotype = paste0(project, '.tsv'), - 'files-info' = 'files_info.tsv', - mean = paste0('mean_', project, '.bw'), - 'rse-fc' = paste0('rse_fc_', project, '.Rdata') + "rse-gene" = "rse_gene.Rdata", + "rse-exon" = "rse_exon.Rdata", + "rse-jx" = "rse_jx.Rdata", + "rse-tx" = "rse_tx.RData", + "counts-gene" = "counts_gene.tsv.gz", + "counts-exon" = "counts_exon.tsv.gz", + "counts-jx" = "counts_jx.tsv.gz", + phenotype = paste0(project, ".tsv"), + "files-info" = "files_info.tsv", + mean = paste0("mean_", project, ".bw"), + "rse-fc" = paste0("rse_fc_", project, ".Rdata") ) url <- url_table$url[url_table$file_name == filename] - if(length(url) == 0) return(NULL) - if(download) { - message(paste(Sys.time(), 'downloading file', filename, 'to', - outdir)) + if (length(url) == 0) { + return(NULL) + } + if (download) { + message(paste( + Sys.time(), "downloading file", filename, "to", + outdir + )) xx <- download_retry( url = url, destfile = file.path(outdir, filename), - mode = 'wb', + mode = "wb", ... ) } - } else if(type == 'samples') { - url_table <- url_table[url_table$file_name != paste0('mean_', project, - '.bw'), ] - sample_urls <- url_table[grep('[.]bw$', url_table$file_name), ] + } else if (type == "samples") { + url_table <- url_table[url_table$file_name != paste0( + "mean_", project, + ".bw" + ), ] + sample_urls <- url_table[grep("[.]bw$", url_table$file_name), ] url <- sample_urls$url - if(download) { + if (download) { xx <- sapply(seq_len(nrow(sample_urls)), function(i, ...) { - message(paste(Sys.time(), 'downloading file', - sample_urls$file_name[i], 'to', outdir)) + message(paste( + Sys.time(), "downloading file", + sample_urls$file_name[i], "to", outdir + )) download_retry( url = sample_urls$url[i], destfile = file.path(outdir, sample_urls$file_name[i]), - mode = 'wb', + mode = "wb", ... ) }, ...) diff --git a/R/expressed_regions.R b/R/expressed_regions.R index 67f0483..750223a 100644 --- a/R/expressed_regions.R +++ b/R/expressed_regions.R @@ -9,16 +9,16 @@ #' @param chr A character vector with the name of the chromosome. #' @param cutoff The base-pair level cutoff to use. #' @param outdir The destination directory for the downloaded file(s) that were -#' previously downloaded with [download_study]. If the files are missing, +#' previously downloaded with [download_study]. If the files are missing, #' but `outdir` is specified, they will get downloaded first. By default #' `outdir` is set to `NULL` which will use the data from the web. #' We only recommend downloading the full data if you will use it several times. #' @param maxClusterGap This determines the maximum gap between candidate ERs. -#' @param chrlen The chromosome length in base pairs. If it's `NULL`, the +#' @param chrlen The chromosome length in base pairs. If it's `NULL`, the #' chromosome length is extracted from the Rail-RNA runs GitHub repository. #' Alternatively check the `SciServer` section on the vignette to see #' how to access all the recount data via a R Jupyter Notebook. -#' @param verbose If `TRUE` basic status updates will be printed along the +#' @param verbose If `TRUE` basic status updates will be printed along the #' way. #' @param ... Additional arguments passed to [download_study] when #' `outdir` is specified but the required files are missing. @@ -38,84 +38,96 @@ #' #' @examples #' ## Define expressed regions for study SRP009615, chrY -#' if(.Platform$OS.type != 'windows') { -#' ## Reading BigWig files is not supported by rtracklayer on Windows -#' regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L, -#' maxClusterGap = 3000L) +#' if (.Platform$OS.type != "windows") { +#' ## Reading BigWig files is not supported by rtracklayer on Windows +#' regions <- expressed_regions("SRP009615", "chrY", +#' cutoff = 5L, +#' maxClusterGap = 3000L +#' ) #' } -#' #' \dontrun{ #' ## Define the regions for multiple chrs -#' regs <- sapply(chrs, expressed_regions, project = 'SRP009615', cutoff = 5L) +#' regs <- sapply(chrs, expressed_regions, project = "SRP009615", cutoff = 5L) #' #' ## You can then combine them into a single GRanges object if you want to -#' library('GenomicRanges') +#' library("GenomicRanges") #' single <- unlist(GRangesList(regs)) #' } #' - expressed_regions <- function(project, chr, cutoff, outdir = NULL, maxClusterGap = 300L, chrlen = NULL, verbose = TRUE, ...) { ## Check inputs stopifnot(is.character(project) & length(project) == 1) stopifnot(is.character(chr) & length(chr) == 1) - + ## Windows-specific info - if(.Platform$OS.type == 'windows') { - warning('rtracklayer does not support importing BigWig files on Windows, so this function might not work') + if (.Platform$OS.type == "windows") { + warning("rtracklayer does not support importing BigWig files on Windows, so this function might not work") } - + ## Use table from the package url_table <- recount::recount_url - + ## Subset url data url_table <- url_table[url_table$project == project, ] - if(nrow(url_table) == 0) { + if (nrow(url_table) == 0) { stop("Invalid 'project' argument. There's no such 'project' in the recount_url data.frame.") } - + ## Find chromosome length if absent - if(is.null(chrlen)) { - chrinfo <- read.table('https://raw.githubusercontent.com/nellore/runs/master/gtex/hg38.sizes', - col.names = c('chr', 'size'), colClasses = c('character', - 'integer')) + if (is.null(chrlen)) { + chrinfo <- read.table("https://raw.githubusercontent.com/nellore/runs/master/gtex/hg38.sizes", + col.names = c("chr", "size"), colClasses = c( + "character", + "integer" + ) + ) chrlen <- chrinfo$size[chrinfo$chr == chr] stopifnot(length(chrlen) == 1) } - + ## Check if data is present, otherwise download it - if(!is.null(outdir)) { + if (!is.null(outdir)) { ## Check mean file - meanFile <- file.path(outdir, 'bw', url_table$file_name[grep('mean', - url_table$file_name)]) - if(!file.exists(meanFile)) { - download_study(project = project, type = 'mean', outdir = outdir, - download = TRUE, ...) + meanFile <- file.path(outdir, "bw", url_table$file_name[grep( + "mean", + url_table$file_name + )]) + if (!file.exists(meanFile)) { + download_study( + project = project, type = "mean", outdir = outdir, + download = TRUE, ... + ) } } else { - meanFile <- download_study(project = project, type = 'mean', - download = FALSE) + meanFile <- download_study( + project = project, type = "mean", + download = FALSE + ) } - + ## Load coverage - meanCov <- derfinder::loadCoverage(files = meanFile, chr = chr, - chrlen = chrlen) - + meanCov <- derfinder::loadCoverage( + files = meanFile, chr = chr, + chrlen = chrlen + ) + ## Find regions regs <- derfinder::findRegions( position = S4Vectors::Rle(TRUE, length(meanCov$coverage[[1]])), fstats = meanCov$coverage[[1]], chr = chr, - maxClusterGap = maxClusterGap, cutoff = cutoff, verbose = verbose) - + maxClusterGap = maxClusterGap, cutoff = cutoff, verbose = verbose + ) + ## If there are no regions, return NULL - if(is.null(regs)) regs <- GenomicRanges::GRanges() - + if (is.null(regs)) regs <- GenomicRanges::GRanges() + ## Format appropriately names(regs) <- seq_len(length(regs)) - + ## Set the length GenomeInfoDb::seqlengths(regs) <- length(meanCov$coverage[[1]]) - + ## Finish return(regs) } diff --git a/R/find_geo.R b/R/find_geo.R index ce999db..db5cc9b 100644 --- a/R/find_geo.R +++ b/R/find_geo.R @@ -26,37 +26,41 @@ #' #' @examples #' ## Find the GEO accession id for for SRX110461 -#' find_geo('SRX110461') -#' - -find_geo <- function(run, verbose = FALSE, sleep = 1/2) { +#' find_geo("SRX110461") +find_geo <- function(run, verbose = FALSE, sleep = 1 / 2) { ## Check inputs stopifnot(is.character(run) & length(run) == 1) - if(run == '') return(NA) - - if(verbose) message(paste(Sys.time(), 'finding GEO accession id for SRA run', run)) + if (run == "") { + return(NA) + } + + if (verbose) message(paste(Sys.time(), "finding GEO accession id for SRA run", run)) Sys.sleep(sleep) - + ## Find uid first - uid <- rentrez::entrez_search('sra', term = run) - if(length(uid$ids) == 0) return(NA) - + uid <- rentrez::entrez_search("sra", term = run) + if (length(uid$ids) == 0) { + return(NA) + } + ## Find linking ids - linking <- rentrez::entrez_link('sra', id = uid$ids, db = 'gds') - if(length(linking$links$sra_gds) == 0) return(NA) - + linking <- rentrez::entrez_link("sra", id = uid$ids, db = "gds") + if (length(linking$links$sra_gds) == 0) { + return(NA) + } + ## Find GSM foundGSM <- FALSE - for(i in linking$links$sra_gds) { - gsm <- rentrez::entrez_summary(db = 'gds', i)$accession - if(grepl('GSM', gsm)) { + for (i in linking$links$sra_gds) { + gsm <- rentrez::entrez_summary(db = "gds", i)$accession + if (grepl("GSM", gsm)) { foundGSM <- TRUE break } } - + ## Finish - if(foundGSM) { + if (foundGSM) { return(gsm) } else { return(NA) diff --git a/R/geo_characteristics.R b/R/geo_characteristics.R index e6ae7d0..4ed51c1 100644 --- a/R/geo_characteristics.R +++ b/R/geo_characteristics.R @@ -21,7 +21,7 @@ #' @examples #' #' ## Load required library -#' library('SummarizedExperiment') +#' library("SummarizedExperiment") #' #' ## Get the GEO accession ids #' # geoids <- sapply(colData(rse_gene_SRP009615)$run[1:2], find_geo) @@ -30,8 +30,13 @@ #' ## This code makes sure that the example code runs. #' geoids <- tryCatch( #' sapply(colData(rse_gene_SRP009615)$run[1:2], find_geo), -#' error = function(e) c('SRR387777' = 'GSM836270', -#' 'SRR387778' = 'GSM836271')) +#' error = function(e) { +#' c( +#' "SRR387777" = "GSM836270", +#' "SRR387778" = "GSM836271" +#' ) +#' } +#' ) #' #' ## Get the data from GEO #' geodata <- do.call(rbind, sapply(geoids, geo_info)) @@ -41,14 +46,12 @@ #' #' ## Explore the original characteristics and the result from #' ## geo_characteristics() -#' geodata[, c('characteristics', 'cells', 'shrna.expression', 'treatment')] -#' - +#' geodata[, c("characteristics", "cells", "shrna.expression", "treatment")] geo_characteristics <- function(pheno) { ## Check inputs - stopifnot('characteristics' %in% colnames(pheno)) + stopifnot("characteristics" %in% colnames(pheno)) - if(is.character(pheno$characteristics)) { + if (is.character(pheno$characteristics)) { ## Solves https://support.bioconductor.org/p/116480/ pheno$characteristics <- IRanges::CharacterList( lapply(lapply(pheno$characteristics, str2lang), eval) @@ -57,24 +60,25 @@ geo_characteristics <- function(pheno) { ## Separate information result <- lapply(pheno$characteristics, function(sampleinfo) { - ## Check if there are colons - if(any(!as.vector(sapply(sampleinfo, grepl, pattern = ':')))) { + ## Check if there are colons + if (any(!as.vector(sapply(sampleinfo, grepl, pattern = ":")))) { res <- data.frame( - 'characteristics' = paste(unlist(sampleinfo), - collapse = ', '), + "characteristics" = paste(unlist(sampleinfo), + collapse = ", " + ), stringsAsFactors = FALSE ) return(res) } - info <- strsplit(sampleinfo, ': ') + info <- strsplit(sampleinfo, ": ") ## Get variable names - varNames <- sapply(info, '[[', 1) + varNames <- sapply(info, "[[", 1) varNames <- make.names(tolower(varNames)) ## Construct result - res <- matrix(sapply(info, '[[', 2), nrow = 1) + res <- matrix(sapply(info, "[[", 2), nrow = 1) colnames(res) <- varNames res <- data.frame(res, stringsAsFactors = FALSE) diff --git a/R/geo_info.R b/R/geo_info.R index 1e83765..74d45a5 100644 --- a/R/geo_info.R +++ b/R/geo_info.R @@ -24,84 +24,101 @@ #' @import GEOquery IRanges S4Vectors #' #' @examples -#' geo_info('GSM836270') -#' - -geo_info <- function(geoid, verbose = FALSE, sleep = 1/2, getGPL = FALSE, +#' geo_info("GSM836270") +geo_info <- function(geoid, verbose = FALSE, sleep = 1 / 2, getGPL = FALSE, destdir = tempdir(), ...) { - if(is.na(geoid)) return(NULL) + if (is.na(geoid)) { + return(NULL) + } ## Check inputs stopifnot(is.character(geoid) & length(geoid) == 1) - if(verbose) message(paste(Sys.time(), - 'finding GEO information for GEO accession id', geoid)) + if (verbose) { + message(paste( + Sys.time(), + "finding GEO information for GEO accession id", geoid + )) + } - if(!file.exists(file.path(destdir, paste0(geoid, '.soft')))) { + if (!file.exists(file.path(destdir, paste0(geoid, ".soft")))) { Sys.sleep(sleep) } ## Get data from GEO, with 3 retries, waiting between 0 and 2 seconds in ## between retries N.TRIES <- 3L - while(N.TRIES > 0L) { + while (N.TRIES > 0L) { geo <- tryCatch( GEOquery::getGEO(geoid, getGPL = getGPL, destdir = destdir, ...), error = function(e) { - soft <- paste0(geoid, '.soft') + soft <- paste0(geoid, ".soft") soft_file <- file.path(destdir, soft) - if(any(grepl('private', readLines(soft_file)))) { - message(paste(geoid, 'is currently private')) + if (any(grepl("private", readLines(soft_file)))) { + message(paste(geoid, "is currently private")) return(NA) - } else if (any(grepl('blocked', readLines(soft_file)))) { - warning(paste('It seems like your IP access is blocked. Please check the file', soft_file, 'for more details.')) + } else if (any(grepl("blocked", readLines(soft_file)))) { + warning(paste("It seems like your IP access is blocked. Please check the file", soft_file, "for more details.")) return(NA) } else { return(e) } } ) - if(!inherits(geo, 'error')) - break + if (!inherits(geo, "error")) { + break + } Sys.sleep(runif(n = 1, min = 2, max = 5)) N.TRIES <- N.TRIES - 1L } ## Return and empty DataFrame if there was an issue with getGEO() - if(!is(geo, 'GSM')) return(S4Vectors::DataFrame()) + if (!is(geo, "GSM")) { + return(S4Vectors::DataFrame()) + } ## Extract the header information - result <- geo@header + result <- geo@header ## Function for cleaning clean_geo <- function(pattern, varname, res) { - charIndex <- grep(pattern, names(res)) - if(length(charIndex) > 0) { - res <- c(res, - IRanges::CharacterList(unlist(unname(result[charIndex])))) + charIndex <- grep(pattern, names(res)) + if (length(charIndex) > 0) { + res <- c( + res, + IRanges::CharacterList(unlist(unname(result[charIndex]))) + ) names(res)[length(res)] <- varname - res <- res[-charIndex] - } + res <- res[-charIndex] + } return(res) } ## Clean up the header information df <- data.frame( - pattern = c('characteristics_ch1', 'data_processing', 'contact_', - 'extract_', 'library_', 'relation', 'series_', - 'supplementary_file_'), - varname = c('characteristics', 'data_processing', 'contact', 'extract', - 'library', 'relation', 'series', 'supplementary_file'), + pattern = c( + "characteristics_ch1", "data_processing", "contact_", + "extract_", "library_", "relation", "series_", + "supplementary_file_" + ), + varname = c( + "characteristics", "data_processing", "contact", "extract", + "library", "relation", "series", "supplementary_file" + ), stringsAsFactors = FALSE ) - for(i in seq_len(nrow(df))) result <- clean_geo(df$pattern[i], - df$varname[i], result) + for (i in seq_len(nrow(df))) { + result <- clean_geo( + df$pattern[i], + df$varname[i], result + ) + } ## Make sure they are all length 1 - if(any(S4Vectors::elementNROWS(result) > 1)) { - for(i in which(S4Vectors::elementNROWS(result) > 1)) result[i] <- IRanges::CharacterList(unlist(unname(result[i]))) + if (any(S4Vectors::elementNROWS(result) > 1)) { + for (i in which(S4Vectors::elementNROWS(result) > 1)) result[i] <- IRanges::CharacterList(unlist(unname(result[i]))) } ## Finish - return(S4Vectors::DataFrame(result)) + return(S4Vectors::DataFrame(result)) } diff --git a/R/getRPKM.R b/R/getRPKM.R index 86d8a6f..4336f95 100644 --- a/R/getRPKM.R +++ b/R/getRPKM.R @@ -36,20 +36,18 @@ #' ## You can also get an RPKM matrix after running scale_counts() #' ## with similar RPKM values #' rpkm2 <- getRPKM(scale_counts(rse_gene_SRP009615)) -#' rpkm3 <- getRPKM(scale_counts(rse_gene_SRP009615, by = 'mapped_reads')) +#' rpkm3 <- getRPKM(scale_counts(rse_gene_SRP009615, by = "mapped_reads")) #' #' summary(rpkm - rpkm2) #' summary(rpkm - rpkm3) #' summary(rpkm2 - rpkm3) -#' - -getRPKM <- function(rse, length_var = 'bp_length', mapped_var = NULL) { - mapped <- if(!is.null(mapped_var)) colData(rse)[, mapped_var] else colSums(assays(rse)$counts) +getRPKM <- function(rse, length_var = "bp_length", mapped_var = NULL) { + mapped <- if (!is.null(mapped_var)) colData(rse)[, mapped_var] else colSums(assays(rse)$counts) bg <- matrix(mapped, ncol = ncol(rse), nrow = nrow(rse), byrow = TRUE) - len <- if(!is.null(length_var)) rowData(rse)[, length_var] else width(rowRanges(rse)) + len <- if (!is.null(length_var)) rowData(rse)[, length_var] else width(rowRanges(rse)) wid <- matrix(len, nrow = nrow(rse), ncol = ncol(rse), byrow = FALSE) - assays(rse)$counts / (wid/1000) / (bg/1e6) + assays(rse)$counts / (wid / 1000) / (bg / 1e6) } diff --git a/R/getTPM.R b/R/getTPM.R index 67625f8..0426484 100644 --- a/R/getTPM.R +++ b/R/getTPM.R @@ -39,10 +39,10 @@ #' #' ## Compute the TPM matrix from the raw gene-level base-pair counts. #' tpm <- getTPM(rse_gene_SRP009615) -#' - -getTPM <- function(rse, length_var = 'bp_length', mapped_var = NULL) { +getTPM <- function(rse, length_var = "bp_length", mapped_var = NULL) { # From https://support.bioconductor.org/p/124265/ rpkm <- getRPKM(rse, length_var = length_var, mapped_var = mapped_var) - apply(rpkm, 2, function(x) { (x / sum(x)) * 10^6 }) + apply(rpkm, 2, function(x) { + (x / sum(x)) * 10^6 + }) } diff --git a/R/read_counts.R b/R/read_counts.R index f9e21ef..4c9b857 100644 --- a/R/read_counts.R +++ b/R/read_counts.R @@ -40,30 +40,27 @@ #' #' ## Difference between read counts and reads downloaded by Rail-RNA #' colSums(assays(read_counts(rse_gene_SRP009615))$counts) / 1e6 - -#' colData(rse_gene_SRP009615)$reads_downloaded /1e6 +#' colData(rse_gene_SRP009615)$reads_downloaded / 1e6 #' #' ## Paired-end read counts vs fragment counts (single-end counts) -#' download_study('DRP000499') -#' load('DRP000499/rse_gene.Rdata') +#' download_study("DRP000499") +#' load("DRP000499/rse_gene.Rdata") #' colSums(assays(read_counts(rse_gene, use_paired_end = FALSE))$counts) / #' colSums(assays(read_counts(rse_gene))$counts) #' #' ## Difference between paired-end read counts vs paired-end reads downloaded #' colSums(assays(read_counts(rse_gene))$counts) / 1e6 - -#' colData(rse_gene)$reads_downloaded / 1e6 /2 -#' - +#' colData(rse_gene)$reads_downloaded / 1e6 / 2 read_counts <- function(rse, use_paired_end = TRUE, round = FALSE) { - - if(use_paired_end) { + if (use_paired_end) { counts <- t(t(assays(rse)$counts) / (colData(rse)$auc / (colData(rse)$mapped_read_count / ifelse(colData(rse)$paired_end, 2, 1)))) } else { counts <- t(t(assays(rse)$counts) / (colData(rse)$auc / - colData(rse)$mapped_read_count )) + colData(rse)$mapped_read_count)) } - if(round) counts <- round(counts, 0) + if (round) counts <- round(counts, 0) assays(rse)$counts <- counts return(rse) } diff --git a/R/recount-package.R b/R/recount-package.R index 44a07f3..b30bbda 100644 --- a/R/recount-package.R +++ b/R/recount-package.R @@ -1,22 +1,8 @@ -#' Explore and download data from the recount project. -#' -#' Explore and download data from the recount project available at -#' . Using the recount package -#' you can download -#' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] -#' objects at the gene or exon level, the raw counts, the phenotype metadata -#' used, the urls to the sample coverage bigWig files or the mean coverage -#' bigWig file for a particular study. The -#' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] objects can be -#' used by different packages for performing differential expression analysis. -#' Using you can perform -#' annotation-agnostic differential expression analyses with the data from the -#' recount project. -#' -#' @name recount-package -#' @aliases recount-package -#' @docType package -#' @author Leonardo Collado-Torres -#' -#' @keywords package +#' @keywords internal +"_PACKAGE" + +# The following block is used by usethis to automatically manage +# roxygen namespace tags. Modify with care! +## usethis namespace: start +## usethis namespace: end NULL diff --git a/R/recount_exons-data.R b/R/recount_exons-data.R index 108140d..9493c0a 100644 --- a/R/recount_exons-data.R +++ b/R/recount_exons-data.R @@ -5,7 +5,7 @@ #' #' @name recount_exons #' @docType data -#' @format A [GRangesList-class][GenomicRanges::GRangesList-class] with one element per +#' @format A [GRangesList-class][GenomicRanges::GRangesList-class] with one element per #' gene. The names match the gene Gencode v25 ids. #' #' @keywords datasets diff --git a/R/recount_genes-data.R b/R/recount_genes-data.R index f35cc6c..0a04258 100644 --- a/R/recount_genes-data.R +++ b/R/recount_genes-data.R @@ -4,17 +4,17 @@ #' Data extraced on October 12th, 2017. The symbols were updated compared #' to the version from January 17th, 2017. #' It includes the sum of the width of the disjoint exons which can be -#' used for normalizing the counts provided in the +#' used for normalizing the counts provided in the #' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] objects. #' #' @name recount_genes #' @docType data -#' @format A [GRanges-class][GenomicRanges::GRanges-class] with one range per gene. The +#' @format A [GRanges-class][GenomicRanges::GRanges-class] with one range per gene. The #' names match their Gencode v25 ids. The [GRanges-class][GenomicRanges::GRanges-class] #' has three metadata columns. #' \describe{ #' \item{gene_id }{ the Gencode v25 ids, identical to the names.} -#' \item{bp_length }{ the sum of the width of the disjoint exons for that +#' \item{bp_length }{ the sum of the width of the disjoint exons for that #' given gene.} #' \item{symbol }{ a CharacterList with the corresponding gene symbols.} #' } diff --git a/R/recount_url-data.R b/R/recount_url-data.R index 5d76ef6..3c84e59 100644 --- a/R/recount_url-data.R +++ b/R/recount_url-data.R @@ -21,4 +21,4 @@ #' @keywords datasets #' @seealso [download_study] #' @references -NULL +NULL diff --git a/R/reproduce_ranges.R b/R/reproduce_ranges.R index bafa63d..2104c36 100644 --- a/R/reproduce_ranges.R +++ b/R/reproduce_ranges.R @@ -54,22 +54,22 @@ #' length(recount_genes) #' } #' - -reproduce_ranges <- function(level = 'gene', db = 'Gencode.v25') { +reproduce_ranges <- function(level = "gene", db = "Gencode.v25") { ## Check input level <- tolower(level) - stopifnot(level %in% c('gene', 'exon', 'both')) - stopifnot(db %in% c('Gencode.v25', 'EnsDb.Hsapiens.v79')) + stopifnot(level %in% c("gene", "exon", "both")) + stopifnot(db %in% c("Gencode.v25", "EnsDb.Hsapiens.v79")) ## Load required packages - .load_check(c('GenomicFeatures', 'org.Hs.eg.db')) - if (db == 'Gencode.v25') { - txdb <- GenomicFeatures::makeTxDbFromGFF('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_25/gencode.v25.annotation.gff3.gz', - format = 'gff3', organism = 'Homo sapiens') - } else if(db == 'EnsDb.Hsapiens.v79') { - .load_check('EnsDb.Hsapiens.v79') + .load_check(c("GenomicFeatures", "org.Hs.eg.db")) + if (db == "Gencode.v25") { + txdb <- GenomicFeatures::makeTxDbFromGFF("ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_25/gencode.v25.annotation.gff3.gz", + format = "gff3", organism = "Homo sapiens" + ) + } else if (db == "EnsDb.Hsapiens.v79") { + .load_check("EnsDb.Hsapiens.v79") txdb <- EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79 } @@ -77,38 +77,42 @@ reproduce_ranges <- function(level = 'gene', db = 'Gencode.v25') { genes <- GenomicFeatures::genes(txdb) ## Get Exons - exons <- GenomicFeatures::exonsBy(txdb, by = 'gene') + exons <- GenomicFeatures::exonsBy(txdb, by = "gene") ## Keep only exons for gene ids that we selected previously - if(!all(names(exons) %in% names(genes))) { - warning('Dropping exons with gene ids not present in the gene list') + if (!all(names(exons) %in% names(genes))) { + warning("Dropping exons with gene ids not present in the gene list") exons <- exons[names(exons) %in% names(genes)] } ## Disjoin exons by gene so the exons won't be overlapping each other inside a gene exons <- GenomicRanges::disjoin(exons) - if(level == 'exon') return(exons) + if (level == "exon") { + return(exons) + } ## For 'gene' or 'both', continue by: ## * adding length of disjoint exons by gene genes$bp_length <- sum(GenomicRanges::width(exons)) ## * adding gene symbol - if(db == 'Gencode.v25') { + if (db == "Gencode.v25") { gene_info <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, - gsub('\\..*', '', names(genes)), 'SYMBOL', 'ENSEMBL', - multiVals = 'CharacterList') - } else if(db == 'EnsDb.Hsapiens.v79') { + gsub("\\..*", "", names(genes)), "SYMBOL", "ENSEMBL", + multiVals = "CharacterList" + ) + } else if (db == "EnsDb.Hsapiens.v79") { gene_info <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, - names(genes), 'SYMBOL', 'ENSEMBL', multiVals = 'CharacterList') + names(genes), "SYMBOL", "ENSEMBL", + multiVals = "CharacterList" + ) } genes$symbol <- gene_info - if(level == 'gene') { + if (level == "gene") { return(genes) - } else if (level == 'both') { - return(list('exon' = exons, 'gene' = genes)) + } else if (level == "both") { + return(list("exon" = exons, "gene" = genes)) } } - diff --git a/R/rse_gene_SRP009615-data.R b/R/rse_gene_SRP009615-data.R index e55c59d..c70870c 100644 --- a/R/rse_gene_SRP009615-data.R +++ b/R/rse_gene_SRP009615-data.R @@ -1,6 +1,6 @@ #' RangedSummarizedExperiment at the gene level for study SRP009615 #' -#' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] at the gene +#' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] at the gene #' level for study SRP009615. Used as an example in [scale_counts]. #' Matches the version2 file #' (details at ) under the @@ -8,8 +8,8 @@ #' #' @name rse_gene_SRP009615 #' @docType data -#' @format A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] as -#' created by the recount project for study with SRA id (accession number) +#' @format A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] as +#' created by the recount project for study with SRA id (accession number) #' SRP009615. #' #' @keywords datasets diff --git a/R/scale_counts.R b/R/scale_counts.R index 54e242b..49387e6 100644 --- a/R/scale_counts.R +++ b/R/scale_counts.R @@ -8,9 +8,9 @@ #' into account the gene or exon lengths. If you prefer to calculate read counts #' without scaling check the function [read_counts]. #' -#' @param rse A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] +#' @param rse A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] #' object as downloaded with [download_study]. -#' @param by Either `auc` or `mapped_reads`. If set to `auc` it +#' @param by Either `auc` or `mapped_reads`. If set to `auc` it #' will scale the counts by the total coverage of the sample. That is, the area #' under the curve (AUC) of the coverage. If set to `mapped_reads` it will #' scale the counts by the number of mapped reads, whether the library was @@ -19,8 +19,8 @@ #' @param L The target read length. Only used when `by = 'mapped_reads'` #' since it cancels out in the calculation when using `by = 'auc'`. #' @param factor_only Whether to only return the numeric scaling factor or -#' to return a [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] -#' object with the counts scaled. If set to `TRUE`, you have to multiply +#' to return a [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] +#' object with the counts scaled. If set to `TRUE`, you have to multiply #' the sample counts by this scaling factor. #' @param round Whether to round the counts to integers or not. #' @@ -52,7 +52,7 @@ #' rse <- scale_counts(rse_gene) #' #' ## Find the project used as an example -#' project_info <- abstract_search('GSE32465') +#' project_info <- abstract_search("GSE32465") #' #' ## See some summary information for this project #' project_info @@ -61,62 +61,63 @@ #' \dontrun{ #' ## Download #' download_study(project_info$project) -#' +#' #' ## Load file -#' load(file.path(project_info$project, 'rse_gene.Rdata')) +#' load(file.path(project_info$project, "rse_gene.Rdata")) #' identical(rse_gene, rse_gene_SRP009615) #' } #' - - -scale_counts <- function(rse, by = 'auc', targetSize = 4e7, L = 100, - factor_only = FALSE, round = TRUE) { +scale_counts <- function(rse, by = "auc", targetSize = 4e7, L = 100, + factor_only = FALSE, round = TRUE) { ## Check inputs - stopifnot(is(rse, 'RangedSummarizedExperiment')) + stopifnot(is(rse, "RangedSummarizedExperiment")) stopifnot(length(targetSize) == 1) stopifnot(is.numeric(targetSize) | is.integer(targetSize)) stopifnot(length(L) == 1) stopifnot(is.numeric(L) | is.integer(L)) stopifnot(is.character(by)) - stopifnot(by %in% c('auc', 'mapped_reads')) + stopifnot(by %in% c("auc", "mapped_reads")) stopifnot(is.logical(factor_only)) stopifnot(length(factor_only) == 1) stopifnot(is.logical(round)) stopifnot(length(round) == 1) - + ## Check RSE details counts <- SummarizedExperiment::assay(rse, 1) stopifnot(ncol(counts) == nrow(SummarizedExperiment::colData(rse))) - if(by == 'auc'){ - stopifnot('auc' %in% colnames(SummarizedExperiment::colData(rse))) - } else if (by == 'mapped_reads') { - stopifnot(all(c('avg_read_length', 'mapped_read_count', - 'paired_end') %in% colnames(SummarizedExperiment::colData(rse)))) + if (by == "auc") { + stopifnot("auc" %in% colnames(SummarizedExperiment::colData(rse))) + } else if (by == "mapped_reads") { + stopifnot(all(c( + "avg_read_length", "mapped_read_count", + "paired_end" + ) %in% colnames(SummarizedExperiment::colData(rse)))) } - + ## Scale counts - if(by == 'auc') { + if (by == "auc") { # L cancels out: # have to multiply by L to get the desired library size, # but then divide by L to take into account the read length since the # raw counts are the sum of base-level coverage. scaleFactor <- targetSize / SummarizedExperiment::colData(rse)$auc - } else if (by == 'mapped_reads') { + } else if (by == "mapped_reads") { scaleFactor <- targetSize * L * ifelse(SummarizedExperiment::colData(rse)$paired_end, 2, 1)^2 / (SummarizedExperiment::colData(rse)$mapped_read_count * - SummarizedExperiment::colData(rse)$avg_read_length^2) + SummarizedExperiment::colData(rse)$avg_read_length^2) } - - if(factor_only) { + + if (factor_only) { names(scaleFactor) <- rownames(SummarizedExperiment::colData(rse)) return(scaleFactor) } else { scaleMat <- matrix(rep(scaleFactor, each = nrow(counts)), - ncol = ncol(counts)) + ncol = ncol(counts) + ) scaledCounts <- counts * scaleMat - if(round) scaledCounts <- round(scaledCounts, 0) + if (round) scaledCounts <- round(scaledCounts, 0) SummarizedExperiment::assay(rse, 1) <- scaledCounts return(rse) } diff --git a/R/snaptron_query.R b/R/snaptron_query.R index 9bd0793..5d26715 100644 --- a/R/snaptron_query.R +++ b/R/snaptron_query.R @@ -6,10 +6,10 @@ #' @param junctions A [GRanges-class][GenomicRanges::GRanges-class] object with the #' exon-exon junctions of interest. The chromosome names should be in UCSC #' format, such as 'chr1'. The strand information is ignored in the query. -#' @param version Either `srav1`, `srav2`, `gtex` or +#' @param version Either `srav1`, `srav2`, `gtex` or #' `tcga`. SRA Version 1 of Intropolis has the -#' exon-exon junctions from about 20 thousand RNA-seq samples in hg19 -#' coordinates. SRA Version 2 has the data from about 50 thousand RNA-seq +#' exon-exon junctions from about 20 thousand RNA-seq samples in hg19 +#' coordinates. SRA Version 2 has the data from about 50 thousand RNA-seq #' samples aligned to hg38. GTEx has about 30 million junctions from about 10 #' thousand samples from the GTEx consortium on hg38 coordinates. Finally, #' TCGA has about 36 million junctions from about 11 thousand samples @@ -35,111 +35,123 @@ #' @import RCurl #' #' @examples -#' -#' library('GenomicRanges') +#' +#' library("GenomicRanges") #' ## Define some exon-exon junctions (hg19 coordinates) -#' junctions <- GRanges(seqnames = 'chr2', IRanges( +#' junctions <- GRanges(seqnames = "chr2", IRanges( #' start = c(28971710:28971712, 29555081:29555083, 29754982:29754984), -#' end = c(29462417:29462419, 29923338:29923340, 29917714:29917716))) +#' end = c(29462417:29462419, 29923338:29923340, 29917714:29917716) +#' )) #' #' ## Check against Snaptron SRA version 1 (hg19 coordinates) #' snaptron_query(junctions) -#' #' \dontrun{ -#' ## Check another set of junctions against SRA version 2 (more data, hg38 +#' ## Check another set of junctions against SRA version 2 (more data, hg38 #' ## coordinates) -#' junctions_v2 <- GRanges(seqnames = 'chr2', IRanges( -#' start = 29532116:29532118, end = 29694848:29694850)) -#' snaptron_query(junctions_v2, version = 'srav2') +#' junctions_v2 <- GRanges(seqnames = "chr2", IRanges( +#' start = 29532116:29532118, end = 29694848:29694850 +#' )) +#' snaptron_query(junctions_v2, version = "srav2") #' #' ## Check these junctions in GTEx and TCGA data -#' snaptron_query(junctions_v2, version = 'gtex') -#' snaptron_query(junctions_v2, version = 'tcga') +#' snaptron_query(junctions_v2, version = "gtex") +#' snaptron_query(junctions_v2, version = "tcga") #' } - -snaptron_query <- function(junctions, version = 'srav1', verbose = TRUE, async = TRUE) { +#' +snaptron_query <- function(junctions, version = "srav1", verbose = TRUE, async = TRUE) { ## Check input - stopifnot(is(junctions, 'GRanges')) - stopifnot(all(grepl('chr', seqlevels(junctions)))) + stopifnot(is(junctions, "GRanges")) + stopifnot(all(grepl("chr", seqlevels(junctions)))) version <- tolower(version) - stopifnot(version %in% c('srav1', 'srav2', 'gtex', 'tcga')) - + stopifnot(version %in% c("srav1", "srav2", "gtex", "tcga")) + ## Build query URLs - urls <- paste0('http://snaptron.cs.jhu.edu/', version, - '/snaptron?regions=', seqnames(junctions), ':', start(junctions), '-', - end(junctions), '&exact=1&header=0') - + urls <- paste0( + "http://snaptron.cs.jhu.edu/", version, + "/snaptron?regions=", seqnames(junctions), ":", start(junctions), "-", + end(junctions), "&exact=1&header=0" + ) + ## Get results - if(verbose) message(paste(Sys.time(), 'querying Snaptron')) + if (verbose) message(paste(Sys.time(), "querying Snaptron")) query_res <- getURL(urls, async = async) - + ## Split by line - if(verbose) message(paste(Sys.time(), 'processing results')) - query_split <- strsplit(query_res, '\n') + if (verbose) message(paste(Sys.time(), "processing results")) + query_split <- strsplit(query_res, "\n") names(query_split) <- NULL - + ## Are there any valid ones? valid <- which(elementNROWS(query_split) > 0) - if(length(valid) == 0) { - message(paste(Sys.time(), - 'found no exon-exon junctions in Intropolis version', version, - 'matching your query: this version uses', - ifelse(version == 'srav1', 'hg19', 'hg38'), 'coordinates.')) + if (length(valid) == 0) { + message(paste( + Sys.time(), + "found no exon-exon junctions in Intropolis version", version, + "matching your query: this version uses", + ifelse(version == "srav1", "hg19", "hg38"), "coordinates." + )) return(NULL) } - + ## Extract information - if(verbose) message(paste(Sys.time(), 'extracting information')) - info <- lapply(query_split[valid], function(jxs) { - matrix(strsplit(jxs, '\t')[[1]], ncol = 18) + if (verbose) message(paste(Sys.time(), "extracting information")) + info <- lapply(query_split[valid], function(jxs) { + matrix(strsplit(jxs, "\t")[[1]], ncol = 18) }) - + ## Build result res <- do.call(rbind, info) - colnames(res) <- c('type', 'snaptron_id', 'chromosome', 'start', 'end', - 'length', 'strand', 'annotated', 'left_motif', 'right_motif', - 'left_annotated', 'right_annotated', 'samples', - 'samples_count', 'coverage_sum', 'coverage_avg', 'coverage_median', - 'source_dataset_id') - + colnames(res) <- c( + "type", "snaptron_id", "chromosome", "start", "end", + "length", "strand", "annotated", "left_motif", "right_motif", + "left_annotated", "right_annotated", "samples", + "samples_count", "coverage_sum", "coverage_avg", "coverage_median", + "source_dataset_id" + ) + ## Helper function for some special variables to_chr_list <- function(x) { - r <- strsplit(x, ',') - i <- which(sapply(r, function(y) { y[[1]] == '0' })) - if(length(i) > 0) r[i] <- NA + r <- strsplit(x, ",") + i <- which(sapply(r, function(y) { + y[[1]] == "0" + })) + if (length(i) > 0) r[i] <- NA return(CharacterList(r)) } - + ## Build into a GRanges object - result <- GRanges(seqnames = res[, 'chromosome'], - IRanges(as.numeric(res[, 'start']), as.numeric(res[, 'end'])), - strand = res[, 'strand']) - + result <- GRanges( + seqnames = res[, "chromosome"], + IRanges(as.numeric(res[, "start"]), as.numeric(res[, "end"])), + strand = res[, "strand"] + ) + ## Add other variables - result$type <- as.factor(res[, 'type']) - result$snaptron_id <- as.integer(res[, 'snaptron_id']) - result$annotated <- to_chr_list(res[, 'annotated']) - result$left_motif <- res[, 'left_motif'] - result$right_motif <- res[, 'right_motif'] - result$left_annotated <- to_chr_list(res[, 'left_annotated']) - result$right_annotated <- to_chr_list(res[, 'right_annotated']) - + result$type <- as.factor(res[, "type"]) + result$snaptron_id <- as.integer(res[, "snaptron_id"]) + result$annotated <- to_chr_list(res[, "annotated"]) + result$left_motif <- res[, "left_motif"] + result$right_motif <- res[, "right_motif"] + result$left_annotated <- to_chr_list(res[, "left_annotated"]) + result$right_annotated <- to_chr_list(res[, "right_annotated"]) + ## Remove leading comma - res[, 'samples'] <- gsub('^,', '', res[, 'samples']) + res[, "samples"] <- gsub("^,", "", res[, "samples"]) ## Split sample ids from the read coverage by sample - sample_info <- IntegerList(strsplit(res[, 'samples'], ':|,')) + sample_info <- IntegerList(strsplit(res[, "samples"], ":|,")) sample_idx <- LogicalList(lapply(elementNROWS(sample_info), function(y) { - rep(c(TRUE, FALSE), y / 2)})) + rep(c(TRUE, FALSE), y / 2) + })) result$samples <- sample_info[sample_idx] result$read_coverage_by_sample <- sample_info[!sample_idx] - + ## Continue with other variables - result$samples_count <- as.integer(res[, 'samples_count']) - result$coverage_sum <- as.integer(res[, 'coverage_sum']) - result$coverage_avg <- as.numeric(res[, 'coverage_avg']) - result$coverage_median <- as.numeric(res[, 'coverage_median']) - result$source_dataset_id <- as.integer(res[, 'source_dataset_id']) - + result$samples_count <- as.integer(res[, "samples_count"]) + result$coverage_sum <- as.integer(res[, "coverage_sum"]) + result$coverage_avg <- as.numeric(res[, "coverage_avg"]) + result$coverage_median <- as.numeric(res[, "coverage_median"]) + result$source_dataset_id <- as.integer(res[, "source_dataset_id"]) + ## Finish return(result) } diff --git a/R/utils.R b/R/utils.R index bd47b81..b97f5f0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -21,24 +21,24 @@ works <- sapply(pkg, requireNamespace, quietly = TRUE) ## If some failed, then give a useful error before quitting - if(any(!works)) { + if (any(!works)) { x <- pkg[!works] stop(paste0( Sys.time(), - ' Package', - ifelse(length(x) > 1, 's', ''), - ' ', - paste(x, collapse = ', '), - ' ', - ifelse(length(x) > 1, 'are', 'is'), - ' missing. Install ', - ifelse(length(x) > 1, 'them', 'it'), - ' with BiocManager::install(', - ifelse(length(x) > 1, 'c(', ''), + " Package", + ifelse(length(x) > 1, "s", ""), + " ", + paste(x, collapse = ", "), + " ", + ifelse(length(x) > 1, "are", "is"), + " missing. Install ", + ifelse(length(x) > 1, "them", "it"), + " with BiocManager::install(", + ifelse(length(x) > 1, "c(", ""), '"', paste(x, collapse = '", "'), '")', - ifelse(length(x) > 1, ')', '') + ifelse(length(x) > 1, ")", "") )) } return(invisible(NULL)) diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..592af6f --- /dev/null +++ b/README.Rmd @@ -0,0 +1,91 @@ +--- +output: github_document +--- + + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%" +) +``` + +# recount + + +[![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://www.tidyverse.org/lifecycle/#stable) +[![BioC status](http://www.bioconductor.org/shields/build/release/bioc/recount.svg)](https://bioconductor.org/checkResults/release/bioc-LATEST/recount) +[![Codecov test coverage](https://codecov.io/gh/leekgroup/recount/branch/master/graph/badge.svg)](https://codecov.io/gh/leekgroup/recount?branch=master) +[![R build status](https://github.com/leekgroup/recount/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/leekgroup/recount/actions) + + +Explore and download data from the recount project available at the [recount2 website](https://jhubiostatistics.shinyapps.io/recount/). Using the `recount` package you can download _RangedSummarizedExperiment_ objects at the gene, exon or exon-exon junctions level, the raw counts, the phenotype metadata used, the urls to the sample coverage bigWig files or the mean coverage bigWig file for a particular study. The _RangedSummarizedExperiment_ objects can be used by different packages for performing differential expression analysis. Using [derfinder](http://bioconductor.org/packages/derfinder) you can perform annotation-agnostic differential expression analyses with the data from the recount project. + +## Documentation + +For more information about `recount` check the vignettes [through Bioconductor](http://bioconductor.org/packages/recount) or at the [documentation website](http://leekgroup.github.io/recount). + +## Installation instructions + +Get the latest stable `R` release from [CRAN](http://cran.r-project.org/). Then install `recount` using from [Bioconductor](http://bioconductor.org/) the following code: + +```{r 'install', eval = FALSE} +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("recount") +``` + +## Citation + +Below is the citation output from using `citation('recount')` in R. Please +run this yourself to check for any updates on how to cite __recount__. + +```{r 'citation', eval = requireNamespace('recount')} +print(citation('recount'), bibtex = TRUE) +``` + +Please note that the `recount` was only made possible thanks to many other R and bioinformatics software authors, which are cited either in the vignettes and/or the paper(s) describing this package. + +## Code of Conduct + +Please note that the recount project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. + +## Development tools + +* Continuous code testing is possible thanks to [GitHub actions](https://www.tidyverse.org/blog/2020/04/usethis-1-6-0/) through `r BiocStyle::CRANpkg('usethis')`, `r BiocStyle::CRANpkg('remotes')`, `r BiocStyle::Githubpkg('r-hub/sysreqs')` and `r BiocStyle::CRANpkg('rcmdcheck')` customized to use [Bioconductor's docker containers](https://www.bioconductor.org/help/docker/) and `r BiocStyle::Biocpkg('BiocCheck')`. +* Code coverage assessment is possible thanks to [codecov](https://codecov.io/gh) and `r BiocStyle::CRANpkg('covr')`. +* The [documentation website](http://leekgroup.github.io/recount) is automatically updated thanks to `r BiocStyle::CRANpkg('pkgdown')`. +* The code is styled automatically thanks to `r BiocStyle::CRANpkg('styler')`. +* The documentation is formatted thanks to `r BiocStyle::CRANpkg('devtools')` and `r BiocStyle::CRANpkg('roxygen2')`. + +For more details, check the `dev` directory. + + +## Teams involved + +* [Jeff Leek's lab at JHBSPH Biostatistics Department](http://jtleek.com/), +* [Ben Langmead's lab at JHU Computer Science](http://www.langmead-lab.org/), +* [Kasper Daniel Hansen's lab at JHBSPH Biostatistics Department](https://www.hansenlab.org/), +* [Leonardo Collado-Torres](http://lcolladotor.github.io/) and [Andrew E. Jaffe](http://aejaffe.com/) from [LIBD](https://www.libd.org/), +* [Abhinav Nellore's lab at OHSU](http://nellore.bio/), +* Data hosted by [SciServer at JHU](https://www.sciserver.org/). + + +| | | | | +| --- | --- | --- | --- | +| | | | | + + + + + diff --git a/README.md b/README.md index 1a765c5..5e34fa8 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,245 @@ - -Status: Travis CI [![Build Status](https://travis-ci.org/leekgroup/recount.svg?branch=master)](https://travis-ci.org/leekgroup/recount), -Bioc-release , -Bioc-devel . + -Bioc-release , Bioc-devel , Codecov [![codecov.io](https://codecov.io/github/leekgroup/recount/coverage.svg?branch=master)](https://codecov.io/github/leekgroup/recount?branch=master) +# recount -recount -======= + -Explore and download data from the recount project available at the [recount2 website](https://jhubiostatistics.shinyapps.io/recount/). Using the `recount` package you can download _RangedSummarizedExperiment_ objects at the gene, exon or exon-exon junctions level, the raw counts, the phenotype metadata used, the urls to the sample coverage bigWig files or the mean coverage bigWig file for a particular study. The _RangedSummarizedExperiment_ objects can be used by different packages for performing differential expression analysis. Using [derfinder](http://bioconductor.org/packages/derfinder) you can perform annotation-agnostic differential expression analyses with the data from the recount project. +[![Lifecycle: +stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://www.tidyverse.org/lifecycle/#stable) +[![BioC +status](http://www.bioconductor.org/shields/build/release/bioc/recount.svg)](https://bioconductor.org/checkResults/release/bioc-LATEST/recount) +[![Codecov test +coverage](https://codecov.io/gh/leekgroup/recount/branch/master/graph/badge.svg)](https://codecov.io/gh/leekgroup/recount?branch=master) +[![R build +status](https://github.com/leekgroup/recount/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/leekgroup/recount/actions) + -For more information about `recount` check the vignettes. +Explore and download data from the recount project available at the +[recount2 website](https://jhubiostatistics.shinyapps.io/recount/). +Using the `recount` package you can download +*RangedSummarizedExperiment* objects at the gene, exon or exon-exon +junctions level, the raw counts, the phenotype metadata used, the urls +to the sample coverage bigWig files or the mean coverage bigWig file for +a particular study. The *RangedSummarizedExperiment* objects can be used +by different packages for performing differential expression analysis. +Using [derfinder](http://bioconductor.org/packages/derfinder) you can +perform annotation-agnostic differential expression analyses with the +data from the recount project. -# Installation instructions +## Documentation -Get R 3.5.x from [CRAN](http://cran.r-project.org/). +For more information about `recount` check the vignettes [through +Bioconductor](http://bioconductor.org/packages/recount) or at the +[documentation website](http://leekgroup.github.io/recount). -```R -## From Bioconductor +## Installation instructions + +Get the latest stable `R` release from +[CRAN](http://cran.r-project.org/). Then install `recount` using from +[Bioconductor](http://bioconductor.org/) the following code: + +``` r if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("recount") - -## Suggested: -BiocManager::install(c('derfinder', 'DESeq2')) ``` -# Vignettes - -The vignettes for this package can be viewed via [Bioconductor's website](http://www.bioconductor.org/packages/recount). - - -# Citation - -Below is the citation output from using `citation('recount')` in R. Please -run this yourself to check for any updates on how to cite __recount__. - -To cite the __recount__ package in publications use: - -Collado-Torres L, Nellore A, Kammers K, Ellis SE, Taub MA, Hansen KD, Jaffe AE, Langmead B and Leek JT (2017). “Reproducible RNA-seq analysis using _recount2_.” _Nature Biotechnology_. doi: 10.1038/nbt.3838 (URL: http://doi.org/10.1038/nbt.3838), . - -A BibTeX entry for LaTeX users is - -@Article{, - title = {Reproducible RNA-seq analysis using recount2}, - author = {Leonardo Collado-Torres and Abhinav Nellore and Kai Kammers and Shannon E. Ellis and Margaret A. Taub and Kasper D. Hansen and and Andrew E. Jaffe and Ben Langmead and Jeffrey T. Leek}, - year = {2017}, - journal = {Nature Biotechnology}, - doi = {10.1038/nbt.3838}, - url = {http://www.nature.com/nbt/journal/v35/n4/full/nbt.3838.html}, -} - -# Testing +## Citation + +Below is the citation output from using `citation('recount')` in R. +Please run this yourself to check for any updates on how to cite +**recount**. + +``` r +print(citation('recount'), bibtex = TRUE) +#> +#> Collado-Torres L, Nellore A, Kammers K, Ellis SE, Taub MA, Hansen KD, +#> Jaffe AE, Langmead B, Leek JT (2017). "Reproducible RNA-seq analysis +#> using recount2." _Nature Biotechnology_. doi: 10.1038/nbt.3838 (URL: +#> https://doi.org/10.1038/nbt.3838), http://www.nature.com/nbt/journal/v35/n4/full/nbt.3838.html>. +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Article{, +#> title = {Reproducible RNA-seq analysis using recount2}, +#> author = {Leonardo Collado-Torres and Abhinav Nellore and Kai Kammers and Shannon E. Ellis and Margaret A. Taub and Kasper D. Hansen and Andrew E. Jaffe and Ben Langmead and Jeffrey T. Leek}, +#> year = {2017}, +#> journal = {Nature Biotechnology}, +#> doi = {10.1038/nbt.3838}, +#> url = {http://www.nature.com/nbt/journal/v35/n4/full/nbt.3838.html}, +#> } +#> +#> Collado-Torres L, Nellore A, Jaffe AE (2017). "recount workflow: +#> Accessing over 70,000 human RNA-seq samples with Bioconductor [version +#> 1; referees: 1 approved, 2 approved with reservations]." +#> _F1000Research_. doi: 10.12688/f1000research.12223.1 (URL: +#> https://doi.org/10.12688/f1000research.12223.1), https://f1000research.com/articles/6-1558/v1>. +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Article{, +#> title = {recount workflow: Accessing over 70,000 human RNA-seq samples with Bioconductor [version 1; referees: 1 approved, 2 approved with reservations]}, +#> author = {Leonardo Collado-Torres and Abhinav Nellore and Andrew E. Jaffe}, +#> year = {2017}, +#> journal = {F1000Research}, +#> doi = {10.12688/f1000research.12223.1}, +#> url = {https://f1000research.com/articles/6-1558/v1}, +#> } +#> +#> Ellis SE, Collado-Torres L, Jaffe AE, Leek JT (2018). "Improving the +#> value of public RNA-seq expression data by phenotype prediction." +#> _Nucl. Acids Res._. doi: 10.1093/nar/gky102 (URL: +#> https://doi.org/10.1093/nar/gky102), https://doi.org/10.1093/nar/gky102>. +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Article{, +#> title = {Improving the value of public RNA-seq expression data by phenotype prediction}, +#> author = {Shannon E. Ellis and Leonardo Collado-Torres and Andrew E. Jaffe and Jeffrey T. Leek}, +#> year = {2018}, +#> journal = {Nucl. Acids Res.}, +#> doi = {10.1093/nar/gky102}, +#> url = {https://doi.org/10.1093/nar/gky102}, +#> } +#> +#> Collado-Torres L, Nellore A, Kammers K, Ellis SE, Taub MA, Hansen KD, +#> Jaffe AE, Langmead B, Leek JT (2020). _Explore and download data from +#> the recount project_. doi: 10.18129/B9.bioc.recount (URL: +#> https://doi.org/10.18129/B9.bioc.recount), +#> https://github.com/leekgroup/recount - R package version 1.13.2, http://www.bioconductor.org/packages/recount>. +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Manual{, +#> title = {Explore and download data from the recount project}, +#> author = {Leonardo Collado-Torres and Abhinav Nellore and Kai Kammers and Shannon E. Ellis and Margaret A. Taub and Kasper D. Hansen and Andrew E. Jaffe and Ben Langmead and Jeffrey T. Leek}, +#> year = {2020}, +#> url = {http://www.bioconductor.org/packages/recount}, +#> note = {https://github.com/leekgroup/recount - R package version 1.13.2}, +#> doi = {10.18129/B9.bioc.recount}, +#> } +#> +#> Frazee AC, Langmead B, Leek JT (2011). "ReCount: A multi-experiment +#> resource of analysis-ready RNA-seq gene count datasets." _BMC +#> Bioinformatics_. doi: 10.1186/1471-2105-12-449 (URL: +#> https://doi.org/10.1186/1471-2105-12-449), https://doi.org/10.1186/1471-2105-12-449>. +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Article{, +#> title = {ReCount: A multi-experiment resource of analysis-ready RNA-seq gene count datasets}, +#> author = {Alyssa C. Frazee and Ben Langmead and Jeffrey T. Leek}, +#> year = {2011}, +#> journal = {BMC Bioinformatics}, +#> doi = {10.1186/1471-2105-12-449}, +#> url = {https://doi.org/10.1186/1471-2105-12-449}, +#> } +#> +#> Razmara A, Ellis SE, Sokolowski DJ, Davis S, Wilson MD, Leek JT, Jaffe +#> AE, Collado-Torres L (2019). "recount-brain: a curated repository of +#> human brain RNA-seq datasets metadata." _bioRxiv_. doi: 10.1101/618025 +#> (URL: https://doi.org/10.1101/618025), https://doi.org/10.1101/618025>. +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Article{, +#> title = {recount-brain: a curated repository of human brain RNA-seq datasets metadata}, +#> author = {Ashkaun Razmara and Shannon E. Ellis and Dustin J. Sokolowski and Sean Davis and Michael D. Wilson and Jeffrey T. Leek and Andrew E. Jaffe and Leonardo Collado-Torres}, +#> year = {2019}, +#> journal = {bioRxiv}, +#> doi = {10.1101/618025}, +#> url = {https://doi.org/10.1101/618025}, +#> } +#> +#> Imada E, Sanchez DF, Collado-Torres L, Wilks C, Matam T, Dinalankara W, +#> Stupnikov A, Lobo-Pereira F, Yip C, Yasuzawa K, Kondo N, Itoh M, Suzuki +#> H, Kasukawa T, Hon CC, de Hoon MJ, Shin JW, Carninci P, Jaffe AE, Leek +#> JT, Favorov A, Franco GR, Langmead B, Marchionni L (2020). "Recounting +#> the FANTOM CAGE–Associated Transcriptome." _Genome Research_. doi: +#> 10.1101/gr.254656.119 (URL: https://doi.org/10.1101/gr.254656.119), +#> . +#> +#> A BibTeX entry for LaTeX users is +#> +#> @Article{, +#> title = {Recounting the FANTOM CAGE–Associated Transcriptome}, +#> author = {Eddie-Luidy Imada and Diego Fernando Sanchez and Leonardo Collado-Torres and Christopher Wilks and Tejasvi Matam and Wikum Dinalankara and Aleksey Stupnikov and Francisco Lobo-Pereira and Chi-Wai Yip and Kayoko Yasuzawa and Naoto Kondo and Masayoshi Itoh and Harukazu Suzuki and Takeya Kasukawa and Chung Chau Hon and Michiel JL {de Hoon} and Jay W Shin and Piero Carninci and Andrew E. Jaffe and Jeffrey T. Leek and Alexander Favorov and Glória R Franco and Ben Langmead and Luigi Marchionni}, +#> year = {2020}, +#> journal = {Genome Research}, +#> doi = {10.1101/gr.254656.119}, +#> url = {https://doi.org/10.1101/gr.254656.119}, +#> } +``` -Testing on Bioc-devel is feasible thanks to [R Travis](http://docs.travis-ci.com/user/languages/r/) as well as Bioconductor's nightly build. +Please note that the `recount` was only made possible thanks to many +other R and bioinformatics software authors, which are cited either in +the vignettes and/or the paper(s) describing this package. + +## Code of Conduct + +Please note that the recount project is released with a [Contributor +Code of +Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). +By contributing to this project, you agree to abide by its terms. + +## Development tools + + - Continuous code testing is possible thanks to [GitHub + actions](https://www.tidyverse.org/blog/2020/04/usethis-1-6-0/) + through *[usethis](https://CRAN.R-project.org/package=usethis)*, + *[remotes](https://CRAN.R-project.org/package=remotes)*, + *[sysreqs](https://github.com/r-hub/sysreqs)* and + *[rcmdcheck](https://CRAN.R-project.org/package=rcmdcheck)* + customized to use [Bioconductor’s docker + containers](https://www.bioconductor.org/help/docker/) and + *[BiocCheck](https://bioconductor.org/packages/3.11/BiocCheck)*. + - Code coverage assessment is possible thanks to + [codecov](https://codecov.io/gh) and + *[covr](https://CRAN.R-project.org/package=covr)*. + - The [documentation website](http://leekgroup.github.io/recount) is + automatically updated thanks to + *[pkgdown](https://CRAN.R-project.org/package=pkgdown)*. + - The code is styled automatically thanks to + *[styler](https://CRAN.R-project.org/package=styler)*. + - The documentation is formatted thanks to + *[devtools](https://CRAN.R-project.org/package=devtools)* and + *[roxygen2](https://CRAN.R-project.org/package=roxygen2)*. + +For more details, check the `dev` directory. + +## Teams involved + + - [Jeff Leek’s lab at JHBSPH Biostatistics + Department](http://jtleek.com/), + - [Ben Langmead’s lab at JHU Computer + Science](http://www.langmead-lab.org/), + - [Kasper Daniel Hansen’s lab at JHBSPH Biostatistics + Department](https://www.hansenlab.org/), + - [Leonardo Collado-Torres](http://lcolladotor.github.io/) and [Andrew + E. Jaffe](http://aejaffe.com/) from [LIBD](https://www.libd.org/), + - [Abhinav Nellore’s lab at OHSU](http://nellore.bio/), + - Data hosted by [SciServer at JHU](https://www.sciserver.org/). + +| | | | | +| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| | | | | + + + + + + diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..6ef5f5a --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1 @@ +destination: docs diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..04c5585 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,14 @@ +comment: false + +coverage: + status: + project: + default: + target: auto + threshold: 1% + informational: true + patch: + default: + target: auto + threshold: 1% + informational: true diff --git a/dev/01_setup.R b/dev/01_setup.R new file mode 100644 index 0000000..175b1f7 --- /dev/null +++ b/dev/01_setup.R @@ -0,0 +1,73 @@ +## Create the dev directory +dir.create("dev") +file.create("dev/01_setup.R") +rstudioapi::navigateToFile("dev/01_setup.R") + +## Ignore the dev directory +system("echo dev >> .Rbuildignore") + +## Remove travis +unlink(".travis.yml") + +## Update package doc +dir.create("R", showWarnings = FALSE) +unlink(dir("R", "-package.R$", full.names = TRUE)) +usethis::use_package_doc() + +## GitHub tests +## Original one +# usethis::use_github_action('check-standard') +## Modified one: +usethis::use_github_action("check-bioc", + url = "https://raw.githubusercontent.com/leekgroup/derfinderPlot/master/.github/workflows/check-bioc.yml" +) +## See also: +# * https://github.com/csoneson/dreval/blob/master/.github/workflows/R-CMD-check.yaml +# * https://github.com/seandavi/BiocActions/blob/master/.github/workflows/main.yml + +## pkgdown setup +usethis::use_pkgdown() + +## Support files +usethis::use_readme_rmd() ## Edit with original README.md contents +# usethis::use_news_md() +usethis::use_tidy_coc() +usethis::use_tidy_contributing() + +## Customize the support message to mention the Bioconductor Support Website +usethis::use_tidy_support() +support <- readLines(".github/SUPPORT.md") +support <- + gsub( + "\\[community.rstudio.com\\]\\(https://community.rstudio.com/\\), and/or StackOverflow", + "the [Bioconductor Support Website](https://support.bioconductor.org/) using the appropriate package tag", + support + ) +writeLines(support, ".github/SUPPORT.md") + +## Use my own ISSUE template, modified from https://github.com/lcolladotor/osca_LIIGH_UNAM_2020 +contents <- xfun::read_utf8("https://raw.githubusercontent.com/leekgroup/derfinderPlot/master/.github/ISSUE_TEMPLATE/issue_template.md") +save_as <- file.path(".github", "ISSUE_TEMPLATE", "issue_template.md") +usethis:::create_directory(dirname(save_as)) +usethis::write_over(proj_path(save_as), contents) + +## Add some badges +usethis::use_lifecycle_badge("Stable") +usethis::use_bioc_badge() + +## Tests +usethis::use_coverage() + +## GitHub badges +usethis::use_github_actions_badge("R-CMD-check-bioc") + +## Deploy with pkgdown at least once locally such that the automatic updates +## from GitHub actions will work +pkgdown::deploy_to_branch(new_process = FALSE) + +## Update style for this document +styler::style_file("dev/01_setup.R", transformers = styler::tidyverse_style(indent_by = 4)) + +## Update prior to committing +file.create("dev/02_update.R") +rstudioapi::navigateToFile("dev/02_update.R") diff --git a/dev/02_update.R b/dev/02_update.R new file mode 100644 index 0000000..df756b1 --- /dev/null +++ b/dev/02_update.R @@ -0,0 +1,23 @@ +## Style as close as possible to BioC's style using styler +bioc_style <- styler::tidyverse_style(indent_by = 4) +bioc_style$indention$update_indention_ref_fun_dec <- function(pd_nested) { + if (pd_nested$token[1] == "FUNCTION") { + seq <- rlang::seq2(3, nrow(pd_nested) - 2) + pd_nested$indention_ref_pos_id[seq] <- pd_nested$pos_id[nrow(pd_nested)] + pd_nested$indent[seq] <- pd_nested$indent[seq] + 4 + } + pd_nested +} + +styler::style_pkg(transformers = bioc_style) +## Style vignettes too +sapply(dir("vignettes", "Rmd$", full.names = TRUE), styler::style_file, transformers = bioc_style) + +## This messes up quotes +# formatR::tidy_dir("R", indent = 4, width.cutoff = 70) + +## Update docs +devtools::document() + +## Update style for this document +styler::style_file("dev/02_update.R", transformers = styler::tidyverse_style(indent_by = 4)) diff --git a/inst/CITATION b/inst/CITATION index 8336f7c..3c46ed5 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -96,7 +96,7 @@ c( url = "https://doi.org/10.1101/618025" ), bibentry(bibtype="article", - title = "Recounting the FANTOM Cage Associated Transcriptome", + title = "Recounting the FANTOM CAGE–Associated Transcriptome", author = personList( as.person("Eddie-Luidy Imada"), as.person("Diego Fernando Sanchez"), @@ -113,20 +113,19 @@ c( as.person("Harukazu Suzuki"), as.person("Takeya Kasukawa"), as.person("Chung Chau Hon"), - as.person("Michiel de Hoon"), + as.person("Michiel JL de Hoon"), as.person("Jay W Shin"), as.person("Piero Carninci"), - as.person("FANTOM-consortium"), as.person("Andrew E. Jaffe"), as.person("Jeffrey T. Leek"), as.person("Alexander Favorov"), - as.person("Gloria R Franco"), + as.person("Glória R Franco"), as.person("Ben Langmead"), as.person("Luigi Marchionni") ), - year = 2019, - journal = "bioRxiv", - doi = "10.1101/659490", - url = "https://doi.org/10.1101/659490" + year = 2020, + journal = "Genome Research", + doi = "10.1101/gr.254656.119", + url = "https://doi.org/10.1101/gr.254656.119" ) -) \ No newline at end of file +) diff --git a/man/abstract_search.Rd b/man/abstract_search.Rd index 6b54d3a..b2a3ae8 100644 --- a/man/abstract_search.Rd +++ b/man/abstract_search.Rd @@ -33,7 +33,7 @@ For a more powerful search use the recount project website at } \examples{ ## Find the Geuvadis consortium project -project_info <- abstract_search('Geuvadis consortium') +project_info <- abstract_search("Geuvadis consortium") ## See some summary information for this project project_info diff --git a/man/add_metadata.Rd b/man/add_metadata.Rd index ac19c04..389bff1 100644 --- a/man/add_metadata.Rd +++ b/man/add_metadata.Rd @@ -4,8 +4,7 @@ \alias{add_metadata} \title{Add additional curated metadata to a recount rse object} \usage{ -add_metadata(rse, source = "recount_brain_v2", is_tcga = FALSE, - verbose = TRUE) +add_metadata(rse, source = "recount_brain_v2", is_tcga = FALSE, verbose = TRUE) } \arguments{ \item{rse}{A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} @@ -48,7 +47,7 @@ A bib file is available via citation('recount'). \examples{ ## Add the sample metadata to an example rse_gene object -rse_gene <- add_metadata(rse_gene_SRP009615, 'recount_brain_v2') +rse_gene <- add_metadata(rse_gene_SRP009615, "recount_brain_v2") ## Explore the metadata colData(rse_gene) @@ -61,8 +60,7 @@ colData(rse_gene) ## Obtain all the recount_brain_v2 metadata if you want to ## explore the metadata manually -recount_brain_v2 <- add_metadata(source = 'recount_brain_v2') - +recount_brain_v2 <- add_metadata(source = "recount_brain_v2") } \references{ Razmara et al, bioRxiv, 2019. diff --git a/man/add_predictions.Rd b/man/add_predictions.Rd index 1959756..bb5d7cb 100644 --- a/man/add_predictions.Rd +++ b/man/add_predictions.Rd @@ -4,8 +4,7 @@ \alias{add_predictions} \title{Add predicted phenotypes to a recount rse object} \usage{ -add_predictions(rse, is_tcga = FALSE, version = "latest", - verbose = TRUE) +add_predictions(rse, is_tcga = FALSE, version = "latest", verbose = TRUE) } \arguments{ \item{rse}{A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} @@ -69,7 +68,6 @@ colData(rse_gene) ## Download all the latest predictions PredictedPhenotypes <- add_predictions() - } \references{ Ellis et al, bioRxiv, 2017. diff --git a/man/all_metadata.Rd b/man/all_metadata.Rd index 6596c70..4615a85 100644 --- a/man/all_metadata.Rd +++ b/man/all_metadata.Rd @@ -39,7 +39,6 @@ For more information, check \url{https://github.com/leekgroup/recount-website/tr \examples{ metadata <- all_metadata() - } \author{ Leonardo Collado-Torres diff --git a/man/browse_study.Rd b/man/browse_study.Rd index 360ac01..8ef592f 100644 --- a/man/browse_study.Rd +++ b/man/browse_study.Rd @@ -19,7 +19,7 @@ Given a SRA study id get the url to browse the study using the SRA website. } \examples{ ## Find the Geuvadis consortium project -id <- abstract_search('Geuvadis consortium', id_only = TRUE) +id <- abstract_search("Geuvadis consortium", id_only = TRUE) id ## Explore the Geuvadis consortium project diff --git a/man/coverage_matrix.Rd b/man/coverage_matrix.Rd index 6e4814d..2bcbb0c 100644 --- a/man/coverage_matrix.Rd +++ b/man/coverage_matrix.Rd @@ -5,9 +5,20 @@ \title{Given a set of regions for a chromosome, compute the coverage matrix for a given SRA study.} \usage{ -coverage_matrix(project, chr, regions, chunksize = 1000, - bpparam = NULL, outdir = NULL, chrlen = NULL, verbose = TRUE, - verboseLoad = verbose, scale = TRUE, round = FALSE, ...) +coverage_matrix( + project, + chr, + regions, + chunksize = 1000, + bpparam = NULL, + outdir = NULL, + chrlen = NULL, + verbose = TRUE, + verboseLoad = verbose, + scale = TRUE, + round = FALSE, + ... +) } \arguments{ \item{project}{A character vector with one SRA study id.} @@ -80,19 +91,20 @@ Note that you will need to run \link{scale_counts} after running } \examples{ -if(.Platform$OS.type != 'windows') { -## Reading BigWig files is not supported by rtracklayer on Windows +if (.Platform$OS.type != "windows") { + ## Reading BigWig files is not supported by rtracklayer on Windows ## Define expressed regions for study DRP002835, chrY - regions <- expressed_regions('DRP002835', 'chrY', cutoff = 5L, - maxClusterGap = 3000L) + regions <- expressed_regions("DRP002835", "chrY", + cutoff = 5L, + maxClusterGap = 3000L + ) ## Now calculate the coverage matrix for this study - rse <- coverage_matrix('DRP002835', 'chrY', regions) + rse <- coverage_matrix("DRP002835", "chrY", regions) ## One row per region identical(length(regions), nrow(rse)) } - } \seealso{ \link{download_study}, \link[derfinder:findRegions]{findRegions}, diff --git a/man/download_retry.Rd b/man/download_retry.Rd index 42570e4..10eaaf4 100644 --- a/man/download_retry.Rd +++ b/man/download_retry.Rd @@ -4,8 +4,7 @@ \alias{download_retry} \title{Retry multiple times to download a file} \usage{ -download_retry(url, destfile = basename(url), mode = "wb", - N.TRIES = 3L, ...) +download_retry(url, destfile = basename(url), mode = "wb", N.TRIES = 3L, ...) } \arguments{ \item{url}{The URL to download. Passed to \link[downloader:download]{download}.} @@ -37,7 +36,6 @@ build errors due to the occassional errors from our data hosting server. ## Download the first files_info.tsv file (version 1) download_retry( - recount_url$url[which(recount_url$file_name == 'files_info.tsv')[1]] + recount_url$url[which(recount_url$file_name == "files_info.tsv")[1]] ) - } diff --git a/man/download_study.Rd b/man/download_study.Rd index 6340a3c..b2fe993 100644 --- a/man/download_study.Rd +++ b/man/download_study.Rd @@ -4,8 +4,14 @@ \alias{download_study} \title{Download data for a given SRA study id from the recount project} \usage{ -download_study(project, type = "rse-gene", outdir = project, - download = TRUE, version = 2, ...) +download_study( + project, + type = "rse-gene", + outdir = project, + download = TRUE, + version = 2, + ... +) } \arguments{ \item{project}{A character vector with one SRA study id.} @@ -90,24 +96,22 @@ Sanchez, et al., bioRxiv, 2019. \examples{ ## Find the URL to download the RangedSummarizedExperiment for the ## Geuvadis consortium study. -url <- download_study('ERP001942', download = FALSE) +url <- download_study("ERP001942", download = FALSE) ## See the actual URL url - \dontrun{ ## Download the example data included in the package for study SRP009615 -url2 <- download_study('SRP009615') +url2 <- download_study("SRP009615") url2 ## Load the data -load(file.path('SRP009615', 'rse_gene.Rdata')) +load(file.path("SRP009615", "rse_gene.Rdata")) ## Compare the data -library('testthat') +library("testthat") expect_equivalent(rse_gene, rse_gene_SRP009615) - } } diff --git a/man/expressed_regions.Rd b/man/expressed_regions.Rd index 7d0a42e..5ee22e6 100644 --- a/man/expressed_regions.Rd +++ b/man/expressed_regions.Rd @@ -4,8 +4,16 @@ \alias{expressed_regions} \title{Identify expressed regions from the mean coverage for a given SRA project} \usage{ -expressed_regions(project, chr, cutoff, outdir = NULL, - maxClusterGap = 300L, chrlen = NULL, verbose = TRUE, ...) +expressed_regions( + project, + chr, + cutoff, + outdir = NULL, + maxClusterGap = 300L, + chrlen = NULL, + verbose = TRUE, + ... +) } \arguments{ \item{project}{A character vector with one SRA study id.} @@ -45,18 +53,19 @@ defined by \link[derfinder:findRegions]{findRegions}. } \examples{ ## Define expressed regions for study SRP009615, chrY -if(.Platform$OS.type != 'windows') { -## Reading BigWig files is not supported by rtracklayer on Windows - regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L, - maxClusterGap = 3000L) +if (.Platform$OS.type != "windows") { + ## Reading BigWig files is not supported by rtracklayer on Windows + regions <- expressed_regions("SRP009615", "chrY", + cutoff = 5L, + maxClusterGap = 3000L + ) } - \dontrun{ ## Define the regions for multiple chrs -regs <- sapply(chrs, expressed_regions, project = 'SRP009615', cutoff = 5L) +regs <- sapply(chrs, expressed_regions, project = "SRP009615", cutoff = 5L) ## You can then combine them into a single GRanges object if you want to -library('GenomicRanges') +library("GenomicRanges") single <- unlist(GRangesList(regs)) } diff --git a/man/figures/logo.png b/man/figures/logo.png new file mode 100644 index 0000000..2748a90 Binary files /dev/null and b/man/figures/logo.png differ diff --git a/man/find_geo.Rd b/man/find_geo.Rd index f7660aa..dcc5391 100644 --- a/man/find_geo.Rd +++ b/man/find_geo.Rd @@ -33,8 +33,7 @@ the information using \link{geo_info}. } \examples{ ## Find the GEO accession id for for SRX110461 -find_geo('SRX110461') - +find_geo("SRX110461") } \author{ Leonardo Collado-Torres diff --git a/man/geo_characteristics.Rd b/man/geo_characteristics.Rd index 3b8d428..d6a8c46 100644 --- a/man/geo_characteristics.Rd +++ b/man/geo_characteristics.Rd @@ -25,7 +25,7 @@ samples as shown in the examples section. \examples{ ## Load required library -library('SummarizedExperiment') +library("SummarizedExperiment") ## Get the GEO accession ids # geoids <- sapply(colData(rse_gene_SRP009615)$run[1:2], find_geo) @@ -34,8 +34,13 @@ library('SummarizedExperiment') ## This code makes sure that the example code runs. geoids <- tryCatch( sapply(colData(rse_gene_SRP009615)$run[1:2], find_geo), - error = function(e) c('SRR387777' = 'GSM836270', - 'SRR387778' = 'GSM836271')) + error = function(e) { + c( + "SRR387777" = "GSM836270", + "SRR387778" = "GSM836271" + ) + } +) ## Get the data from GEO geodata <- do.call(rbind, sapply(geoids, geo_info)) @@ -45,8 +50,7 @@ geodata <- cbind(geodata, geo_characteristics(geodata)) ## Explore the original characteristics and the result from ## geo_characteristics() -geodata[, c('characteristics', 'cells', 'shrna.expression', 'treatment')] - +geodata[, c("characteristics", "cells", "shrna.expression", "treatment")] } \author{ Leonardo Collado-Torres diff --git a/man/geo_info.Rd b/man/geo_info.Rd index ae972db..c0cdd8c 100644 --- a/man/geo_info.Rd +++ b/man/geo_info.Rd @@ -4,8 +4,14 @@ \alias{geo_info} \title{Extract information from GEO for a given sample} \usage{ -geo_info(geoid, verbose = FALSE, sleep = 1/2, getGPL = FALSE, - destdir = tempdir(), ...) +geo_info( + geoid, + verbose = FALSE, + sleep = 1/2, + getGPL = FALSE, + destdir = tempdir(), + ... +) } \arguments{ \item{geoid}{A character vector of length 1 with the GEO accession id for @@ -34,8 +40,7 @@ This function uses GEOquery to extract information for a given sample. The GEO accession ids for the sample can be found in the study phenotype table. } \examples{ -geo_info('GSM836270') - +geo_info("GSM836270") } \author{ Leonardo Collado-Torres, Andrew Jaffe diff --git a/man/getRPKM.Rd b/man/getRPKM.Rd index e27afce..1e95d61 100644 --- a/man/getRPKM.Rd +++ b/man/getRPKM.Rd @@ -42,12 +42,11 @@ rpkm <- getRPKM(rse_gene_SRP009615) ## You can also get an RPKM matrix after running scale_counts() ## with similar RPKM values rpkm2 <- getRPKM(scale_counts(rse_gene_SRP009615)) -rpkm3 <- getRPKM(scale_counts(rse_gene_SRP009615, by = 'mapped_reads')) +rpkm3 <- getRPKM(scale_counts(rse_gene_SRP009615, by = "mapped_reads")) summary(rpkm - rpkm2) summary(rpkm - rpkm3) summary(rpkm2 - rpkm3) - } \seealso{ \link{scale_counts} diff --git a/man/getTPM.Rd b/man/getTPM.Rd index 22b583e..f31c219 100644 --- a/man/getTPM.Rd +++ b/man/getTPM.Rd @@ -42,7 +42,7 @@ TPM = FPKM / (sum of FPKM over all genes/transcripts) * 10^6 Arora et al mention in their code that the formula comes from https://doi.org/10.1093/bioinformatics/btp692; specifically -\code{1.1.1 Comparison to RPKM estimation} where they mention an important +\verb{1.1.1 Comparison to RPKM estimation} where they mention an important assumption: Under the assumption of uniformly distributed reads, we note that RPKM measures are estimates of ... @@ -54,7 +54,6 @@ https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-e ## Compute the TPM matrix from the raw gene-level base-pair counts. tpm <- getTPM(rse_gene_SRP009615) - } \references{ https://www.biorxiv.org/content/10.1101/445601v2 diff --git a/man/read_counts.Rd b/man/read_counts.Rd index 6469d26..099ae58 100644 --- a/man/read_counts.Rd +++ b/man/read_counts.Rd @@ -39,18 +39,17 @@ will return non-sensical results. ## Difference between read counts and reads downloaded by Rail-RNA colSums(assays(read_counts(rse_gene_SRP009615))$counts) / 1e6 - - colData(rse_gene_SRP009615)$reads_downloaded /1e6 + colData(rse_gene_SRP009615)$reads_downloaded / 1e6 ## Paired-end read counts vs fragment counts (single-end counts) -download_study('DRP000499') -load('DRP000499/rse_gene.Rdata') +download_study("DRP000499") +load("DRP000499/rse_gene.Rdata") colSums(assays(read_counts(rse_gene, use_paired_end = FALSE))$counts) / colSums(assays(read_counts(rse_gene))$counts) ## Difference between paired-end read counts vs paired-end reads downloaded colSums(assays(read_counts(rse_gene))$counts) / 1e6 - - colData(rse_gene)$reads_downloaded / 1e6 /2 - + colData(rse_gene)$reads_downloaded / 1e6 / 2 } \references{ Collado-Torres L, Nellore A and Jaffe AE. recount workflow: Accessing over diff --git a/man/recount-package.Rd b/man/recount-package.Rd index 6ad04e1..28a4739 100644 --- a/man/recount-package.Rd +++ b/man/recount-package.Rd @@ -2,23 +2,47 @@ % Please edit documentation in R/recount-package.R \docType{package} \name{recount-package} +\alias{recount} \alias{recount-package} -\title{Explore and download data from the recount project.} +\title{recount: Explore and download data from the recount project} \description{ Explore and download data from the recount project available at -\url{https://jhubiostatistics.shinyapps.io/recount/}. Using the recount package -you can download -\link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} -objects at the gene or exon level, the raw counts, the phenotype metadata -used, the urls to the sample coverage bigWig files or the mean coverage -bigWig file for a particular study. The -\link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} objects can be -used by different packages for performing differential expression analysis. -Using \url{http://bioconductor.org/packages/derfinder} you can perform -annotation-agnostic differential expression analyses with the data from the -recount project. + https://jhubiostatistics.shinyapps.io/recount/. Using the recount package you can + download RangedSummarizedExperiment objects at the gene, exon or exon-exon junctions level, + the raw counts, the phenotype metadata used, the urls to the sample coverage + bigWig files or the mean coverage bigWig file for a particular study. The + RangedSummarizedExperiment objects can be used by different packages for + performing differential expression analysis. Using + http://bioconductor.org/packages/derfinder you can perform + annotation-agnostic differential expression analyses with the data from the + recount project as described at http://www.nature.com/nbt/journal/v35/n4/full/nbt.3838.html. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/leekgroup/recount} + \item Report bugs at \url{https://support.bioconductor.org/t/recount/} +} + } \author{ -Leonardo Collado-Torres \href{mailto:lcolladotor@gmail.com}{lcolladotor@gmail.com} +\strong{Maintainer}: Leonardo Collado-Torres \email{lcolladotor@gmail.com} (\href{https://orcid.org/0000-0003-2140-308X}{ORCID}) + +Authors: +\itemize{ + \item Jeffrey T. Leek \email{jtleek@gmail.com} (\href{https://orcid.org/0000-0002-2873-2671}{ORCID}) [thesis advisor] +} + +Other contributors: +\itemize{ + \item Abhinav Nellore \email{anellore@gmail.com} [contributor] + \item Andrew E. Jaffe \email{andrew.jaffe@libd.org} (\href{https://orcid.org/0000-0001-6886-1454}{ORCID}) [contributor] + \item Margaret A. Taub \email{mtaub@jhsph.edu} [contributor] + \item Kai Kammers \email{kai.kammers@gmail.com} [contributor] + \item Shannon E. Ellis \email{sellis18@jhmi.edu} (\href{https://orcid.org/0000-0002-9231-0481}{ORCID}) [contributor] + \item Kasper Daniel Hansen \email{kasperdanielhansen@gmail.com} (\href{https://orcid.org/0000-0003-0086-0687}{ORCID}) [contributor] + \item Ben Langmead \email{langmea@cs.jhu.edu} (\href{https://orcid.org/0000-0003-2437-1976}{ORCID}) [contributor] +} + } -\keyword{package} +\keyword{internal} diff --git a/man/recount_abstract.Rd b/man/recount_abstract.Rd index 6e2dbe0..0815f91 100644 --- a/man/recount_abstract.Rd +++ b/man/recount_abstract.Rd @@ -4,14 +4,16 @@ \name{recount_abstract} \alias{recount_abstract} \title{Summary information at the project level for the recount project} -\format{A data.frame with 4 columns. +\format{ +A data.frame with 4 columns. \describe{ \item{number_samples }{ the number of samples in the study,} \item{species }{ the species of the study,} \item{abstract }{ the abstract text as provided by the SRAdb Bioconductor package,} \item{project }{ the SRA project id.} -}} +} +} \description{ A data.frame with summary information at the project level for the studies analyzed in the recount project. diff --git a/man/recount_exons.Rd b/man/recount_exons.Rd index 09b1ca4..26843d1 100644 --- a/man/recount_exons.Rd +++ b/man/recount_exons.Rd @@ -4,8 +4,10 @@ \name{recount_exons} \alias{recount_exons} \title{Exon annotation used in recount} -\format{A \link[GenomicRanges:GRangesList-class]{GRangesList-class} with one element per -gene. The names match the gene Gencode v25 ids.} +\format{ +A \link[GenomicRanges:GRangesList-class]{GRangesList-class} with one element per +gene. The names match the gene Gencode v25 ids. +} \description{ Exon annotation extracted from Gencode v25 (GRCh38.p7) used in recount. Data extraced on October 12th, 2017. diff --git a/man/recount_genes.Rd b/man/recount_genes.Rd index 5433cd3..be3d411 100644 --- a/man/recount_genes.Rd +++ b/man/recount_genes.Rd @@ -4,7 +4,8 @@ \name{recount_genes} \alias{recount_genes} \title{Gene annotation used in recount} -\format{A \link[GenomicRanges:GRanges-class]{GRanges-class} with one range per gene. The +\format{ +A \link[GenomicRanges:GRanges-class]{GRanges-class} with one range per gene. The names match their Gencode v25 ids. The \link[GenomicRanges:GRanges-class]{GRanges-class} has three metadata columns. \describe{ @@ -12,7 +13,8 @@ has three metadata columns. \item{bp_length }{ the sum of the width of the disjoint exons for that given gene.} \item{symbol }{ a CharacterList with the corresponding gene symbols.} -}} +} +} \description{ Gene annotation extracted from Gencode v25 (GRCh38.p7) used in recount. Data extraced on October 12th, 2017. The symbols were updated compared diff --git a/man/recount_url.Rd b/man/recount_url.Rd index c482e22..71557b3 100644 --- a/man/recount_url.Rd +++ b/man/recount_url.Rd @@ -4,7 +4,8 @@ \name{recount_url} \alias{recount_url} \title{Files and URLs hosted by the recount project} -\format{A data.frame with 6 columns. +\format{ +A data.frame with 6 columns. \describe{ \item{path }{ the original path to the file before being uploaded,} \item{file_name }{ the file name,} @@ -15,7 +16,8 @@ part of version 1 (reduced exons)}. updated in version 2 (disjoint exons)}. Further details in the recount website documentation tab. \item{url }{ the public URL for the given file.} -}} +} +} \description{ Files and URLs as provided by the recount project. This information is used internally in \link{download_study}. diff --git a/man/rse_gene_SRP009615.Rd b/man/rse_gene_SRP009615.Rd index 1c04076..a52d2cc 100644 --- a/man/rse_gene_SRP009615.Rd +++ b/man/rse_gene_SRP009615.Rd @@ -4,9 +4,11 @@ \name{rse_gene_SRP009615} \alias{rse_gene_SRP009615} \title{RangedSummarizedExperiment at the gene level for study SRP009615} -\format{A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} as +\format{ +A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} as created by the recount project for study with SRA id (accession number) -SRP009615.} +SRP009615. +} \description{ \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} at the gene level for study SRP009615. Used as an example in \link{scale_counts}. diff --git a/man/scale_counts.Rd b/man/scale_counts.Rd index cdf4ef5..a7df4c6 100644 --- a/man/scale_counts.Rd +++ b/man/scale_counts.Rd @@ -4,8 +4,14 @@ \alias{scale_counts} \title{Scale the raw counts provided by the recount project} \usage{ -scale_counts(rse, by = "auc", targetSize = 4e+07, L = 100, - factor_only = FALSE, round = TRUE) +scale_counts( + rse, + by = "auc", + targetSize = 4e+07, + L = 100, + factor_only = FALSE, + round = TRUE +) } \arguments{ \item{rse}{A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} @@ -60,7 +66,7 @@ rse_gene <- rse_gene_SRP009615 rse <- scale_counts(rse_gene) ## Find the project used as an example -project_info <- abstract_search('GSE32465') +project_info <- abstract_search("GSE32465") ## See some summary information for this project project_info @@ -71,7 +77,7 @@ project_info download_study(project_info$project) ## Load file -load(file.path(project_info$project, 'rse_gene.Rdata')) +load(file.path(project_info$project, "rse_gene.Rdata")) identical(rse_gene, rse_gene_SRP009615) } diff --git a/man/snaptron_query.Rd b/man/snaptron_query.Rd index 2778bc2..0ea6a31 100644 --- a/man/snaptron_query.Rd +++ b/man/snaptron_query.Rd @@ -4,8 +4,7 @@ \alias{snaptron_query} \title{Query Snaptron to get data from exon-exon junctions present in Intropolis} \usage{ -snaptron_query(junctions, version = "srav1", verbose = TRUE, - async = TRUE) +snaptron_query(junctions, version = "srav1", verbose = TRUE, async = TRUE) } \arguments{ \item{junctions}{A \link[GenomicRanges:GRanges-class]{GRanges-class} object with the @@ -39,26 +38,28 @@ that are available via Intropolis as described in the vignette. } \examples{ -library('GenomicRanges') +library("GenomicRanges") ## Define some exon-exon junctions (hg19 coordinates) -junctions <- GRanges(seqnames = 'chr2', IRanges( +junctions <- GRanges(seqnames = "chr2", IRanges( start = c(28971710:28971712, 29555081:29555083, 29754982:29754984), - end = c(29462417:29462419, 29923338:29923340, 29917714:29917716))) + end = c(29462417:29462419, 29923338:29923340, 29917714:29917716) +)) ## Check against Snaptron SRA version 1 (hg19 coordinates) snaptron_query(junctions) - \dontrun{ -## Check another set of junctions against SRA version 2 (more data, hg38 +## Check another set of junctions against SRA version 2 (more data, hg38 ## coordinates) -junctions_v2 <- GRanges(seqnames = 'chr2', IRanges( - start = 29532116:29532118, end = 29694848:29694850)) -snaptron_query(junctions_v2, version = 'srav2') +junctions_v2 <- GRanges(seqnames = "chr2", IRanges( + start = 29532116:29532118, end = 29694848:29694850 +)) +snaptron_query(junctions_v2, version = "srav2") ## Check these junctions in GTEx and TCGA data -snaptron_query(junctions_v2, version = 'gtex') -snaptron_query(junctions_v2, version = 'tcga') +snaptron_query(junctions_v2, version = "gtex") +snaptron_query(junctions_v2, version = "tcga") } + } \references{ Please cite \url{http://snaptron.cs.jhu.edu} diff --git a/tests/test-all.R b/tests/test-all.R index fb1bd0d..1ea8ad0 100644 --- a/tests/test-all.R +++ b/tests/test-all.R @@ -1,7 +1,7 @@ ## Disable the tests if the system variable 'R_DISABLE_TESTS' is set to TRUE -flag <- as.logical(Sys.getenv('R_DISABLE_TESTS')) -if(is.na(flag) | flag == FALSE) { - library('testthat') - test_check('recount') +flag <- as.logical(Sys.getenv("R_DISABLE_TESTS")) +if (is.na(flag) | flag == FALSE) { + library("testthat") + test_check("recount") } diff --git a/tests/testthat/test-annotation.R b/tests/testthat/test-annotation.R index 2b5a57f..906b9ae 100644 --- a/tests/testthat/test-annotation.R +++ b/tests/testthat/test-annotation.R @@ -1,20 +1,20 @@ -context('Annotation') +context("Annotation") -library('GenomicRanges') +library("GenomicRanges") -ranges_both <- reproduce_ranges('both', db = 'Gencode.v25') +ranges_both <- reproduce_ranges("both", db = "Gencode.v25") exons <- ranges_both$exon genes <- ranges_both$gene -exons_ensembl <- reproduce_ranges('exon', db = 'EnsDb.Hsapiens.v79') +exons_ensembl <- reproduce_ranges("exon", db = "EnsDb.Hsapiens.v79") -test_that('Annotation length', { +test_that("Annotation length", { expect_equal(length(exons), length(genes)) expect_gte(length(genes), length(recount_genes)) expect_gte(length(exons), length(recount_exons)) expect_gte(length(exons_ensembl), 65774) - expect_equal('ENSG00000000003' %in% names(exons_ensembl), TRUE) + expect_equal("ENSG00000000003" %in% names(exons_ensembl), TRUE) expect_equal(length(genome(recount_exons)), 25) - expect_equal('GRCh38' %in% genome(exons_ensembl), TRUE) - expect_equal(colnames(mcols(recount_genes)), c('gene_id', 'bp_length', 'symbol')) - expect_equal(colnames(mcols(genes)), c('gene_id', 'bp_length', 'symbol')) + expect_equal("GRCh38" %in% genome(exons_ensembl), TRUE) + expect_equal(colnames(mcols(recount_genes)), c("gene_id", "bp_length", "symbol")) + expect_equal(colnames(mcols(genes)), c("gene_id", "bp_length", "symbol")) }) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index a2acd10..047cbf3 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -1,12 +1,12 @@ -context('Download data, scale, ER-level') +context("Download data, scale, ER-level") -library('SummarizedExperiment') +library("SummarizedExperiment") ## Download the example data included in the package -url <- download_study('SRP009615', outdir = file.path(tempdir(), 'SRP009615')) +url <- download_study("SRP009615", outdir = file.path(tempdir(), "SRP009615")) ## Load the data -load(file.path(tempdir(), 'SRP009615', 'rse_gene.Rdata')) +load(file.path(tempdir(), "SRP009615", "rse_gene.Rdata")) ## Check size once: # > file.path(tempdir(), 'SRP009615', 'rse_gene.Rdata') @@ -14,101 +14,136 @@ load(file.path(tempdir(), 'SRP009615', 'rse_gene.Rdata')) # > # > system('ls -lh /var/folders/cx/n9s558kx6fb7jf5z_pgszgb80000gn/T//Rtmptm6u0K/SRP009615/rse_gene.Rdata') # -rw-r--r-- 1 lcollado staff 3.0M Apr 27 16:35 /var/folders/cx/n9s558kx6fb7jf5z_pgszgb80000gn/T//Rtmptm6u0K/SRP009615/rse_gene.Rdata -unlink(file.path(tempdir(), 'SRP009615', 'rse_gene.Rdata')) +unlink(file.path(tempdir(), "SRP009615", "rse_gene.Rdata")) ## Compare the data -test_that('Example RSE', { +test_that("Example RSE", { expect_equivalent(rse_gene, rse_gene_SRP009615) }) ## Temporary download URLs -test_that('Download URLs', { - expect_equal(download_study('DRP000366', type = 'mean', download = FALSE), - 'http://duffel.rail.bio/recount/DRP000366/bw/mean_DRP000366.bw') - expect_equal(download_study('DRP000366', type = 'samples', - download = FALSE), - 'http://duffel.rail.bio/recount/DRP000366/bw/DRR000897.bw') - expect_equal(download_study('DRP000366', type = 'rse-fc', download = FALSE), - 'http://idies.jhu.edu/recount/data/fc_rc/rse_fc_DRP000366.Rdata') +test_that("Download URLs", { + expect_equal( + download_study("DRP000366", type = "mean", download = FALSE), + "http://duffel.rail.bio/recount/DRP000366/bw/mean_DRP000366.bw" + ) + expect_equal( + download_study("DRP000366", + type = "samples", + download = FALSE + ), + "http://duffel.rail.bio/recount/DRP000366/bw/DRR000897.bw" + ) + expect_equal( + download_study("DRP000366", type = "rse-fc", download = FALSE), + "http://idies.jhu.edu/recount/data/fc_rc/rse_fc_DRP000366.Rdata" + ) }) ## Test downloading a small project entirely -tmpdir <- file.path(tempdir(), 'SRP002001') -urls <- download_study('SRP002001', type = 'all', outdir = tmpdir) -expected_urls <- paste0('http://duffel.rail.bio/recount/', - rep(rep(c('v2/', ''), 3), c(2, 1, 3, 2, 1, 2)), 'SRP002001/', - c('rse_gene.Rdata', 'rse_exon.Rdata', 'rse_jx.Rdata', - 'rse_tx.RData', 'counts_gene.tsv.gz', - 'counts_exon.tsv.gz', 'counts_jx.tsv.gz', - 'SRP002001.tsv', 'files_info.tsv', - 'bw/SRR036661.bw', 'bw/mean_SRP002001.bw')) -names(expected_urls) <- c('rse-gene', 'rse-exon', 'rse-jx', 'rse-tx', - 'counts-gene', 'counts-exon', 'counts-jx', - 'phenotype', 'files-info', 'samples', 'mean') +tmpdir <- file.path(tempdir(), "SRP002001") +urls <- download_study("SRP002001", type = "all", outdir = tmpdir) +expected_urls <- paste0( + "http://duffel.rail.bio/recount/", + rep(rep(c("v2/", ""), 3), c(2, 1, 3, 2, 1, 2)), "SRP002001/", + c( + "rse_gene.Rdata", "rse_exon.Rdata", "rse_jx.Rdata", + "rse_tx.RData", "counts_gene.tsv.gz", + "counts_exon.tsv.gz", "counts_jx.tsv.gz", + "SRP002001.tsv", "files_info.tsv", + "bw/SRR036661.bw", "bw/mean_SRP002001.bw" + ) +) +names(expected_urls) <- c( + "rse-gene", "rse-exon", "rse-jx", "rse-tx", + "counts-gene", "counts-exon", "counts-jx", + "phenotype", "files-info", "samples", "mean" +) ## Compute md5sum locally -localfiles <- c(list.files(tmpdir, '[.]', full.names = TRUE), - dir(file.path(tmpdir, 'bw'), full.names = TRUE)) -names(localfiles) <- c(list.files(tmpdir, '[.]'), dir(file.path(tmpdir, 'bw'))) +localfiles <- c( + list.files(tmpdir, "[.]", full.names = TRUE), + dir(file.path(tmpdir, "bw"), full.names = TRUE) +) +names(localfiles) <- c(list.files(tmpdir, "[.]"), dir(file.path(tmpdir, "bw"))) -library('tools') +library("tools") md5 <- sapply(localfiles, md5sum) names(md5) <- names(localfiles) ## md5sum doesn't match for jx files? -md5 <- md5[-which(names(md5) %in% c('files_info.tsv', 'counts_jx.tsv.gz', - 'rse_jx.Rdata'))] +md5 <- md5[-which(names(md5) %in% c( + "files_info.tsv", "counts_jx.tsv.gz", + "rse_jx.Rdata" +))] ## Get original md5sum -fileinfo <- read.table(file.path(tmpdir, 'files_info.tsv'), header = TRUE, - stringsAsFactors = FALSE) +fileinfo <- read.table(file.path(tmpdir, "files_info.tsv"), + header = TRUE, + stringsAsFactors = FALSE +) -test_that('Project SRP002001', { +test_that("Project SRP002001", { expect_equal(urls, expected_urls) expect_equivalent(fileinfo$md5sum[match(names(md5), fileinfo$file)], md5) }) scaleFac <- scale_counts(rse_gene_SRP009615, factor_only = TRUE) -scaleFac_mapped <- scale_counts(rse_gene_SRP009615, by = 'mapped_reads', - factor_only = TRUE) +scaleFac_mapped <- scale_counts(rse_gene_SRP009615, + by = "mapped_reads", + factor_only = TRUE +) rse <- scale_counts(rse_gene_SRP009615, round = FALSE) -test_that('Scaling', { - expect_equal(round(head(scaleFac), 2), c('SRR387777' = 0.04, - 'SRR387778' = 0.03, 'SRR387779' = 0.03, 'SRR387780' = 0.04, - 'SRR389077' = 0.04, 'SRR389078' = 0.04)) +test_that("Scaling", { + expect_equal(round(head(scaleFac), 2), c( + "SRR387777" = 0.04, + "SRR387778" = 0.03, "SRR387779" = 0.03, "SRR387780" = 0.04, + "SRR389077" = 0.04, "SRR389078" = 0.04 + )) expect_gt(scaleFac_mapped[1], scaleFac[1]) expect_gt(scaleFac_mapped[2], scaleFac[2]) expect_gt(scaleFac_mapped[3], scaleFac[3]) expect_equal(assay(rse, 1) / matrix(rep(scaleFac, each = 58037), - ncol = 12), assay(rse_gene_SRP009615, 1)) + ncol = 12 + ), assay(rse_gene_SRP009615, 1)) }) -if(.Platform$OS.type != 'windows') { - regions <- expressed_regions('SRP002001', 'chrY', cutoff = 5) +if (.Platform$OS.type != "windows") { + regions <- expressed_regions("SRP002001", "chrY", cutoff = 5) ## Artificially remove the mean coverage file so that the file will have to ## get downloaded on the first test, then it'll be present for the second ## test - unlink(localfiles['mean_SRP002001.bw']) - - test_that('Expressed regions', { - expect_equal(regions, - expressed_regions('SRP002001', 'chrY', cutoff = 5, outdir = tmpdir)) - expect_equal(regions, - expressed_regions('SRP002001', 'chrY', cutoff = 5, outdir = tmpdir)) + unlink(localfiles["mean_SRP002001.bw"]) + + test_that("Expressed regions", { + expect_equal( + regions, + expressed_regions("SRP002001", "chrY", cutoff = 5, outdir = tmpdir) + ) + expect_equal( + regions, + expressed_regions("SRP002001", "chrY", cutoff = 5, outdir = tmpdir) + ) }) - rse_ER <- coverage_matrix('SRP002001', 'chrY', regions) + rse_ER <- coverage_matrix("SRP002001", "chrY", regions) ## Same for the phenotype data and the sample bigwig file - unlink(localfiles['SRP002001.tsv']) - unlink(localfiles['SRR036661.bw']) - - test_that('Coverage matrix', { - expect_equal(rse_ER, - coverage_matrix('SRP002001', 'chrY', regions, outdir = tmpdir)) - expect_equal(rse_ER, - coverage_matrix('SRP002001', 'chrY', regions, outdir = tmpdir, - chunksize = 500)) + unlink(localfiles["SRP002001.tsv"]) + unlink(localfiles["SRR036661.bw"]) + + test_that("Coverage matrix", { + expect_equal( + rse_ER, + coverage_matrix("SRP002001", "chrY", regions, outdir = tmpdir) + ) + expect_equal( + rse_ER, + coverage_matrix("SRP002001", "chrY", regions, + outdir = tmpdir, + chunksize = 500 + ) + ) }) } @@ -121,78 +156,88 @@ unlink(tmpdir, recursive = TRUE) metadata <- all_metadata() -test_that('All metadata', { +test_that("All metadata", { expect_equal(nrow(metadata), 50099) }) -phenoFile <- download_study(project = 'SRP012289', type = 'phenotype', - download = FALSE) -pheno <- read.table(phenoFile, header = TRUE, stringsAsFactors = FALSE, - sep = '\t') -test_that('Correct phenotype information', { +phenoFile <- download_study( + project = "SRP012289", type = "phenotype", + download = FALSE +) +pheno <- read.table(phenoFile, + header = TRUE, stringsAsFactors = FALSE, + sep = "\t" +) +test_that("Correct phenotype information", { expect_equal(pheno$auc, 159080954) }) ## Test Snaptron -library('GenomicRanges') -junctions <- GRanges(seqnames = 'chr2', IRanges( +library("GenomicRanges") +junctions <- GRanges(seqnames = "chr2", IRanges( start = c(28971710:28971712, 29555081:29555083, 29754982:29754984), - end = c(29462417:29462419, 29923338:29923340, 29917714:29917716))) - -junctions_v2 <- GRanges(seqnames = 'chr2', IRanges( - start = 29532116:29532118, end = 29694848:29694850)) + end = c(29462417:29462419, 29923338:29923340, 29917714:29917716) +)) + +junctions_v2 <- GRanges(seqnames = "chr2", IRanges( + start = 29532116:29532118, end = 29694848:29694850 +)) snap <- snaptron_query(junctions) -snap_v2 <- snaptron_query(junctions_v2, version = 'srav2') -snap_gtex <- snaptron_query(junctions_v2, version = 'gtex') -snap_tcga <- snaptron_query(junctions_v2, version = 'tcga') +snap_v2 <- snaptron_query(junctions_v2, version = "srav2") +snap_gtex <- snaptron_query(junctions_v2, version = "gtex") +snap_tcga <- snaptron_query(junctions_v2, version = "tcga") -test_that('Snaptron', { +test_that("Snaptron", { expect_equal(length(snap), 3) expect_equal(ncol(mcols(snap)), 14) expect_equal(snap$left_annotated[[1]], as.character(NA)) expect_equal(snaptron_query(junctions[1], verbose = FALSE), NULL) - expect_equal(is(snap_v2$annotated, 'CompressedCharacterList'), TRUE) + expect_equal(is(snap_v2$annotated, "CompressedCharacterList"), TRUE) expect_equal(snaptron_query(junctions_v2, verbose = FALSE), NULL) - expect_equal(snap_gtex$type == 'GTEx:I', TRUE) - expect_equal(snap_tcga$type == 'TCGA:I', TRUE) + expect_equal(snap_gtex$type == "GTEx:I", TRUE) + expect_equal(snap_tcga$type == "TCGA:I", TRUE) }) ## Weird pheno files ## First 2 are separate from the rest -projects <- c('SRP036843', 'SRP029334', 'SRP050563', 'SRP055438', 'SRP055749', - 'SRP058120', 'SRP005342', 'SRP007508', 'SRP015668') -sapply(projects, download_study, type = 'phenotype') -phenoFiles <- sapply(projects, function(x) { file.path(x, paste0(x, '.tsv')) }) +projects <- c( + "SRP036843", "SRP029334", "SRP050563", "SRP055438", "SRP055749", + "SRP058120", "SRP005342", "SRP007508", "SRP015668" +) +sapply(projects, download_study, type = "phenotype") +phenoFiles <- sapply(projects, function(x) { + file.path(x, paste0(x, ".tsv")) +}) pheno <- mapply(recount:::.read_pheno, phenoFiles, projects, SIMPLIFY = FALSE) -pheno_tcga <- recount:::.read_pheno('doesntexist', 'TCGA') -test_that('Weird pheno files', { +pheno_tcga <- recount:::.read_pheno("doesntexist", "TCGA") +test_that("Weird pheno files", { expect_equivalent(sapply(pheno, nrow), c(3, 5, 4, 33, 16, 30, 12, 5, 33)) expect_equal(dim(pheno_tcga), c(11284, 864)) }) -test_that('RPKM', { - expect_equal(getRPKM(rse_gene_SRP009615)[1, ], assays(rse_gene_SRP009615)$count[1, ] / (rowData(rse_gene_SRP009615)$bp_length[1] / 1000) / (colSums(assays(rse_gene_SRP009615)$count)/1e6)) - expect_equal(getRPKM(rse_gene_SRP009615, length_var = NULL)[1, ], assays(rse_gene_SRP009615)$count[1, ] / (width(rowRanges(rse_gene_SRP009615))[1] / 1000) / (colSums(assays(rse_gene_SRP009615)$count)/1e6)) - expect_equal(getRPKM(rse_gene_SRP009615, mapped_var = 'mapped_read_count')[1, ], assays(rse_gene_SRP009615)$count[1, ] / (rowData(rse_gene_SRP009615)$bp_length[1] / 1000) / (colData(rse_gene_SRP009615)$mapped_read_count/1e6)) - expect_equal(getRPKM(rse_gene_SRP009615, length_var = NULL, mapped_var = 'mapped_read_count')[1, ], assays(rse_gene_SRP009615)$count[1, ] / (width(rowRanges(rse_gene_SRP009615))[1] / 1000) / (colData(rse_gene_SRP009615)$mapped_read_count/1e6)) +test_that("RPKM", { + expect_equal(getRPKM(rse_gene_SRP009615)[1, ], assays(rse_gene_SRP009615)$count[1, ] / (rowData(rse_gene_SRP009615)$bp_length[1] / 1000) / (colSums(assays(rse_gene_SRP009615)$count) / 1e6)) + expect_equal(getRPKM(rse_gene_SRP009615, length_var = NULL)[1, ], assays(rse_gene_SRP009615)$count[1, ] / (width(rowRanges(rse_gene_SRP009615))[1] / 1000) / (colSums(assays(rse_gene_SRP009615)$count) / 1e6)) + expect_equal(getRPKM(rse_gene_SRP009615, mapped_var = "mapped_read_count")[1, ], assays(rse_gene_SRP009615)$count[1, ] / (rowData(rse_gene_SRP009615)$bp_length[1] / 1000) / (colData(rse_gene_SRP009615)$mapped_read_count / 1e6)) + expect_equal(getRPKM(rse_gene_SRP009615, length_var = NULL, mapped_var = "mapped_read_count")[1, ], assays(rse_gene_SRP009615)$count[1, ] / (width(rowRanges(rse_gene_SRP009615))[1] / 1000) / (colData(rse_gene_SRP009615)$mapped_read_count / 1e6)) }) rse_pred <- add_predictions(rse_gene_SRP009615) -test_that('Predictions', { - expect_equal(all(c('reported_sex', 'predicted_sex', 'accuracy_sex', 'reported_samplesource', 'predicted_samplesource', 'accuracy_samplesource', 'reported_tissue', 'predicted_tissue', 'accuracy_tissue', 'reported_sequencingstrategy', 'predicted_sequencingstrategy', 'accuracy_sequencingstrategy') %in% colnames(colData(rse_pred))), TRUE) +test_that("Predictions", { + expect_equal(all(c("reported_sex", "predicted_sex", "accuracy_sex", "reported_samplesource", "predicted_samplesource", "accuracy_samplesource", "reported_tissue", "predicted_tissue", "accuracy_tissue", "reported_sequencingstrategy", "predicted_sequencingstrategy", "accuracy_sequencingstrategy") %in% colnames(colData(rse_pred))), TRUE) }) ## read_counts -download_study('DRP000499') -load('DRP000499/rse_gene.Rdata') -test_that('read_counts', { +download_study("DRP000499") +load("DRP000499/rse_gene.Rdata") +test_that("read_counts", { expect_equal(all(colSums(assays(read_counts(rse_gene, use_paired_end = FALSE))$counts) / colSums(assays(read_counts(rse_gene))$counts) == 2), TRUE) - expect_equal(all(sign(colSums(assays(read_counts(rse_gene))$counts) / 1e6 - colData(rse_gene)$reads_downloaded / 1e6 /2) == -1), TRUE) + expect_equal(all(sign(colSums(assays(read_counts(rse_gene))$counts) / 1e6 - colData(rse_gene)$reads_downloaded / 1e6 / 2) == -1), TRUE) }) ## Clean up -for(p in c(projects, 'DRP000499')) unlink(p, recursive = TRUE) +for (p in c(projects, "DRP000499")) unlink(p, recursive = TRUE) diff --git a/tests/testthat/test-download_retry.R b/tests/testthat/test-download_retry.R index af1ba82..564424b 100644 --- a/tests/testthat/test-download_retry.R +++ b/tests/testthat/test-download_retry.R @@ -1,5 +1,5 @@ test_that("Downloading with retries", { - expect_error(download_retry('http://duffel.rail.bio/recount/v2/DRP000366/files_info.tsv_fake', N.TRIES = 1)) - expect_error(download_retry('http://duffel.rail.bio/recount/v2/DRP000366/files_info.tsv', N.TRIES = 0)) - expect_error(download_retry('http://duffel.rail.bio/recount/v2/DRP000366/files_info.tsv', N.TRIES = -1L)) + expect_error(download_retry("http://duffel.rail.bio/recount/v2/DRP000366/files_info.tsv_fake", N.TRIES = 1)) + expect_error(download_retry("http://duffel.rail.bio/recount/v2/DRP000366/files_info.tsv", N.TRIES = 0)) + expect_error(download_retry("http://duffel.rail.bio/recount/v2/DRP000366/files_info.tsv", N.TRIES = -1L)) }) diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R index 5bfae2b..652542c 100644 --- a/tests/testthat/test-misc.R +++ b/tests/testthat/test-misc.R @@ -1,36 +1,37 @@ -context('Misc') +context("Misc") -test_that('Loading packages', { - expect_equal(recount:::.load_check('recount'), NULL) - expect_error(recount:::.load_check('somecrazyname')) +test_that("Loading packages", { + expect_equal(recount:::.load_check("recount"), NULL) + expect_error(recount:::.load_check("somecrazyname")) }) -info <- geo_info('GSM836270') -chars <- data.frame('cells' = 'K562', 'shrna.expression' = 'no', - 'treatment' = 'Puromycin', stringsAsFactors = FALSE) -test_that('Geo info', { - expect_equal(info, geo_info('GSM836270', TRUE)) - expect_equal(info$geo_accession, 'GSM836270') - expect_equal(find_geo('SRX110461'), 'GSM836270') - expect_equal(find_geo(''), NA) +info <- geo_info("GSM836270") +chars <- data.frame( + "cells" = "K562", "shrna.expression" = "no", + "treatment" = "Puromycin", stringsAsFactors = FALSE +) +test_that("Geo info", { + expect_equal(info, geo_info("GSM836270", TRUE)) + expect_equal(info$geo_accession, "GSM836270") + expect_equal(find_geo("SRX110461"), "GSM836270") + expect_equal(find_geo(""), NA) expect_equal(geo_characteristics(info), chars) expect_equal(geo_info(NA), NULL) - expect_equal(colnames(geo_characteristics(geo_info('GSM359183'))), 'characteristics') - #expect_equal(geo_info('GSM1062236'), S4Vectors::DataFrame()) + expect_equal(colnames(geo_characteristics(geo_info("GSM359183"))), "characteristics") + # expect_equal(geo_info('GSM1062236'), S4Vectors::DataFrame()) }) -if(interactive()) { +if (interactive()) { ## Open shiny app when running test in interactive mode. ## This will force the app to be available, which will then run the test ## successfully. ## If there is a better way to test this, please let me know! ## See thread at https://groups.google.com/forum/#!topic/shinyapps-users/ElMO_v1eurQ - browseURL('https://jhubiostatistics.shinyapps.io/recount/') - test_that('Shiny app is up', { + browseURL("https://jhubiostatistics.shinyapps.io/recount/") + test_that("Shiny app is up", { expect_equal( - RCurl::url.exists('https://jhubiostatistics.shinyapps.io/recount/'), + RCurl::url.exists("https://jhubiostatistics.shinyapps.io/recount/"), TRUE ) }) } - diff --git a/tests/testthat/test-web-metadata.R b/tests/testthat/test-web-metadata.R index 3a65c39..cc94d32 100644 --- a/tests/testthat/test-web-metadata.R +++ b/tests/testthat/test-web-metadata.R @@ -1,20 +1,24 @@ -context('Website metadata') +context("Website metadata") -test_that('Browse studies at SRA', { - expect_equal(length(browse_study(c('ERP001942', 'ERP009768'), FALSE)), 2) - expect_equal(browse_study('ERP001942', FALSE), 'https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=ERP001942') +test_that("Browse studies at SRA", { + expect_equal(length(browse_study(c("ERP001942", "ERP009768"), FALSE)), 2) + expect_equal(browse_study("ERP001942", FALSE), "https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=ERP001942") }) -test_that('Download studies', { - expect_equal(download_study('ERP001942', download = FALSE), 'http://duffel.rail.bio/recount/v2/ERP001942/rse_gene.Rdata') - expect_equal(download_study('ERP001942', download = FALSE, version = 1), 'http://duffel.rail.bio/recount/ERP001942/rse_gene.Rdata') - expect_equal(download_study('ERP001942', type = 'mean', download = FALSE), 'http://duffel.rail.bio/recount/ERP001942/bw/mean_ERP001942.bw') +test_that("Download studies", { + expect_equal(download_study("ERP001942", download = FALSE), "http://duffel.rail.bio/recount/v2/ERP001942/rse_gene.Rdata") + expect_equal(download_study("ERP001942", download = FALSE, version = 1), "http://duffel.rail.bio/recount/ERP001942/rse_gene.Rdata") + expect_equal(download_study("ERP001942", type = "mean", download = FALSE), "http://duffel.rail.bio/recount/ERP001942/bw/mean_ERP001942.bw") }) -test_that('Finding abstracts', { - expect_equal(abstract_search('Geuvadis consortium', id_only = TRUE), - 'ERP001942') - expect_equal(abstract_search('Geuvadis consortium'), - recount_abstract[recount_abstract$project == 'ERP001942', ]) +test_that("Finding abstracts", { + expect_equal( + abstract_search("Geuvadis consortium", id_only = TRUE), + "ERP001942" + ) + expect_equal( + abstract_search("Geuvadis consortium"), + recount_abstract[recount_abstract$project == "ERP001942", ] + ) }) diff --git a/vignettes/recount-quickstart.Rmd b/vignettes/recount-quickstart.Rmd index db25be7..a81ce81 100644 --- a/vignettes/recount-quickstart.Rmd +++ b/vignettes/recount-quickstart.Rmd @@ -26,62 +26,70 @@ vignette: > startTime <- Sys.time() ## Bib setup -library('knitcitations') +library("knitcitations") ## Load knitcitations with a clean bibliography cleanbib() -cite_options(hyperlink = 'to.doc', citation_format = 'text', style = 'html') +cite_options(hyperlink = "to.doc", citation_format = "text", style = "html") # Note links won't show for now due to the following issue # https://github.com/cboettig/knitcitations/issues/63 ## Write bibliography information bib <- c( R = citation(), - AnnotationDbi = RefManageR::BibEntry(bibtype = 'manual', - key = 'AnnotationDbi', - author = 'Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li', - title = 'AnnotationDbi: Annotation Database Interface', - year = 2017, doi = '10.18129/B9.bioc.AnnotationDbi'), - BiocParallel = citation('BiocParallel'), - BiocStyle = citation('BiocStyle'), - derfinder = citation('derfinder')[1], - DESeq2 = citation('DESeq2'), - sessioninfo = citation('sessioninfo'), - downloader = citation('downloader'), - EnsDb.Hsapiens.v79 = citation('EnsDb.Hsapiens.v79'), - GEOquery = citation('GEOquery'), - GenomeInfoDb = RefManageR::BibEntry(bibtype = 'manual', - key = 'GenomeInfoDb', - author = 'Sonali Arora and Martin Morgan and Marc Carlson and H. Pagès', + AnnotationDbi = RefManageR::BibEntry( + bibtype = "manual", + key = "AnnotationDbi", + author = "Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li", + title = "AnnotationDbi: Annotation Database Interface", + year = 2017, doi = "10.18129/B9.bioc.AnnotationDbi" + ), + BiocParallel = citation("BiocParallel"), + BiocStyle = citation("BiocStyle"), + derfinder = citation("derfinder")[1], + DESeq2 = citation("DESeq2"), + sessioninfo = citation("sessioninfo"), + downloader = citation("downloader"), + EnsDb.Hsapiens.v79 = citation("EnsDb.Hsapiens.v79"), + GEOquery = citation("GEOquery"), + GenomeInfoDb = RefManageR::BibEntry( + bibtype = "manual", + key = "GenomeInfoDb", + author = "Sonali Arora and Martin Morgan and Marc Carlson and H. Pagès", title = "GenomeInfoDb: Utilities for manipulating chromosome and other 'seqname' identifiers", - year = 2017, doi = '10.18129/B9.bioc.GenomeInfoDb'), - GenomicFeatures = citation('GenomicFeatures'), - GenomicRanges = citation('GenomicRanges'), - IRanges = citation('IRanges'), - knitcitations = citation('knitcitations'), - knitr = citation('knitr')[3], - org.Hs.eg.db = citation('org.Hs.eg.db'), - RCurl = citation('RCurl'), - recount = citation('recount')[1], - recountworkflow = citation('recount')[2], - phenopredict = citation('recount')[3], - regionReport = citation('regionReport'), - rentrez = citation('rentrez'), - rmarkdown = citation('rmarkdown'), - rtracklayer = citation('rtracklayer'), - S4Vectors = RefManageR::BibEntry(bibtype = 'manual', key = 'S4Vectors', - author = 'Hervé Pagès and Michael Lawrence and Patrick Aboyoun', + year = 2017, doi = "10.18129/B9.bioc.GenomeInfoDb" + ), + GenomicFeatures = citation("GenomicFeatures"), + GenomicRanges = citation("GenomicRanges"), + IRanges = citation("IRanges"), + knitcitations = citation("knitcitations"), + knitr = citation("knitr")[3], + org.Hs.eg.db = citation("org.Hs.eg.db"), + RCurl = citation("RCurl"), + recount = citation("recount")[1], + recountworkflow = citation("recount")[2], + phenopredict = citation("recount")[3], + regionReport = citation("regionReport"), + rentrez = citation("rentrez"), + rmarkdown = citation("rmarkdown")[1], + rtracklayer = citation("rtracklayer"), + S4Vectors = RefManageR::BibEntry( + bibtype = "manual", key = "S4Vectors", + author = "Hervé Pagès and Michael Lawrence and Patrick Aboyoun", title = "S4Vectors: S4 implementation of vector-like and list-like objects", - year = 2017, doi = '10.18129/B9.bioc.S4Vectors'), - SummarizedExperiment = RefManageR::BibEntry(bibtype = 'manual', - key = 'SummarizedExperiment', - author = 'Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès', - title = 'SummarizedExperiment: SummarizedExperiment container', - year = 2017, doi = '10.18129/B9.bioc.SummarizedExperiment'), - testthat = citation('testthat') + year = 2017, doi = "10.18129/B9.bioc.S4Vectors" + ), + SummarizedExperiment = RefManageR::BibEntry( + bibtype = "manual", + key = "SummarizedExperiment", + author = "Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès", + title = "SummarizedExperiment: SummarizedExperiment container", + year = 2017, doi = "10.18129/B9.bioc.SummarizedExperiment" + ), + testthat = citation("testthat") ) -write.bibtex(bib, file = 'quickstartRef.bib') +write.bibtex(bib, file = "quickstartRef.bib") ``` # Basics @@ -91,8 +99,9 @@ write.bibtex(bib, file = 'quickstartRef.bib') `R` is an open-source statistical environment which can be easily modified to enhance its functionality via packages. `r Biocpkg('recount')` is a `R` package available via the [Bioconductor](http://bioconductor/packages/recount) repository for packages. `R` can be installed on any operating system from [CRAN](https://cran.r-project.org/) after which you can install `r Biocpkg('recount')` by using the following commands in your `R` session: ```{r 'installDer', eval = FALSE} -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") + } BiocManager::install("recount") @@ -124,7 +133,7 @@ We hope that `r Biocpkg('recount')` will be useful for your research. Please use ```{r 'citation'} ## Citation info -citation('recount') +citation("recount") ``` # Quick start to using to `r Biocpkg('recount')` @@ -144,16 +153,16 @@ This quick analysis is explained in more detail later on in this document. Furth ```{r 'ultraQuick', eval = FALSE} ## Load library -library('recount') +library("recount") ## Find a project of interest -project_info <- abstract_search('GSE32465') +project_info <- abstract_search("GSE32465") ## Download the gene-level RangedSummarizedExperiment data download_study(project_info$project) ## Load the data -load(file.path(project_info$project, 'rse_gene.Rdata')) +load(file.path(project_info$project, "rse_gene.Rdata")) ## Browse the project at SRA browse_study(project_info$project) @@ -167,8 +176,8 @@ geochar <- lapply(split(colData(rse_gene), seq_len(nrow(colData(rse_gene)))), ge ## Note that the information for this study is a little inconsistent, so we ## have to fix it. geochar <- do.call(rbind, lapply(geochar, function(x) { - if('cells' %in% colnames(x)) { - colnames(x)[colnames(x) == 'cells'] <- 'cell.line' + if ("cells" %in% colnames(x)) { + colnames(x)[colnames(x) == "cells"] <- "cell.line" return(x) } else { return(x) @@ -178,9 +187,13 @@ geochar <- do.call(rbind, lapply(geochar, function(x) { ## We can now define some sample information to use sample_info <- data.frame( run = colData(rse_gene)$run, - group = ifelse(grepl('uninduced', colData(rse_gene)$title), 'uninduced', 'induced'), - gene_target = sapply(colData(rse_gene)$title, function(x) { strsplit(strsplit(x, - 'targeting ')[[1]][2], ' gene')[[1]][1] }), + group = ifelse(grepl("uninduced", colData(rse_gene)$title), "uninduced", "induced"), + gene_target = sapply(colData(rse_gene)$title, function(x) { + strsplit(strsplit( + x, + "targeting " + )[[1]][2], " gene")[[1]][1] + }), cell.line = geochar$cell.line ) @@ -192,34 +205,38 @@ colData(rse)$group <- sample_info$group colData(rse)$gene_target <- sample_info$gene_target ## Perform differential expression analysis with DESeq2 -library('DESeq2') +library("DESeq2") ## Specify design and switch to DESeq2 format dds <- DESeqDataSet(rse, ~ gene_target + group) ## Perform DE analysis -dds <- DESeq(dds, test = 'LRT', reduced = ~ gene_target, fitType = 'local') +dds <- DESeq(dds, test = "LRT", reduced = ~gene_target, fitType = "local") res <- results(dds) ## Explore results -plotMA(res, main="DESeq2 results for SRP009615") +plotMA(res, main = "DESeq2 results for SRP009615") ## Make a report with the results -library('regionReport') -DESeq2Report(dds, res = res, project = 'SRP009615', - intgroup = c('group', 'gene_target'), outdir = '.', - output = 'SRP009615-results') +library("regionReport") +DESeq2Report(dds, + res = res, project = "SRP009615", + intgroup = c("group", "gene_target"), outdir = ".", + output = "SRP009615-results" +) ``` The [recount project](https://jhubiostatistics.shinyapps.io/recount/) also hosts the necessary data to perform annotation-agnostic differential expression analyses with `r Biocpkg('derfinder')` `r citep(bib[['derfinder']])`. An example analysis would like this: ```{r 'er_analysis', eval = FALSE} ## Define expressed regions for study SRP009615, only for chromosome Y -regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L, - maxClusterGap = 3000L) +regions <- expressed_regions("SRP009615", "chrY", + cutoff = 5L, + maxClusterGap = 3000L +) ## Compute coverage matrix for study SRP009615, only for chromosome Y -system.time( rse_ER <- coverage_matrix('SRP009615', 'chrY', regions) ) +system.time(rse_ER <- coverage_matrix("SRP009615", "chrY", regions)) ## Round the coverage matrix to integers covMat <- round(assays(rse_ER)$counts, 0) @@ -232,22 +249,28 @@ m <- match(pheno$run, sample_info$run) pheno <- cbind(pheno, sample_info[m, 2:3]) ## Build a DESeqDataSet -dds_ers <- DESeqDataSetFromMatrix(countData = covMat, colData = pheno, - design = ~ gene_target + group) +dds_ers <- DESeqDataSetFromMatrix( + countData = covMat, colData = pheno, + design = ~ gene_target + group +) ## Perform differential expression analysis with DESeq2 at the ER-level -dds_ers <- DESeq(dds_ers, test = 'LRT', reduced = ~ gene_target, - fitType = 'local') +dds_ers <- DESeq(dds_ers, + test = "LRT", reduced = ~gene_target, + fitType = "local" +) res_ers <- results(dds_ers) ## Explore results -plotMA(res_ers, main="DESeq2 results for SRP009615 (ER-level, chrY)") +plotMA(res_ers, main = "DESeq2 results for SRP009615 (ER-level, chrY)") ## Create a more extensive exploratory report -DESeq2Report(dds_ers, res = res_ers, - project = 'SRP009615 (ER-level, chrY)', - intgroup = c('group', 'gene_target'), outdir = '.', - output = 'SRP009615-results-ER-level-chrY') +DESeq2Report(dds_ers, + res = res_ers, + project = "SRP009615 (ER-level, chrY)", + intgroup = c("group", "gene_target"), outdir = ".", + output = "SRP009615-results-ER-level-chrY" +) ``` @@ -274,14 +297,14 @@ Next we load the required packages. Loading `r Biocpkg('recount')` will load the ```{r 'start', message=FALSE} ## Load recount R package -library('recount') +library("recount") ``` Lets say that we don't know the actual SRA accession number for this study but we do know a particular term which will help us identify it. If that's the case, we can use the `abstract_search()` function to identify the study of interest as shown below. ```{r 'search_abstract'} ## Find a project of interest -project_info <- abstract_search('GSE32465') +project_info <- abstract_search("GSE32465") ## Explore info project_info @@ -296,7 +319,7 @@ Now that we have a study that we are interested in, we can download the _RangedS download_study(project_info$project) ## Load the data -load(file.path(project_info$project, 'rse_gene.Rdata')) +load(file.path(project_info$project, "rse_gene.Rdata")) ## Delete it if you don't need it anymore unlink(project_info$project, recursive = TRUE) @@ -311,7 +334,7 @@ rse_gene colData(rse_gene) ## At the gene level, the row data includes the gene Gencode ids, the gene -## symbols and the sum of the disjoint exons widths, which can be used for +## symbols and the sum of the disjoint exons widths, which can be used for ## taking into account the gene length. rowData(rse_gene) @@ -342,8 +365,8 @@ geochar <- lapply(split(colData(rse_gene), seq_len(nrow(colData(rse_gene)))), ge ## Note that the information for this study is a little inconsistent, so we ## have to fix it. geochar <- do.call(rbind, lapply(geochar, function(x) { - if('cells' %in% colnames(x)) { - colnames(x)[colnames(x) == 'cells'] <- 'cell.line' + if ("cells" %in% colnames(x)) { + colnames(x)[colnames(x) == "cells"] <- "cell.line" return(x) } else { return(x) @@ -353,9 +376,13 @@ geochar <- do.call(rbind, lapply(geochar, function(x) { ## We can now define some sample information to use sample_info <- data.frame( run = colData(rse_gene)$run, - group = ifelse(grepl('uninduced', colData(rse_gene)$title), 'uninduced', 'induced'), - gene_target = sapply(colData(rse_gene)$title, function(x) { strsplit(strsplit(x, - 'targeting ')[[1]][2], ' gene')[[1]][1] }), + group = ifelse(grepl("uninduced", colData(rse_gene)$title), "uninduced", "induced"), + gene_target = sapply(colData(rse_gene)$title, function(x) { + strsplit(strsplit( + x, + "targeting " + )[[1]][2], " gene")[[1]][1] + }), cell.line = geochar$cell.line ) ``` @@ -401,13 +428,13 @@ In this particular analysis, we'll test whether there is a group difference adju ```{r 'de_analysis'} ## Perform differential expression analysis with DESeq2 -library('DESeq2') +library("DESeq2") ## Specify design and switch to DESeq2 format dds <- DESeqDataSet(rse, ~ gene_target + group) ## Perform DE analysis -dds <- DESeq(dds, test = 'LRT', reduced = ~ gene_target, fitType = 'local') +dds <- DESeq(dds, test = "LRT", reduced = ~gene_target, fitType = "local") res <- results(dds) ``` @@ -415,7 +442,7 @@ We can now use functions from `r Biocpkg('DESeq2')` to explore the results. For ```{r 'maplot', fig.cap = "MA plot for group differences adjusted by gene target for SRP009615 using gene-level data."} ## Explore results -plotMA(res, main="DESeq2 results for SRP009615") +plotMA(res, main = "DESeq2 results for SRP009615") ``` ## Report results @@ -426,34 +453,39 @@ We can also use the `r Biocpkg('regionReport')` package to generate interactive ```{r 'make_report', eval = FALSE} ## Make a report with the results -library('regionReport') -report <- DESeq2Report(dds, res = res, project = 'SRP009615', - intgroup = c('group', 'gene_target'), outdir = '.', - output = 'SRP009615-results', nBest = 10, nBestFeatures = 2) +library("regionReport") +report <- DESeq2Report(dds, + res = res, project = "SRP009615", + intgroup = c("group", "gene_target"), outdir = ".", + output = "SRP009615-results", nBest = 10, nBestFeatures = 2 +) ``` ```{r 'make_report_real', echo = FALSE, results = 'hide'} -library('regionReport') +library("regionReport") ## Make it so that the report will be available as a vignette -original <- readLines(system.file('DESeq2Exploration', 'DESeq2Exploration.Rmd', - package = 'regionReport')) +original <- readLines(system.file("DESeq2Exploration", "DESeq2Exploration.Rmd", + package = "regionReport" +)) vignetteInfo <- c( - 'vignette: >', - ' %\\VignetteEngine{knitr::rmarkdown}', - ' %\\VignetteIndexEntry{Basic DESeq2 results exploration}', - ' %\\VignetteEncoding{UTF-8}' + "vignette: >", + " %\\VignetteEngine{knitr::rmarkdown}", + " %\\VignetteIndexEntry{Basic DESeq2 results exploration}", + " %\\VignetteEncoding{UTF-8}" ) new <- c(original[1:16], vignetteInfo, original[17:length(original)]) -writeLines(new, 'SRP009615-results-template.Rmd') +writeLines(new, "SRP009615-results-template.Rmd") ## Now create the report -report <- DESeq2Report(dds, res = res, project = 'SRP009615', - intgroup = c('group', 'gene_target'), outdir = '.', - output = 'SRP009615-results', device = 'png', template = 'SRP009615-results-template.Rmd', nBest = 10, nBestFeatures = 2) - +report <- DESeq2Report(dds, + res = res, project = "SRP009615", + intgroup = c("group", "gene_target"), outdir = ".", + output = "SRP009615-results", device = "png", template = "SRP009615-results-template.Rmd", nBest = 10, nBestFeatures = 2 +) + ## Clean up -file.remove('SRP009615-results-template.Rmd') +file.remove("SRP009615-results-template.Rmd") ``` You can view the final report [here](SRP009615-results.html). @@ -465,14 +497,16 @@ The gene Gencode ids are included in several objects in `recount`. With these id ```{r 'geneSymbols'} ## Load required library -library('org.Hs.eg.db') +library("org.Hs.eg.db") ## Extract Gencode gene ids -gencode <- gsub('\\..*', '', names(recount_genes)) +gencode <- gsub("\\..*", "", names(recount_genes)) ## Find the gene information we are interested in -gene_info <- select(org.Hs.eg.db, gencode, c('ENTREZID', 'GENENAME', 'SYMBOL', - 'ENSEMBL'), 'ENSEMBL') +gene_info <- select(org.Hs.eg.db, gencode, c( + "ENTREZID", "GENENAME", "SYMBOL", + "ENSEMBL" +), "ENSEMBL") ## Explore part of the results dim(gene_info) @@ -490,8 +524,10 @@ For an annotation-agnostic differential expression analysis, we first need to de ```{r 'define_ers', eval = .Platform$OS.type != 'windows'} ## Define expressed regions for study SRP009615, only for chromosome Y -regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L, - maxClusterGap = 3000L) +regions <- expressed_regions("SRP009615", "chrY", + cutoff = 5L, + maxClusterGap = 3000L +) ## Briefly explore the resulting regions regions @@ -505,7 +541,7 @@ Having defined the expressed regions, we can now compute a coverage matrix for t ```{r 'compute_covMat', eval = .Platform$OS.type != 'windows'} ## Compute coverage matrix for study SRP009615, only for chromosome Y -system.time( rse_ER <- coverage_matrix('SRP009615', 'chrY', regions) ) +system.time(rse_ER <- coverage_matrix("SRP009615", "chrY", regions)) ## Explore the RSE a bit dim(rse_ER) @@ -541,8 +577,10 @@ Now that we have the necessary data, we can construct a `DESeqDataSet` object us ```{r 'ers_dds', eval = .Platform$OS.type != 'windows'} ## Build a DESeqDataSet -dds_ers <- DESeqDataSetFromMatrix(countData = covMat, colData = pheno, - design = ~ gene_target + group) +dds_ers <- DESeqDataSetFromMatrix( + countData = covMat, colData = pheno, + design = ~ gene_target + group +) ``` @@ -552,8 +590,10 @@ With the `DESeqDataSet` object in place, we can then use the function `DESeq()` ```{r 'de_analysis_ers', eval = .Platform$OS.type != 'windows'} ## Perform differential expression analysis with DESeq2 at the ER-level -dds_ers <- DESeq(dds_ers, test = 'LRT', reduced = ~ gene_target, - fitType = 'local') +dds_ers <- DESeq(dds_ers, + test = "LRT", reduced = ~gene_target, + fitType = "local" +) res_ers <- results(dds_ers) ``` @@ -561,17 +601,19 @@ We can then visually explore the results as shown in Figure \@ref(fig:maploters) ```{r 'maploters', eval = .Platform$OS.type != 'windows', fig.cap = "MA plot for group differences adjusted by gene target for SRP009615 using ER-level data (just chrY)."} ## Explore results -plotMA(res_ers, main="DESeq2 results for SRP009615 (ER-level, chrY)") +plotMA(res_ers, main = "DESeq2 results for SRP009615 (ER-level, chrY)") ``` We can also use `r Biocpkg('regionReport')` to create a more extensive exploratory report. ```{r 'report2', eval = FALSE} ## Create the report -report2 <- DESeq2Report(dds_ers, res = res_ers, - project = 'SRP009615 (ER-level, chrY)', - intgroup = c('group', 'gene_target'), outdir = '.', - output = 'SRP009615-results-ER-level-chrY') +report2 <- DESeq2Report(dds_ers, + res = res_ers, + project = "SRP009615 (ER-level, chrY)", + intgroup = c("group", "gene_target"), outdir = ".", + output = "SRP009615-results-ER-level-chrY" +) ``` # Annotation used @@ -593,11 +635,11 @@ For an `rse_exon` object, the `rownames()` correspond to the Gencode `gene_id`. ```{r, 'exon_annotation'} ## Get the rse_exon object for study SRP009615 -download_study('SRP009615', type = 'rse-exon') -load(file.path('SRP009615', 'rse_exon.Rdata')) +download_study("SRP009615", type = "rse-exon") +load(file.path("SRP009615", "rse_exon.Rdata")) ## Delete it if you don't need it anymore -unlink('SRP009615', recursive = TRUE) +unlink("SRP009615", recursive = TRUE) ## Annotation information rowRanges(rse_exon) @@ -606,7 +648,7 @@ rowRanges(rse_exon) rowRanges(rse_exon)$gene_id <- rownames(rse_exon) ## Example subset -rse_exon_subset <- subset(rse_exon, subset = gene_id == 'ENSG00000000003.14') +rse_exon_subset <- subset(rse_exon, subset = gene_id == "ENSG00000000003.14") rowRanges(rse_exon_subset) ``` @@ -626,24 +668,27 @@ If you are re-processing a small set of samples, it simply might be easier to us ```{r, eval = .Platform$OS.type != 'windows'} ## Get the disjoint exons based on EnsDb.Hsapiens.v79 which matches hg38 -exons <- reproduce_ranges('exon', db = 'EnsDb.Hsapiens.v79') +exons <- reproduce_ranges("exon", db = "EnsDb.Hsapiens.v79") ## Change the chromosome names to match those used in the BigWig files -library('GenomeInfoDb') -seqlevelsStyle(exons) <- 'UCSC' +library("GenomeInfoDb") +seqlevelsStyle(exons) <- "UCSC" ## Get the count matrix at the exon level for chrY -exons_chrY <- keepSeqlevels(exons, 'chrY', pruning.mode = 'coarse') -exonRSE <- coverage_matrix('SRP009615', 'chrY', unlist(exons_chrY), - chunksize = 3000) +exons_chrY <- keepSeqlevels(exons, "chrY", pruning.mode = "coarse") +exonRSE <- coverage_matrix("SRP009615", "chrY", unlist(exons_chrY), + chunksize = 3000 +) exonMatrix <- assays(exonRSE)$counts dim(exonMatrix) head(exonMatrix) ## Summary the information at the gene level for chrY exon_gene <- rep(names(exons_chrY), elementNROWS(exons_chrY)) -geneMatrix <- do.call(rbind, lapply(split(as.data.frame(exonMatrix), - exon_gene), colSums)) +geneMatrix <- do.call(rbind, lapply(split( + as.data.frame(exonMatrix), + exon_gene +), colSums)) dim(geneMatrix) head(geneMatrix) ``` @@ -658,20 +703,20 @@ Finally, the BigWig files hosted by recount have the following chromosome names If you are interested in finding possible gene fusion events in a particular study, you can download the _RangedSummarizedExperiment_ object at the exon-exon junctions level for that study. The objects we provide classify exon-exon junction events as shown below. ```{r 'gene_fusions'} -library('recount') +library("recount") ## Download and load RSE object for SRP009615 study -download_study('SRP009615', type = 'rse-jx') -load(file.path('SRP009615', 'rse_jx.Rdata')) +download_study("SRP009615", type = "rse-jx") +load(file.path("SRP009615", "rse_jx.Rdata")) ## Delete it if you don't need it anymore -unlink('SRP009615', recursive = TRUE) +unlink("SRP009615", recursive = TRUE) ## Exon-exon junctions by class table(rowRanges(rse_jx)$class) ## Potential gene fusions by chromosome -fusions <- rowRanges(rse_jx)[rowRanges(rse_jx)$class == 'fusion'] +fusions <- rowRanges(rse_jx)[rowRanges(rse_jx)$class == "fusion"] fusions_by_chr <- sort(table(seqnames(fusions)), decreasing = TRUE) fusions_by_chr[fusions_by_chr > 0] @@ -687,10 +732,11 @@ If you are interested in checking the frequency of a particular exon-exon juncti Our collaborators built _SnaptronUI_ to query the [Intropolis](https://github.com/nellore/intropolis) database of the exon-exon junctions found in SRA. _SnaptronUI_ allows you to do several types of queries manually and perform analyses. `r Biocpkg('recount')` has the function `snaptron_query()` which uses [Snaptron](https://github.com/ChristopherWilks/snaptron) to check if particular exon-exon junctions are present in _Intropolis_. For example, here we use `snaptron_query()` to get in `R`-friendly format the information for 3 exon-exon junctions using _Intropolis_ version 1 which uses hg19 coordinates. Version 2 uses hg38 coordinates and the exon-exon junctions were derived from a larger data set: about 50,000 samples vs 21,000 in version 1. `snaptron_query()` can also be used to access the 30 million and 36 million exon-exon junctions derived from the GTEx and TCGA consortiums respectively (about 10 and 11 thousand samples respectively). ```{r snaptron} -library('GenomicRanges') -junctions <- GRanges(seqnames = 'chr2', IRanges( +library("GenomicRanges") +junctions <- GRanges(seqnames = "chr2", IRanges( start = c(28971711, 29555082, 29754983), - end = c(29462418, 29923339, 29917715))) + end = c(29462418, 29923339, 29917715) +)) snaptron_query(junctions) ``` @@ -705,7 +751,7 @@ Thanks to [Imada, Sanchez, et al., bioRxiv, 2019](https://www.biorxiv.org/conten ## Example use of download_study() ## for downloading the FANTOM-CAT `rse_fc` ## file for a given study -download_study('DRP000366', type = 'rse-fc', download = FALSE) +download_study("DRP000366", type = "rse-fc", download = FALSE) ``` @@ -715,7 +761,7 @@ The `recount-brain` project by [Ramzara et al., bioRxiv, 2019](https://www.biorx ```{r 'recount-brain'} ## recount-brain citation details -citation('recount')[5] +citation("recount")[5] ``` @@ -725,7 +771,7 @@ If you are interested in downloading all the data you can do so using the `recou ```{r 'downloadAll'} ## Get data.frame with all the URLs -library('recount') +library("recount") dim(recount_url) ## Explore URLs @@ -736,7 +782,7 @@ You can then download all the data using the `download_study()` function or howe ```{r 'actuallyDownload', eval = FALSE} ## Download all files (will take a while! It's over 8 TB of data) -sapply(unique(recount_url$project), download_study, type = 'all') +sapply(unique(recount_url$project), download_study, type = "all") ``` You can find the code for the website at [leekgroup/recount-website](https://github.com/leekgroup/recount-website/tree/master/website) if you want to deploy your own local mirror. Please let us know if you choose to do deploy a mirror. @@ -790,8 +836,10 @@ Several functions in the `r Biocpkg('recount')` package have the `outdir` argume ```{r 'ER-sciserver', eval = FALSE} ## Expressed-regions SciServer example -regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L, - maxClusterGap = 3000L, outdir = file.path(scipath, 'SRP009615')) +regions <- expressed_regions("SRP009615", "chrY", + cutoff = 5L, + maxClusterGap = 3000L, outdir = file.path(scipath, "SRP009615") +) regions ``` @@ -838,17 +886,17 @@ Code for creating the vignette ```{r createVignette, eval=FALSE} ## Create the vignette -library('rmarkdown') -system.time(render('recount-quickstart.Rmd', 'BiocStyle::html_document')) +library("rmarkdown") +system.time(render("recount-quickstart.Rmd", "BiocStyle::html_document")) ## Extract the R code -library('knitr') -knit('recount-quickstart.Rmd', tangle = TRUE) +library("knitr") +knit("recount-quickstart.Rmd", tangle = TRUE) ``` ```{r createVignette2} ## Clean up -file.remove('quickstartRef.bib') +file.remove("quickstartRef.bib") ``` Date the vignette was generated. @@ -863,14 +911,14 @@ Wallclock time spent generating the vignette. ```{r reproduce2, echo=FALSE} ## Processing time in seconds totalTime <- diff(c(startTime, Sys.time())) -round(totalTime, digits=3) +round(totalTime, digits = 3) ``` `R` session information. ```{r reproduce3, echo=FALSE} ## Session info -library('sessioninfo') +library("sessioninfo") options(width = 120) session_info() ``` @@ -879,58 +927,68 @@ session_info() ## Code for re-creating the data distributed in this package ## Genes/exons -library('recount') -recount_both <- reproduce_ranges('both', db = 'Gencode.v25') +library("recount") +recount_both <- reproduce_ranges("both", db = "Gencode.v25") recount_exons <- recount_both$exon -save(recount_exons, file = '../data/recount_exons.RData', compress = 'xz') +save(recount_exons, file = "../data/recount_exons.RData", compress = "xz") recount_genes <- recount_both$gene -save(recount_genes, file = '../data/recount_genes.RData', compress = 'xz') +save(recount_genes, file = "../data/recount_genes.RData", compress = "xz") ## URL table -load('../../recount-website/fileinfo/upload_table.Rdata') +load("../../recount-website/fileinfo/upload_table.Rdata") recount_url <- upload_table ## Create URLs -is.bw <- grepl('[.]bw$', recount_url$file_name) +is.bw <- grepl("[.]bw$", recount_url$file_name) recount_url$url <- NA -recount_url$url[!is.bw] <-paste0('http://duffel.rail.bio/recount/', - recount_url$project[!is.bw], '/', recount_url$file_name[!is.bw]) +recount_url$url[!is.bw] <- paste0( + "http://duffel.rail.bio/recount/", + recount_url$project[!is.bw], "/", recount_url$file_name[!is.bw] +) recount_url$url[!is.bw & recount_url$version2] <- - paste0('http://duffel.rail.bio/recount/v2/', - recount_url$project[!is.bw & recount_url$version2], '/', - recount_url$file_name[!is.bw & recount_url$version2]) -recount_url$url[is.bw] <- paste0('http://duffel.rail.bio/recount/', - recount_url$project[is.bw], '/bw/', recount_url$file_name[is.bw]) -save(recount_url, file = '../data/recount_url.RData', compress = 'xz') + paste0( + "http://duffel.rail.bio/recount/v2/", + recount_url$project[!is.bw & recount_url$version2], "/", + recount_url$file_name[!is.bw & recount_url$version2] + ) +recount_url$url[is.bw] <- paste0( + "http://duffel.rail.bio/recount/", + recount_url$project[is.bw], "/bw/", recount_url$file_name[is.bw] +) +save(recount_url, file = "../data/recount_url.RData", compress = "xz") ## Add FC-RC2 data -load('../data/recount_url.RData') -load('../../recount-website/fileinfo/fc_rc_url_table.Rdata') +load("../data/recount_url.RData") +load("../../recount-website/fileinfo/fc_rc_url_table.Rdata") recount_url <- rbind(recount_url, fc_rc_url_table) rownames(recount_url) <- NULL -save(recount_url, file = '../data/recount_url.RData', compress = 'xz') +save(recount_url, file = "../data/recount_url.RData", compress = "xz") ## Abstract info -load('../../recount-website/website/meta_web.Rdata') -recount_abstract <- meta_web[, c('number_samples', 'species', 'abstract')] -recount_abstract$project <- gsub('.*">|', '', meta_web$accession) -Encoding(recount_abstract$abstract) <- 'latin1' -recount_abstract$abstract <- iconv(recount_abstract$abstract, 'latin1', 'UTF-8') -save(recount_abstract, file = '../data/recount_abstract.RData', - compress = 'bzip2') +load("../../recount-website/website/meta_web.Rdata") +recount_abstract <- meta_web[, c("number_samples", "species", "abstract")] +recount_abstract$project <- gsub('.*">|', "", meta_web$accession) +Encoding(recount_abstract$abstract) <- "latin1" +recount_abstract$abstract <- iconv(recount_abstract$abstract, "latin1", "UTF-8") +save(recount_abstract, + file = "../data/recount_abstract.RData", + compress = "bzip2" +) ## Example rse_gene file -system('scp e:/dcl01/leek/data/recount-website/rse/rse_sra/SRP009615/rse_gene.Rdata .') -load('rse_gene.Rdata') +system("scp e:/dcl01/leek/data/recount-website/rse/rse_sra/SRP009615/rse_gene.Rdata .") +load("rse_gene.Rdata") rse_gene_SRP009615 <- rse_gene -save(rse_gene_SRP009615, file = '../data/rse_gene_SRP009615.RData', - compress = 'xz') -unlink('rse_gene.Rdata') +save(rse_gene_SRP009615, + file = "../data/rse_gene_SRP009615.RData", + compress = "xz" +) +unlink("rse_gene.Rdata") ``` ```{r 'cleanup', echo = FALSE, results = 'hide'} ## Clean up -unlink('SRP009615', recursive = TRUE) +unlink("SRP009615", recursive = TRUE) ```