preprint_details.Rmd

---
title: "R Notebook"
---

# Libraries

```{r}

library(tidyverse)
library(rcrossref)
library(roadoi)
library(rvest)
library(fuzzyjoin)
library(rentrez)
library(XML)
library(lubridate)

```

# Retrieve basic metadata of all preprints via bioRxiv API

```{r}

# See https://api.biorxiv.org for details
# Note that the API allows querying of both bioRxiv and medRxiv via the 
# 'server' parameter (although this is not documented)

max_results_per_page <- 100 # max allowable number of results per page
base_url <- "https://api.biorxiv.org/details/"

start <- "2013-11-01" # Launch of bioRxiv
end <- "2020-10-31"

getPreprintData <- function(server) {
  
  # Make initial request
  url <- paste0(base_url, server, "/", start, "/", end, "/", 0)
  request <- httr::GET(url = url)
  content <- httr::content(request, as = "parsed")
  
  # Determine total number of results and required iterations for paging
  total_results <- content$messages[[1]]$total
  pages <- ceiling(total_results / max_results_per_page) - 1
  
  data <- content$collection
  
  for (i in 1:pages) {
    cursor <- format(i * max_results_per_page, scientific = FALSE) # otherwise page 100000 becomes 1e05, which the api does not recognise
    url <- paste0(base_url, server, "/", start, "/", end, "/", cursor)
    request <- httr::RETRY("GET", url, times = 5, pause_base = 1, pause_cap = 60) # retry if server error
    content <- httr::content(request, as = "parsed")
    if(content$messages[[1]]$status == "ok") {
      data <- c(data, content$collection)
    } else {
      data <- c(data, list("error" = content$messages[[1]]$status))
    }
    Sys.sleep(0.1) # don't hit the API too hard
  }
  return(data)
}

preprint_data <- purrr::map(c("biorxiv", "medrxiv"), getPreprintData)

```


```{r}

parsePreprintData <- function(item) {
  if(item[1] != "no posts found") {
    tibble(
      source = item$server,
      doi = item$doi,
      title = item$title,
      abstract = item$abstract,
      authors = item$authors,
      author_corresponding = item$author_corresponding, 
      author_corresponding_institution = item$author_corresponding_institution,
      posted_date = item$date,
      version = item$version,
      license = item$license,
      type = item$type,
      category = item$category,
      published_doi = if(item$published == "NA") NA_character_ else item$published
    )
  }
}

# Build a search string containing terms related to COVID-19
search_string_covid <- "coronavirus|covid-19|sars-cov|ncov-2019|2019-ncov|hcov-19|sars-2"

# Set date of first case of COVID-19
covid_start <- "2019-12-31"

# Parse data to dataframe
preprints <- map_dfr(preprint_data, ~ map_dfr(.x, parsePreprintData)) %>%
  group_by(doi) %>%
  # calculate number of versions of a preprint and number of authors
  mutate(n_versions = n()) %>%
  ungroup() %>%
  # keep the first version record
  filter(version == 1) %>%
  select(-version) %>%
  mutate(
    # clean up DOIs for later matching
    doi = str_trim(str_to_lower(doi)),
    published_doi = str_trim(str_to_lower(published_doi)),
    covid_preprint = case_when(
      str_detect(title, regex(search_string_covid, ignore_case = TRUE)) & posted_date > covid_start ~ T, 
      str_detect(abstract, regex(search_string_covid, ignore_case = TRUE)) & posted_date > covid_start ~ T,
      T ~ F),
    n_authors = map_int(authors, ~length(strsplit(., split = ";")[[1]]))
    ) %>%
  # some duplicates are included
  distinct()

```

# Save basic metadata of all preprints

```{r}

# Basic fields without abstracts
preprints %>% 
  # reorder elements
  select(source, doi, posted_date, covid_preprint, title, n_versions, 
         license, type, category, authors, n_authors, author_corresponding, 
         author_corresponding_institution,  published_doi) %>% 
  write_csv("data/preprints_basic_20131101_20201031.csv")

# Abstracts (separate annual files due to large file sizes)
preprints %>%
  mutate(year = as.character(year(posted_date))) %>%
  select(doi, abstract, year) %>%
  distinct() %>%
  nest(data = !year) %>%
  pwalk(~ write_csv(x = .y, path = str_c("data/preprint_abstracts_", .x, ".csv")))

```

# Subset preprints for COVID analysis (Jan 2019 - Oct 2020)

```{r}

preprints_analysis <- preprints %>%
  filter(posted_date >= "2019-01-01",
         posted_date <= "2020-10-31")

```


# Add published article metadata (published article title, publication date, journal, publisher) via Crossref

```{r}
 
published_articles <- preprints_analysis %>%
  filter(!is.na(published_doi)) %>%
  pull(published_doi) %>%
  map_dfr(., ~ cr_works(.x)$data) %>%
  select(doi, title, created, container.title, publisher) %>%
  rename(
    published_doi = doi,
    published_title = title,
    published_date = created,
    published_journal = container.title,
    published_publisher = publisher
  ) %>%
  filter(published_date >= "2019-01-01",
         published_date <= "2020-10-31") %>%
  distinct()

```

# Merge preprints and published articles data

```{r}

preprints_analysis <- preprints_analysis %>%
  # Remove published dois where the publication date is outside our analysis period
  mutate(
    published_doi = case_when(
      published_doi %in% published_articles$published_doi ~ published_doi,
      T ~ NA_character_
    )
  ) %>%
  left_join(published_articles, by = "published_doi") %>%
  mutate(delay_in_days = as.numeric(ymd(published_date) -  ymd(posted_date))) %>%
  distinct()

```

# OA information for published articles

```{r}

published_articles_oa <- preprints_analysis %>%
  filter(!is.na(published_doi)) %>%
  pull(published_doi) %>%
  map_dfr(., ~ oadoi_fetch(dois = .x, email = "n.fraser@zbw.eu")) %>%
  mutate(
    published_doi = doi,
    published_article_is_oa = is_oa,
    published_article_oa_status = oa_status,
    published_journal_is_oa = journal_is_oa
  ) %>%
  select(published_doi, published_article_is_oa, 
         published_article_oa_status, published_journal_is_oa)

```

# Merge preprints and OA data

```{r}

preprints_analysis <- preprints_analysis %>% 
  left_join(published_articles_oa, by = "published_doi") %>%
  distinct()

```

# Disambiguated author affiliations via ROR affiliation matching (preprints only)

```{r}

# Some affiliation names are truncated at 160 characters - in these cases
# we can instead retrieve full affili from the public webpage and match to the 
# corresponding author information from the API results
#truncated_institutions <- preprints_analysis %>% 
#  mutate(nchar(author_corresponding_institution) == 160)

# Scrape the bioRxiv and medRxiv websites for affiliation information contained
# in HTML meta tags
getTruncatedAuthorInstitutions <- function(doi, author_corresponding_institution) {
  
  url <- paste0("https://doi.org/", doi)
  html <- read_html(url)
  
  data <- html %>%
      html_nodes("meta[name='citation_author_institution'][content]") %>%
      html_attr('content')
  
  tibble(
    doi = doi,
    institutions = data
  ) %>%
  filter(str_trunc(institutions, width = 160, ellipsis = "") == author_corresponding_institution) %>%
  distinct() %>%
  rename(author_corresponding_institution = institutions)
  
}

truncated_institutions <- preprints_analysis %>% 
  filter(nchar(author_corresponding_institution) == 160) %>%
  {map2_dfr(.$doi, .$author_corresponding_institution, getTruncatedAuthorInstitutions)}
  
```

```{r}

# See https://github.com/ror-community/ror-api
# The full string of the institution name is passed to the ROR institution
# matching API. The API returns a list of possible matches, including a 
# match score (between zero and one, one being a perfect match), and the match
# type (e.g. matching on common terms in institution names, or on....). 
# The API-selected best match (field "chosen" = TRUE) is retained if score
# = 1 and match is based on full-phrase matching. Otherwise empty tibble returned.
# These are the most conservative settings customisable.

getRORAffiliations <- function(doi, institution) {
  
  Sys.sleep(0.1) # Don't hit API too hard
  
  # update progress bar
  pb$tick()$print()
  
  base_url <- "https://api.ror.org/organizations?affiliation="
  encoded_institution <- URLencode(institution)
  url <- paste0(base_url, encoded_institution)
  
  request <- httr::GET(url)
  content <- httr::content(request, as = "parsed")
  
  if(length(content$items)) {
    data <- content$items
    return(tibble(
      doi = rep(doi, length(data)), 
      institution_match_name = map_chr(data, ~ .$organization$name),
      institution_match_country_name = map_chr(data, ~ .$organization$country$country_name),
      institution_match_country_code = map_chr(data, ~ .$organization$country$country_code),
      institution_match_type = map_chr(data, ~ .$matching_type),
      institution_match_score =  map_dbl(data, ~ .$score),
      chosen =  map_chr(data, ~ .$chosen)
    )) %>%
    filter(chosen == "TRUE" & institution_match_type == "PHRASE" & institution_match_score == 1) %>%
    slice(1) %>%
    select(doi, institution_match_score, institution_match_type, institution_match_name, 
           institution_match_country_name, institution_match_country_code)
  }
}

# set counter for progress bar
pb <- progress_estimated(length(preprints_analysis$doi))

affiliations <- map2_dfr(preprints_analysis$doi,
                         preprints_analysis$author_corresponding_institution, 
                         getRORAffiliations) %>%
  mutate(doi = str_trim(str_to_lower(doi))) %>%
  distinct()


```

# Merge authors data and apply some manual corrections

```{r}

preprints_analysis <- preprints_analysis %>%
  left_join(affiliations, by = "doi") %>%
  select(source:author_corresponding_institution,
         institution_match_score:institution_match_country_code,
         published_doi:published_journal_is_oa) %>% 
  # Apply manual correction to terms that lead to misdefined first cases, and terms among the top 50 most common terms that are matched incorrectly
  mutate(
    institution_match_country_name = case_when(
      author_corresponding_institution == "Universidad de los Andes" ~ "Colombia",
      author_corresponding_institution == "CAS Key Laboratory of Pathogenic Microbiology and Immunology, Institute of Microbiology, Center for Influenza Research and Early-warning (CASCIRE), Chinese Acad" ~ "China",
      author_corresponding_institution == "Metabiota" ~ "Canada",
      author_corresponding_institution == "Jinan University" ~ "China",
      author_corresponding_institution == "The School of Information Science and Technology, Jinan University" ~ "China",
      author_corresponding_institution == "National Research Council" ~ "Italy",
      author_corresponding_institution == "Northeastern University" ~ "United States",
      author_corresponding_institution == "Georgetown University" ~ "United States",
      author_corresponding_institution == "Institute of Virology, Charite-Universitaetsmedizin Berlin, corporate member of Freie Universitaet Berlin, Humboldt-Universitaet zu Berlin, and Berlin Institute" ~ "Germany",
      author_corresponding_institution == "CNRS" ~ "France",
      author_corresponding_institution == "UCSF" ~ "USA",
      TRUE ~ institution_match_country_name),
    institution_match_country_code = case_when(
      author_corresponding_institution == "Universidad de los Andes" ~ "CO",
      author_corresponding_institution == "CAS Key Laboratory of Pathogenic Microbiology and Immunology, Institute of Microbiology, Center for Influenza Research and Early-warning (CASCIRE), Chinese Acad" ~ "CN",
      author_corresponding_institution == "Metabiota" ~ "CA",
      author_corresponding_institution == "Jinan University" ~ "CN",
      author_corresponding_institution == "The School of Information Science and Technology, Jinan University" ~ "CN",
      author_corresponding_institution == "National Research Council" ~ "IT",
      author_corresponding_institution == "Northeastern University" ~ "US",
      author_corresponding_institution == "Georgetown University" ~ "US",
      author_corresponding_institution == "Institute of Virology, Charite-Universitaetsmedizin Berlin, corporate member of Freie Universitaet Berlin, Humboldt-Universitaet zu Berlin, and Berlin Institute" ~ "DE",
      author_corresponding_institution == "CNRS" ~ "FR",
      author_corresponding_institution == "UCSF" ~ "US",
      TRUE ~ institution_match_country_code)
    )
```

# Preprint Word and Reference Counts

```{r}

word_count <- function(string) {
  sapply(strsplit(string, " "), length)
}

# Scrape the bioRxiv and medRxiv websites for usage stats
getWordCounts <- function(doi) {
  
  base_url = "https://www.biorxiv.org/content/"
  
  url <- paste0(base_url, doi, "v1.full")
  
  html <- read_html(url)
  
  # Get all nodes contained in article full text
  data <- html %>%
    html_nodes(".fulltext-view")
  
  # All paragraphs (anything in "<p>" tags)
  p_text <- data %>%
    html_nodes("p") %>%
    html_text(trim = TRUE) %>%
    str_trim(., side = "both") %>%
    str_remove_all(., "\\[(.*?)\\]") %>%
    str_c(., collapse = " ")
  
  # Abstract
  abstract_text <- data %>%
    html_nodes(xpath="//div[contains(@id, 'abstract-') and @class='abstract']") %>%
    html_text(trim = TRUE) %>%
    str_trim(., side = "both") %>%
    str_remove_all(., "\\[(.*?)\\]") %>%
    str_c(., collapse = " ")
  
  # Figure captions
  figs <- data %>% 
    html_nodes(".fig-caption")
  
  fig_text <- figs %>%
    html_text(trim = TRUE) %>%
    str_trim(., side = "both") %>%
    str_remove_all(., "\\[(.*?)\\]") %>%
    str_c(., collapse = " ")
  
  # Table captions
  tables <- data %>% 
    html_nodes(".table-caption")
  
  table_text <- tables %>%
    html_text(trim = TRUE) %>%
    str_trim(., side = "both") %>%
    str_remove_all(., "\\[(.*?)\\]") %>%
    str_c(., collapse = " ")
  
  # Acknowledgements
  ack_text <- data %>% 
    html_nodes(xpath="//div[contains(@id, 'ack-') and contains(@class, 'ack')]") %>%
    html_text(trim = TRUE) %>%
    str_trim(., side = "both") %>%
    str_remove_all(., "\\[(.*?)\\]") %>%
    str_c(., collapse = " ")
  
  # Calculate word counts (not including abstracts, fig and table captions, acknowledgements)
  n_words <- word_count(p_text) - word_count(abstract_text) - word_count(fig_text) - word_count(table_text) - word_count(ack_text)
    
  n_refs <- html %>%
    html_nodes(xpath="//*[contains(@class, 'ref-cit')]") %>%
    html_text() %>%
    length()

  # update progress bar
  pb$tick()$print()
  
  return(list(
    doi = doi,
    n_words = n_words,
    n_refs = n_refs,
    n_figs = length(figs),
    n_tables = length(tables)
  ))
  
}

# Retrieve usage data. Sometimes the bioRxix/medRxiv websites time out and
# return an invalid response. So we conduct the iteration with purrr::safely
# to prevent errors interrupting the process
getWordCountsSafely <- safely(getWordCounts)

biorxiv_preprints <- preprints_analysis %>% 
  filter(source == "biorxiv")

# set counter for progress bar
pb <- progress_estimated(length(biorxiv_preprints$doi))

word_counts <- map(biorxiv_preprints$doi, ~ getWordCountsSafely(.))

word_counts_df <- map_dfr(word_counts, ~ tibble(doi = .$result$doi,
                                                n_words = .$result$n_words,
                                                n_refs = .$result$n_refs,
                                                n_figs = .$result$n_figs,
                                                n_tables = .$result$n_tables)) %>%
# remove cases where word and ref counts are zero - these are the result of a
# full text not being available (e.g. if a new version is posted before the initial
# version has had a full text added)
  filter(n_words != 0,
         n_refs != 0)

```


```{r}
# Merge with other preprint data
preprints_analysis <- preprints_analysis %>%
  left_join(word_counts_df, by = "doi") %>%
  select(source:n_versions, n_words:n_tables, license:published_journal_is_oa)
  
```

# Write final dataset to csv

```{r}

write_csv(preprints_analysis, "data/preprints_full_20190101_20201031.csv")

```