preprint_altmetrics.Rmd

---
title: "R Notebook"
---

# Load libraries

```{r}

library(tidyverse)
library(rtweet)

```

# Load data

```{r}

preprints <- read_csv("data/preprints_full_20190101_20201031.csv")

```

# Counts data

```{r}
# Altmetric API key - set in .Renviron file
# API key is not strictly necessary but removes rate limits
api_key <- Sys.getenv("ALTMETRIC_API_KEY")

# API request function
request <- function(doi){
  
  # URL for API call (remove everything after 'doi' if no API key)
  url <- paste0("http://api.altmetric.com/v1/doi/", doi, "?key=", api_key)
  
  # Make request
  response <- httr::GET(url)
  
  # Check for empty response (404)
  if (response$status_code == 404) { 
    return()
  }else{
    return(response)
  }
  
}

# Get altmetrics data and parse to a dataframe
getAltmetrics <- function(doi) {
  
  response <- request(doi)
  
  # If no response, set all counts to 0
  if(!length(response)){
    
    altmetrics_data <- tibble(
      "doi" = doi,
      "score" = 0,
      "twitter" = 0,
      "facebook" = 0,
      "blogs" = 0,
      "news" = 0,
      "wikipedia" = 0,
      "policies" = 0
    )
    
  } else {
    
    # Retrieve data
    data <- httr::content(response, as="parsed")
    
    # Build tibble of results
    # Where no data exists on a field, we set it to 0
    altmetrics_data <- tibble(
      "doi" = doi,
      "score" = if (length(data$score)) data$score else NA,
      "twitter" = if (length(data$cited_by_tweeters_count)) data$cited_by_tweeters_count else 0, 
      "facebook" = if (length(data$cited_by_fbwalls_count)) data$cited_by_fbwalls_count else 0, 
      "blogs" = if (length(data$cited_by_feeds_count)) data$cited_by_feeds_count else 0,
      "news" = if (length(data$cited_by_msm_count)) data$cited_by_msm_count else 0,
      "wikipedia" = if (length(data$cited_by_wikipedia_count)) data$cited_by_wikipedia_count else 0,
      "policies" = if (length(data$cited_by_policies_count)) data$cited_by_policies_count else 0
    )
  
  }
  
  # update progress bar
  pb$tick()$print()
  
  return(altmetrics_data)
}

# Retrieve altmetric data for all preprints
pb <- progress_estimated(length(preprints$doi))
preprint_altmetric_counts <- map_dfr(preprints$doi, getAltmetrics) %>%
  mutate(doi = str_trim(str_to_lower(doi)))

```


```{r}

# Retrieve altmetric data for all published articles
published_dois <- preprints %>% 
  filter(!is.na(published_doi)) %>% 
  pull(published_doi)
pb <- progress_estimated(length(published_dois))
published_altmetric_counts <- map_dfr(published_dois, getAltmetrics) %>%
  rename(
    published_doi = doi,
    published_score = score,
    published_twitter = twitter,
    published_facebook = facebook,
    published_blogs = blogs,
    published_news = news,
    published_wikipedia = wikipedia,
    published_policies = policies
  ) %>%
  mutate(published_doi = str_trim(str_to_lower(published_doi)))

```

# Create final dataset

```{r}

preprints %>%
  inner_join(preprint_altmetric_counts, by = "doi") %>%
  left_join(published_altmetric_counts, by = "published_doi") %>%
  select(doi, score:policies, published_doi, published_score:published_policies) %>%
  distinct() %>%
  write_csv("data/preprint_altmetrics_20190101_20201031.csv")

```

# Full altmetrics for preprints (NOTE: should check licensing agreements with Altmetric before making data publicly available)

```{r}

# Altmetric API key - set in .Renviron file
# An API key is necessary for using the Fetch API
api_key <- Sys.getenv("ALTMETRIC_API_KEY")

# API request function. Retrieving full altmetrics data requires an api key 
# that is authorized to use the 'fetch' API endpoint
fetchRequest <- function(doi){
  
  # URL for API call 
  url <- paste0("http://api.altmetric.com/v1/fetch/doi/", doi, "?key=", api_key)
  
  # Make request
  response <- httr::GET(url)
  
  # Check for empty response (404)
  if (response$status_code == 404) { 
    return(NA_character_)
  }else{
    return(httr::content(response))
  }
}

altmetrics_data <- map(preprints$doi, fetchRequest)

```

# Extract Twitter IDs

```{r}

getTweetIDs <- function(item) {
  if(!is.character(item)) {
    length_tweets = length(item$posts$twitter)
    if(length_tweets > 0) {
      d <- tibble(
             doi = rep(item$citation$doi, length_tweets),
             tweet_id = map_chr(item$posts$twitter, ~.x$tweet_id)
           )
      return(d)
    }
  }
}

tweet_ids <- map_dfr(altmetrics_data, getTweetIDs)

```

# Retrieve Twitter data

```{r}

# Tweets have to be batched into groups of 90,000 - one batch can be queried
# in every 15-minute period
iterations <- ceiling(length(tweet_ids$tweet_id) / 90000)

getTweets <- function(i) {

  if(i == iterations) {
    ids <- tweet_ids$tweet_id[(((i-1) * 90000) + 1):length(tweet_ids$tweet_id)]
  } else {
    ids <- tweet_ids$tweet_id[(((i-1) * 90000) + 1):(i*90000)]
  }
  
  t_start = Sys.time() # time at start of data collection
  
  tweets <- lookup_tweets(ids, parse = T) %>%
    select(user_id, status_id, status_url, created_at, screen_name, text, 
           reply_to_status_id, reply_to_user_id, reply_to_screen_name, is_quote, 
           is_retweet, favorite_count, retweet_count, quote_count, reply_count, 
           hashtags, retweet_status_id, retweet_user_id, retweet_text, 
           retweet_created_at, retweet_retweet_count, followers_count, location, 
           description)
  
  t_end = Sys.time() # time at end of data collection
  
  t_next_interval = as.numeric(900 - difftime(t_end, t_start, units = "secs"))
  
  # Sleep until next 15-minute interval - add 60 seconds extra buffer to be sure
  if(t_next_interval > 0) {
    Sys.sleep(t_next_interval + 60) 
  } else {
    Sys.sleep(60)
  }
  
  return(tweets)
  
}

tweets <- map_dfr(c(1:iterations), getTweets)

```

# Write tweets to file

```{r}

tweet_ids %>%
  inner_join(tweets, by = c("tweet_id" = "status_id")) %>%
  # hashtags are in list - convert to a single string
  mutate(hashtags = map_chr(hashtags, ~ str_c(.x, collapse = ", ")),
         created_at = lubridate::date(created_at)) %>%
  # limit tweets to those in our analysis period
  filter(created_at <= "2020-10-31") %>%
  # remove duplicates
  distinct() %>%
  write_csv("data/preprint_tweets_20190101_20201031.csv")

```