-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprint_usage.Rmd
101 lines (71 loc) · 2.09 KB
/
preprint_usage.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
---
title: "R Notebook"
---
# Libraries
```{r}
library(tidyverse)
library(lubridate)
library(rvest)
library(janitor)
```
# Load data
```{r}
preprints <- bind_rows(
read_csv("data/preprints_basic_20190101_20191231.csv"),
read_csv("data/preprints_basic_20200101_20201031.csv"))
```
# Retrieve usage data for preprints
```{r}
# Scrape the bioRxiv and medRxiv websites for usage stats
getUsageData <- function(source, doi) {
if(source == "biorxiv") {
base_url = "https://www.biorxiv.org/content/"
} else {
base_url = "https://www.medrxiv.org/content/"
}
url <- paste0(base_url, doi, "v1.article-metrics")
html <- read_html(url)
data <- html %>%
html_nodes(".highwire-stats") %>%
html_table(fill = TRUE) %>% .[[1]] %>%
rename(date = 1) %>%
mutate(source = source,
doi = doi) %>%
janitor::clean_names()
# update progress bar
pb$tick()$print()
return(data)
}
# set counter for progress bar
pb <- progress_estimated(length(preprints$doi))
# Retrieve usage data. Sometimes the bioRxix/medRxiv websites time out and
# return an invalid response. So we conduct the iteration with purrr::safely
# to prevent errors interrupting the process
getUsageDataSafely <- safely(getUsageData)
usage_data <- map2(preprints$source, preprints$doi,
function(x, y) getUsageDataSafely(x, y))
```
# Create final dataset
```{r}
# Parse the response returned by the 'safely' function
parseUsageData <- function(item) {
if(item["error"] == "") {
return()
} else if(!length(item$result)) {
return()
} else {
return(
tibble(
source = item$result$source,
doi = as.character(item$result$doi),
collection_date = as.character(item$result$date),
abstract_views = as.numeric(item$result$abstract),
full_text_views = if(length(item$result$full)) as.numeric(item$result$full) else rep(NA_integer_, length(item$result$doi)),
pdf_downloads = as.numeric(item$result$pdf)
)
)
}
}
map_dfr(usage_data, parseUsageData) %>%
write_csv("data/preprint_usage_20190101_20201031.csv")
```