-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprint_altmetrics.Rmd
248 lines (179 loc) · 6.1 KB
/
preprint_altmetrics.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
---
title: "R Notebook"
---
# Load libraries
```{r}
library(tidyverse)
library(rtweet)
```
# Load data
```{r}
preprints <- read_csv("data/preprints_full_20190101_20201031.csv")
```
# Counts data
```{r}
# Altmetric API key - set in .Renviron file
# API key is not strictly necessary but removes rate limits
api_key <- Sys.getenv("ALTMETRIC_API_KEY")
# API request function
request <- function(doi){
# URL for API call (remove everything after 'doi' if no API key)
url <- paste0("http://api.altmetric.com/v1/doi/", doi, "?key=", api_key)
# Make request
response <- httr::GET(url)
# Check for empty response (404)
if (response$status_code == 404) {
return()
}else{
return(response)
}
}
# Get altmetrics data and parse to a dataframe
getAltmetrics <- function(doi) {
response <- request(doi)
# If no response, set all counts to 0
if(!length(response)){
altmetrics_data <- tibble(
"doi" = doi,
"score" = 0,
"twitter" = 0,
"facebook" = 0,
"blogs" = 0,
"news" = 0,
"wikipedia" = 0,
"policies" = 0
)
} else {
# Retrieve data
data <- httr::content(response, as="parsed")
# Build tibble of results
# Where no data exists on a field, we set it to 0
altmetrics_data <- tibble(
"doi" = doi,
"score" = if (length(data$score)) data$score else NA,
"twitter" = if (length(data$cited_by_tweeters_count)) data$cited_by_tweeters_count else 0,
"facebook" = if (length(data$cited_by_fbwalls_count)) data$cited_by_fbwalls_count else 0,
"blogs" = if (length(data$cited_by_feeds_count)) data$cited_by_feeds_count else 0,
"news" = if (length(data$cited_by_msm_count)) data$cited_by_msm_count else 0,
"wikipedia" = if (length(data$cited_by_wikipedia_count)) data$cited_by_wikipedia_count else 0,
"policies" = if (length(data$cited_by_policies_count)) data$cited_by_policies_count else 0
)
}
# update progress bar
pb$tick()$print()
return(altmetrics_data)
}
# Retrieve altmetric data for all preprints
pb <- progress_estimated(length(preprints$doi))
preprint_altmetric_counts <- map_dfr(preprints$doi, getAltmetrics) %>%
mutate(doi = str_trim(str_to_lower(doi)))
```
```{r}
# Retrieve altmetric data for all published articles
published_dois <- preprints %>%
filter(!is.na(published_doi)) %>%
pull(published_doi)
pb <- progress_estimated(length(published_dois))
published_altmetric_counts <- map_dfr(published_dois, getAltmetrics) %>%
rename(
published_doi = doi,
published_score = score,
published_twitter = twitter,
published_facebook = facebook,
published_blogs = blogs,
published_news = news,
published_wikipedia = wikipedia,
published_policies = policies
) %>%
mutate(published_doi = str_trim(str_to_lower(published_doi)))
```
# Create final dataset
```{r}
preprints %>%
inner_join(preprint_altmetric_counts, by = "doi") %>%
left_join(published_altmetric_counts, by = "published_doi") %>%
select(doi, score:policies, published_doi, published_score:published_policies) %>%
distinct() %>%
write_csv("data/preprint_altmetrics_20190101_20201031.csv")
```
# Full altmetrics for preprints (NOTE: should check licensing agreements with Altmetric before making data publicly available)
```{r}
# Altmetric API key - set in .Renviron file
# An API key is necessary for using the Fetch API
api_key <- Sys.getenv("ALTMETRIC_API_KEY")
# API request function. Retrieving full altmetrics data requires an api key
# that is authorized to use the 'fetch' API endpoint
fetchRequest <- function(doi){
# URL for API call
url <- paste0("http://api.altmetric.com/v1/fetch/doi/", doi, "?key=", api_key)
# Make request
response <- httr::GET(url)
# Check for empty response (404)
if (response$status_code == 404) {
return(NA_character_)
}else{
return(httr::content(response))
}
}
altmetrics_data <- map(preprints$doi, fetchRequest)
```
# Extract Twitter IDs
```{r}
getTweetIDs <- function(item) {
if(!is.character(item)) {
length_tweets = length(item$posts$twitter)
if(length_tweets > 0) {
d <- tibble(
doi = rep(item$citation$doi, length_tweets),
tweet_id = map_chr(item$posts$twitter, ~.x$tweet_id)
)
return(d)
}
}
}
tweet_ids <- map_dfr(altmetrics_data, getTweetIDs)
```
# Retrieve Twitter data
```{r}
# Tweets have to be batched into groups of 90,000 - one batch can be queried
# in every 15-minute period
iterations <- ceiling(length(tweet_ids$tweet_id) / 90000)
getTweets <- function(i) {
if(i == iterations) {
ids <- tweet_ids$tweet_id[(((i-1) * 90000) + 1):length(tweet_ids$tweet_id)]
} else {
ids <- tweet_ids$tweet_id[(((i-1) * 90000) + 1):(i*90000)]
}
t_start = Sys.time() # time at start of data collection
tweets <- lookup_tweets(ids, parse = T) %>%
select(user_id, status_id, status_url, created_at, screen_name, text,
reply_to_status_id, reply_to_user_id, reply_to_screen_name, is_quote,
is_retweet, favorite_count, retweet_count, quote_count, reply_count,
hashtags, retweet_status_id, retweet_user_id, retweet_text,
retweet_created_at, retweet_retweet_count, followers_count, location,
description)
t_end = Sys.time() # time at end of data collection
t_next_interval = as.numeric(900 - difftime(t_end, t_start, units = "secs"))
# Sleep until next 15-minute interval - add 60 seconds extra buffer to be sure
if(t_next_interval > 0) {
Sys.sleep(t_next_interval + 60)
} else {
Sys.sleep(60)
}
return(tweets)
}
tweets <- map_dfr(c(1:iterations), getTweets)
```
# Write tweets to file
```{r}
tweet_ids %>%
inner_join(tweets, by = c("tweet_id" = "status_id")) %>%
# hashtags are in list - convert to a single string
mutate(hashtags = map_chr(hashtags, ~ str_c(.x, collapse = ", ")),
created_at = lubridate::date(created_at)) %>%
# limit tweets to those in our analysis period
filter(created_at <= "2020-10-31") %>%
# remove duplicates
distinct() %>%
write_csv("data/preprint_tweets_20190101_20201031.csv")
```