-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprint_firstcase.Rmd
70 lines (53 loc) · 2.96 KB
/
preprint_firstcase.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
---
title: "R Notebook"
---
# Load libraries
```{r}
library(tidyverse)
```
# Load data
```{r}
preprints <- read_csv("data/preprints_full_20190101_20201031.csv")
# Read in Johns Hopkins global case time series data (reference: Dong E, Du H, Gardner L. An interactive web-based dashboard to track COVID-19 in real time. Lancet Infect Dis)
global_cases <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
```
# Calculate first preprint, first case per country
```{r}
# Apply manual correction to terms that lead to misdefined first cases, and terms among the top 50 most common terms that are matched incorrectly
# Identify date of first confirmed case per country
global_cases %<>%
filter(!(Country.Region %in% c("Diamond Princess","Kosovo","MS Zaandam"))) %>%
mutate(Country.Region = case_when(
Province.State == "Guadeloupe" ~ "Guadeloupe", # Overwrite Guadeloupe as country to match preprint author data
TRUE ~ as.character(Country.Region))) %>%
select(-c(Province.State, Lat, Long)) %>%
gather(date, value, -Country.Region) %>%
mutate(date = as.Date(date, format = "X%m.%d.%y"),
country_code = countrycode::countrycode(Country.Region,
origin = 'country.name',
destination = 'iso2c')) %>%
group_by(Country.Region, country_code) %>%
filter(value != 0) %>%
summarise(first_case = min(date))
# Identify date of first preprint per country and merge with first case, calculate time difference between them
case_author_dates_df <- preprints %>%
filter(covid_preprint == TRUE) %>%
select(doi, author_corresponding_institution,
starts_with("institution"), posted_date) %>%
filter(posted_date >= "2020-01-01",
posted_date <= "2020-10-31") %>%
group_by(institution_match_country_name,
institution_match_country_code) %>%
slice(which.min(posted_date)) %>%
rename(first_preprint = posted_date) %>%
ungroup() %>%
full_join(global_cases, by = c("institution_match_country_code" = "country_code")) %>%
mutate(first_case = if_else(institution_match_country_code == "CN",
as.Date("2019-12-31", format="%Y-%m-%d"),
first_case)) %>% # Override first case for China
mutate(lag = first_preprint - first_case)
# Write csv
write.csv(case_author_dates_df %>% filter(!is.na(first_preprint) & !is.na(first_case)), "data\\preprint_earliest_per_country_20200101_20201031.csv")
no_preprints <- case_author_dates_df %>% filter(is.na(first_preprint)) %>% select(Country.Region, first_case) # Identify countries/territories with no associated corresponding authors of COVID-19 papers
no_cases <- case_author_dates_df %>% filter(is.na(first_case)) %>% select(institution_match_country_name, first_preprint) # Identify countries/territories without case data for COVID-19
```