generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_prepare.R
187 lines (146 loc) · 6.39 KB
/
data_prepare.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
######################################
# Purpose:
# process (tidy, clean, reshape, derive) data extracted using ehrQL (or dummy data), including:
# - standardising variables (eg convert to factor) and derives some new ones
# - outputting skim files which summarise data types and distributions
# Outputs:
# - a prepared dataset
# - skim files
######################################
# Preliminaries ----
## Import libraries ----
library('tidyverse')
library('lubridate')
library('arrow')
library('here')
library("dtplyr")
## Import custom user functions from lib
source(here("analysis", "0-lib", "utility.R"))
## Import design elements
source(here("analysis", "0-lib", "design.R"))
# create output directory
output_dir <- here("output", "2-prepare")
fs::dir_create(output_dir)
# Import and process data ----
## Import extracted dataset ----
data_extract <- read_feather(here("output", "1-extract", "extract.arrow"))
# print details about extracted dataset
capture.output(
skimr::skim_without_charts(data_extract),
file = fs::path(output_dir, "data_extract_skim.txt"),
split = FALSE
)
stopifnot(
"inconsistency between ethnicity5 and ethnicity 16" = identical(data_extract$ethnicity5, ethnicity_16_to_5(data_extract$ethnicity16))
)
## Process extracted dataset ----
data_prepared <-
data_extract |>
mutate(
# use short product names
vax_product = factor(vax_product, vax_product_lookup, names(vax_product_lookup)) |> fct_na_value_to_level("other"),
# binary variable for the exposure
# helpful for various downstream matching / plotting / table functions
treatment = case_when(
vax_product==productA ~ 0L,
vax_product==productB ~ 1L,
TRUE ~ NA_integer_
),
vax_date = vax_date - 1L,
# vaccination date represented as an integer, using for matching instead of date-formatted variable to avoid issues
vax_day = as.integer(vax_date - study_dates$studystart_date),
# all subgroup dummy variable
all = factor("all"),
ageband = cut(
age_eligible, # use fixed date to ascertain age so that age bands align with eligibility. because age and vax date are closely matched, this doesn't cause any problems
breaks=c(-Inf, 18, 50, 65, 75, 80, 85, Inf),
labels=c("under 18", "18-49", "50-64", "65-74", "75-79", "80-84", "85+"),
right=FALSE
),
ethnicity5 = factor(ethnicity5, levels = factor_levels$ethnicity5) |> fct_na_value_to_level(level = "Unknown"),
ethnicity16 = factor(ethnicity16, levels = factor_levels$ethnicity16) |> fct_relabel(~ str_extract(.x, "(?<= - )(.*)")), # pick up everything after " - "
region = fct_collapse(
region,
`East of England` = "East",
`London` = "London",
`Midlands` = c("West Midlands", "East Midlands"),
`North East and Yorkshire` = c("Yorkshire and The Humber", "North East"),
`North West` = "North West",
`South East` = "South East",
`South West` = "South West"
),
imd_Q5 = cut(
imd,
breaks = (32844/5)*c(-0.1,1,2,3,4,5),
labels = c("1 most deprived", "2", "3", "4", "5 least deprived"),
include.lowest = TRUE,
right = FALSE
),
care_home_combined = care_home_tpp | care_home_code, # any carehome flag
# clinically at-risk group
cv = immunosuppressed | ckd | crd | diabetes | cld |
cns | chd | learndis | smi,
multimorb =
(severe_obesity) +
(chd) +
(ckd)+
(diabetes) +
(cld)+
(crd)+
(cns)+
#(cancer)+
#(learndis)+
#(sev_mental),
0,
multimorb = cut(multimorb, breaks = c(0, 1, 2, Inf), labels=c("0", "1", "2+"), right=FALSE),
## process partial covid vaccine history
prior_vax_count = vax_covid_prior_count,
prior_vax_interval = as.integer(vax_date - vax_covid_prior_1_date),
prior_vax_interval_atleast12weeks = prior_vax_interval >= 12*7,
vax_count_realistic = prior_vax_count <= max_prior_vax_count, # earlier than 2020-12-08 to include trial participants, but excludes "unknowns" coded as eg 1900-01-01
prior_vax_interval_bigM = if_else(prior_vax_count==0, 365L*5L, as.integer(prior_vax_interval)), #if current vaccine is first recorded vaccine, then set vax_interval to be very large
prior_vax_count_group = cut(
prior_vax_count,
breaks = c(0, 1, 3, 6, Inf),
labels = c("0", "1-2", "3-5", "6+"),
include.lowest = TRUE,
right = FALSE
),
## process outcomes data
censor_date = pmin(dereg_date, study_dates$followupend_date, na.rm=TRUE),
# latest covid event before study start
covid_death_date = if_else(death_cause_covid, death_date, NA_Date_),
noncovid_death_date = if_else(!is.na(death_date) & is.na(covid_death_date), death_date, as.Date(NA_character_)),
# replace events dates with more severe dates if they precede less severe dates
#covid_emergency_date = pmin(covid_emergency_date, covid_admitted_date, covid_critcare_date, covid_death_date, na.rm=TRUE),
#covid_admitted_date = pmin(covid_admitted_date, covid_critcare_date, covid_death_date, na.rm=TRUE),
#covid_critcare_date = pmin(covid_critcare_date, covid_death_date, na.rm=TRUE),
# earliest covid event after study start
#any_covid_date = pmin(covidemergency_date, covidadmitted_date, coviddeath_date, na.rm=TRUE),
cause_of_death = fct_case_when(
death_cause_covid ~ "covid-related",
!death_cause_covid ~ "not covid-related",
TRUE ~ NA_character_
),
# pericarditis_death_date = if_else(death_cause_pericarditis, death_date, NA_Date_),
# pericarditis_date = pmin(pericarditis_emergency_date, pericarditis_admitted_date, pericarditis_death_date, na.rm=TRUE),
#
# myocarditis_death_date = if_else(death_cause_myocarditis, death_date, NA_Date_),
# myocarditis_date = pmin(myocarditis_emergency_date, myocarditis_admitted_date, myocarditis_death_date, na.rm=TRUE),
# define cohorts
age75plus = age_eligible >= 75,
#cv = cv,
is_eligible = age_eligible | cv,
) %>%
select(
-vax_covid_prior_1_date, -vax_covid_prior_1_product,
-vax_covid_prior_2_date, -vax_covid_prior_2_product,
-vax_covid_prior_3_date, -vax_covid_prior_3_product
)
# print details about prepared dataset
capture.output(
skimr::skim_without_charts(data_prepared),
file = fs::path(output_dir, "data_prepared_skim.txt"),
split = FALSE
)
write_feather(data_prepared, sink = fs::path(output_dir, "data_prepared.arrow"))