-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFile3_UK_LLC_RScrptToMergeStudies.R
220 lines (149 loc) · 7.37 KB
/
File3_UK_LLC_RScrptToMergeStudies.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#This file is run to put together the cohort and NHS data within the UK LLC.
# This file creates the main spine of the data. Multiple files will need to be run in advance.
# Note this version was written on the 03May2023
# Updated 11 Feb 2025 to correct typos.
# NB I, Richard Shaw, have dyslexia and there may be a few typos I miss.
library(haven)
library(tidyverse)
library(readr)
library(expss)
library(feather)
library(lubridate)
library(stringr)
#### User created functions
#This takes selected column names from a data frame and creates the frequencies and proportions for them
frequencies_selected <- function(data_frame, column_names, decimal_places = 1) {
select(data_frame, column_names)|>
pivot_longer(column_names) |>
group_by(name) |>
count( value) |>
mutate(percent = round(n /sum(n)*100 , digits = decimal_places))
}
#### Importing data files
# Core denominator data
core_denominator <- readRDS("S:/LLC_0010/data/core/core_denominator_2022-11-04.rds")
### Contains ethnicity data
core_nhsd_derived <- readRDS("S:/LLC_0010/data/core/core_indicator_2023-02-08.rds")|>
drop_all_labels()
ethnic_valid <- core_nhsd_derived |>
select(llc_0010_stud_id, ethnic)
ethnic_valid$e_1 <-NA
ethnic_valid$e_2 <-NA
ethnic_valid$e_1[ethnic_valid$ethnic == "A"] <- 1
ethnic_valid$e_2[ethnic_valid$ethnic == "A"] <- 1
ethnic_valid$e_1[ethnic_valid$ethnic == "B"] <- 1
ethnic_valid$e_2[ethnic_valid$ethnic == "B"] <- 1
ethnic_valid$e_1[ethnic_valid$ethnic == "C"] <- 1
ethnic_valid$e_2[ethnic_valid$ethnic == "C"] <- 1
ethnic_valid$e_1[ethnic_valid$ethnic == "D"] <- 4
ethnic_valid$e_2[ethnic_valid$ethnic == "D"] <- 2
ethnic_valid$e_1[ethnic_valid$ethnic == "E"] <- 4
ethnic_valid$e_2[ethnic_valid$ethnic == "E"] <- 2
ethnic_valid$e_1[ethnic_valid$ethnic == "F"] <- 4
ethnic_valid$e_2[ethnic_valid$ethnic == "F"] <- 2
ethnic_valid$e_1[ethnic_valid$ethnic == "G"] <- 4
ethnic_valid$e_2[ethnic_valid$ethnic == "G"] <- 2
ethnic_valid$e_1[ethnic_valid$ethnic == "H"] <- 2
ethnic_valid$e_2[ethnic_valid$ethnic == "H"] <- 3
ethnic_valid$e_1[ethnic_valid$ethnic == "H"] <- 2
ethnic_valid$e_2[ethnic_valid$ethnic == "H"] <- 3
ethnic_valid$e_1[ethnic_valid$ethnic == "J"] <- 2
ethnic_valid$e_2[ethnic_valid$ethnic == "J"] <- 4
ethnic_valid$e_1[ethnic_valid$ethnic == "K"] <- 2
ethnic_valid$e_2[ethnic_valid$ethnic == "K"] <- 5
ethnic_valid$e_1[ethnic_valid$ethnic == "L"] <- 2
ethnic_valid$e_2[ethnic_valid$ethnic == "L"] <- 8
ethnic_valid$e_1[ethnic_valid$ethnic == "M"] <- 3
ethnic_valid$e_2[ethnic_valid$ethnic == "M"] <- 6
ethnic_valid$e_1[ethnic_valid$ethnic == "N"] <- 3
ethnic_valid$e_2[ethnic_valid$ethnic == "N"] <- 7
ethnic_valid$e_1[ethnic_valid$ethnic == "P"] <- 3
ethnic_valid$e_2[ethnic_valid$ethnic == "P"] <- 8
ethnic_valid$e_1[ethnic_valid$ethnic == "R"] <- 2
ethnic_valid$e_2[ethnic_valid$ethnic == "R"] <- 8
ethnic_valid$e_1[ethnic_valid$ethnic == "S"] <- 5
ethnic_valid$e_2[ethnic_valid$ethnic == "S"] <- 8
ethnic.df <- ethnic_valid |>
select(llc_0010_stud_id, e_1, e_2)
glimpse(ethnic_valid)
# NHS Covid data files
first_test <- readRDS("S:/LLC_0010/data/multiple/first_pos_test_2022-10-31.rds")
first_covid_hosp <- readRDS("S:/LLC_0010/data/multiple/first_hospital_covid_record_2022-11-04.rds")
first_covid_gp <- readRDS("S:/LLC_0010/data/multiple/first_gp_covid_record_2022-11-04.rds")
#Will probably need a vaccination date but for most people it may be after the outcome wave.
#Cohort data currently available
elsa <- read_dta("S:/LLC_0010/data/flow_chart/ELSA_flowchart_sample_26Apr23.dta")
usoc <- read_dta("S:/LLC_0010/data/flow_chart/USoc_flowchart_sample_26Apr23.dta")
bcs70 <- read_dta("S:/LLC_0010/data/flow_chart/BCS70_flowchart_sample_26Apr23.dta")
ncds58 <- read_dta("S:/LLC_0010/data/flow_chart/NCDS_flowchart_sample_23Apr23.dta")
nextsteps <- read_dta("S:/LLC_0010/data/flow_chart/nextsteps_flowchart_sample_23Apr23.dta")
# at a later stage factor order gets muddled up not using this although may have some advantages in some context so making a note of the code usoc_labeled <- as_factor(usoc, levels = "labels")
#Generation Scotland is being converted as labels and factors are problematic.
#cls cohorts
bcs70_convert <- bcs70 |>
mutate(survey_date = ymd(
paste0("2021-",
str_pad(as.character(cw3_enddatem), width = 2 , side = 'left', pad = '0'),
"-",
str_pad(as.character(cw3_enddated), width = 2 , side = 'left', pad = '0'))))
ncds58_convert <- ncds58 |>
mutate(survey_date = ymd(
paste0("2021-",
str_pad(as.character(cw3_enddatem), width = 2 , side = 'left', pad = '0'),
"-",
str_pad(as.character(cw3_enddated), width = 2 , side = 'left', pad = '0'))))
nextsteps_convert <- nextsteps |>
mutate(survey_date = ymd(
paste0("2021-",
str_pad(as.character(cw3_enddatem), width = 2 , side = 'left', pad = '0'),
"-",
str_pad(as.character(cw3_enddated), width = 2 , side = 'left', pad = '0'))))
cls_combined <- bcs70_convert |>
rbind(ncds58_convert)|>
rbind(nextsteps_convert) |>
select(!c(cw3_enddated, cw3_enddatem )) |>
drop_all_labels()
core_vars <- names(cls_combined)
class(core_vars)
names(nextsteps_convert)
# ELSA
elsa_convert <- elsa |>
mutate(survey_date = ymd(
paste0("2020-",
str_pad(as.character(cintdatm_w2), width = 2 , side = 'left', pad = '0'),
"-",
str_pad(as.character(cintdatd_w2), width = 2 , side = 'left', pad = '0')))) |>
select(c(core_vars))|>
drop_all_labels()
view(elsa_convert)
#usoc will need_to_versions depending on March on January
usoc_jan_convert <- usoc|>
mutate(survey_date = ymd_hms(survey_alt),
age = age_alt ,
employment_status = employment_status_alt) |>
select(c(core_vars))|>
drop_all_labels()
#Combining the data sets.
combined_data <- bind_rows(cls_combined, elsa_convert) |>
bind_rows(usoc_jan_convert) |>
mutate(survey_date = as.Date(survey_date))
#Alteration as the employment status currently includes furlough
combined_data$employment_status[combined_data$employment_status == 1] <- 0
combined_data$employment_status[combined_data$employment_status == 2] <- 1
### add in the additional ethnicity variables, English NHS data and then code English NHS data.
combined_data <- left_join(combined_data, ethnic.df, by = "llc_0010_stud_id" ) |>
left_join(first_test, by = "llc_0010_stud_id" ) |>
left_join(first_covid_hosp, by = "llc_0010_stud_id" ) |>
left_join(first_covid_gp, by = "llc_0010_stud_id" ) |>
mutate(
Test_positive = if_else(survey_date > test_date , 1, 0, missing = 0),
Covid_hospital = if_else(survey_date > acute_covid_hosp , 1, 0, missing = 0),
COVID19_GP = if_else(survey_date > COVID19_Date , 1, 0, missing = 0),
)
###############################################################################
# Now attaching the spine
### note previous version had done this before adding NHS data but because including Generation Scotland comes later.
core_denominator$in_spine <- 1
Spine <- full_join(core_denominator, combined_data, by = "llc_0010_stud_id")
##Saving a file for use in another package
write_dta(Spine,"S:/LLC_0010/data/Flow_Chart/FlowChart_24Apr2023.dta" )