From 77dee707228245fa62887606350cfd316ae4fd9e Mon Sep 17 00:00:00 2001 From: sammigd Date: Thu, 14 Jul 2022 14:58:09 -0400 Subject: [PATCH] clean code --- PCV_analysis_with_pharma_dataset1.Rmd | 156 ++++---------------------- R/functions.R | 5 +- 2 files changed, 24 insertions(+), 137 deletions(-) diff --git a/PCV_analysis_with_pharma_dataset1.Rmd b/PCV_analysis_with_pharma_dataset1.Rmd index 4c5452b..36ffeef 100644 --- a/PCV_analysis_with_pharma_dataset1.Rmd +++ b/PCV_analysis_with_pharma_dataset1.Rmd @@ -14,6 +14,7 @@ library(dplyr) library(ggplot2) #library(GGally) library(car) +source('./R/functions.R') ``` ```{r} @@ -29,174 +30,57 @@ sum(core_table$count) print("\n") sum(core_table_norace$count) ``` -## Factor variables in the data +## Define covariates ```{r} -## Factor categorical variable and change reference level for race (set White as the ref level) -col_names <- c("age","patient_gender_code","race_code","PCV_combined") -core_table[col_names] <- lapply(core_table[col_names], factor) -core_table$race_code <- relevel(core_table$race_code,ref="W") -core_table$PCV_combined <- relevel(core_table$PCV_combined,ref="U") +list_cov <- c("year_month","age","patient_gender_code","race_code","PCV_combined") ``` -## First step is to create another columnn of no response -```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- core_table[core_table$COVID_severity %in% c("non_severe","resp_severe"),] -data1 <- core_table[core_table$COVID_severity == "non_severe",] -data2 <- core_table[core_table$COVID_severity =="resp_severe",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -``` ## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes ```{r} -m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined, data=data_comb, family = binomial("logit")) -#summary(m1) -m1.ci<-confint(m1) +list_out <- c("non_severe","resp_severe") +m1<-run_model(core_table,list_cov,list_out) +View(m1$m.table) ``` - -```{r} -m1.table <- cbind(coef(m1),m1.ci) -colnames(m1.table) <- c("estimate","lower","upper") -m1.table <- exp(m1.table) -View(m1.table) -``` - -##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?". +## Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?". The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity. - ```{r} -#X<-data_comb[,2:10] -#ggpairs(X) -vif(m1) +m1$vif ``` - - ## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 ```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- core_table[core_table$COVID_severity %in% c("non_severe","non_resp_severe"),] -data1 <- core_table[core_table$COVID_severity == "non_severe",] -data2 <- core_table[core_table$COVID_severity =="non_resp_severe",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("Total number of individuals: ",sum(data$count))) -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +list_out <- c("non_severe","non_resp_severe") +m2<-run_model(core_table,list_cov,list_out) +View(m2$m.table) ``` - -## Logistic regression -```{r} -m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined, data=data_comb, family = binomial("logit")) -#summary(m2) -m2.ci<-confint(m2) -``` - ##Check collinearity ```{r} -vif(m2) -``` - - -```{r} -m2.table <- cbind(coef(m2),m2.ci) -colnames(m2.table) <- c("estimate","lower","upper") -m2.table <- exp(m2.table) -View(m2.table) +m2$vif ``` ## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19 ```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),] -data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] -data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("Total number of individuals: ",sum(data$count))) -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) -print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) -``` - -## Logistic regression -```{r} -m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) -#summary(m3) -m3.ci<-confint(m3) -``` - -```{r} -m3.table <- cbind(coef(m3),m3.ci) -colnames(m3.table) <- c("estimate","lower","upper") -m3.table <- exp(m3.table) -View(m3.table) +list_out <- c("non_severe","ICU_crit_care") +m3<-run_model(core_table,list_cov,list_out) +View(m3$m.table) ``` ##Check collinearity ```{r} -vif(m3) +m3$vif ``` ## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19 -```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),] -data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",] -data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("Total number of individuals: ",sum(data$count))) -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) -print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) -``` - -## Logistic regression -```{r} -m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) -#summary(m4) -m4.ci<-confint(m4) -``` ```{r} -m4.table <- cbind(coef(m4),m4.ci) -colnames(m4.table) <- c("estimate","lower","upper") -m4.table <- exp(m4.table) -View(m4.table) +list_out <- c("ICU_crit_care","resp_severe") +m4<-run_model(core_table,list_cov,list_out) +View(m4$m.table) ``` ##Check collinearity ```{r} -vif(m4) +m4$vif ``` diff --git a/R/functions.R b/R/functions.R index 1b7f9b0..093f740 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,6 +1,6 @@ run_model <- function(df,list_cov,list_out) {## Convert columns of list_cov to factors - colnames <- list_cov + col_names <- list_cov df[col_names] <- lapply(df[col_names], factor) df$race_code <- relevel(df$race_code,ref="W") df$PCV_combined <- relevel(df$PCV_combined,ref="U") @@ -9,6 +9,9 @@ run_model <- function(df,list_cov,list_out) data1 <- df[df$COVID_severity == list_out[1],] data2 <- df[df$COVID_severity ==list_out[2],] data_comb<-merge(data1,data2,by=c("year_month",list_cov)) + ## + if(sum(df$count)==sum(data_comb$count.x)+sum(data_comb$count.y)){print("OK")} + else print("ERROR") ## Run logistic regression model data_comb$COVID_severity.x<-NULL data_comb$COVID_severity.y<-NULL