From f20fd87f2736a3c98d10ae1cc92e992a73ca15da Mon Sep 17 00:00:00 2001 From: sammigd Date: Mon, 18 Jul 2022 10:57:53 -0400 Subject: [PATCH] push some fixes --- PCV_analysis_with_pharma_dataset1.Rmd | 8 +- PCV_analysis_with_pharma_dataset2.Rmd | 189 ++------------------------ PCV_analysis_with_pharma_dataset3.Rmd | 10 ++ R/functions.R | 13 +- 4 files changed, 33 insertions(+), 187 deletions(-) diff --git a/PCV_analysis_with_pharma_dataset1.Rmd b/PCV_analysis_with_pharma_dataset1.Rmd index 36ffeef..949e902 100644 --- a/PCV_analysis_with_pharma_dataset1.Rmd +++ b/PCV_analysis_with_pharma_dataset1.Rmd @@ -32,13 +32,13 @@ sum(core_table_norace$count) ``` ## Define covariates ```{r} -list_cov <- c("year_month","age","patient_gender_code","race_code","PCV_combined") +list_cov <- c("age","patient_gender_code","race_code","PCV_combined") ``` ## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes ```{r} -list_out <- c("non_severe","resp_severe") +list_out <- c("resp_severe","non_severe") m1<-run_model(core_table,list_cov,list_out) View(m1$m.table) ``` @@ -49,7 +49,7 @@ m1$vif ``` ## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 ```{r} -list_out <- c("non_severe","non_resp_severe") +list_out <- c("non_resp_severe","non_severe") m2<-run_model(core_table,list_cov,list_out) View(m2$m.table) ``` @@ -60,7 +60,7 @@ m2$vif ## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19 ```{r} -list_out <- c("non_severe","ICU_crit_care") +list_out <- c("ICU_crit_care","non_severe") m3<-run_model(core_table,list_cov,list_out) View(m3$m.table) ``` diff --git a/PCV_analysis_with_pharma_dataset2.Rmd b/PCV_analysis_with_pharma_dataset2.Rmd index f10904b..c662e0f 100644 --- a/PCV_analysis_with_pharma_dataset2.Rmd +++ b/PCV_analysis_with_pharma_dataset2.Rmd @@ -1,5 +1,5 @@ --- -title: "PCV_analysis -- Dataset 1 and 2" +title: "PCV_analysis -- Dataset 3" output: html_document --- @@ -18,189 +18,24 @@ library(car) ```{r} ### Load the data: -core_table <- read.csv("./Data/core_table_filled.csv") -core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv") +supp_table <- read.csv("~/Documents/Ottavia/Data_pharma/suppl_table_filled.csv") ``` ## Check that individuals are the same across dataframes ```{r} print(paste0("Total number of individuals: ")) -sum(core_table$count) -print("\n") -sum(core_table_norace$count) +sum(supp_table$count) ``` -## Factor variables in the data +## Define covariates ```{r} -## Factor categorical variable and change reference level for race (set White as the ref level) -col_names <- c("age","patient_gender_code","race_code","income_est_mod","PCV_combined") -supp_table_filled[col_names] <- lapply(supp_table_filled[col_names], factor) -supp_table_filled$race_code <- relevel(supp_table_filled$race_code,ref="W") -supp_table_filled$PCV_combined <- relevel(supp_table_filled$PCV_combined,ref="U") +list_cov <- c("year_month","age","patient_gender_code","race_code","PCV_combined") #removed "income_est_mod" +## drop covariates +drop.cols <- c("bmi_30_plus","income_est_mod") +df_table <- supp_table %>% select(-all_of(drop.cols)) ``` - -## First step is to create another columnn of no response -```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","resp_severe"),] -data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] -data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) -print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) -``` - ## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes ```{r} -m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) -#summary(m1) -m1.ci<-confint(m1) -``` - -```{r} -m1.table <- cbind(coef(m1),m1.ci) -colnames(m1.table) <- c("estimate","lower","upper") -m1.table <- exp(m1.table) -View(m1.table) -``` - -##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?". -The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity. - -```{r} -#X<-data_comb[,2:10] -#ggpairs(X) -vif(m1) -``` - - -## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 -```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","non_resp_severe"),] -data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] -data2 <- supp_table_filled[supp_table_filled$COVID_severity =="non_resp_severe",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("Total number of individuals: ",sum(data$count))) -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) -print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) -``` - -## Logistic regression -```{r} -m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) -#summary(m2) -m2.ci<-confint(m2) -``` - -##Check collinearity -```{r} -vif(m2) -``` - - -```{r} -m2.table <- cbind(coef(m2),m2.ci) -colnames(m2.table) <- c("estimate","lower","upper") -m2.table <- exp(m2.table) -View(m2.table) -``` - -## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19 -```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),] -data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] -data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("Total number of individuals: ",sum(data$count))) -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) -print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) -``` - -## Logistic regression -```{r} -m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) -#summary(m3) -m3.ci<-confint(m3) -``` - -```{r} -m3.table <- cbind(coef(m3),m3.ci) -colnames(m3.table) <- c("estimate","lower","upper") -m3.table <- exp(m3.table) -View(m3.table) -``` - -##Check collinearity -```{r} -vif(m3) -``` - -## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19 -```{r} -### Prepare the data and do some exploration -#baseline demographic characteristics (Table 1) -data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),] -data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",] -data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] -data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) -``` - -```{r} -## Exploratory analysis of the dataset: -print(paste0("Total number of individuals: ",sum(data$count))) -print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) -print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) -print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) -print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) -print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) -print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) -``` - -## Logistic regression -```{r} -m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) -#summary(m4) -m4.ci<-confint(m4) -``` - -```{r} -m4.table <- cbind(coef(m4),m4.ci) -colnames(m4.table) <- c("estimate","lower","upper") -m4.table <- exp(m4.table) -View(m4.table) -``` - -##Check collinearity -```{r} -vif(m4) -``` - - +list_out <- c("non_severe","resp_severe") +m1<-run_model(df_table,list_cov,list_out) +View(m1$m.table) +``` \ No newline at end of file diff --git a/PCV_analysis_with_pharma_dataset3.Rmd b/PCV_analysis_with_pharma_dataset3.Rmd index f10904b..0e9a882 100644 --- a/PCV_analysis_with_pharma_dataset3.Rmd +++ b/PCV_analysis_with_pharma_dataset3.Rmd @@ -47,6 +47,16 @@ data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) ``` +```{r} +d <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","resp_severe"),] +d1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] +d2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] +d_comb<-merge(d1,d2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) + +``` + + + ```{r} ## Exploratory analysis of the dataset: diff --git a/R/functions.R b/R/functions.R index 093f740..f091982 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,16 +1,17 @@ run_model <- function(df,list_cov,list_out) {## Convert columns of list_cov to factors - col_names <- list_cov + col_names <- list_cov[2:length(list_cov)] df[col_names] <- lapply(df[col_names], factor) df$race_code <- relevel(df$race_code,ref="W") df$PCV_combined <- relevel(df$PCV_combined,ref="U") ## Create df of response and non-responses to be fed to the logistic regression - df <- df[df$COVID_severity %in% list_out,] - data1 <- df[df$COVID_severity == list_out[1],] - data2 <- df[df$COVID_severity ==list_out[2],] - data_comb<-merge(data1,data2,by=c("year_month",list_cov)) + df1 <- df[df$COVID_severity %in% list_out,] + data1 <- df1[df1$COVID_severity == list_out[1],] + data2 <- df1[df1$COVID_severity ==list_out[2],] + #list_cov_tot <- paste(c("year_month",list_cov)) + data_comb<-merge(data1,data2,all.x=TRUE,all.y=TRUE) ## - if(sum(df$count)==sum(data_comb$count.x)+sum(data_comb$count.y)){print("OK")} + if(sum(df1$count)==sum(data_comb$count.x)+sum(data_comb$count.y)){print("OK")} else print("ERROR") ## Run logistic regression model data_comb$COVID_severity.x<-NULL