From f20fd87f2736a3c98d10ae1cc92e992a73ca15da Mon Sep 17 00:00:00 2001
From: sammigd <sdean39@gmail.com>
Date: Mon, 18 Jul 2022 10:57:53 -0400
Subject: [PATCH] push some fixes

---
 PCV_analysis_with_pharma_dataset1.Rmd |   8 +-
 PCV_analysis_with_pharma_dataset2.Rmd | 189 ++------------------------
 PCV_analysis_with_pharma_dataset3.Rmd |  10 ++
 R/functions.R                         |  13 +-
 4 files changed, 33 insertions(+), 187 deletions(-)

diff --git a/PCV_analysis_with_pharma_dataset1.Rmd b/PCV_analysis_with_pharma_dataset1.Rmd
index 36ffeef..949e902 100644
--- a/PCV_analysis_with_pharma_dataset1.Rmd
+++ b/PCV_analysis_with_pharma_dataset1.Rmd
@@ -32,13 +32,13 @@ sum(core_table_norace$count)
 ```
 ## Define covariates
 ```{r}
-list_cov <- c("year_month","age","patient_gender_code","race_code","PCV_combined")
+list_cov <- c("age","patient_gender_code","race_code","PCV_combined")
 ```
 
 
 ## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes
 ```{r}
-list_out <- c("non_severe","resp_severe")
+list_out <- c("resp_severe","non_severe")
 m1<-run_model(core_table,list_cov,list_out)
 View(m1$m.table)
 ```
@@ -49,7 +49,7 @@ m1$vif
 ```
 ## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
 ```{r}
-list_out <- c("non_severe","non_resp_severe")
+list_out <- c("non_resp_severe","non_severe")
 m2<-run_model(core_table,list_cov,list_out)
 View(m2$m.table)
 ```
@@ -60,7 +60,7 @@ m2$vif
 
 ## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19
 ```{r}
-list_out <- c("non_severe","ICU_crit_care")
+list_out <- c("ICU_crit_care","non_severe")
 m3<-run_model(core_table,list_cov,list_out)
 View(m3$m.table)
 ```
diff --git a/PCV_analysis_with_pharma_dataset2.Rmd b/PCV_analysis_with_pharma_dataset2.Rmd
index f10904b..c662e0f 100644
--- a/PCV_analysis_with_pharma_dataset2.Rmd
+++ b/PCV_analysis_with_pharma_dataset2.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "PCV_analysis -- Dataset 1 and 2"
+title: "PCV_analysis -- Dataset 3"
 output: html_document
 ---
 
@@ -18,189 +18,24 @@ library(car)
 
 ```{r}
 ### Load the data: 
-core_table <- read.csv("./Data/core_table_filled.csv")
-core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv")
+supp_table <- read.csv("~/Documents/Ottavia/Data_pharma/suppl_table_filled.csv")
 ```
 
 ## Check that individuals are the same across dataframes
 ```{r}
 print(paste0("Total number of individuals: "))
-sum(core_table$count)
-print("\n")
-sum(core_table_norace$count)
+sum(supp_table$count)
 ```
-## Factor variables in the data
+## Define covariates
 ```{r}
-## Factor categorical variable and change reference level for race (set White as the ref level)
-col_names <- c("age","patient_gender_code","race_code","income_est_mod","PCV_combined")
-supp_table_filled[col_names] <- lapply(supp_table_filled[col_names], factor)
-supp_table_filled$race_code <- relevel(supp_table_filled$race_code,ref="W")
-supp_table_filled$PCV_combined <- relevel(supp_table_filled$PCV_combined,ref="U")
+list_cov <- c("year_month","age","patient_gender_code","race_code","PCV_combined") #removed "income_est_mod"
+## drop covariates
+drop.cols <- c("bmi_30_plus","income_est_mod")
+df_table <- supp_table %>% select(-all_of(drop.cols))
 ```
-
-## First step is to create another columnn of no response
-```{r}
-### Prepare the data and do some exploration
-#baseline demographic characteristics (Table 1)
-data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","resp_severe"),]
-data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
-data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
-data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
-```
-
-```{r}
-## Exploratory analysis of the dataset: 
-print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
-print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
-print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
-print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
-print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
-print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
-```
-
 ## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes
 ```{r}
-m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
-#summary(m1)
-m1.ci<-confint(m1)
-```
-
-```{r}
-m1.table <- cbind(coef(m1),m1.ci)
-colnames(m1.table) <- c("estimate","lower","upper")
-m1.table <- exp(m1.table)
-View(m1.table)
-```
-
-##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?".
-The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity.
-
-```{r}
-#X<-data_comb[,2:10]
-#ggpairs(X)
-vif(m1)
-```
-
-
-## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
-```{r}
-### Prepare the data and do some exploration
-#baseline demographic characteristics (Table 1)
-data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","non_resp_severe"),]
-data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
-data2 <- supp_table_filled[supp_table_filled$COVID_severity =="non_resp_severe",]
-data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
-```
-
-```{r}
-## Exploratory analysis of the dataset: 
-print(paste0("Total number of individuals: ",sum(data$count)))
-print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
-print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
-print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
-print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
-print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
-print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
-```
-
-## Logistic regression
-```{r}
-m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
-#summary(m2)
-m2.ci<-confint(m2)
-```
-
-##Check collinearity
-```{r}
-vif(m2)
-```
-
-
-```{r}
-m2.table <- cbind(coef(m2),m2.ci)
-colnames(m2.table) <- c("estimate","lower","upper")
-m2.table <- exp(m2.table)
-View(m2.table)
-```
-
-## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19
-```{r}
-### Prepare the data and do some exploration
-#baseline demographic characteristics (Table 1)
-data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),]
-data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
-data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",]
-data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
-```
-
-```{r}
-## Exploratory analysis of the dataset: 
-print(paste0("Total number of individuals: ",sum(data$count)))
-print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
-print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
-print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
-print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
-print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
-print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
-```
-
-## Logistic regression
-```{r}
-m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
-#summary(m3)
-m3.ci<-confint(m3)
-```
-
-```{r}
-m3.table <- cbind(coef(m3),m3.ci)
-colnames(m3.table) <- c("estimate","lower","upper")
-m3.table <- exp(m3.table)
-View(m3.table)
-```
-
-##Check collinearity
-```{r}
-vif(m3)
-```
-
-## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19
-```{r}
-### Prepare the data and do some exploration
-#baseline demographic characteristics (Table 1)
-data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),]
-data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",]
-data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
-data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
-```
-
-```{r}
-## Exploratory analysis of the dataset: 
-print(paste0("Total number of individuals: ",sum(data$count)))
-print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
-print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
-print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
-print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
-print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
-print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
-```
-
-## Logistic regression
-```{r}
-m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
-#summary(m4)
-m4.ci<-confint(m4)
-```
-
-```{r}
-m4.table <- cbind(coef(m4),m4.ci)
-colnames(m4.table) <- c("estimate","lower","upper")
-m4.table <- exp(m4.table)
-View(m4.table)
-```
-
-##Check collinearity
-```{r}
-vif(m4)
-```
-
-
+list_out <- c("non_severe","resp_severe")
+m1<-run_model(df_table,list_cov,list_out)
+View(m1$m.table)
+```
\ No newline at end of file
diff --git a/PCV_analysis_with_pharma_dataset3.Rmd b/PCV_analysis_with_pharma_dataset3.Rmd
index f10904b..0e9a882 100644
--- a/PCV_analysis_with_pharma_dataset3.Rmd
+++ b/PCV_analysis_with_pharma_dataset3.Rmd
@@ -47,6 +47,16 @@ data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
 data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
 data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
 ```
+```{r}
+d <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","resp_severe"),]
+d1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
+d2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
+d_comb<-merge(d1,d2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+
+```
+
+
+
 
 ```{r}
 ## Exploratory analysis of the dataset: 
diff --git a/R/functions.R b/R/functions.R
index 093f740..f091982 100644
--- a/R/functions.R
+++ b/R/functions.R
@@ -1,16 +1,17 @@
 run_model <- function(df,list_cov,list_out)
   {## Convert columns of list_cov to factors
-  col_names <- list_cov
+  col_names <- list_cov[2:length(list_cov)]
   df[col_names] <- lapply(df[col_names], factor)
   df$race_code <- relevel(df$race_code,ref="W")
   df$PCV_combined <- relevel(df$PCV_combined,ref="U")
   ## Create df of response and non-responses to be fed to the logistic regression
-  df <- df[df$COVID_severity %in% list_out,]
-  data1 <- df[df$COVID_severity == list_out[1],]
-  data2 <- df[df$COVID_severity ==list_out[2],]
-  data_comb<-merge(data1,data2,by=c("year_month",list_cov))
+  df1 <- df[df$COVID_severity %in% list_out,]
+  data1 <- df1[df1$COVID_severity == list_out[1],]
+  data2 <- df1[df1$COVID_severity ==list_out[2],]
+  #list_cov_tot <- paste(c("year_month",list_cov))
+  data_comb<-merge(data1,data2,all.x=TRUE,all.y=TRUE)
   ## 
-  if(sum(df$count)==sum(data_comb$count.x)+sum(data_comb$count.y)){print("OK")}
+  if(sum(df1$count)==sum(data_comb$count.x)+sum(data_comb$count.y)){print("OK")}
   else print("ERROR")
   ## Run logistic regression model
   data_comb$COVID_severity.x<-NULL