From 2db35fa120812f2f63dba2f6962d9ccb58929ca8 Mon Sep 17 00:00:00 2001
From: sammigd <sdean39@gmail.com>
Date: Thu, 14 Jul 2022 14:35:06 -0400
Subject: [PATCH] Organize folder with functions

---
 .DS_Store                                     | Bin 0 -> 6148 bytes
 PCV_analysis_with_pharma_dataset1.Rmd         | 202 +++++++++++++++++
 ...d => PCV_analysis_with_pharma_dataset2.Rmd |  70 +++---
 PCV_analysis_with_pharma_dataset3.Rmd         | 206 ++++++++++++++++++
 PCV_analysis_with_pharma_exploratory.Rmd      |  50 +++++
 .../Example_logistic_reg_agg_data.R           |   0
 Tests.R => R/Tests.R                          |   0
 R/functions.R                                 |  24 ++
 8 files changed, 514 insertions(+), 38 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 PCV_analysis_with_pharma_dataset1.Rmd
 rename PCV_analysis_with_pharma.Rmd => PCV_analysis_with_pharma_dataset2.Rmd (83%)
 create mode 100644 PCV_analysis_with_pharma_dataset3.Rmd
 create mode 100644 PCV_analysis_with_pharma_exploratory.Rmd
 rename Example_logistic_reg_agg_data.R => R/Example_logistic_reg_agg_data.R (100%)
 rename Tests.R => R/Tests.R (100%)
 create mode 100644 R/functions.R

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..3cddd6bc6944608891e015960e90de8117f2419f
GIT binary patch
literal 6148
zcmeHKJ5Iwu5S@V(mS|E^?iIobmdIQnX(&;M1VtjJp^W4PaWQVeWq9)eVXdNc6!1ox
zdEWW#+OOF0h=?v<whNKDh;&dxc{9_B#Z4QzNY4zYw#WX^<eYOv?P~Y5SU(PvdX?3F
z==;rX*kF_H{`KQ{Q{Qj>EJU^^B8TT@r*rSXI2j-VWPl8if&at+dbUY-6-Y}4$N(Am
zX29-;0yV6OW1wFhFg^kR?Kj>9&pt~4i#dQbaSVh9qEQNrQq2)Vqa6N{c{OnijB-&w
z8T;hPniGop>F^g%7p;M`WPl7z8Mut)-1`3=^u+u>E#is{kbytNfX@1Lzr-hHZ*4x#
wdToI|K)(dJo(|)!80f7S3v0#KKk|w_BVH57Kre^i%Ypn6FkUER;5Qif1VI}hod5s;

literal 0
HcmV?d00001

diff --git a/PCV_analysis_with_pharma_dataset1.Rmd b/PCV_analysis_with_pharma_dataset1.Rmd
new file mode 100644
index 0000000..4c5452b
--- /dev/null
+++ b/PCV_analysis_with_pharma_dataset1.Rmd
@@ -0,0 +1,202 @@
+---
+title: "PCV_analysis -- Dataset 1 and 2"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Load required R packages
+```{r}
+library(table1)
+library(dplyr)
+library(ggplot2)
+#library(GGally)
+library(car)
+```
+
+```{r}
+### Load the data: 
+core_table <- read.csv("./Data/core_table_filled.csv")
+core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv")
+```
+
+## Check that individuals are the same across dataframes
+```{r}
+print(paste0("Total number of individuals: "))
+sum(core_table$count)
+print("\n")
+sum(core_table_norace$count)
+```
+## Factor variables in the data
+```{r}
+## Factor categorical variable and change reference level for race (set White as the ref level)
+col_names <- c("age","patient_gender_code","race_code","PCV_combined")
+core_table[col_names] <- lapply(core_table[col_names], factor)
+core_table$race_code <- relevel(core_table$race_code,ref="W")
+core_table$PCV_combined <- relevel(core_table$PCV_combined,ref="U")
+```
+
+## First step is to create another columnn of no response
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- core_table[core_table$COVID_severity %in% c("non_severe","resp_severe"),]
+data1 <- core_table[core_table$COVID_severity == "non_severe",]
+data2 <- core_table[core_table$COVID_severity =="resp_severe",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+```
+
+## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes
+```{r}
+m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined, data=data_comb, family = binomial("logit"))
+#summary(m1)
+m1.ci<-confint(m1)
+```
+
+```{r}
+m1.table <- cbind(coef(m1),m1.ci)
+colnames(m1.table) <- c("estimate","lower","upper")
+m1.table <- exp(m1.table)
+View(m1.table)
+```
+
+##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?".
+The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity.
+
+```{r}
+#X<-data_comb[,2:10]
+#ggpairs(X)
+vif(m1)
+```
+
+
+## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- core_table[core_table$COVID_severity %in% c("non_severe","non_resp_severe"),]
+data1 <- core_table[core_table$COVID_severity == "non_severe",]
+data2 <- core_table[core_table$COVID_severity =="non_resp_severe",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("Total number of individuals: ",sum(data$count)))
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+```
+
+## Logistic regression
+```{r}
+m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined, data=data_comb, family = binomial("logit"))
+#summary(m2)
+m2.ci<-confint(m2)
+```
+
+##Check collinearity
+```{r}
+vif(m2)
+```
+
+
+```{r}
+m2.table <- cbind(coef(m2),m2.ci)
+colnames(m2.table) <- c("estimate","lower","upper")
+m2.table <- exp(m2.table)
+View(m2.table)
+```
+
+## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),]
+data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
+data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("Total number of individuals: ",sum(data$count)))
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
+print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
+```
+
+## Logistic regression
+```{r}
+m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
+#summary(m3)
+m3.ci<-confint(m3)
+```
+
+```{r}
+m3.table <- cbind(coef(m3),m3.ci)
+colnames(m3.table) <- c("estimate","lower","upper")
+m3.table <- exp(m3.table)
+View(m3.table)
+```
+
+##Check collinearity
+```{r}
+vif(m3)
+```
+
+## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),]
+data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",]
+data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("Total number of individuals: ",sum(data$count)))
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
+print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
+```
+
+## Logistic regression
+```{r}
+m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
+#summary(m4)
+m4.ci<-confint(m4)
+```
+
+```{r}
+m4.table <- cbind(coef(m4),m4.ci)
+colnames(m4.table) <- c("estimate","lower","upper")
+m4.table <- exp(m4.table)
+View(m4.table)
+```
+
+##Check collinearity
+```{r}
+vif(m4)
+```
+
+
diff --git a/PCV_analysis_with_pharma.Rmd b/PCV_analysis_with_pharma_dataset2.Rmd
similarity index 83%
rename from PCV_analysis_with_pharma.Rmd
rename to PCV_analysis_with_pharma_dataset2.Rmd
index b1f07d0..f10904b 100644
--- a/PCV_analysis_with_pharma.Rmd
+++ b/PCV_analysis_with_pharma_dataset2.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "PCV_analysis"
+title: "PCV_analysis -- Dataset 1 and 2"
 output: html_document
 ---
 
@@ -12,15 +12,14 @@ knitr::opts_chunk$set(echo = TRUE)
 library(table1)
 library(dplyr)
 library(ggplot2)
-library(GGally)
+#library(GGally)
+library(car)
 ```
 
 ```{r}
 ### Load the data: 
 core_table <- read.csv("./Data/core_table_filled.csv")
 core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv")
-supp_table_filled <-read.csv("~/Documents/Ottavia/Data_pharma/suppl_table_filled.csv")
-supp_table_6mo_filled <- read.csv("./Data/suppl_6mo_table_filled.csv")
 ```
 
 ## Check that individuals are the same across dataframes
@@ -29,25 +28,6 @@ print(paste0("Total number of individuals: "))
 sum(core_table$count)
 print("\n")
 sum(core_table_norace$count)
-print("\n")
-sum(supp_table_filled$count)
-print("\n")
-sum(supp_table_6mo_filled$count)
-```
-
-## Exploratory analysis: descriptive stats
-```{r}
-Tot_ind <- sum(supp_table_filled$count)
-vax_by_age<-supp_table_filled %>% group_by(age,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind)
-vax_by_com<-supp_table_filled %>% group_by(comorbidities,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind)
-```
-
-## Plot histograms
-```{r}
-ggplot(vax_by_age,aes(age,prop,fill=PCV_combined))+
-  geom_bar(stat="identity",position='dodge')
-ggplot(vax_by_com,aes(comorbidities,prop,fill=PCV_combined))+
-  geom_bar(stat="identity",position='dodge')
 ```
 ## Factor variables in the data
 ```{r}
@@ -58,8 +38,6 @@ supp_table_filled$race_code <- relevel(supp_table_filled$race_code,ref="W")
 supp_table_filled$PCV_combined <- relevel(supp_table_filled$PCV_combined,ref="U")
 ```
 
-
-
 ## First step is to create another columnn of no response
 ```{r}
 ### Prepare the data and do some exploration
@@ -86,14 +64,6 @@ m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined
 #summary(m1)
 m1.ci<-confint(m1)
 ```
-##Check collinearity among the covariates using pair-wise correlation matrix
-```{r}
-X<-data_comb[,2:10]
-ggpairs(X)
-
-```
-
-
 
 ```{r}
 m1.table <- cbind(coef(m1),m1.ci)
@@ -102,6 +72,15 @@ m1.table <- exp(m1.table)
 View(m1.table)
 ```
 
+##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?".
+The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity.
+
+```{r}
+#X<-data_comb[,2:10]
+#ggpairs(X)
+vif(m1)
+```
+
 
 ## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
 ```{r}
@@ -124,13 +103,19 @@ print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$c
 print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
 ```
 
-## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe respiratory outcome
+## Logistic regression
 ```{r}
 m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
 #summary(m2)
 m2.ci<-confint(m2)
 ```
 
+##Check collinearity
+```{r}
+vif(m2)
+```
+
+
 ```{r}
 m2.table <- cbind(coef(m2),m2.ci)
 colnames(m2.table) <- c("estimate","lower","upper")
@@ -138,7 +123,7 @@ m2.table <- exp(m2.table)
 View(m2.table)
 ```
 
-## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
+## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19
 ```{r}
 ### Prepare the data and do some exploration
 #baseline demographic characteristics (Table 1)
@@ -159,7 +144,7 @@ print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$c
 print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
 ```
 
-## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe respiratory outcome
+## Logistic regression
 ```{r}
 m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
 #summary(m3)
@@ -173,7 +158,12 @@ m3.table <- exp(m3.table)
 View(m3.table)
 ```
 
-## Fourth part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
+##Check collinearity
+```{r}
+vif(m3)
+```
+
+## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19
 ```{r}
 ### Prepare the data and do some exploration
 #baseline demographic characteristics (Table 1)
@@ -194,7 +184,7 @@ print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$c
 print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
 ```
 
-## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe respiratory outcome
+## Logistic regression
 ```{r}
 m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
 #summary(m4)
@@ -208,5 +198,9 @@ m4.table <- exp(m4.table)
 View(m4.table)
 ```
 
+##Check collinearity
+```{r}
+vif(m4)
+```
 
 
diff --git a/PCV_analysis_with_pharma_dataset3.Rmd b/PCV_analysis_with_pharma_dataset3.Rmd
new file mode 100644
index 0000000..f10904b
--- /dev/null
+++ b/PCV_analysis_with_pharma_dataset3.Rmd
@@ -0,0 +1,206 @@
+---
+title: "PCV_analysis -- Dataset 1 and 2"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Load required R packages
+```{r}
+library(table1)
+library(dplyr)
+library(ggplot2)
+#library(GGally)
+library(car)
+```
+
+```{r}
+### Load the data: 
+core_table <- read.csv("./Data/core_table_filled.csv")
+core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv")
+```
+
+## Check that individuals are the same across dataframes
+```{r}
+print(paste0("Total number of individuals: "))
+sum(core_table$count)
+print("\n")
+sum(core_table_norace$count)
+```
+## Factor variables in the data
+```{r}
+## Factor categorical variable and change reference level for race (set White as the ref level)
+col_names <- c("age","patient_gender_code","race_code","income_est_mod","PCV_combined")
+supp_table_filled[col_names] <- lapply(supp_table_filled[col_names], factor)
+supp_table_filled$race_code <- relevel(supp_table_filled$race_code,ref="W")
+supp_table_filled$PCV_combined <- relevel(supp_table_filled$PCV_combined,ref="U")
+```
+
+## First step is to create another columnn of no response
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","resp_severe"),]
+data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
+data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
+print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
+```
+
+## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes
+```{r}
+m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
+#summary(m1)
+m1.ci<-confint(m1)
+```
+
+```{r}
+m1.table <- cbind(coef(m1),m1.ci)
+colnames(m1.table) <- c("estimate","lower","upper")
+m1.table <- exp(m1.table)
+View(m1.table)
+```
+
+##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?".
+The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity.
+
+```{r}
+#X<-data_comb[,2:10]
+#ggpairs(X)
+vif(m1)
+```
+
+
+## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","non_resp_severe"),]
+data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
+data2 <- supp_table_filled[supp_table_filled$COVID_severity =="non_resp_severe",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("Total number of individuals: ",sum(data$count)))
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
+print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
+```
+
+## Logistic regression
+```{r}
+m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
+#summary(m2)
+m2.ci<-confint(m2)
+```
+
+##Check collinearity
+```{r}
+vif(m2)
+```
+
+
+```{r}
+m2.table <- cbind(coef(m2),m2.ci)
+colnames(m2.table) <- c("estimate","lower","upper")
+m2.table <- exp(m2.table)
+View(m2.table)
+```
+
+## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),]
+data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",]
+data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("Total number of individuals: ",sum(data$count)))
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
+print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
+```
+
+## Logistic regression
+```{r}
+m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
+#summary(m3)
+m3.ci<-confint(m3)
+```
+
+```{r}
+m3.table <- cbind(coef(m3),m3.ci)
+colnames(m3.table) <- c("estimate","lower","upper")
+m3.table <- exp(m3.table)
+View(m3.table)
+```
+
+##Check collinearity
+```{r}
+vif(m3)
+```
+
+## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19
+```{r}
+### Prepare the data and do some exploration
+#baseline demographic characteristics (Table 1)
+data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),]
+data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",]
+data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",]
+data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod"))
+```
+
+```{r}
+## Exploratory analysis of the dataset: 
+print(paste0("Total number of individuals: ",sum(data$count)))
+print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count)))
+print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count)))
+print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count)))
+print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count)))
+print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count)))
+print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count)))
+```
+
+## Logistic regression
+```{r}
+m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit"))
+#summary(m4)
+m4.ci<-confint(m4)
+```
+
+```{r}
+m4.table <- cbind(coef(m4),m4.ci)
+colnames(m4.table) <- c("estimate","lower","upper")
+m4.table <- exp(m4.table)
+View(m4.table)
+```
+
+##Check collinearity
+```{r}
+vif(m4)
+```
+
+
diff --git a/PCV_analysis_with_pharma_exploratory.Rmd b/PCV_analysis_with_pharma_exploratory.Rmd
new file mode 100644
index 0000000..cc50b2b
--- /dev/null
+++ b/PCV_analysis_with_pharma_exploratory.Rmd
@@ -0,0 +1,50 @@
+---
+title: "PCV_analysis"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Load required R packages
+```{r}
+library(table1)
+library(dplyr)
+library(ggplot2)
+```
+
+```{r}
+### Load the data: 
+core_table <- read.csv("./Data/core_table_filled.csv")
+core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv")
+supp_table_filled <-read.csv("~/Documents/Ottavia/Data_pharma/suppl_table_filled.csv")
+supp_table_6mo_filled <- read.csv("./Data/suppl_6mo_table_filled.csv")
+```
+
+## Check that individuals are the same across dataframes
+```{r}
+print(paste0("Total number of individuals: "))
+sum(core_table$count)
+print("\n")
+sum(core_table_norace$count)
+print("\n")
+sum(supp_table_filled$count)
+print("\n")
+sum(supp_table_6mo_filled$count)
+```
+
+## Exploratory analysis: descriptive stats
+```{r}
+Tot_ind <- sum(supp_table_filled$count)
+vax_by_age<-supp_table_filled %>% group_by(age,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind)
+vax_by_com<-supp_table_filled %>% group_by(comorbidities,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind)
+```
+
+## Plot histograms
+```{r}
+ggplot(vax_by_age,aes(age,prop,fill=PCV_combined))+
+  geom_bar(stat="identity",position='dodge')
+ggplot(vax_by_com,aes(comorbidities,prop,fill=PCV_combined))+
+  geom_bar(stat="identity",position='dodge')
+```
diff --git a/Example_logistic_reg_agg_data.R b/R/Example_logistic_reg_agg_data.R
similarity index 100%
rename from Example_logistic_reg_agg_data.R
rename to R/Example_logistic_reg_agg_data.R
diff --git a/Tests.R b/R/Tests.R
similarity index 100%
rename from Tests.R
rename to R/Tests.R
diff --git a/R/functions.R b/R/functions.R
new file mode 100644
index 0000000..1b7f9b0
--- /dev/null
+++ b/R/functions.R
@@ -0,0 +1,24 @@
+run_model <- function(df,list_cov,list_out)
+  {## Convert columns of list_cov to factors
+  colnames <- list_cov
+  df[col_names] <- lapply(df[col_names], factor)
+  df$race_code <- relevel(df$race_code,ref="W")
+  df$PCV_combined <- relevel(df$PCV_combined,ref="U")
+  ## Create df of response and non-responses to be fed to the logistic regression
+  df <- df[df$COVID_severity %in% list_out,]
+  data1 <- df[df$COVID_severity == list_out[1],]
+  data2 <- df[df$COVID_severity ==list_out[2],]
+  data_comb<-merge(data1,data2,by=c("year_month",list_cov))
+  ## Run logistic regression model
+  data_comb$COVID_severity.x<-NULL
+  data_comb$COVID_severity.y<-NULL
+  m<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+PCV_combined+. -year_month, data=data_comb, family = binomial("logit"))
+  #summary(m1)
+  m.ci<-confint(m)
+  m.table <- cbind(coef(m),m.ci)
+  colnames(m.table) <- c("estimate","lower","upper")
+  m.table <- exp(m.table)
+  m.vif <- vif(m)
+  result <- list("m.table"=m.table,"vif"=m.vif)
+  return(result)
+}
\ No newline at end of file