From 2db35fa120812f2f63dba2f6962d9ccb58929ca8 Mon Sep 17 00:00:00 2001 From: sammigd Date: Thu, 14 Jul 2022 14:35:06 -0400 Subject: [PATCH] Organize folder with functions --- .DS_Store | Bin 0 -> 6148 bytes PCV_analysis_with_pharma_dataset1.Rmd | 202 +++++++++++++++++ ...d => PCV_analysis_with_pharma_dataset2.Rmd | 70 +++--- PCV_analysis_with_pharma_dataset3.Rmd | 206 ++++++++++++++++++ PCV_analysis_with_pharma_exploratory.Rmd | 50 +++++ .../Example_logistic_reg_agg_data.R | 0 Tests.R => R/Tests.R | 0 R/functions.R | 24 ++ 8 files changed, 514 insertions(+), 38 deletions(-) create mode 100644 .DS_Store create mode 100644 PCV_analysis_with_pharma_dataset1.Rmd rename PCV_analysis_with_pharma.Rmd => PCV_analysis_with_pharma_dataset2.Rmd (83%) create mode 100644 PCV_analysis_with_pharma_dataset3.Rmd create mode 100644 PCV_analysis_with_pharma_exploratory.Rmd rename Example_logistic_reg_agg_data.R => R/Example_logistic_reg_agg_data.R (100%) rename Tests.R => R/Tests.R (100%) create mode 100644 R/functions.R diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3cddd6bc6944608891e015960e90de8117f2419f GIT binary patch literal 6148 zcmeHKJ5Iwu5S@V(mS|E^?iIobmdIQnX(&;M1VtjJp^W4PaWQVeWq9)eVXdNc6!1ox zdEWW#+OOF0h=?vEJU^^B8TT@r*rSXI2j-VWPl8if&at+dbUY-6-Y}4$N(Am zX29-;0yV6OW1wFhFg^kR?Kj>9&pt~4i#dQbaSVh9qEQNrQq2)Vqa6N{c{OnijB-&w z8T;hPniGop>F^g%7p;M`WPl7z8Mut)-1`3=^u+u>E#is{kbytNfX@1Lzr-hHZ*4x# wdToI|K)(dJo(|)!80f7S3v0#KKk|w_BVH57Kre^i%Ypn6FkUER;5Qif1VI}hod5s; literal 0 HcmV?d00001 diff --git a/PCV_analysis_with_pharma_dataset1.Rmd b/PCV_analysis_with_pharma_dataset1.Rmd new file mode 100644 index 0000000..4c5452b --- /dev/null +++ b/PCV_analysis_with_pharma_dataset1.Rmd @@ -0,0 +1,202 @@ +--- +title: "PCV_analysis -- Dataset 1 and 2" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Load required R packages +```{r} +library(table1) +library(dplyr) +library(ggplot2) +#library(GGally) +library(car) +``` + +```{r} +### Load the data: +core_table <- read.csv("./Data/core_table_filled.csv") +core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv") +``` + +## Check that individuals are the same across dataframes +```{r} +print(paste0("Total number of individuals: ")) +sum(core_table$count) +print("\n") +sum(core_table_norace$count) +``` +## Factor variables in the data +```{r} +## Factor categorical variable and change reference level for race (set White as the ref level) +col_names <- c("age","patient_gender_code","race_code","PCV_combined") +core_table[col_names] <- lapply(core_table[col_names], factor) +core_table$race_code <- relevel(core_table$race_code,ref="W") +core_table$PCV_combined <- relevel(core_table$PCV_combined,ref="U") +``` + +## First step is to create another columnn of no response +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- core_table[core_table$COVID_severity %in% c("non_severe","resp_severe"),] +data1 <- core_table[core_table$COVID_severity == "non_severe",] +data2 <- core_table[core_table$COVID_severity =="resp_severe",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +``` + +## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes +```{r} +m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined, data=data_comb, family = binomial("logit")) +#summary(m1) +m1.ci<-confint(m1) +``` + +```{r} +m1.table <- cbind(coef(m1),m1.ci) +colnames(m1.table) <- c("estimate","lower","upper") +m1.table <- exp(m1.table) +View(m1.table) +``` + +##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?". +The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity. + +```{r} +#X<-data_comb[,2:10] +#ggpairs(X) +vif(m1) +``` + + +## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- core_table[core_table$COVID_severity %in% c("non_severe","non_resp_severe"),] +data1 <- core_table[core_table$COVID_severity == "non_severe",] +data2 <- core_table[core_table$COVID_severity =="non_resp_severe",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("Total number of individuals: ",sum(data$count))) +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +``` + +## Logistic regression +```{r} +m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined, data=data_comb, family = binomial("logit")) +#summary(m2) +m2.ci<-confint(m2) +``` + +##Check collinearity +```{r} +vif(m2) +``` + + +```{r} +m2.table <- cbind(coef(m2),m2.ci) +colnames(m2.table) <- c("estimate","lower","upper") +m2.table <- exp(m2.table) +View(m2.table) +``` + +## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19 +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),] +data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] +data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("Total number of individuals: ",sum(data$count))) +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) +print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) +``` + +## Logistic regression +```{r} +m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) +#summary(m3) +m3.ci<-confint(m3) +``` + +```{r} +m3.table <- cbind(coef(m3),m3.ci) +colnames(m3.table) <- c("estimate","lower","upper") +m3.table <- exp(m3.table) +View(m3.table) +``` + +##Check collinearity +```{r} +vif(m3) +``` + +## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19 +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),] +data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",] +data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("Total number of individuals: ",sum(data$count))) +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) +print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) +``` + +## Logistic regression +```{r} +m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) +#summary(m4) +m4.ci<-confint(m4) +``` + +```{r} +m4.table <- cbind(coef(m4),m4.ci) +colnames(m4.table) <- c("estimate","lower","upper") +m4.table <- exp(m4.table) +View(m4.table) +``` + +##Check collinearity +```{r} +vif(m4) +``` + + diff --git a/PCV_analysis_with_pharma.Rmd b/PCV_analysis_with_pharma_dataset2.Rmd similarity index 83% rename from PCV_analysis_with_pharma.Rmd rename to PCV_analysis_with_pharma_dataset2.Rmd index b1f07d0..f10904b 100644 --- a/PCV_analysis_with_pharma.Rmd +++ b/PCV_analysis_with_pharma_dataset2.Rmd @@ -1,5 +1,5 @@ --- -title: "PCV_analysis" +title: "PCV_analysis -- Dataset 1 and 2" output: html_document --- @@ -12,15 +12,14 @@ knitr::opts_chunk$set(echo = TRUE) library(table1) library(dplyr) library(ggplot2) -library(GGally) +#library(GGally) +library(car) ``` ```{r} ### Load the data: core_table <- read.csv("./Data/core_table_filled.csv") core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv") -supp_table_filled <-read.csv("~/Documents/Ottavia/Data_pharma/suppl_table_filled.csv") -supp_table_6mo_filled <- read.csv("./Data/suppl_6mo_table_filled.csv") ``` ## Check that individuals are the same across dataframes @@ -29,25 +28,6 @@ print(paste0("Total number of individuals: ")) sum(core_table$count) print("\n") sum(core_table_norace$count) -print("\n") -sum(supp_table_filled$count) -print("\n") -sum(supp_table_6mo_filled$count) -``` - -## Exploratory analysis: descriptive stats -```{r} -Tot_ind <- sum(supp_table_filled$count) -vax_by_age<-supp_table_filled %>% group_by(age,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind) -vax_by_com<-supp_table_filled %>% group_by(comorbidities,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind) -``` - -## Plot histograms -```{r} -ggplot(vax_by_age,aes(age,prop,fill=PCV_combined))+ - geom_bar(stat="identity",position='dodge') -ggplot(vax_by_com,aes(comorbidities,prop,fill=PCV_combined))+ - geom_bar(stat="identity",position='dodge') ``` ## Factor variables in the data ```{r} @@ -58,8 +38,6 @@ supp_table_filled$race_code <- relevel(supp_table_filled$race_code,ref="W") supp_table_filled$PCV_combined <- relevel(supp_table_filled$PCV_combined,ref="U") ``` - - ## First step is to create another columnn of no response ```{r} ### Prepare the data and do some exploration @@ -86,14 +64,6 @@ m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined #summary(m1) m1.ci<-confint(m1) ``` -##Check collinearity among the covariates using pair-wise correlation matrix -```{r} -X<-data_comb[,2:10] -ggpairs(X) - -``` - - ```{r} m1.table <- cbind(coef(m1),m1.ci) @@ -102,6 +72,15 @@ m1.table <- exp(m1.table) View(m1.table) ``` +##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?". +The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity. + +```{r} +#X<-data_comb[,2:10] +#ggpairs(X) +vif(m1) +``` + ## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 ```{r} @@ -124,13 +103,19 @@ print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$c print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) ``` -## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe respiratory outcome +## Logistic regression ```{r} m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) #summary(m2) m2.ci<-confint(m2) ``` +##Check collinearity +```{r} +vif(m2) +``` + + ```{r} m2.table <- cbind(coef(m2),m2.ci) colnames(m2.table) <- c("estimate","lower","upper") @@ -138,7 +123,7 @@ m2.table <- exp(m2.table) View(m2.table) ``` -## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 +## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19 ```{r} ### Prepare the data and do some exploration #baseline demographic characteristics (Table 1) @@ -159,7 +144,7 @@ print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$c print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) ``` -## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe respiratory outcome +## Logistic regression ```{r} m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) #summary(m3) @@ -173,7 +158,12 @@ m3.table <- exp(m3.table) View(m3.table) ``` -## Fourth part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 +##Check collinearity +```{r} +vif(m3) +``` + +## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19 ```{r} ### Prepare the data and do some exploration #baseline demographic characteristics (Table 1) @@ -194,7 +184,7 @@ print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$c print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) ``` -## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe respiratory outcome +## Logistic regression ```{r} m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) #summary(m4) @@ -208,5 +198,9 @@ m4.table <- exp(m4.table) View(m4.table) ``` +##Check collinearity +```{r} +vif(m4) +``` diff --git a/PCV_analysis_with_pharma_dataset3.Rmd b/PCV_analysis_with_pharma_dataset3.Rmd new file mode 100644 index 0000000..f10904b --- /dev/null +++ b/PCV_analysis_with_pharma_dataset3.Rmd @@ -0,0 +1,206 @@ +--- +title: "PCV_analysis -- Dataset 1 and 2" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Load required R packages +```{r} +library(table1) +library(dplyr) +library(ggplot2) +#library(GGally) +library(car) +``` + +```{r} +### Load the data: +core_table <- read.csv("./Data/core_table_filled.csv") +core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv") +``` + +## Check that individuals are the same across dataframes +```{r} +print(paste0("Total number of individuals: ")) +sum(core_table$count) +print("\n") +sum(core_table_norace$count) +``` +## Factor variables in the data +```{r} +## Factor categorical variable and change reference level for race (set White as the ref level) +col_names <- c("age","patient_gender_code","race_code","income_est_mod","PCV_combined") +supp_table_filled[col_names] <- lapply(supp_table_filled[col_names], factor) +supp_table_filled$race_code <- relevel(supp_table_filled$race_code,ref="W") +supp_table_filled$PCV_combined <- relevel(supp_table_filled$PCV_combined,ref="U") +``` + +## First step is to create another columnn of no response +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","resp_severe"),] +data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] +data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) +print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) +``` + +## Run the first logistic regression analysis looking at outcome: severe respiratory outcome for Covid-19 versus non-severe outcomes +```{r} +m1<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) +#summary(m1) +m1.ci<-confint(m1) +``` + +```{r} +m1.table <- cbind(coef(m1),m1.ci) +colnames(m1.table) <- c("estimate","lower","upper") +m1.table <- exp(m1.table) +View(m1.table) +``` + +##Check collinearity among the covariates using variance inflation factor (VIF): VIF answers the question: "How well is one of my covariates x_i explained by all other covariates x jointly?". +The VIF is defined as 1/(1-R_squared); this means that for VIF>=10, this specific covariate is explained by all other covariates to a large extent (~90%). There could be a problem of collinearity. + +```{r} +#X<-data_comb[,2:10] +#ggpairs(X) +vif(m1) +``` + + +## Second part of the analysis: restrict to people with severe non-respiratory vs non-severe outcomes for Covid-19 +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","non_resp_severe"),] +data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] +data2 <- supp_table_filled[supp_table_filled$COVID_severity =="non_resp_severe",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("Total number of individuals: ",sum(data$count))) +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) +print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) +``` + +## Logistic regression +```{r} +m2<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) +#summary(m2) +m2.ci<-confint(m2) +``` + +##Check collinearity +```{r} +vif(m2) +``` + + +```{r} +m2.table <- cbind(coef(m2),m2.ci) +colnames(m2.table) <- c("estimate","lower","upper") +m2.table <- exp(m2.table) +View(m2.table) +``` + +## Third part of the analysis: restrict to people with ICU critical care vs non-severe outcomes for Covid-19 +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("non_severe","ICU_crit_care"),] +data1 <- supp_table_filled[supp_table_filled$COVID_severity == "non_severe",] +data2 <- supp_table_filled[supp_table_filled$COVID_severity =="ICU_crit_care",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("Total number of individuals: ",sum(data$count))) +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) +print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) +``` + +## Logistic regression +```{r} +m3<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) +#summary(m3) +m3.ci<-confint(m3) +``` + +```{r} +m3.table <- cbind(coef(m3),m3.ci) +colnames(m3.table) <- c("estimate","lower","upper") +m3.table <- exp(m3.table) +View(m3.table) +``` + +##Check collinearity +```{r} +vif(m3) +``` + +## Fourth part of the analysis: restrict to people with ICU critical care vs respiratory severe outcomes for Covid-19 +```{r} +### Prepare the data and do some exploration +#baseline demographic characteristics (Table 1) +data <- supp_table_filled[supp_table_filled$COVID_severity %in% c("ICU_crit_care","resp_severe"),] +data1 <- supp_table_filled[supp_table_filled$COVID_severity == "ICU_crit_care",] +data2 <- supp_table_filled[supp_table_filled$COVID_severity =="resp_severe",] +data_comb<-merge(data1,data2,by=c("year_month","age","patient_gender_code","race_code","PCV_combined","flu_vacc","zoster_vacc","bmi_30_plus","comorbidities","income_est_mod")) +``` + +```{r} +## Exploratory analysis of the dataset: +print(paste0("Total number of individuals: ",sum(data$count))) +print(paste0("PCV13 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_only"])/sum(data$count))) +print(paste0("PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PPSV23_10yrs"])/sum(data$count))) +print(paste0("PCV13 and PPSV23 vaccine coverage: ",sum(data$count[data$PCV_combined=="PCV13_PPSV23_10yrs"])/sum(data$count))) +print(paste0("Unknown PCV vaccine coverage: ",sum(data$count[data$PCV_combined=="U"])/sum(data$count))) +print(paste0("Flu vaccine: ",sum(data$count[data$"flu_vacc"=="TRUE"])/sum(data$count))) +print(paste0("Zooster vaccine: ",sum(data$count[data$"zoster_vacc"=="TRUE"])/sum(data$count))) +``` + +## Logistic regression +```{r} +m4<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+race_code+PCV_combined+flu_vacc+zoster_vacc+bmi_30_plus+comorbidities+income_est_mod, data=data_comb, family = binomial("logit")) +#summary(m4) +m4.ci<-confint(m4) +``` + +```{r} +m4.table <- cbind(coef(m4),m4.ci) +colnames(m4.table) <- c("estimate","lower","upper") +m4.table <- exp(m4.table) +View(m4.table) +``` + +##Check collinearity +```{r} +vif(m4) +``` + + diff --git a/PCV_analysis_with_pharma_exploratory.Rmd b/PCV_analysis_with_pharma_exploratory.Rmd new file mode 100644 index 0000000..cc50b2b --- /dev/null +++ b/PCV_analysis_with_pharma_exploratory.Rmd @@ -0,0 +1,50 @@ +--- +title: "PCV_analysis" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Load required R packages +```{r} +library(table1) +library(dplyr) +library(ggplot2) +``` + +```{r} +### Load the data: +core_table <- read.csv("./Data/core_table_filled.csv") +core_table_norace <- read.csv("./Data/core_table_droprace_filled.csv") +supp_table_filled <-read.csv("~/Documents/Ottavia/Data_pharma/suppl_table_filled.csv") +supp_table_6mo_filled <- read.csv("./Data/suppl_6mo_table_filled.csv") +``` + +## Check that individuals are the same across dataframes +```{r} +print(paste0("Total number of individuals: ")) +sum(core_table$count) +print("\n") +sum(core_table_norace$count) +print("\n") +sum(supp_table_filled$count) +print("\n") +sum(supp_table_6mo_filled$count) +``` + +## Exploratory analysis: descriptive stats +```{r} +Tot_ind <- sum(supp_table_filled$count) +vax_by_age<-supp_table_filled %>% group_by(age,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind) +vax_by_com<-supp_table_filled %>% group_by(comorbidities,PCV_combined) %>% summarise(prop = sum(count)/Tot_ind) +``` + +## Plot histograms +```{r} +ggplot(vax_by_age,aes(age,prop,fill=PCV_combined))+ + geom_bar(stat="identity",position='dodge') +ggplot(vax_by_com,aes(comorbidities,prop,fill=PCV_combined))+ + geom_bar(stat="identity",position='dodge') +``` diff --git a/Example_logistic_reg_agg_data.R b/R/Example_logistic_reg_agg_data.R similarity index 100% rename from Example_logistic_reg_agg_data.R rename to R/Example_logistic_reg_agg_data.R diff --git a/Tests.R b/R/Tests.R similarity index 100% rename from Tests.R rename to R/Tests.R diff --git a/R/functions.R b/R/functions.R new file mode 100644 index 0000000..1b7f9b0 --- /dev/null +++ b/R/functions.R @@ -0,0 +1,24 @@ +run_model <- function(df,list_cov,list_out) + {## Convert columns of list_cov to factors + colnames <- list_cov + df[col_names] <- lapply(df[col_names], factor) + df$race_code <- relevel(df$race_code,ref="W") + df$PCV_combined <- relevel(df$PCV_combined,ref="U") + ## Create df of response and non-responses to be fed to the logistic regression + df <- df[df$COVID_severity %in% list_out,] + data1 <- df[df$COVID_severity == list_out[1],] + data2 <- df[df$COVID_severity ==list_out[2],] + data_comb<-merge(data1,data2,by=c("year_month",list_cov)) + ## Run logistic regression model + data_comb$COVID_severity.x<-NULL + data_comb$COVID_severity.y<-NULL + m<-glm(cbind(count.x, count.y) ~ age+patient_gender_code+PCV_combined+. -year_month, data=data_comb, family = binomial("logit")) + #summary(m1) + m.ci<-confint(m) + m.table <- cbind(coef(m),m.ci) + colnames(m.table) <- c("estimate","lower","upper") + m.table <- exp(m.table) + m.vif <- vif(m) + result <- list("m.table"=m.table,"vif"=m.vif) + return(result) +} \ No newline at end of file