forked from harris-coding-lab/harris-coding-lab.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path00_introduction.R
115 lines (89 loc) · 3.51 KB
/
00_introduction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Coding Lab Week 0
# Introduction
# load the Tidyverse library
library(tidyverse)
# load COVID data from the CDC website
# and rename the columns in the data set for ease of use
covid_data <- read_csv(paste0("https://data.cdc.gov/api/views/qfhf-uhaa/rows.csv?",
"accessType=DOWNLOAD&bom=true&format=true%20target="), col_types = cols(Suppress = col_character())) %>%
rename(week = `Week Ending Date`,
time_period = `Time Period`,
race_ethnicity = `Race/Ethnicity`,
deaths = `Number of Deaths`,
diff = `Difference from 2015-2019 to 2020`,
state = `State Abbreviation`)
# make a plot of the COVID data
covid_data %>%
mutate(week = lubridate::mdy(week)) %>%
filter(race_ethnicity %in% c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
filter(time_period == "2020", MMWRWeek <= 30) %>%
group_by(race_ethnicity, week ) %>%
summarize(actual_deaths = sum(deaths, na.rm = TRUE),
diff_deaths = sum(diff, na.rm = TRUE),
expected_deaths = actual_deaths - diff_deaths,
perc_above_expected = 100 * diff_deaths / expected_deaths) %>%
ggplot(aes(x = week, color = race_ethnicity)) +
geom_line(aes(y = perc_above_expected)) +
theme_minimal() +
labs(y = "Percent of deaths above expected deaths\n(weekly mean 2015-2019 for given week)",
x = "",
title = "Racial disparities of Covid-19, United States 2020" ,
subtitle = "Data source: CDC",
color = "") +
theme(legend.position = "bottom")
# assign the value 4 to the variable my_number
my_number <- 4
# check the value of the variable my_number
my_number
# assign a new value to my_number
my_number <- 2
# do some math using the value of my_number
# and save the result to my_output
my_output <- sqrt((12 * my_number) + 1)
# run a linear regression and save the output to the variable model_fit
model_fit <- lm(mpg ~ disp + cyl + hp, mtcars)
# view a summary of the linear regression
summary(model_fit)
# calculate the square root of the number 4 using the sqrt function
sqrt(4)
# find the median from a vector of numbers using the median function
median(c(3, 4, 5, 6, 7 ))
# create a new function called f
f <- function(x, y) {
2 * x + y
}
# call f with x = 7 and y = 0
f(7, 0)
# call f with x = 0 and y = 7
f(y = 7, x = 0)
# read the documentation of the sum function
?sum
# try calculating a few different sums
sum(1, 2, 3, 4, 5)
sum(1:5, NA)
sum(1:5, NA, na.rm = TRUE)
# try to figure out what 1:5 does
1:5
# install the readxl package
# typically, you should put this in the console (at the bottom of the window)
# instead of a script like this (at the top of the window)
#
# a script is meant to be used for code that you might want to run more than once
# but you only need to install a package one time and it'll stay on the computer
install.packages("readxl")
# load the package in the script so R knows to use it every time you restart
library(readxl)
# call a function defined within the readxl package after loading the package
read_xlsx("some_data.xls")
# call a function defined within the readxl package *without* loading the package
readxl::read_xlsx("some_data.xls")
# try this:
install.packages("haven")
our_data <- read_dta("my_file.dta")
# what is the error message telling you?
# you need to load the package before using its functions:
library(haven)
our_data <- read_dta("my_file.dta")
# one of the most useful packages we'll use is the Tidyverse:
# make sure you have this installed
library(tidyverse)