forked from harris-coding-lab/harris-coding-lab.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path07_for_loops.R
194 lines (135 loc) · 3.61 KB
/
07_for_loops.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
## For-loops and Iterations
library(tidyverse)
library(readxl)
for(value in c(1, 2, 3, 4, 5)) {
print(value)
}
# simple for-loop
for (x in c(3, 6, 9)) {
print(x)
}
# simple for-loop: what is going on?
x <- 3
print(x)
x <- 6
print(x)
x <- 9
print(x)
## basic structure of for-loop
for (value in list_of_values) {
do something (based on value)
}
for (index in list_of_indices) {
do something (based on index)
}
# example: find sample means
mean1 <- mean(rnorm(5))
mean2 <- mean(rnorm(10))
mean3 <- mean(rnorm(15))
mean4 <- mean(rnorm(20))
mean5 <- mean(rnorm(25000))
means <- c(mean1, mean2, mean3, mean4, mean5)
means
# avoid repeating by using a for-loop
sample_sizes <- c(5, 10, 15, 20, 25000)
sample_means <- rep(0, length(sample_sizes))
for (i in seq_along(sample_sizes)) {
sample_means[[i]] <- mean(rnorm(sample_sizes[[i]]))
}
sample_means
## finding sample means, broken down
# determine what to loop over
sample_sizes <- c(5, 10, 15, 20, 25000)
# pre-allocate space to store output
sample_means <- rep(0, length(sample_sizes))
# determine what 'sample_means' currently looks like
sample_means <- rep(0, length(sample_sizes))
sample_means
# altenative ways to pre-allocate space
sample_means <- vector("double", length = 5)
sample_means <- double(5)
# using lists
data_list <- vector("list", length = 5)
## adding data to a vector, broken down
for (i in 1:length(sample_sizes)) {
}
# 'seq_along' helper function
vec <- c("x", "y", "z")
1:length(vec)
seq_along(vec)
sample_sizes <- c(5, 10, 15, 20, 25000)
seq_along(sample_sizes)
sample_sizes <- c(5, 10, 15, 20, 25000)
sample_means <- rep(0, length(sample_sizes))
for (i in seq_along(sample_sizes)) {
}
sample_sizes <- c(5, 10, 15, 20, 25000)
sample_means <- numeric(length(sample_sizes))
for (i in seq_along(sample_sizes)) {
sample_means[[i]] <- mean(rnorm(sample_sizes[[i]]))
}
sample_means
## common error
sample_sizes <- c(5, 10, 15, 20, 25000)
sample_means <- rep(0, length(sample_sizes))
for (i in seq_along(sample_sizes)) {
mean(rnorm(sample_sizes[[i]]))
}
sample_means
## reading data into R and storing as single data set
setwd("../data/loops")
file_1 <- read_csv("data_1999.csv")
file_2 <- read_csv("data_2000.csv")
...
file_22 <- read_csv("data_2020.csv")
data <- bind_rows(file_1, file_2, ..., file_22)
# fake data used for this exercise
setwd('../data/loops')
file_list <- paste0("data_", 1999:2020, ".csv")
for (file in file_list) {
data <-
tibble(id = 1:100,
employed = sample(c(0, 1, 1, 1),
100, replace = TRUE),
happy = sample(c(0,1),
100, replace = TRUE))
write_csv(data, file)
}
# 'bind_rows' function
df_1 <- tibble(col1 = 1, col2 = "A")
df_2 <- tibble(col1 = 2:3, col2 = c("B", "C"))
bind_rows(df_1, df_2)
# 'list.files' function
list.files("../data/loops", pattern = "*.csv$")
# loop to read data
file_names <- list.files(pattern = "*.csv$")
output <- vector("list", length(file_names))
for (i in seq_along(file_names)) {
output[[i]] <- read_csv(file_names[[i]]) %>%
mutate(year = str_extract(file_names[[i]], "[0-9]{4}"))
}
data <- bind_rows(output)
View(data)
# alternative loop to read data
setwd('../data/loops')
# by default, reads files in working directory
file_list <- list.files(pattern = "*.csv$")
out <- tibble()
for (file in file_list) {
temp <- read_csv(file)
out <- bind_rows(out, temp)
}
nrow(out)
# Review: Bad example of loop
a <- 7:11
b <- 8:12
out <- rep(0L, 5)
for (i in seq_along(a)) {
out[[i]] <- a[[i]] + b[[i]]
}
out
# Review: Better alternative is vectorized operations
a <- 7:11
b <- 8:12
out <- a + b
out