-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathChapter_4_SampleStatistics.Rmd
222 lines (169 loc) · 5.41 KB
/
Chapter_4_SampleStatistics.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
---
title: "Block4: sample statistics"
output: github_document
---
```{r}
library(tidyverse)
library(here)
library(e1071)
library(skimr)
library(lubridate)
library(forcats)
library(ggplot2)
library(infer)
sales <- readRDS(here::here("data/sales.rds"))
mu <- sales %>%
summarize(stat = mean(resale_price))
mu
```
```{r}
sales %>%
sample_n(100) %>%
summarize(stat = mean(resale_price))
```
```{r}
sample_mean <- function(data, size = 500) {
data %>%
sample_n(size) %>%
summarize(stat = mean(resale_price))
}
sampled_means <- rerun(500, sample_mean(sales)) %>% bind_rows()
sampled_means %>%
ggplot(aes(x = stat)) + geom_histogram() +
geom_vline(xintercept=mu$stat)
```
```{r}
#sample of 500 (from the entire database)
sales_sample <- sales %>%
sample_n(500)
# mean of the sample
x_bar <- sales_sample %>%
summarize(stat = mean(resale_price))
x_bar
# same with different library
sales_sample %>%
specify(response = resale_price) %>%
calculate(stat = "mean")
```
```{r}
# Bootstrapping
bootstrapped_resale_price <- sales_sample %>%
specify(response = resale_price) %>%
generate(reps = 500) %>%
calculate(stat = "mean")
bootstrapped_resale_price %>%
visualise() +
geom_vline(xintercept = x_bar$stat)
```
```{r}
bootstrapped_resale_price %>%
get_ci()
```
```{r}
#Exercise 1: Using RStudio’s help panel, find out how to use the standard error method with get_ci. Do you find the same values?
#Exercise 2: Try changing the value of your confidence level (by default, get_ci uses 0.95). What do you observe as you ask for higher confidence levels?
#Exercise 3: Repeat the previous steps using a different value for n, the size of our original sample. What does the new confidence interval compare to the previous one?
```
```{r}
# Confidence intervals for subsets:
#x_bar_mp_all <- sales %>%
# filter(town == "MARINE PARADE") %>%
# summarize(stat = mean(resale_price))
#x_bar_mp
```
```{r}
sales %>%
#filter(floor_area_sqm == 67) %>%
filter(town == "MARINE PARADE") %>%
View()
#sample of 500 (from the entire database)
sales_sample_mp <- sales %>%
filter(town == "MARINE PARADE") %>%
sample_n(100) # Error: `size` must be less or equal than 485 (size of data), set `replace` = TRUE to use sampling with replacement.
# mean of the sample
x_bar_mp <- sales_sample_mp %>%
summarize(stat = mean(resale_price))
x_bar_mp
bootstrapped_resale_price_mp <- sales_sample_mp %>%
filter(town == "MARINE PARADE") %>%
specify(response = resale_price) %>%
generate(reps = 500) %>%
calculate(stat = "mean")
bootstrapped_resale_price_mp %>%
visualise() +
geom_vline(xintercept = x_bar$stat)
bootstrapped_resale_price_mp %>%
get_ci()
```
```{r}
# 4.4 Comparing means between groups
town_means <- sales %>%
filter(town == "QUEENSTOWN" | town == "MARINE PARADE") %>%
group_by(town) %>%
summarise(mean = mean(resale_price))
town_means
```
```{r}
ggplot() +
geom_histogram(
data = sales %>%
filter(town == "QUEENSTOWN" | town == "MARINE PARADE"),
mapping = aes(x = resale_price, y = ..density.., group = town, fill = town),
position = position_dodge()
) +
geom_vline(
data = town_means,
mapping = aes(xintercept = mean, group = town, color = town)
)
```
```{r}
bootstrapped_resale_price_qt <- sales %>%
filter(town == "QUEENSTOWN") %>%
specify(response = resale_price) %>%
generate(reps = 100) %>%
calculate(stat = "mean")
bootstrapped_resale_price_qt %>%
get_ci()
```
```{r}
# test the significance of the difference more formally with infer
mean_diff <- sales %>%
filter(town == "MARINE PARADE" | town == "QUEENSTOWN") %>%
specify(formula = resale_price ~ town) %>%
calculate(stat = "diff in means", order = c("QUEENSTOWN", "MARINE PARADE"))
mean_diff
```
```{r}
# We then generate a set of sampled datasets. But instead of doing a simple bootstrap, we generate each sample based on the null hypothesis, namely that there is no difference between the mean of the two towns.
null_distribution <- sales %>%
filter(town == "MARINE PARADE" | town == "QUEENSTOWN") %>%
specify(formula = resale_price ~ town) %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "diff in means", order = c("QUEENSTOWN", "MARINE PARADE"))
null_distribution %>%
visualise(bins = 100) +
shade_p_value(obs_stat = mean_diff, direction = "greater")
```
```{r}
null_distribution %>%
get_pvalue(obs_stat = mean_diff, direction = "greater")
```
```{r}
sales %>%
filter(town == "MARINE PARADE" | town == "QUEENSTOWN") %>%
specify(formula = resale_price ~ town) %>%
# hypothesize(null = "independence") %>% # disable the null hypothesis
generate(reps = 500, type = "bootstrap") %>% # set type to bootstrap
calculate(stat = "diff in means", order = c("QUEENSTOWN", "MARINE PARADE")) %>%
get_ci()
```
## Exercise 4: In the RStudio help, find out what the direction argument does for the get_p_value and shade_p_value functions. Try setting it to both on the data above.
```{r}
```
## Exercise 5: Are the resale prices means between Marine Parade and Bukit Timah significantly different?
```{r}
```
## Exercise 6: During the previous week, we saw that the price of a flat can be quite different depending on what storey the flat is located on. Use the same procedure to see if the average price of 3 room flats on storey 04-06 is significantly different from 3 room flats on storey 07-09.
```{r}
```