-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStatInference.R
687 lines (607 loc) · 34.1 KB
/
StatInference.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
setwd("D:\\study\\DataScience")
#1.basics
#(1)likelihood ratios: P(D|+)/P(D_c|+) = [P(+|D)/P(+|D_c)] * [P(D)/P(D_c)]
#(2)Some uses for the Poisson Distribution:
#-Modeling count data,
#-Modeling event-time or survival data (sensoring)
#-Modeling contingency tables
#-Approximating binomials when n is large and p is small (lambda = n*p)
pbinom(2,size=1000,prob=0.0001)
ppois(2,lambda=1000*0.0001)
#(3)law of large number
n<-1000
means<-cumsum(rnorm(n))/(1:n) # mean prob = 0.5
plot(1:n,means)
abline(h=0)
#law of large number of coin flip, mean prob = 0.5
means<-cumsum(sample(0:1,n,replace=TRUE))/(1:n)
plot(1:n,means)
abline(h=0.5)
#(4) central limit theorm -- distribution of average of iid variables (properly normalized) becomes
#that of a standard normal as the sample size increases
0.56+c(-1,1)*qnorm(0.975)*sqrt(0.56*0.44/100)
binom.test(56,100)$conf.int
#binom.test(x, n, p = 0.5, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
#This guarantees that the confidence level is at least conf.level, but in general does not give the shortest-length confidence intervals.
#conf.int:a confidence interval for the probability of success.
binom.test(c(682, 243), p = 3/4)$conf.int
binom.test(682, 682 + 243, p = 3/4) # The same.
#Binomial Interval should be use Agresti/Coull interval instead of Wald Interval
#simulation
n<-20
pvals<-seq(0.1,0.9,by=0.05)
nosim<-1000
#wald interval
coverage<-sapply(pvals,function(p){
phats<-rbinom(nosim,prob=p,size=n)/n #generate a 1000 of 10 coin flips
ll<-phats-qnorm(0.975)*sqrt(phats*(1-phats)/n)
ul<-phats+qnorm(0.975)*sqrt(phats*(1-phats)/n)
mean(ll<p & ul>p) #proporation of times that insides the lower and upper bound
})
plot(pvals,coverage,type="l")
abline(h=0.95)
#if the true value of p is 0.95 and I do a confidence interval, the coverage is actually better
#than 95% a little bit.
#when p value is away from 0.5, 95% confidence interval would give us coverage less than 95%.
#it's simply because the CTL isn't as accurate as n is not large enough (n=20)
#Quick fix 1:
n<-2000
coverage_largern<-sapply(pvals,function(p){
phats<-rbinom(nosim,prob=p,size=n)/n #generate a 1000 of 10 coin flips
ll<-phats-qnorm(0.975)*sqrt(phats*(1-phats)/n)
ul<-phats+qnorm(0.975)*sqrt(phats*(1-phats)/n)
mean(ll<p & ul>p) #proporation of times that insides the lower and upper bound
})
plot(pvals,coverage_largern,type="l")
abline(h=0.95)
#Quick fix 2:
#form the inverval with (X+2)/(n+4) (Add two success and failures, Agresti/Coull interval)
n<-20
coverage_AC<-sapply(pvals,function(p){
phats<-(rbinom(nosim,prob=p,size=n)+2)/(n+4)
ll<-phats-qnorm(0.975)*sqrt(phats*(1-phats)/n)
ul<-phats+qnorm(0.975)*sqrt(phats*(1-phats)/n)
mean(ll<p & ul>p) #proporation of times that insides the lower and upper bound
})
plot(pvals,coverage_AC,type="l")
abline(h=0.95)
#so Agresti/Coull interval should be generally used instead of Wald Interval for binomial interval
#Poisson interval
#X~Poisson(lamda*t)
#esimate lambda=X/t (X is the number of failtures)
#Variance estimate: lambda/t.
x<-5
t<-94.32
lambda<-x/t #lambda is the expected count per unit of time
round(lambda+c(-1,1)*qnorm(0.975)*sqrt(lambda/t),3)
poisson.test(x,T=t)$conf
poisson.test(x,T=t)$p.value
lambdavals<-seq(0.005,0.1,by=0.01)
nosim<-1000
t<-100 #t is total monitoring time, so if t is larger, more events would happen, and inference would be more accurate
coverage<-sapply(lambdavals,function(lambda){
lhats<-rpois(nosim,lambda=lambda*t)/t
ll<-lhats-qnorm(0.975)*sqrt(lhats/t)
ul<-lhats+qnorm(0.975)*sqrt(lhats/t)
mean(ll<lambda & ul>lambda)
})
plot(lambdavals,coverage,type="l") #gets really bad for small lambda values
abline(h=0.95)
#if t=1000
t<-10000
coverage<-sapply(lambdavals,function(lambda){
lhats<-rpois(nosim,lambda=lambda*t)/t
ll<-lhats-qnorm(0.975)*sqrt(lhats/t)
ul<-lhats+qnorm(0.975)*sqrt(lhats/t)
mean(ll<lambda & ul>lambda)
})
plot(lambdavals,coverage,type="l") #gets really bad for small lambda values
abline(h=0.95)
#2.T-distribution
#(X_bar-miu)/[S/sqrt(n)] follows Gosset's distribution with n-1 degrees of freedom
#interval is X+c(-1,1)*t_(n-1) * Std(d)/sqrt(n) # wider than normal distribution
#the t interval technically assumes that the data are iid normal, though it is robust to this assumption
#Paired Observations are often analyzed using the t interval by taking differences.
#e.g. when you measure something once and the same unit a few days later for the second measurement
#you can use t interval to analyze this kind of data by taking differences or differences on the log scale.
#For skewed distribution, consider taking logs or using a different summary like the median.
#or you can use other precedures, for example, creating bootstrapping confidence intervals
#Nonetheless, it doesn't make a lot of sense to use t intervals.
#For highly discrete data, like binary or poisson data, other intervals are available.
#Paired T confidence interval
#y_bar - x_bar + c(-1,1)*t_{alpha/2,n-1} * Std(d)/sqrt(n)
#data(sleep) shows the increase in hours for 10 patients on two soporific drugs. (labeled as groups)
data(sleep)
head(sleep)
dim(sleep) #20 * 3
library(ggplot2)
g <- ggplot(sleep, aes(x = group, y = extra, group = factor(ID)))
g <- g + geom_line(size = 1, aes(colour = ID))
g<-g + geom_point(size =10, pch = 21, fill = "salmon", alpha = .5)
g
g1 <- sleep$extra[1 : 10]
g2 <- sleep$extra[11 : 20]
difference <- g2 - g1
mn <- mean(difference);
s <- sd(difference);
n <- 10
mn + c(-1, 1) * qt(.975, n-1) * s / sqrt(n) #qt(0, n-1)=-Inf or qt(0, n-1)=Inf
#dt(x, df, ncp, log = FALSE)
#pt(q, df, ncp, lower.tail = TRUE, log.p = FALSE)
#qt(p, df, ncp, lower.tail = TRUE, log.p = FALSE)
#rt(n, df, ncp)
#dt gives the density, pt gives the distribution function, qt gives the quantile function, and rt generates random deviates.
t.test(difference) #the same as above
#t.test(x, y = NULL, alternative = c("two.sided"(default), "less", "greater"),
#mu = 0, paired = FALSE, var.equal = FALSE,conf.level = 0.95, ...)
t.test(g2, g1, paired = TRUE) #the same as above
t.test(extra ~ I(relevel(group, 2)), paired = TRUE, data = sleep) #same as above
#t.test(formula, data, subset, na.action, ...)
#formula: a formula of the form lhs ~ rhs where lhs is a numeric variable giving the data
#values and rhs a factor with two levels giving the corresponding groups.
#relevel(x, ref, ...): The levels of a factor are re-ordered so that the level specified by ref is first and the others are moved down.
#so that change g1-g2 to g2 - g1
#I function: In function formula. There it is used to inhibit the interpretation of operators
#such as "+", "-", "*" and "^" as formula operators, so they are used as arithmetical operators.
#The results
rbind(
mn + c(-1, 1) * qt(.975, n-1) * s / sqrt(n),
as.vector(t.test(difference)$conf.int), #class is numeric
as.vector(t.test(g2, g1, paired = TRUE)$conf.int),
as.vector(t.test(extra ~ I(relevel(group, 2)), paired = TRUE, data = sleep)$conf.int)
)
#Independent group t confidence intervals (A/B testing, randomized trial/experiment)
#equal variance -- pooled variance
#Suppose we want to compare the mean blood pressure between two groups in a randomized trial: those
#who received the treatment to those who received a placebo
#We cannot use the paired t test because the groups are independent and may have different sample sizes
#Therefore, a (1-alpha)*100% confidence interval for miu_y - miu_z is:
#y_bar - x_bar + c(-1,1)*t_{alpha/2,n_x+n_y-2} * Sample_Std(p) * sqrt(1/n_x + 1/n_y)
#where (Sample_std(p))^2 = Var(p) = {(n_x-1)*Var(x)+(n_y-1)*Var(y)}/(n_x+n_y-2) #Pooled Variance
#Because it's a randomization trial, it reasonable to assume the variance is the same in the two groups.
#then our estimate of the variance should at some level be an average of the variance estimate from group one and the variance
#from group two. -- we could pool the variance.
#if n_x = n_y, then the pooled variance is the simple average of the variance from the x group
#and the variance from y group.
#remember, this interval is assuming a constant variance across the two groups.
#If there is some doubt, assume a different variance per group. #will be covered later.
#e.g.
#Comparing SBP for 8 oral contraceptive users versus 21 controls
#$\bar X_{OC} = 132.86$ mmHg with $s_{OC} = 15.34$ mmHg
#$\bar X_{C} = 127.44$ mmHg with $s_{C} = 18.23$ mmHg
#Pooled variance estimate
sp <- sqrt((7 * 15.34^2 + 20 * 18.23^2) / (8 + 21 - 2))
132.86 - 127.44 + c(-1, 1) * qt(.975, 27) * sp * (1 / 8 + 1 / 21)^.5
#whenever you're doing an independent group interval, you always mentally think which one is the
#first part of the subtraction. In this case, oral contraceptive users are the first part.
#Mistakenly treating the sleep data as grouped
n1 <- length(g1)
n2 <- length(g2)
sp <- sqrt( ((n1 - 1) * sd(g1)^2 + (n2-1) * sd(g2)^2) / (n1 + n2-2))
md <- mean(g2) - mean(g1)
semd <- sp * sqrt(1 / n1 + 1/n2)
rbind(
md + c(-1, 1) * qt(.975, n1 + n2 - 2) * semd,
t.test(g2, g1, paired = FALSE, var.equal = TRUE)$conf, #same as above
t.test(g2, g1, paired = TRUE)$conf #different with above
)
#pooled variance independent T confidence interval is larger than Paired T confidence interval
#if you disregard the pairing, the interval actually contains 0.
#you could look at the plot of sleep data, it seems quite clear why paired T interval doesn't include o.
library(ggplot2)
g <- ggplot(sleep, aes(x = group, y = extra, group = factor(ID)))
g <- g + geom_line(size = 1, aes(colour = ID)) + geom_point(size =10, pch = 21, fill = "salmon", alpha = .5)
g
#if you're comparing variation of the left-hand side to the variation of the RHS, that's a lot
#variability in the two groups. However, when you're matching up the subjects and looking at
#the variability in the difference, there's a lot of that variability is explained by
#inter-subject variability. Therefore, the variability should not be so large as to include 0.
#ChickWeight data in R
library(datasets)
data(ChickWeight)
dim(ChickWeight) #578 * 4
head(ChickWeight) #Chick and Diet are factor variables
library(reshape2)
##define weight gain or loss
wideCW <- dcast(ChickWeight, Diet + Chick ~ Time, value.var = "weight")
#convert the Time variable from long format to a short format
names(wideCW)[-(1 : 2)] <- paste("time", names(wideCW)[-(1 : 2)], sep = "")
library(dplyr)
wideCW <- mutate(wideCW,gain = time21 - time0)
#mutate: Mutate a data frame by adding new or replacing existing columns.
#Plotting the raw data
g <- ggplot(ChickWeight, aes(x = Time, y = weight, colour = Diet, group = Chick))
g <- g + geom_line() #geom_line(): Connect observations, ordered by x value.
g <- g + stat_summary(aes(group = 1), geom = "line", fun.y = mean, size = 1, col = "black")
g <- g + facet_grid(. ~ Diet)
g
#the first diet has a lot more variation and chicks than the fourth diet.
#by the average line, it appears that the first diet's weight gain is a little bit slower than the fourth diet.
#Weight gain by diet
g <- ggplot(wideCW, aes(x = factor(Diet), y = gain, fill = factor(Diet)))
g <- g + geom_violin(col = "black", size = 2)
g #gain is the end weight minus the baseline weight by each of the diet. We could see that the assumption
#of equal variances appear suspect here.
#The violin plot is similar to box plots, except that they also show the probability density of the
#data at different values.--> so the width of each chart could reflect the variance.
#Let's do a t interval
wideCW14 <- subset(wideCW, Diet %in% c(1, 4))
rbind(
t.test(gain ~ Diet, paired = FALSE, var.equal = TRUE, data = wideCW14)$conf,
t.test(gain ~ Diet, paired = FALSE, var.equal = FALSE, data = wideCW14)$conf
)
#Unequal variances -- If there is some doubt, assume a different variance per group.
#Assume x, y observations are iid normal, different means and different variances, then the relevant
#statistic does not follow a t distribution, but could be approxmiated by t distribution:
#Y_bar - X_bar + c(-1,1) * t_df * sqrt([(Sample_std(x))^2/n_x _ (Sample_std(y))^2/n_y)])
#where df = [(Sample_std(x))^2/n_x _ (Sample_std(y))^2/n_y)]^2 / [(Sample_std(x))^2/(n_x-1) + (Sample_std(y))^2/(n_y-1))] #don't have to remember this
#Comparing SBP for 8 oral contraceptive users versus 21 controls
#$\bar X_{OC} = 132.86$ mmHg with $s_{OC} = 15.34$ mmHg
#$\bar X_{C} = 127.44$ mmHg with $s_{C} = 18.23$ mmHg
#$df=15.04$, $t_{15.04, .975} = 2.13$
#Interval $$ 132.86 - 127.44 \pm 2.13 \left(\frac{15.34^2}{8} + \frac{18.23^2}{21} \right)^{1/2} = [-8.91, 19.75] $$
#t.test(..., var.equal = FALSE)
#equal variance:
sp <- sqrt((7 * 15.34^2 + 20 * 18.23^2) / (8 + 21 - 2))
132.86 - 127.44 + c(-1, 1) * qt(.975, 27) * sp * (1 / 8 + 1 / 21)^.5 #-9.521097 20.361097
#unequal variance
#df = 15.04, t_{15.04,0.975} = 2.13
132.86 - 127.44 + c(-1, 1) * 2.13 * sqrt(15.34^2/8 + 18.23^2/21) #-8.906499 19.746499
#the interval of equal and unequal is not necessarily that one is larger than another one.
#Comparing other kinds of data
#For binomial data, there's lots of ways to compare two groups
#Relative risk, risk difference, odds ratio.
#Chi-squared tests, normal approximations, exact tests.
#For count data, there's also Chi-squared tests and exact tests.
#We'll leave the discussions for comparing groups of data for binary and count data until
#covering glms in the regression class.
#3. Hypothsis test
#(X_bar - value_against) / (s/sqrt(n)) follows a t distribution with 15 df under H0. s is standard deviation of the population
#two-sided test:
#we want the probability of rejecting under the null to e 5%, split equally as 2.5% in the upper tail
#and 2.5% in the lower tail.
#T test
if(!require("UsingR")){install.packages("UsingR")}
library(UsingR) #this guy includes almost everything I need in R.
data(father.son)
dim(father.son) #1078 * 2
t.test(father.son$sheight - father.son$fheight)
#if you fail to reject the one-sided test(p>alpha), you know that you also fail to reject the two sided(p>alpha/2)
#p-value
#binomial example:
#Suppose a friend has $8$ children, $7$ of which are girls and none are twins
#If each gender has an independent $50$% probability for each birth, what's the probability of getting
#7 or more girls out of 8 births?
choose(8, 7) * .5 ^ 8 + choose(8, 8) * .5 ^ 8 #choose is the function of combination
#The binomial distribution with size = n and prob = p has density: p(x) = choose(n, x) * p^x * (1-p)^(n-x)
pbinom(6, size = 8, prob = .5, lower.tail = FALSE) #same as above
#lower.tail: logical; if TRUE (default), probabilities are P[X <= x], otherwise, P[X > x].
#Poisson example:
#p(x) = lambda^x exp(-lambda)/x!
#Suppose that a hospital has an infection rate of 10 infections per 100 person/days at risk (rate of
#0.1) during the last monitoring period.
#Assume that an infection rate of 0.05 is an important benchmark.
#Given the model, could the observed rate being larger than 0.05 be attributed to chance?
#H0: lambda = 0.05; Ha: lambda > 0.05
#lambda = 0.05, then the lambda plugged in ppois function should be 0.05*100 = 5
ppois(9, 5, lower.tail = FALSE) #p-value 0.03182806
#4. Power: the probability of rejecting the null hypothesis when it is false (specificity)
#the probability of a type II error is usually called beta. So powr = 1- beta
#Consider our previous example involving RDI
#H_0: miu = 30 versus H_a: miu > 30
#Then power is Pr((X_bar - 30)/(s/sqrt(n) > t_{1-alpha,n-1}; miu = miu_a)
#Note that this is a function that depends on the specific value of mu_a!
#Notice as miu_a approaches 30, the power approaches alpha
#Calculating power for Gaussian data
#We reject if (X_bar - 30)/(sigma/sqrt(n)) > z_{1-alpha}
#Equivalently if X_bar > 30 + Z_{1-alpha} * sigma/sqrt(n)
#Under $H_0 : X_bar ~ N(miu_0, sigma^2 / n)
#Under $H_a : X_bar ~ N(miu_a, sigma^2 / n)
alpha = 0.05
z = qnorm(1 - alpha)
mu0=30
mua=32
sigma=4
n=c(8,16,32,64,128)
pnorm(mu0 + z * sigma / sqrt(n), mean = mu0, sd = sigma / sqrt(n), lower.tail = FALSE)
pnorm(mu0 + z * sigma / sqrt(n), mean = mua, sd = sigma / sqrt(n), lower.tail = FALSE) #get the upper probability
library(ggplot2)
nseq = c(8, 16, 32, 64, 128)
mua = seq(30, 35, by = 0.1)
z = qnorm(.95)
power = sapply(nseq, function(n) pnorm(mu0 + z * sigma / sqrt(n), mean = mua, sd = sigma / sqrt(n), lower.tail = FALSE))
colnames(power) <- paste("n", nseq, sep = "")
d <- data.frame(mua, power)
library(reshape2)
d2 <- melt(d, id.vars = "mua")
names(d2) <- c("mua", "n", "power")
g <- ggplot(d2, aes(x = mua, y = power, col = n)) + geom_line(size = 2)
g #power increases as miu_a gets larger -- we are more likely to detect a diference if the difference
#we want to detect is very big. higher n get us higher power earilier and earilier.
#
library(manipulate)
library(ggplot2)
mu0 = 30
myplot <- function(sigma, mua, n, alpha){
g = ggplot(data.frame(mu = c(27, 36)), aes(x = mu))
g = g + stat_function(fun=dnorm, geom = "line",
args = list(mean = mu0, sd = sigma / sqrt(n)),
size = 2, col = "red")
g = g + stat_function(fun=dnorm, geom = "line",
args = list(mean = mua, sd = sigma / sqrt(n)),
size = 2, col = "blue")
xitc = mu0 + qnorm(1 - alpha) * sigma / sqrt(n)
g = g + geom_vline(xintercept=xitc, size = 3)
g
}
#manipulate: The manipulate function accepts a plotting expression and a set of controls (e.g. slider, picker,
#checkbox, or button) which are used to dynamically change values within the expression. When a
#value is changed using its corresponding control the expression is automatically re-executed and
#the plot is redrawn.
#slider: Create a slider control(a small gear on the topleft corner) to allow manipulation of a plot variable along a numeric range.
#slider(min, max, initial = min, label = NULL, step = NULL, ticks = TRUE)
manipulate(
myplot(sigma, mua, n, alpha),
sigma = slider(1, 10, step = 1, initial = 4),
mua = slider(30, 35, step = 1, initial = 32),
n = slider(1, 50, step = 1, initial = 16),
alpha = slider(0.01, 0.1, step = 0.01, initial = 0.05)
)
#power is nothing other than the probability of getting larger than the black line (the area between the black line and blue curve).
#The probability that we reject if in fact the blue curve is true.
#In power calcualtion, Unknowns are miu_a, sigma, n, beta, and Knowns: miu_0, alpha
#Specify any 3 of the unknowns and you can solve for the remainder
#for example, for particular power I want, what n (how many trials) should like to have.
#Power goes up as alpha gets larger. (the black line moves left.)
#Power of a one sided test is greater than the power of the associated two sided test
#Power goes up as miu_a gets further away from miu_0
#Power goes up as n goes up
#Power doesn't need miu_a, sigma and n, instead only sqrt(n)*(miu_a-miu_0)/sigma
#The quantity (miu_a-miu_0)/sigma is called the effect size, the difference in the means in standard
#deviation units. Being unit free, it has some hope of interpretability across settings.
#t-test power
#The power is P((X_bar - miu_0)/(S/sqrt(n)) > t_{1-\alpha, n-1} ; miu = miu_a) #just modify z-quantile to t-quantile
#(X_bar - miu_0)/(S/sqrt(n)) doesn't follow a t distribution is the true mean is not miu_0.
#the distribution is called non-central t distribution.
?power.t.test
#power.t.test(n = NULL, delta = NULL, sd = 1, sig.level = 0.05, power = NULL, type = c("two.sample",
#"one.sample", "paired"), alternative = c("two.sided", "one.sided"),strict = FALSE)
#delta: True difference in means: this is the miu_a - miu_0.
#miu_a/, sigma, n, beta/power: Omit one of the arguments and it solves for it
power.t.test(n = 16, delta = 2 / 4, sd=1, type = "one.sample", alt = "one.sided")$power
power.t.test(n = 16, delta = 2, sd=4, type = "one.sample", alt = "one.sided")$power
power.t.test(n = 16, delta = 100, sd=200, type = "one.sample", alt = "one.sided")$power
#all (miu_a-miu_0)/sigma are equal. #X_bar is the miu_a
#Power doesn't need miu_a, sigma and n, instead only sqrt(n)*(miu_a-miu_0)/sigma
#The quantity (miu_a-miu_0)/sigma is called the effect size, the difference in the means in standard
#deviation units. Being unit free, it has some hope of interpretability across settings.
power.t.test(power = .8, delta = 2 / 4, sd=1, type = "one.sample", alt = "one.sided")$n
power.t.test(power = .8, delta = 2, sd=4, type = "one.sample", alt = "one.sided")$n
power.t.test(power = .8, delta = 100, sd=200, type = "one.sample", alt = "one.sided")$n
#I always use power.t.test as my first attack on a power calculation. One of the main reasons is that
#power has a lot of knobs and dials that you can turn and it's very easy to get tripped up on thinking
#that you have better power than you have or thinking that you need a smaller sample size.
#So, when in doubt, try to make your power calculation as simple as possible. (try to revert the question
#that you're asking down to a t test or a binomial test.)
#you might want to move onto much more complex power calcualtions, but as a first pass you always want to
#do the t test power the normal calcualtion power or the basic power calculation in a binomial problem.
#5. Multiple testing
#when you do more than one hypothesis test, you have to do some sort of correction to make sure that
#you're not fooling yourself -- overuse of hypothesis testing and false positives or discoveries.
#This lecture is about how to do those corrections. #Two key components: Error Measure, Correction
#Classifications
#http://en.wikipedia.org/wiki/False_positive_rate
#(beta_hat=0|beta=0) = U
###error: (beta_hat!=0|beta=0) = V (the number of false positives (Type I error), also called "false discoveries")
###error: (beta_hat=0|beta!=0) = T (the number of false negatives (Type II error))
#(beta_hat!=0|beta!=0) = S (the number of true positives (also called "true discoveries"))
#set m as the total claim times.
#times of claiming beta = 0 is m-R
#times of claiming beta != 0 is R
#times of beta = 0 is m_0
#times of beta != 0 is m-m_0
#Types of errors:
#Type I error or false positive (V) Say that the parameter does not equal zero when it does
#Type II error or false negative (T) Say that the parameter equals zero when it doesn't
#Error Rates:
#False positive rate - The rate at which false results (beta = 0) are called significant: E[V/m_0]
#m_0 is total number of not significant variables; V is the total number of Type I error
#If P-values are correctly calculated calling all P < alpha significant will control the false
#positive rate at level alpha on average (actually no control). (Ths controlling has a problem: Suppose that you
#perform 10,000 tests and you call all $P < 0.05$ significant, then The expected number of
#false positives is: 10,000 * 0.05 = 500 false positives. -- too many false positives)
#While the false positive rate is mathematically equal to the type I error rate, it is viewed as a separate term because:
#The type I error rate is often associated with the a-prior setting of the significance level.
#As opposed to that, the false positive rate is associated with a post-prior result. the false positive rate is a parameter that is not controlled by the researcher, it cannot be identified with the significance level.
#false positive rate is a term usually used regarding a medical test or diagnostic device
#(i.e. "the false positive rate of a certain diagnostic device is 1%"), while type I error is a term associated with statistical tests.
#Family wise error rate (FWER) - The probability of at least one false positive Pr(V >= 1)
#The Bonferroni correction(very conservative) is the oldest multiple testing correction. Basic idea:
#Suppose you do m tests and you want to control FWER at level alpha so Pr(V>=1) < alpha
#Calculate P-values normally
#Set alpha_{FWER} = alpha/m
#Call all P-values less than alpha_{FWER} significant
#adjusted p-value approach - a direct calculation in R
#Suppose P-values are P_1,...,P_m, You could adjust them by taking P_i_{fwer} = max(m * P_i,1) for each P-value.
#then is if you call all P_i_{fwer} < alpha significant you will control the FWER.
#False discovery rate (FDR) - The rate at which claims of significance are false E[V/R]
#R is the times that we're going to claim that beta is not equal to zero. (total # of discovery)
#the proportion of false when claiming beta != 0;
#Controlling false discovery rate is the most popular correction when performing lots of tests
#say in genomics, imaging, astronomy, or other signal-processing disciplines.
#Basic idea:
#Suppose you do m tests and one wants to contrl FDR at level alpha so FDR is controlled (because one wants to garauntee that FDR <= alpha.
#define V/R = 0 where R = 0
#Calculate P-values normally, Order the P-values from smallest to largest P_1,...,P_m
#Call any P_i <= alpha * (i/m) significant
#same as FWER, we could also use adjusted p-value approach - a direct calculation in R
#q-value approach:
#you could also directly estimate q-values rather than fixing a level alpha at which to control the FDR.
#Calculate P-values normally, Order the P-values from smallest to largest P_1,...,P_m
#define q_k = p_k * (m/k)
#Let q_i be the FDR-adjusted value for p_i It's value is the smallest q_k, where k>=i
#Estimate the q-values for a given set of p-values: #library(qvalue) - no longer available
#qvalue(p=NULL, lambda=seq(0,0.90,0.05), pi0.method="smoother", fdr.level=NULL, robust=FALSE,
#gui=FALSE, smooth.df=3, smooth.log.pi0=FALSE)
#Case study I: no true positives
set.seed(1010093)
pValues <- rep(NA, 1000)
for (i in 1:1000) {
y <- rnorm(20)
x <- rnorm(20)
pValues[i] <- summary(lm(y ~ x))$coeff[2, 4] #four columns: Estimate, Std. Error, t value, Pr(>|t|)
}
# Controls false positive rate
#no control
sum(pValues < 0.05) #even in none of the 1000 simulation cases was there actually a relationship between
#x and y, we still get 51 or about 5% of the tests being performed called significant.
#Use controls:
# Controls FWER
sum(p.adjust(pValues, method = "bonferroni") < 0.05)
## [1] 0
# Controls FDR -- method = "BH" (same as fdr)
sum(p.adjust(pValues, method = "BH") < 0.05)
## [1] 0
p.adjust(pValues, method = "fdr") == p.adjust(pValues, method = "BH") #TRUE
#Case study II: 50% true positives
set.seed(1010093)
pValues <- rep(NA, 1000)
for (i in 1:1000) {
x <- rnorm(20)
# First 500 beta=0, last 500 beta=2
if (i <= 500) {
y <- rnorm(20)
} else {
y <- rnorm(20, mean = 2 * x)
}
pValues[i] <- summary(lm(y ~ x))$coeff[2, 4]
}
trueStatus <- rep(c("zero", "not zero"), each = 500)
table(pValues < 0.05, trueStatus)
# Controls FWER
table(p.adjust(pValues, method = "bonferroni") < 0.05, trueStatus)
# Controls FDR
table(p.adjust(pValues, method = "BH") < 0.05, trueStatus)
#P-values versus adjusted P-values
par(mfrow = c(1, 2))
plot(pValues, p.adjust(pValues, method = "bonferroni"), pch = 19)
plot(pValues, p.adjust(pValues, method = "BH"), pch = 19)
par(mfrow = c(1, 1))
#A basic Bonferroni/BH correction is usually enough
#If there is strong dependence between tests there may be problems, consider method="BY" (BH may perform strangely)
#6. Bootstrap
#The bootstrap is a tremendously useful tool for constructing confidence intervals and calculating standard errors for difficult statistics
#Sample of 50 die rolls
library(ggplot2)
library(gridExtra)
nosim <- 1000
cfunc <- function(x, n) mean(x)
g1 = ggplot(data.frame(y = rep(1/6, 6), x = 1 : 6), aes(y = y, x = x))
g1 = g1 + geom_bar(stat = "identity", fill = "lightblue", colour = "black")
dat <- data.frame(x = apply(matrix(sample(1 : 6, nosim * 50, replace = TRUE), nosim), 1, mean))
g2 <- ggplot(dat, aes(x = x)) + geom_histogram(binwidth=.2, colour = "black", fill = "salmon", aes(y = ..density..))
grid.arrange(g1, g2, ncol = 2) #g1 is population distribution, g2 is the distribution of mean
#imagine if you don't know whether the dice is fair or not.
#What if we only had one sample of size 50?
n = 50
B = 1000
## our data
x = sample(1 : 6, n, replace = TRUE)
## bootstrap resamples
resamples = matrix(sample(x,n * B,replace = TRUE),B, n) #resampling from the left blue bars.
resampledMeans = apply(resamples, 1, mean)
g1 <- ggplot(as.data.frame(prop.table(table(x))), aes(x = x, y = Freq)) + geom_bar(colour = "black", fill = "lightblue", stat = "identity")
g2 <- ggplot(data.frame(x = resampledMeans), aes(x = x)) + geom_histogram(binwidth=.2, colour = "black", fill = "salmon", aes(y = ..density..))
grid.arrange(g1, g2, ncol = 2)
#Now I can't evaluate what't the behavior of average of 50 die rolls from this population because I don't
#know what the population is and I don't know what die to roll from. Bootstrapping would tell me about
#the distribution of averages even though I only get to observe one real true average frm the real population.
#Consider a data set
library(UsingR)
data(father.son)
x <- father.son$sheight
n <- length(x)
B <- 10000
resamples <- matrix(sample(x,n * B,replace = TRUE),B, n)
resampledMedians <- apply(resamples, 1, median)
g = ggplot(data.frame(x = resampledMedians), aes(x = x))
g = g + geom_density(size = 2, fill = "red")
#g = g + geom_histogram(alpha = .20, binwidth=.3, colour = "black", fill = "blue", aes(y = ..density..))
g = g + geom_vline(xintercept = median(x), size = 2)
g
#Nonparametric bootstrap algorithm
#Bootstrapped confidence interval for mean or median
#(1)first simulating complete data sets from the observed data with replacement
#(2)Calculate the statistic for each simulated data set
#(3)Use the simulated statistics to either define a confidence interval or take the standard
#deviation to calculate a standard error
#i. Sample n observations with replacement from the observed data resulting in one simulated
#complete data sets
#ii. Take the median of the simulated data set
#iii. Repeat these two steps B times, resulting in B simulated medians
#iv. These medians are approximately drawn from the sampling distribution of the median of n
#observations; therefore we can:
#Draw a histogram of them
#Calculate their standard deviation to estimate the standard error of the median
#Take the 2.5th and 97.5th percentiles as a confidence interval for the median
#example
#n<-50
#x<-rnorm(50)
x <- father.son$sheight
n <- length(x)
B <- 10000
resamples <- matrix(sample(x,n * B, replace = TRUE), B, n)
medians <- apply(resamples, 1, median)
sd(medians)
quantile(medians, c(.025, .975))
g = ggplot(data.frame(medians = medians), aes(x = medians))
g = g + geom_histogram(color = "black", fill = "lightblue", binwidth = 0.05)
g
#JackKnife Resampling -- especially useful for variance and bias estimation
#http://en.wikipedia.org/wiki/Jackknife_resampling
#The jackknife estimator of a parameter is found by systematically leaving out each observation from
#a dataset and calculating the estimate and then finding the average of these calculations.
#theta_bar_jack = (1/n) * sum_{i=1~n}Xi
#variance estimation: (An estimate of the variance of an estimator can be calculated using the jackknife technique.)
#Var(theta) = var((sum_{i=1~n}Xi)/n) = (sigma^2/n) = [(n-1)/n] * (sum_{i=1~n}(theta_bar_i - theta_bar_jack)^2)
#where theta_bar_i is the parameter estimate based on leaving out the ith observation
#theta_bar_jack is the jackknife estimator based on all of the samples
#Bias estimation and correction:
#theta_bar_BiasCorrected = N * theta_bar - (N-1) * theta_bar_Jack
# This reduces bias by an order of magnitude from O(N^-1) to O(N^-2)
#7. Permutation test -- used for group comparisons
#e.g. comparing sparys B and C
data(InsectSprays)
g = ggplot(InsectSprays, aes(spray, count, fill = spray))
g = g + geom_boxplot()
g
#Permutation tests
#Consider the null hypothesis that the distribution of the observations from each group is the same
#Then, the group labels are irrelevant
#Consider a data frome with count and spray
#Permute the spray (group) labels and Recalculate the statistic: Mean difference in counts, Geometric means, T statistic, etc.
#Calculate the percentage of simulations where the simulated statistic was more extreme (toward the alternative) than the observed
#Variations on Permutation testing
#rank sum test
#Fisher's exact test (hypergeometric prob)
#also, so-called randomization tests are exactly permutation tests, with a different motivation.
#For matched data, one can randomize the signs; For ranks, this results is the signed rank test
#Permutation strategies work for regression as well
#Permutation tests work very well in multivariate settings, because you can calculate sort of maiximum
#statistics that control FWER(family-wise error rate)
#Permutation test B v C
subdata <- InsectSprays[InsectSprays$spray %in% c("B", "C"),]
y <- subdata$count
group <- as.character(subdata$spray)
testStat <- function(w, g) mean(w[g == "B"]) - mean(w[g == "C"])
observedStat <- testStat(y, group)
permutations <- sapply(1 : 10000, function(i) testStat(y, sample(group))) #we could use permutation in this way!!!
observedStat
mean(permutations > observedStat)
#from the result, we could see that for 10000 permutations, we couldn't find a reconfiguration of
#the group labels that lead to a more extreme value of the test statistic than the observed statistic.
#(More formally, the p value is very samll close to zero). So we reject the null hypothesis.
g = ggplot(data.frame(permutations = permutations),aes(permutations))
g = g + geom_histogram(fill = "lightblue", color = "black", binwidth = 1)
g = g + geom_vline(xintercept = observedStat, size = 2)
g #you could see that the observed statistic is way far out in the tail of our null distribution.