-
Notifications
You must be signed in to change notification settings - Fork 93
/
04_观数以形.R
1360 lines (1182 loc) · 35.6 KB
/
04_观数以形.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# 04_观数以形 -----------------------------------------------------------
#了解一个人,可能是先看其长相,了解其言谈举止
#而后再逐渐深入
#了解一份数据,首先也是要看数据的高矮胖瘦
#然后再去深入了解内在的关系结构
#要初步了解数据,大部分教材上用的是以下两种方法:
#1、用少量数字描述数据
#2、用图形直观刻画数据
#机器学习之要义在于“关系结构”
#尤其是变量之间的关系和数据空间的结构
#认识数据:
#同样是刻画数据空间的形态和变量之间的关系
# Data Import -------------------------------------------------------------
#为保持课程的一致性,减少小伙伴们熟悉业务背景的成本
#本次课程同样采用前述的《学生文理分科》的数据
#采用真实数据的目的很简单:所得结果是鲜活的
#书接前文,在观测数据的外表之前,首先还是将数据读入
library(tidyverse)
cjb_url <- "data/cjb.csv"
cjb <- read.csv(cjb_url,
stringsAsFactors = FALSE,
encoding = "CP936")
cjb %<>%
mutate(zcj = rowSums(.[, 4:12])) %>%
mutate_at(vars(xb, bj, wlfk), factor) %>%
dplyr::filter(zcj != 0)
#可能有小伙伴觉得,一次次从网络读取数据很麻烦
#好吧,那咱们就把它存到本地
#实际上,在大部分数据分析项目中
#我们都可以把清理好的数据存为rda格式放在本地
View(cjb)
save(cjb, file = "data/cjb.rda")
#文件存储在当前工作路径的data子目录之下
#当前工作路径可通过getwd()/setwd()来查询和设置
rm(list = ls())
load("data/cjb.rda",
verbose = TRUE)
#也可以采用下边这种方式选取
#load(file.choose())
# 1D-univariate -----------------------------------------------------------
#机器学习的核心任务
#是揭示变量之间的关系和数据空间的结构
#变量之间的关系,自然是一个变量变化或若干变量变化之后,
#另外变量随之产生变化;
#数据空间的形态,
#则主要是数据点在数据空间的散布所呈现的结构
#要考察变量之间的依存/随动关系,
#自然首先要看单个变量的分布情况
#若某个变量取值不变,退化为常量,则几乎是不被作为特征的
#我们要考查的,恰恰就是数据本身的变化或者说分布情况
#同样,要考查数据空间的形态,
#当然也可以从单个维度的形态着手
#因此,我们要认识数据,做的第一件事情,
#往往就是单变量的分布情况的描述
#一维数据空间,毫无疑问就是一条直线
# Stem --------------------------------------------------------------------
#一维空间的数据形态,可以通过茎叶图或是Wikinson点图
#来直观表示
library(tidyverse)
cjb %>%
dplyr::filter(bj == "1101") %>%
pull(sx) %>%
sort() -> sx_1101
cjb %>%
dplyr::filter(bj == "1110") %>%
pull(sx) %>%
sort() -> sx_1110
stem(sx_1101, scale = 0.5)
table(cjb$bj)
set.seed(2012)
tmp_x <- 10 * round(sx_1101 + rnorm(length(sx_1101)), digits = 2)
sort(tmp_x)
# [1] 542.2 564.2 590.9 593.7 596.6 606.0 612.6 637.7 643.7 654.1 672.8 673.1 681.6 688.2 691.8 692.5 703.7 718.9 719.3
# [20] 724.3 724.4 729.8 731.4 731.5 736.3 742.4 749.4 760.3 767.9 778.6 779.8 789.4 789.9 797.7 801.4 801.8 812.5 819.5
# [39] 822.6 825.4 825.6 828.6 831.8 840.0 840.4 845.5 853.5 865.7 891.0 912.6 915.9 950.0
stem(tmp_x, scale = 0.5)
#1101班数学成绩茎叶图
cjb %>%
dplyr::filter(bj == "1101") %>%
select(sx) %>%
as_vector() %>%
stem(scale = 0.5)
#1110班数学成绩茎叶图
cjb %>%
dplyr::filter(bj == "1110") %>%
select(sx) %>%
as_vector() %>%
stem(scale = 2)
stem(sx_1101, scale = 0.5)
stem(sx_1110, scale = 2)
# Histogram ---------------------------------------------------------------
results <- hist(cjb$zz,
breaks = "Sturges")
results <- hist(cjb$zz,
breaks = "Sturges",
plot = FALSE)
results$breaks
(max(cjb$zz) - min(cjb$zz)) /
(ceiling(log2(nrow(cjb))) + 1)
nclass.Sturges(cjb$yw)
nclass.Sturges(cjb$zz)
#ggplot2的基本绘图模板template是:
# ggplot(data = <DATA>) +
# <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>),
# stat = <STAT>,
# position = <POSITION>) +
# <COORDINATE_FUNCTION> +
# <FACET_FUNCTION>
#虽然绘图框架很直观明了,但是真正要精通各类图形绘制,
#仍然需要日积月累
#在初学ggplot2的过程中,除了图形语法之外,
#还有一个难点,那就是数据的转换的过程,
#比如数据的长宽变换等
#在接下来的具体代码演示过程中,
#这一点小伙伴们也许多加留意
#看一看数据分布的形状
sx_hist_results <- hist(cjb$sx,
plot = FALSE)
? hist
#查看sx_hist_results的类型
typeof(sx_hist_results)
#> [1] "list"
#查看列表的组成
names(sx_hist_results)
#> [1] "breaks" "counts" "density" "mids" "xname" "equidist"
sx_hist_results$density
sx_hist_results$counts / length(cjb$sx) * 2 - sx_hist_results$density
library(tidyverse)
ggplot(data = cjb, mapping = aes(sx)) +
geom_histogram(breaks = sx_hist_results$breaks,
color = "darkgray",
fill = "white") +
stat_bin(breaks = sx_hist_results$breaks,
geom = "text",
aes(label = ..count..)) +
coord_flip()
ggplot(data = cjb, mapping = aes(sx)) +
geom_histogram(breaks = sx_hist_results$breaks,
color = "darkgray",
fill = "white") +
stat_bin(breaks = sx_hist_results$breaks,
geom = "text",
aes(label = ..count..)) +
coord_flip()
ggsave("histogram1.png", dpi = 600)
#dpi一般设为300,就可以达到印刷要求了
#换言之,发表论文时,dpi设置为300也就足够了
#当然,dpi值越高、质量越好
ggplot(data = cjb, mapping = aes(sx)) +
geom_histogram(breaks = sx_hist_results$breaks,
color = "darkgray",
fill = "white") +
stat_bin(breaks = sx_hist_results$breaks,
geom = "text",
aes(label = ..count..))
ggsave("histogram2.png", dpi = 600)
# Density -----------------------------------------------------------------
#获取直方图相关参数
sx_hist_results <- hist(cjb$sx,
plot = FALSE)
#绘制直方图
ggplot(data = cjb, mapping = aes(sx)) +
geom_histogram(
aes(y = ..count..),
breaks = sx_hist_results$breaks,
color = "darkgray",
fill = "white"
) +
#绘制概率密度曲线
geom_density(colour = "red")
ggplot(data = cjb, mapping = aes(sx)) +
geom_histogram(
aes(y = ..density..),
breaks = sx_hist_results$breaks,
color = "darkgray",
fill = "white"
) +
#绘制概率密度曲线
geom_density(colour = "red")
ggsave("histogram2+density.png", dpi = 600)
#概率密度图
data_points <- head(cjb$sx, n = 10)
sx_density <- density(data_points)
gaussian_kernel <- function(X, x, h) {
u <- (X - x) / h
return(1 / sqrt(2 * pi) * exp(-0.5 * (u ^ 2)))
}
X <- sx_density$x
h <- sx_density$bw
n <- length(data_points)
components <- lapply(data_points, function(y) {
gaussian_kernel(X, y, h)
})
plot(sx_density)
for (i in seq_along(components)) {
lines(X, components[[i]] / (n * h), col = "grey")
}
points(data_points, rep(0, n), col = "red")
#不同的核函数
#方窗
u <- c(-2.5, -2.5, 2.5, 2.5)
kernel_2 <- c(0, 0.5, 0.5, 0)
plot(
u,
kernel_2,
type = "l",
xaxt = "n",
yaxt = "n",
axes = F,
xlim = c(-5, 5),
ylim = c(0, 1)
)
arrows(0, 0, 0, 0.7, length = 0.1)
arrows(-5, 0, 5, 0, length = 0.1)
#三角窗
u <- c(-2.5, 0, 2.5)
kernel_2 <- c(0, 0.5, 0)
plot(
u,
kernel_2,
type = "l",
xaxt = "n",
yaxt = "n",
axes = F,
xlim = c(-5, 5),
ylim = c(0, 1)
)
arrows(0, 0, 0, 0.7, length = 0.1)
arrows(-5, 0, 5, 0, length = 0.1)
#正态窗
u <- seq(-4, 4, len = 10000)
kernel_2 <- 1 / sqrt(2 * pi) * exp(-0.5 * (u ^ 2))
plot(
u,
kernel_2,
type = "l",
xaxt = "n",
yaxt = "n",
axes = F,
xlim = c(-5, 5),
ylim = c(0, 1)
)
arrows(0, 0, 0, 0.6, length = 0.1)
arrows(-5, 0, 5, 0, length = 0.1)
#指数窗
u <- seq(-4, 4, len = 10000)
kernel_2 <- 1 / 2 * exp(-abs(u))
plot(
u,
kernel_2,
type = "l",
xaxt = "n",
yaxt = "n",
axes = F,
xlim = c(-5, 5),
ylim = c(0, 1)
)
arrows(0, 0, 0, 0.6, length = 0.1)
arrows(-5, 0, 5, 0, length = 0.1)
# Violin ------------------------------------------------------------------
#绘制小提琴图
ggplot(cjb, aes(x = factor(0), y = sx)) +
geom_violin(fill = "orange", alpha = 0.2) +
coord_flip()
ggsave("violin.png", dpi = 600)
# Boxplot -----------------------------------------------------------------
ggplot(cjb, aes(x = factor(0), y = sx)) +
geom_violin(fill = "orange", alpha = 0.2) +
geom_boxplot(width = 0.25,
fill = "blue",
alpha = 0.2) +
coord_flip()
#箱线图
set.seed(2012)
sample_data <- rnorm(100000)
op <- par(mfrow = c(2, 1))
boxplot(sample_data,
horizontal = TRUE)
plot(density(sample_data), main = NA)
par(op)
op <- par(mfrow = c(2, 3))
library(skewt)
set.seed(2012)
rt1 <- rskt(5000, 12, 10)
rt2 <- rskt(5000, 12, 1)
rt3 <- rskt(5000, 12, 0.2)
plot(density(rt1), col = "red", lwd = 2)
plot(density(rt2), col = "red", lwd = 2)
plot(density(rt3), col = "red", lwd = 2)
boxplot(rt1, horizontal = TRUE)
boxplot(rt2, horizontal = TRUE)
boxplot(rt3, horizontal = TRUE)
par(op)
cjb %>%
ggplot(aes(x = factor(0), y = sx)) +
geom_violin(fill = "#56B4E9", width = 0.75) +
geom_boxplot(
width = 0.25,
fill = "#E69F00",
outlier.colour = "red",
outlier.shape = 1,
outlier.size = 2
) +
geom_rug(position = "jitter",
size = 0.1,
sides = "b") +
coord_flip() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
cjb %>%
ggplot(aes(x = factor(0), y = sx)) +
geom_violin(fill = "#56B4E9", width = 0.75) +
geom_boxplot(
width = 0.25,
fill = "#E69F00",
outlier.colour = "red",
outlier.shape = 1,
outlier.size = 2
) +
geom_rug(position = "jitter",
size = 0.1,
sides = "b") +
coord_flip()
ggsave("boxplot1.png", dpi = 600)
cjb %>%
ggplot(aes(x = factor(0), y = sx)) +
geom_boxplot(
width = 0.25,
fill = "#E69F00",
outlier.colour = "red",
outlier.shape = 1,
outlier.size = 2
) +
geom_rug(position = "jitter",
size = 0.1,
sides = "b") +
coord_flip()
ggsave("boxplot2.png", dpi = 600)
boxplot_results <- boxplot.stats(cjb$sx)
# $`stats`
# [1] 60 81 89 95 100
#
# $n
# [1] 774
#
# $conf
# [1] 88.20491 89.79509
#
# $out
# [1] 55 59 57 59 58 51 56 55 59 26 58 46 59 59
sort(boxplot_results$out)
typeof(boxplot_results)
names(boxplot_results)
boxplot_results$stats
fivenum(cjb$sx)
# Location ----------------------------------------------------------------
#前边是数据形态的直观展示
#我们需要有一些定量指标来对数据形态进行刻画
#数据的集中趋势
#均值
cjb %>%
group_by(wlfk) %>% #按文理分科分组统计
summarise(
count = n(),
#各组人数
sx_max = max(sx),
#最大值
sx_Q3 = quantile(sx, 0.75),
#第三分位数
sx_median = median(sx),
#中位数
sx_mean = mean(sx),
#均值
sx_Q1 = quantile(sx, 0.25),
#第一分位数
sx_iqr = IQR(sx),
#四分位距
sx_min = min(sx),
#最小值
sx_range = max(sx) - min(sx)#极差
)
cjb %>%
group_by(wlfk) %>% #按文理分科分组统计
summarise(count = n(),
#各组人数
sx_median = median(sx),
#中位数
sx_mean = mean(sx))#均值
# Scale -------------------------------------------------------------------
cjb %>%
group_by(wlfk) %>% #按文理分科分组统计
summarise(
sx_max = max(sx),
#最大值
sx_min = min(sx),
#最小值
sx_range = max(sx) - min(sx)
)#极差
cjb %>%
group_by(wlfk) %>% #按文理分科分组统计
summarise(
sx_Q3 = quantile(sx, 3 / 4),
#第三分位数
sx_Q1 = quantile(sx, 1 / 4),
#第一分位数
sx_iqr = IQR(sx)
)#四分位距
#查看各科情况
round(apply(cjb[, 4:12], 2, function(x) {
c(
mean = mean(x),
median = median(x),
range = diff(range(x)),
IQR = IQR(x)
)
}))
# More Plots --------------------------------------------------------------
cjb %>%
dplyr::filter(bj == "1110") %>%
select(xm, sx) %>%
mutate(sx_z = (sx - mean(sx)) / sd(sx),
sx_type = ifelse(sx_z >= 0, "above", "below")) %>%
arrange(sx_z) %>%
ggplot(aes(x = fct_inorder(xm), y = sx_z, label = sx_z)) +
geom_bar(stat = 'identity', aes(fill = sx_type), width = .5) +
scale_fill_manual(
name = "Math Score",
labels = c("Above Average", "Below Average"),
values = c("above" = "#00ba38", "below" = "#f8766d")
) +
coord_flip()
cjb %>%
dplyr::filter(bj == "1110" & xb == "男") %>%
select(xm, sx) %>%
mutate(sx_z = (sx - mean(sx)) / sd(sx),
sx_type = ifelse(sx_z >= 0, "above", "below")) %>%
arrange(sx_z) %>%
ggplot(aes(x = fct_inorder(xm), y = sx_z, label = sx_z)) +
geom_bar(stat = 'identity', aes(fill = sx_type), width = 0.5) +
scale_fill_manual(
name = "Math Score",
labels = c("Above Average", "Below Average"),
values = c("above" = "#00ba38", "below" = "#f8766d")
) +
coord_flip() +
theme_bw()
g <- ggplot(cjb, aes(sx))
sx_hist_results <- hist(cjb$sx,
plot = FALSE)
library(tidyverse)
g + geom_histogram(aes(fill = bj),
breaks = sx_hist_results$breaks,
col = "black",
size = .1) + scale_fill_brewer(palette = "Spectral")
# 2D-Two Variables --------------------------------------------------------
# Treemap-Cat vs cat ------------------------------------------------------
#离散变量vs离散变量
#形式可以有很多种,比如马赛克图
#本实验中推荐的是矩形树图
library(tidyverse)
library(treemap)
cjb_sum <- cjb %>%
group_by(wlfk) %>%
summarise(count = n())
cjb_sum <- cjb %>%
group_by(wlfk, xb) %>%
summarise(count = n())
treemap(
as.data.frame(cjb_sum) ,
index = c("wlfk", "xb"),
vSize = "count",
vColor = "xb",
type = "categorical",
fontsize.labels = 20,
lowerbound.cex.labels = 0.6
)
#更改其中的字体
library(showtext)
font_add("fzqt", regular = "D://tools/fonts/FZQTJW.TTF")
showtext_begin()
treemap(
as.data.frame(cjb_sum) ,
index = c("wlfk", "xb"),
vSize = "count",
vColor = "xb",
type = "categorical",
fontsize.labels = 20,
fontfamily.title = "fzqt",
fontfamily.labels = "fzqt",
fontfamily.legend = "fzqt",
lowerbound.cex.labels = 0.6
)
showtext_end()
View(cjb_sum)
nrow(cjb)
library(treemap)
cjb %>%
group_by(wlfk, bj, xb) %>%
summarise(count = n()) %>%
as.data.frame() %>%
treemap(
index = c("wlfk", "bj", "xb"),
vSize = "count",
vColor = "count",
type = "value"
)
cjb %>%
group_by(bj, wlfk) %>%
summarise(count = n())
# Numeric vs numeric ------------------------------------------------------
#连续变量vs连续变量
#散点图是最常见、但同时也应该是最有用的图之一
#散点图可用来观察变量之间可能存在的模式
#同时也是二位数据空间形态的最直接的体现
library(ggplot2)
ggplot(cjb,
aes(
x = sx,
y = sw,
shape = wlfk,
colour = wlfk
)) +
geom_point(size = 2) +
labs(
x = "数学",
y = "生物",
colour = "文理分科",
shape = "文理分科"
)
#散点图矩阵
GGally::ggpairs(cjb, columns = 4:12)
ggsave("scatter_pairs.png", dpi = 600)
View(cjb)
# cor and cov -------------------------------------------------------------
#协方差以及内积的含义
#请大家进一步思考加减乘除的物理含义
#内积在某种程度上讲,也是在衡量相似性
set.seed(2012)
X <- rnorm(100)
Y <- rnorm(100)
sum(X * Y)
#> [1] 16.52361
#相关系数矩阵
sum(sort(X) * sort(Y))
#> [1] 113.6489
sum(sort(X) * rev(sort(Y)))
#> [1] -112.7025
cov(X, Y)
sum((X - mean(X)) * (Y - mean(Y))) / 99
library(animation)
saveGIF(
expr = {
Xs <- seq(-2, 2, len = 100)
Ys <- seq(-2, 2, len = 100)
area <- 4
Wx <- seq(2, -2, len = 20)
Wy <- sqrt(area - Wx ^ 2)
Wx <- c(Wx, rev(Wx))
Wy <- c(Wy, -rev(Wy))
Wx <- rev(Wx)
Wy <- rev(Wy)
W <- cbind(Wx, Wy)
W <- t(apply(W, 1, function(x) {
x / sqrt(x[1] ^ 2 + x[2] ^ 2)
}))
XY <- expand.grid(Xs, Ys)
names(XY) <- c("Xs", "Ys")
XY_bak <- XY
XY <- as.data.frame(XY)
b <- 1 / 2
i <- 5
for (i in 1:nrow(W)) {
w <- W[i, ]
XY <- XY_bak
XY$Inner_Product <- apply(XY, 1, function(x) {
sum(x * w)
})
names(XY) <- c("x", "y", "Inner_Product")
w_label <- paste0("(",
round(w[1], digits = 2),
",",
round(w[2], digits = 2),
")")
library(ggplot2)
p <- ggplot(XY, aes(
x = x,
y = y,
colour = Inner_Product
)) +
geom_point(size = 0.5) +
geom_segment(
aes(
x = 0,
y = 0,
xend = w[1],
yend = w[2]
),
colour = "red",
size = 1.2,
arrow = arrow(length = unit(0.03, "npc"))
) +
geom_text(aes(
x = w[1],
y = w[2],
label = w_label
),
colour = "blue") +
#scale_colour_gradient2(low="#22FF00", mid="white", high="#FF0000", midpoint=0) +
scale_colour_gradient2(
low = "red",
mid = "white",
high = "blue",
midpoint = 0
) +
coord_fixed()
print(p)
}
},
movie.name = "animation5.gif",
convert = "gm convert",
interval = 1
)
library(animation)
saveGIF(
expr = {
Xs <- seq(-2, 2, len = 100)
Ys <- seq(-2, 2, len = 100)
area <- 4
Wx <- seq(2, -2, len = 20)
Wy <- sqrt(area - Wx ^ 2)
Wx <- c(Wx, rev(Wx))
Wy <- c(Wy, -rev(Wy))
Wx <- rev(Wx)
Wy <- rev(Wy)
W <- cbind(Wx, Wy)
W <- t(apply(W, 1, function(x) {
x / sqrt(x[1] ^ 2 + x[2] ^ 2)
}))
XY <- expand.grid(Xs, Ys)
names(XY) <- c("Xs", "Ys")
XY_bak <- XY
XY <- as.data.frame(XY)
b <- 1
for (i in 1:nrow(W)) {
w <- W[i, ]
XY <- XY_bak
XY$Inner_Product <- apply(XY, 1, function(x) {
sum(x * w) > b
})
names(XY) <- c("x", "y", "Inner_Product")
w_label <- paste0("(",
round(w[1], digits = 2),
",",
round(w[2], digits = 2),
")")
library(ggplot2)
p <- ggplot(XY, aes(
x = x,
y = y,
colour = Inner_Product
)) +
geom_point(size = 0.5) +
geom_segment(
aes(
x = 0,
y = 0,
xend = w[1],
yend = w[2]
),
colour = "red",
size = 1.2,
arrow = arrow(length = unit(0.03, "npc"))
) +
geom_text(aes(
x = w[1],
y = w[2],
label = w_label
),
colour = "blue") +
coord_fixed()
print(p)
}
},
movie.name = "animation6.gif",
convert = "gm convert",
interval = 1
)
library(animation)
saveGIF(
expr = {
Xs <- seq(-2, 2, len = 100)
Ys <- seq(-2, 2, len = 100)
area <- 4
Wx <- seq(2, -2, len = 20)
Wy <- sqrt(area - Wx ^ 2)
Wx <- c(Wx, rev(Wx))
Wy <- c(Wy, -rev(Wy))
Wx <- rev(Wx)
Wy <- rev(Wy)
W <- cbind(Wx, Wy)
W <- t(apply(W, 1, function(x) {
x / sqrt(x[1] ^ 2 + x[2] ^ 2)
}))
XY <- expand.grid(Xs, Ys)
names(XY) <- c("Xs", "Ys")
XY_bak <- XY
XY <- as.data.frame(XY)
b <- 1 / 2
for (i in 1:nrow(W)) {
w <- W[i, ]
XY <- XY_bak
XY$Inner_Product <- apply(XY, 1, function(x) {
sum(x * w) > b
})
names(XY) <- c("x", "y", "Inner_Product")
w_label <- paste0("(",
round(w[1], digits = 2),
",",
round(w[2], digits = 2),
")")
library(ggplot2)
p <- ggplot(XY, aes(
x = x,
y = y,
colour = Inner_Product
)) +
geom_point(size = 0.5) +
geom_segment(
aes(
x = 0,
y = 0,
xend = w[1],
yend = w[2]
),
colour = "red",
size = 1.2,
arrow = arrow(length = unit(0.03, "npc"))
) +
geom_text(aes(
x = w[1],
y = w[2],
label = w_label
),
colour = "blue") +
coord_fixed()
print(p)
}
},
movie.name = "animation7.gif",
convert = "gm convert",
interval = 1
)
set.seed(2012)
X <- rnorm(100)
Y <- rnorm(100)
inner_prod <- sapply(1:1000000, function(x) {
sum(sample(X) * sample(Y))
})
#上边的代码,当然也可以用replicate改写
sum(sort(X) * sort(Y))
range(inner_prod)
hist(inner_prod)
cor_coef <- cjb %>%
select(yw:sw) %>%
cor() %>%
round(digits = 2)
#> yw sx wy zz ls dl wl hx sw
#> yw 1.00 0.46 0.54 0.47 0.38 0.37 0.30 0.38 0.40
#> sx 0.46 1.00 0.55 0.39 0.37 0.45 0.57 0.59 0.60
#> wy 0.54 0.55 1.00 0.37 0.28 0.38 0.47 0.44 0.44
#> zz 0.47 0.39 0.37 1.00 0.39 0.32 0.20 0.28 0.28
#> ls 0.38 0.37 0.28 0.39 1.00 0.41 0.31 0.33 0.38
#> dl 0.37 0.45 0.38 0.32 0.41 1.00 0.39 0.44 0.46
#> wl 0.30 0.57 0.47 0.20 0.31 0.39 1.00 0.62 0.65
#> hx 0.38 0.59 0.44 0.28 0.33 0.44 0.62 1.00 0.69
#> sw 0.40 0.60 0.44 0.28 0.38 0.46 0.65 0.69 1.00
# Tile for cor ------------------------------------------------------------
#HOW TO INTERPRET A CORRELATION COEFFICIENT R By Deborah J. Rumsey
library(ggplot2)
library(tidyverse)
cor_coef %>%
as.data.frame() %>%
rownames_to_column(var = "km1") %>%
gather(key = km2, value = cor_num, -km1) %>%
mutate(cor_level = cut(
cor_num,
breaks = c(0, 0.3, 0.5, 0.7, 1),
right = FALSE
)) %>%
ggplot(aes(
x = fct_inorder(km1),
y = fct_inorder(km2),
fill = cor_level
)) +
geom_tile(colour = "white", size = 1.5) +
geom_text(aes(label = format(cor_num, digits = 2))) +
scale_fill_brewer(palette = "YlGn", name = "相关系数区间")
ggsave("cor_coef.png", dpi = 600)
#区间划分方法并不唯一:
#Hinkle DE, Wiersma W, Jurs SG (2003). Applied Statistics for the Behavioral Sciences 5th ed. Boston: Houghton Mifflin
#0~0.3negligible correlation
#0.3~0.5low correlation
#0.5~0.7moderate correlation
#0.7~0.9high correlation
#0.9~1very high correlation
# Catetorial vs Numeric ---------------------------------------------------
#离散变量vs连续变量
#主要是分组绘图
#对不同的组别进行比较
# Grouped boxplots --------------------------------------------------------
#分组绘制箱线图
#看看不同班级数学成绩的分布
library(ggplot2)
ggplot(cjb, aes(x = bj,
y = sx,
fill = bj)) +
geom_boxplot(
outlier.colour = "red",
outlier.shape = 3,
outlier.size = 1
) +
labs(x = "班级", y = "数学成绩") +
theme(legend.position = "none")
ggsave("grouped_boxplots.png", dpi = 600)
#其余图形如直方图、概率密度图等,请自行练习
# Grouped density plots ---------------------------------------------------
#当然,如果分组太多,显然不适合全都叠加在一起
#可以采用以下方式
library(ggridges)#绘制层峦叠嶂图
library(viridis)#采用其中的颜色
ggplot(cjb, aes(x = sx, y = bj, fill = ..x..)) +
geom_density_ridges_gradient(scale = 2,
rel_min_height = 0.01,
gradient_lwd = 1) +
scale_fill_viridis(name = "数学成绩",
option = "C") +
labs(x = "数学", y = "班级")
ggsave("density_ridges.png", dpi = 600)
# featurePlot -------------------------------------------------------------
#对于分类问题而言,在进行数据描述时
#最关键的,当属因变量vs自变量了
library(caret)
featurePlot(
x = cjb[, 4:12],
y = cjb$wlfk,
plot = "density",
scales = list(
x = list(relation = "free"),
y = list(relation = "free")
),
adjust = 1.5,
pch = "|",
auto.key = list(columns = 2)
)
#从上图可以看出,数学/生物最优辨识度
#而语文,几乎文理科生没有什么区别
#变量之间的依存关系
#可以通过信息增益来度量
#也就是说,当我们知道某个自变量时
#有助于因变量不确定性的减少
library(infotheo)
condentropy(cjb$wlfk) -
condentropy(cjb$wlfk, cjb$xb)