-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_for_article_R.r
182 lines (164 loc) · 6.83 KB
/
analysis_for_article_R.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#library(magrittr)
#library(dplyr)
library(tidyverse)
#library(ggplot2)
# The relationship between the number of syntactic errors and the values of the syntactic complexity metrics, colored by task type
df <- read_csv("data.csv")
df %>%
mutate(task_type = str_extract(name, "[12].txt"),
task_type = str_remove(task_type, ".txt")) %>%
pivot_longer(names_to = "syntactic_complexity", values_to = "values", av_depth:lemma_sim_all) %>%
group_by(syntactic_complexity, task_type) %>%
mutate(average = mean(values)) %>%
ggplot(aes(values, errors, color = task_type))+
geom_point(size = 0.1)+
labs(col = "Task type") +
geom_smooth(method = "glm",
method.args = list(family = poisson),
aes(group = task_type), se = FALSE)+
geom_rug(aes(x = average), sides = "b")+
facet_wrap(~syntactic_complexity, scales = "free_x")+
scale_colour_grey() +
theme_bw()
ggsave("figure_7.png", device = "png", width = 9, height = 7)
# FOR TYPE 1 ESSAYS
df1 <- read_csv("data_1.csv")
colnames(df1)
M1 <- glm(errors ~ .,
family = "poisson",
data = df1[,-c(1, 22, 24)]) # remove id, type and name columns
summary(M1)
# Find aliases (linearly dependent terms)
alias(glm(errors ~ ., family = "poisson", data = df1[,-c(1, 22, 24)])) # We need to remove the num_compl_tu variable
M1 <- glm(errors ~ ., family = "poisson", data = df1[,-c(1, 22, 24, 13)])
summary(M1)
# Calculate the variation inflation factors of all predictors
car::vif(M1) # Remove pos_sim_nei, lemma_sim_nei, pos_sim_all, lemma_sim_all
M1 <- glm(errors ~ ., family = "poisson", data = df1[,-c(1, 22, 24, 13, 18:21)])
summary(M1)
# Choose the variables satisfying CI
data_result <- df1 %>% select("max_depth", "num_advcl" , "num_sent" , "num_adj_noun" , "num_noun_inf")
data_result
# Scale the data
scaled_data <- data_result %>%
mutate(across(everything(), scale))
# scaled_data <- data_result %>% mutate_at(c("max_depth",
# "num_advcl",
# "num_sent",
# "num_adj_noun",
# "num_noun_inf"), scale)
scaled_data
# Take the average for each row for the selected metrics
rowMeans(scaled_data)
df1$mean_value <- rowMeans(scaled_data)
# The relationship between the number of errors and the average of the transformed metrics
df1 %>%
ggplot(aes(mean_value, errors))+
geom_point(size = 0.1)+
xlab("generalized complexity metric") +
geom_smooth(method = "glm",
method.args = list(family = poisson),
se = TRUE)+
theme_bw()
ggsave("figure_2_type1.png", device = "png", width = 9, height = 7)
# The relationship between the number of syntactic errors and the values of syntactic complexity metrics separately
df1 %>%
select("max_depth", "num_advcl" , "num_sent" , "num_adj_noun" , "num_noun_inf", "errors") %>%
glm(errors ~ .,
family = "poisson",
data = .) ->
final_fit
summary(final_fit)
library(ggeffects)
p <- ggpredict(final_fit) %>%
plot()
library(ggpubr)
ggarrange(p[[1]], p[[2]], p[[3]],p[[4]], p[[5]])
ggsave("figure_4_type1.png", device = "png", width = 9, height = 7)
#FOR TYPE 2 ESSAYS
df2 <- read_csv("data_2.csv")
colnames(df2)
M2 <- glm(errors ~ .,
family = "poisson",
data = df2[,-c(1, 22, 24)])
summary(M2)
# Find aliases (linearly dependent terms)
alias(glm(errors ~ ., family = "poisson", data = df2[,-c(1, 22, 24)])) # We need to remove the num_compl_tu variable
M2 <- glm(errors ~ ., family = "poisson", data = df2[,-c(1, 22, 24, 13)])
summary(M2)
# Calculate the variation inflation factors of all predictors
car::vif(M2)
M2 <- glm(errors ~ ., family = "poisson", data = df2[,-c(1, 22, 24, 13, 18:21)])
summary(M2)
# Choose the variables satisfying CI
data_result <- df2 %>% select("num_advcl" , "num_sent" , "num_tok", "av_len_sent", "num_cl", "num_tu" , "num_coord" , "num_part_noun")
data_result
# Scale the data
scaled_data <- data_result %>% mutate_at(c("num_advcl" , "num_sent" , "num_tok", "av_len_sent", "num_cl", "num_tu" , "num_coord" , "num_part_noun"), scale)
scaled_data
# Take the average for each row for the selected metrics
rowMeans(scaled_data)
df2$mean_value <- rowMeans(scaled_data)
# The relationship between the number of errors and the average of the transformed metrics
df2 %>%
ggplot(aes(mean_value, errors))+
geom_point(size = 0.1)+
xlab("generalized complexity metric") +
geom_smooth(method = "glm",
method.args = list(family = poisson),
se = TRUE)+
theme_bw()
ggsave("figure_3_type2.png", device = "png", width = 9, height = 7)
# The relationship between the number of syntactic errors and the values of syntactic complexity metrics separately
df2 %>%
select("num_advcl" , "num_sent" , "num_tok", "av_len_sent", "num_cl", "num_tu" , "num_coord" , "num_part_noun", "errors") %>%
glm(errors ~ .,
family = "poisson",
data = .) ->
final_fit
summary(final_fit)
library(ggeffects)
ggpredict(final_fit) %>%
plot() ->
p
library(ggpubr)
ggarrange(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], p[[6]], p[[7]], p[[8]], ncol=4, nrow=2)
ggsave("figure_5_type2.png", device = "png", width = 9, height = 7)
# The density distribution of syntactic complexity metrics for two types of essays
df_all <- df %>%
mutate(task_type = str_extract(name, "[12].txt"),
task_type = str_remove(task_type, ".txt")) %>%
pivot_longer(names_to = "syntactic_complexity", values_to = "values", av_depth:lemma_sim_all) %>%
group_by(syntactic_complexity, task_type) %>%
mutate(average = mean(values))
ggp2 <- ggplot(df_all, aes(x = values, fill=task_type)) + # Draw each column as density
geom_density(alpha=0.6) +
facet_wrap(~ syntactic_complexity, scales = "free")+
scale_fill_grey() + theme_classic()
ggp2
ggsave("figure_6.png", device = "png", width = 9, height = 7)
# Kendall rank correlation coefficients across Task 1 essays
library(corrplot)
df1 = read_csv("data_1.csv")
cc = cor(df1[, 2:21], method = "kendal")
ggp2 <- corrplot(cc, method = "color")
ggp2
ggsave("figure_8_type1.png", device = "png", width = 9, height = 7)
# Kendall rank correlation coefficients across Task 2 essays
df2 = read_csv("data_2.csv")
cc = cor(df2[, 2:21], method = "kendal")
ggp2 <- corrplot(cc, method = "color")
ggp2
ggsave("figure_9_type2.png", device = "png", width = 9, height = 7)
# The mean value of syntactic complexity metrics by quantiles for two types of essays
pd <- position_dodge(.15)
ggp2 <- ggplot(error_bars, aes(x=quantile, y=mean, group=type, color=type)) +
geom_line(position = pd) +
geom_point(position = pd)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,
position=pd) +
facet_wrap(~ feature, scales = "free") +
scale_colour_grey() +
theme_linedraw()
ggp2
ggsave("figure_1.png", device = "png", width = 9, height = 7)