-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexplore_data.R
490 lines (400 loc) · 15 KB
/
explore_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
#**************************************************************************************
#Multivariate Analysis
#Spring Term 2017
#Final Project
#Lecturer: Tomas Aluja
#
# Date: 01.04.2017
# Jakob Gerstenlauer
###############################################################################################
#remove old objects for safety resons
rm(list=ls(all=TRUE))
#utility function
glue<-function(...){paste(...,sep="")}
#define path of standard directories
source("workingDir.R")
setwd(trainDir)
#read the inputs (features/predictors)
d<-read.table("X_train.txt")
dim(d)
#[1] 7352 561
#Are there any missing values in the inputs?
table(complete.cases(d))
#TRUE
#7352
#How many input variables are there?
(maxColumns<-dim(d)[2])
# 561
#read the outputs
dy<-read.table("y_train.txt")
dim(dy)
#[1] 7352 1
#Are there any missing values in the outputs?
table(complete.cases(dy))
#TRUE
#7352
#Let's have a look at the distribution of labels:
table(dy$V1)
# 1 2 3 4 5 6
# 1226 1073 986 1286 1374 1407
#Maybe we have to weight the predictions for label 3 more strongly,
#because there are less observations.
#If we don't do so, the final model might be less accurate in correctly
#predicting label 3.
includeColum<-vector()
#Test for all input variables if they are significantly related to the labels.
for(numVar in 1:maxColumns){
d2 <- data.frame(dy$V1, d[,numVar])
names(d2)<-c("y","x")
m1.aov<-aov(as.numeric(x)~as.factor(y), data=d2)
m0.aov<-aov(as.numeric(x)~1, data=d2)
p.value<-anova(m1.aov,m0.aov)[["Pr(>F)"]][2]
if(p.value<0.05){
includeColum<-c(includeColum,TRUE)
}else{
includeColum<-c(includeColum,FALSE)
}
}
#remove columns without effect
table(includeColum)
#includeColum
# FALSE TRUE
# 6 555
d<-d[,includeColum]
(input.means<-apply(d,2,mean))
#the inputs are not centered!
(input.sd<-apply(d,2,sd))
hist(input.sd)
#The standard deviation of inputs is bounded between 0.0 and 0.8.
#As we know from the data description:
#"Features are normalized and bounded within [-1,1]".
#Nevertheless, we should scale the inputs because we do not know if the
#variance of measurement inputs has any meaning.
#Probably measurement with higher variance are even less reliable.
Xs <- as.matrix(scale(d, center = TRUE, scale = TRUE))
#We have to convert Y into a matrix with 6 vectors,
#each vector being a dummy variable for one of the possible labels.
Y<-model.matrix( ~ as.factor(dy$V1) - 1)
dim(Y)
#[1] 7352 6
subjects <- as.factor(readLines("subject_train.txt"))
table(subjects)
# 1 11 14 15 16 17 19 21 22 23 25 26 27 28 29 3 30 5 6 7 8
# 347 316 323 328 366 368 360 408 321 372 409 392 376 382 344 341 383 302 325 308 281
#library("DMwR")
#outlier_scores<- lofactor(Xs, k=5)
#setwd(plotDir)
#jpeg("Outliers.jpg")
#plot(density(outlier_scores))
#dev.off()
#higher than 1.7 seem to be the real bad guys!
# pick top x outliers
#outliers <- outlier_scores[outlier_scores > 1.7]
#hist(outliers)
#Which are the outliers?
#print(outliers)
#[1] 1.745032 2.057507 1.721322 1.805697
#We excluded the top 4 outliers.
# hist(outlier_scores)
# which(outlier_scores > 1.7)
# [1] 71 1905 3935 5067
indeces.to.exclude<-c(71, 1905, 3935, 5067)
#Here we exclude these 4 outliers from both Xs and Y
Xs <- Xs[-indeces.to.exclude,]
Y <- Y[-indeces.to.exclude,]
subjects <- subjects[-indeces.to.exclude]
#Conclusion: All variables show an effect on the output!
library(FactoMineR)
pca1.fm <- PCA(as.data.frame(Xs),
#a boolean, if TRUE (value set by default) then data are scaled to unit variance
scale.unit = TRUE,
#number of dimensions kept in the results (by default 5)
ncp = 100,
graph = FALSE
)
plot(pca1.fm)
summary(pca1.fm)
str(pca1.fm)
barplot(pca1.fm$eig$`cumulative percentage of variance`)
pca1.fm$eig$`cumulative percentage of variance`[1:59]
#Based on the 90% rule, I would need the first 59 principal components!
nc<-59
#perform a varimax rotation
pca1.varimax<-varimax(pca1.fm$var$cor[,1:nc])
#extract the loadings of the variables on the nc significant principal components:
loadings<-pca1.varimax$loadings
dim(loadings)
#[1] 555 59
#Feature extraction:
#For each significant component do:
cut.off<-0.8
#Lower threshold if no variables were selected for default cut-off
cut.off.2<-0.7
for(i in 1:nc){
loading<-loadings[,i]
#1. Select all variables above a certain threshold for the absolute value of the loading.
index<-abs(loading)>cut.off
if(length(index)==0){
index<-abs(loading)>cut.off.2
}
if(length(index)==0){
next;
}
m<- as.matrix(Xs[,index])
#2. If the loading is negative, multiply the value of the variable with -1.
b<-ifelse(loading[index]<0, -1, 1)
#3. Calculate the mean value of al selected variables and store it as a new feature.
c<-apply(m %*% b, 1, mean)
if(exists("d.pc")){
d.pc<-cbind(d.pc,c)
}else{
d.pc<-c
}
rm(c)
}
dim(d.pc)
#[1] 7348 59
#*******************************************************************
#Instead of using that many principal components,
#it's better to use partial least squares regression.
#The advantage of PLSR is that the selected components
#are also correlated with the response labels.
#*******************************************************************
library(pls)
#Warning: This call can take a long, long time!
#m1.pls2 <- plsr(Y ~ Xs, validation = "LOO")
#fit model without validation:
m1.pls2 <- plsr(Y ~ Xs, validation = "none")
#TODO Try out parallelised cross-validation:
# ## Parallelised cross-validation, using persistent cluster:
# library(parallel)
# ## This creates the cluster:
# pls.options(parallel = makeCluster(4, type = "PSOCK"))
# ## The cluster can be used several times:
# yarn.pls <- plsr(density ~ NIR, 6, data = yarn, validation = "CV")
summary(m1.pls2)
# Plot of R2 for each class
plot(R2(m1.pls2), legendpos = "bottomright")
#calculate the mean R2 over all classes for an increasing number of components:
r2.mean<-apply(R2(m1.pls2)$val[1,,],2,mean)
#plot the mean R2 for the first 50 components:
plot(1:50,r2.mean[1:50],pch="+")
# Calculate the coefficient of determination
# based on generalized cross-validation:
n <- nrow(Xs)
#p <- ncol(Xs)
p<-50
q <- ncol(Y)
R2cv<-rep(-1,p)
for (i in 1:p) {
lmY <- lm(Y~m1.pls2$scores[,1:i])
PRESS <- apply((lmY$residuals/(1-ls.diag(lmY)$hat))^2,2,sum)
R2cv[i] <- mean(1 - PRESS/(sd(Y)^2*(n-1)));
}
setwd(plotDir)
jpeg("NrOfPLSIIComponents.jpeg")
#plot generalized CV estimate first
plot(1:p,R2cv,type="l",
xlab="number of components",
ylab="coefficient of determination")
points(1:p,r2.mean[1:p],type="l",col="red")
dev.off()
#Conclusion: Based on generalized CV 30 components are sufficient.
nc<-30
###################################################################################################################
# Cluster analysis
###################################################################################################################
#the loadings of the inputs on the components
Psi<-m1.pls2$loadings[,1:nc]
#loadings of the individuals
scores<-m1.pls2$scores[,1:nc]
#... is transformed into a distance matrix.
dist.matrix<-dist(scores, method = "euclidean")
#Then I perform hierarchical clustering based on this distance matrix
#and the corrected Ward algorithm:
clusters <- hclust(dist.matrix, method = "ward.D2")
setwd(plotDir)
jpeg("hierarchical_clustering_WARD.jpg")
plot(clusters)
dev.off()
jpeg("hierarchical_clustering_WARD_inertia_explained.jpg")
barplot(clusters$height[1:10])
dev.off()
#Let's use 5 splits / 6 clusters, which also corresponds to our 6 classes!
cl <- cutree(clusters, 6)
table(cl)
# cl
# 1 2 3 4 5 6
# 2621 1445 845 1100 173 1168
setwd(plotDir)
jpeg("hierarchical_clustering_ward_6classes_PC1_PC2.jpeg")
plot(Psi[,1],Psi[,2],type="n",main="Clustering of observations into 4 classes")
points(Psi[,1],Psi[,2],col=cl,pch=dy$V1, cex = 0.6)
abline(h=0,v=0,col="gray")
legend("topleft",c("c1","c2","c3","c4","c5","c6"),pch=20,col=c(1:6))
dev.off()
setwd(plotDir)
jpeg("hierarchical_clustering_ward_6classes_PC1_PC3.jpeg")
plot(Psi[,1],Psi[,3],type="n",main="Clustering of observations into 4 classes")
points(Psi[,1],Psi[,3],col=cl,pch=dy$V1, cex = 0.6)
abline(h=0,v=0,col="gray")
legend("topleft",c("c1","c2","c3","c4","c5","c6"),pch=20,col=c(1:6))
dev.off()
#Is there any correspondence between the clusters and the labels?
ct<-data.frame(labels=dy$V1,clusters=cl)
table(ct)
# clusters
# labels 1 2 3 4 5 6
# 1 0 0 818 3 23 382
# 2 0 0 27 316 2 728
# 3 0 0 0 781 148 57
# 4 1215 70 0 0 0 1
# 5 1369 5 0 0 0 0
# 6 37 1370 0 0 0 0
# cluster 1 -> label 4+5
# cluster 2 -> label 6
# cluster 3 -> label 1
# cluster 4 -> label (2)+3
# cluster 5 -> label 3
# cluster 6 -> label (1)+2
#Conclusion:
#There is a lot of overlap between labels 1+2+3 and 4+5!
#Now let's run a consolidated cluster analysis using the centroids of this partitioning:
#Consolidation of the partition:
#I use the centroids of the 6 clusters found with hierarchical clustering (WARD)
#as starting point for k-means:
#Calculate the centroids of the 6 clusters
#in the nc significant dimensions (principal components)
cdg <- aggregate(as.data.frame(scores),list(cl),mean)[,2:(nc+1)]
k6 <- kmeans(scores, centers=cdg)
Bss <- sum(rowSums(k6$centers^2)*k6$size) # = k5$betweenss
Wss <- sum(k6$withinss) # = k5$tot.withinss
(Ib <- 100*Bss/(Bss+Wss))
#[1] 75.88375
#Is there any correspondence between the clusters and the labels?
ct2<-data.frame(labels=dy$V1,clusters=k6$cluster)
table(ct2)
# clusters
# labels 1 2 3 4 5 6
# 1 0 0 812 151 62 201
# 2 0 0 47 176 3 847
# 3 0 0 40 634 159 153
# 4 1189 89 0 0 0 8
# 5 1372 0 0 0 0 2
# 6 21 1372 0 0 0 14
#****************************************************************************************
# Next step:
# a) Use a regression tree to predict the label using the 25 significant components as input!
# b) Use a random forrest to predict the label using the 25 significant components as input!
# c) Use relevance vector machine or support vector machine using the 25 significant components as input!
#****************************************************************************************
Psi<-m1.pls2$Yscores[,1:nc]
training.data <- data.frame(Psi)
training.data$subjects<-subjects
dim(training.data)
#[1] 7348 31
# choosing the training and test data
setwd(trainDir)
Y_train <- as.factor(dy$V1)
#append the class labels, exclude outliers
training.data$y <- Y_train[-indeces.to.exclude]
setwd(testDir)
Y_test <- read.table("y_test.txt")
Y_test$V1<-as.factor(Y_test$V1)
X_test <- read.table("X_test.txt")
#The test inputs have to be transformed in the same way as the training inputs!
#First we have to exclude 6 columns which we excluded for the training set
X_test <- X_test[,includeColum]
N <- nrow(X_test)
p <- ncol(X_test)
############# Subtract the mean values of the columns of the training data ############
correction.matrix<-as.matrix(rep(1,N)) %*% t(as.matrix(input.means))
dim(correction.matrix)
dim(as.matrix(X_test))
#check if the mean of the columns is identical to column means
table(input.means == apply(correction.matrix,2,mean))
#check if the median of the columns is identical to column means
table(input.means == apply(correction.matrix,2,median))
#check if the min of the columns is identical to column means
table(input.means == apply(correction.matrix,2,min))
#subtract the means of the inputs in the training set
X_test<-X_test - correction.matrix
############# Divide by the standard deviation of the columns of the training data ############
correction.matrix<-as.matrix(rep(1,N) %*% t(input.sd))
#check if the mean of the columns is identical to column sd
table(input.sd == apply(correction.matrix,2,mean))
#divide by the standard deviation of the inputs in the training set
X_test<-X_test / correction.matrix
############# regression tree ############
library(rpart)
set.seed(567)
#Use Gini index as impurity criterion:
m1.rp <- rpart(Adjusted ~ ., method="class", data=d.train, control=rpart.control(cp=0.001, xval=10))
printcp(m1.rp)
############# random forest ############
set.seed(9019)
#install.packages("randomForest")
library(randomForest)
#Possible hyperparameter:
# ntree: Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times.
# mtry: Number of variables randomly sampled as candidates at each split. Note that the default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3)
# classwt: Priors of the classes. Need not add up to one. Ignored for regression.
# strata: Maybe define the subject (the person) as strata
#A (factor) variable that is used for stratified sampling.
# sampsize
# Size(s) of sample to draw. For classification, if sampsize is a vector of the length
# the number of strata, then sampling is stratified by strata,
# and the elements of sampsize indicate the numbers to be drawn from the strata.
# nodesize: use 10!
# Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time). Note that the default values are different for classification (1) and regression (5).
# maxnodes
m1.rf <- randomForest(y~.,
ntree=1000,
mtry=10,
classwt= rep(1/6,6),
importance=TRUE,
data=training.data,
xtest=X_test,
ytest=Y_test,
nodesize=10,
maxnodes=10)
#Plot the error rate for an increasing number of trees:
setwd(plotDir)
jpeg("ErrorRate_NrOfTrees.jpeg")
plot(m1.rf)
dev.off()
#The error rate does not decrease above 200 trees.
pred_rf_data <- predict(rf_data,newdata = X_test)
table(pred_rf_data,Y_test)
plot(rf_data)
#Step 1: define the LH scheme
require("lhs")
#number of samples from the LHC
SampleSize<-16;
NumVariables<-2;
#Now define the ranges for all four parameters of the LHC:
#V1: mtry
low_V1= 5;
high_V1= nc;
#V2: number of trees
low_V2 = 200;
high_V2 = 600;
#V3: nodesize
low_V3 = 1;
high_V3 = 8;
#set-up the Latin Hypercube sampling scheme
LHS<-improvedLHS(n=SampleSize, k=NumVariables, dup=1)
for (simulation in seq(1,dim(LHS)[1]))
{
for (arguments in seq(1,NumVariables))
{
#Here we use the quantile function for the uniform distribution to "translate" from the standard uniform distribution to the respective trait range
eval(parse(text=paste(
'A',arguments,'<-round(qunif(LHS[simulation,',arguments,'], min=low_V',arguments,', max=high_V',arguments,'),digits=3)'
,sep="")));
m1.rf <- randomForest(y~., ntree=A2, mtry=A1, nodesize = A3, classwt= rep(1/6,6), strata=subject, importance=TRUE, data=training.data)
#TODO: Evaluate model quality based on prediction error! Remember best model!
}
}
#################finish of random forest######################