-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogit_spls_example.R
172 lines (137 loc) · 5.19 KB
/
logit_spls_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# how to use logit-spls
library(plsgenomics)
### generate some data sample
## X: matrix n x p with observations in rows and covariates in column
## Y: vector of length n with binary values (0-1)
n = 200
p = 100
kstar = 10
lstar = 2
beta.min = 5
beta.max = 10
mean.H=0
sigma.H=10
sigma.F=5
sample1 = sample.bin(n, p, kstar, lstar, beta.min, beta.max,
mean.H, sigma.H, sigma.F, seed = NULL)
X = sample1$X
Y = sample1$Y
print(table(Y))
### splitting between learning and testing set (70/30%)
index.train <- sort(sample(1:n, size=round(0.7*n)))
index.test <- (1:n)[-index.train]
Xtrain <- X[index.train,]
Ytrain <- Y[index.train,]
Xtest <- X[index.test,]
Ytest <- Y[index.test,]
######## tunning hyper-parameters with cross-validation
### hyper-parameters values to test
lambda.l1.range <- seq(0.05,0.95,by=0.1) # between 0 and 1
ncomp.range <- 1:5
# log-linear range between 0.01 a,d 1000 for lambda.ridge.range
logspace <- function( d1, d2, n) exp(log(10)*seq(d1, d2, length.out=n))
lambda.ridge.range <- signif(logspace(d1 = -2, d2 = 5, n=11), digits=3)
### tuning the hyper-parameters
## (/!\ using 8 cores)
cv1 <- logit.spls.cv(X=Xtrain, Y=Ytrain, lambda.ridge.range=lambda.ridge.range,
lambda.l1.range=lambda.l1.range,
ncomp.range=ncomp.range, adapt=TRUE, maxIter=100, svd.decompose=TRUE, return.grid=TRUE,
ncores=8, nfolds=5, nrun=1, center.X=TRUE, scale.X=FALSE, weighted.center=TRUE)
str(cv1)
### otpimal values
lambda.l1 <- cv1$lambda.l1.opt
lambda.ridge <- cv1$lambda.ridge.opt
ncomp <- cv1$ncomp.opt
######## fitting the model, and predicting new observations
model1 <- logit.spls(Xtrain=Xtrain, Ytrain=Ytrain, Xtest=Xtest,
lambda.ridge=lambda.ridge, lambda.l1=lambda.l1, ncomp=ncomp,
adapt=TRUE, maxIter=100, svd.decompose=TRUE,
center.X=TRUE, scale.X=FALSE, weighted.center=TRUE)
str(model1)
### estimation error
sum(Ytrain!=model1$hatY) / length(Ytrain)
### prediction error
sum(Ytest!=model1$hatYtest) / length(Ytest)
### plotting the predictor coefficients
plot(model1$Coefficients, xlab="variable index", ylab="coeff")
### low dimensional space representation
plot(model1$X.score[,1:2], xlab="comp1", ylab="comp2", col=Ytrain+1)
### selected variables
model1$A
### PLS output
library(ggplot2)
## PLS score
pls_score <- data.frame(
pls1 = model1$X.score[,1],
pls2 = model1$X.score[,2]
)
g1 <- ggplot(data = pls_score, aes(x = pls1, y = pls2)) +
geom_hline(yintercept = 0, lty = 2) +
geom_vline(xintercept = 0, lty = 2) +
geom_point(alpha = 0.8) +
theme_bw()
g1
## correlation circle
circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
circ <- circleFun(c(0,0),2,npoints = 500)
pls_cor <- as.data.frame(cor(Xtrain, model1$X.score))
colnames(pls_cor) <- paste0("pls", 1:ncol(model1$X.score)) # pls components
row.names(pls_cor) <- paste0("X", 1:ncol(X)) # covariate names (edit accordingly)
pls_cor <- pls_cor[model1$A,] # keep only selected variables
g2 <- ggplot() +
geom_path(data = circ, aes(x,y), lty = 2, color = "grey", alpha = 0.7) +
geom_hline(yintercept = 0, lty = 2, color = "grey", alpha = 0.9) +
geom_vline(xintercept = 0, lty = 2, color = "grey", alpha = 0.9) +
geom_segment(
data = pls_cor, aes(x = 0, xend = pls1, y = 0, yend = pls2),
arrow = arrow(length = unit(0.025, "npc"), type = "open"),
lwd = 1) +
geom_text(
data = pls_cor,
aes(x = pls1*1.15, y = pls2*1.15,
label = row.names(pls_cor)),
check_overlap = F, size = 3) +
xlab("PLS1") +
ylab("PLS2") +
coord_equal() +
theme_minimal() +
theme(panel.grid = element_blank(),
panel.border = element_rect(fill= "transparent"))
g2
######### stability selection procedure
## (/!\ using 8 cores)
stab1 <- logit.spls.stab(X=Xtrain, Y=Ytrain, lambda.ridge.range=lambda.ridge.range,
lambda.l1.range=lambda.l1.range,
ncomp.range=ncomp.range,
adapt=TRUE, maxIter=100, svd.decompose=TRUE,
ncores=8, nresamp=50,
center.X=TRUE, scale.X=FALSE, weighted.center=TRUE,
seed=NULL, verbose=TRUE)
str(stab1)
### heatmap of estimated probabilities
stability.selection.heatmap(stab1)
### selected covariates
tmp <- stability.selection(stab1, piThreshold=0.75, rhoError=10)
tmp
### "true" pertinent covariates vs selected ones
sample1$sel
tmp$selected.predictors
# effect of probability threshold
tmp <- sapply(seq(0.55,0.95,0.05), function(prob) {
return(length(stability.selection(stab1, piThreshold=prob, rhoError=10)$selected.predictors))
})
plot(seq(0.55,0.95,0.05), tmp)
# effect of restricting hyper-paramter grid
tmp <- sapply(seq(1,100,1), function(r) {
return(length(stability.selection(stab1, piThreshold=0.75, rhoError=r)$selected.predictors))
})
plot(seq(1,100,1), tmp)
# number of selected variables
length(sample1$sel)
tmp