-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyses_1.R
283 lines (232 loc) · 8.45 KB
/
analyses_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# set a seed to make sure we don't get ridiculous simulated values
set.seed(2019-11-27)
# simulate some data with the following attributes
n_sites <- 200
n_species <- 3
# we need some predictor variables
predictors <- data.frame(
intercept = rep(1, n_sites),
temperature_c = rnorm(n_sites, mean = 25, sd = 3),
precipitation_mm = rnorm(n_sites, mean = 500, sd = 30),
tree_cover_pc = runif(n_sites, min = 20, max = 80),
flowering_pc = runif(n_sites, min = 0, max = 45)
)
# simulate some species-level associations with covariates
n_covariates <- ncol(predictors)
covariate_effects <- matrix(
rnorm(n_species * n_covariates,
mean = 0, sd = (1 / apply(predictors, 2, mean))),
nrow = n_covariates,
ncol = n_species
)
# we can use these to create a `linear_predictor` that gives the mean
# probability of occurrence for each species at each site (assuming
# a logit link function)
linear_predictor <- plogis(as.matrix(predictors) %*% covariate_effects)
# we can use these to simulate species occurrences
species_occurrences <- matrix(
rbinom(n = (n_sites * n_species), size = 1, prob = linear_predictor),
nrow = n_sites,
ncol = n_species
)
# now we want to fit a GLM to these data to estimate probabilities of
# occurrence as as function of the environmental variables
# start by rescaling the environmental variables so they're all centered
# and have variance = 1
predictors$temperature_std <-
(predictors$temperature_c - mean(predictors$temperature_c)) /
sd(predictors$temperature_c)
predictors$precipitation_std <-
(predictors$precipitation_mm - mean(predictors$precipitation_mm)) /
sd(predictors$precipitation_mm)
predictors$tree_cover_std <-
(predictors$tree_cover_pc - mean(predictors$tree_cover_pc)) /
sd(predictors$tree_cover_pc)
predictors$flowering_std <-
(predictors$flowering_pc - mean(predictors$flowering_pc)) /
sd(predictors$flowering_pc)
rescale <- function (x){
(x - mean(x)) /
sd(x)
}
predictors$temperature_std <-rescale(predictors$temperature_c)
predictors$precipitation_std <-rescale(predictors$precipitation_mm)
predictors$tree_cover_std <-rescale(predictors$tree_cover_pc)
predictors$flowering_std <- rescale(predictors$flowering_pc)
# create a data set with everything includes
data_set <- data.frame(
occ_sp1 = species_occurrences[, 1],
occ_sp2 = species_occurrences[, 2],
occ_sp3 = species_occurrences[, 3],
predictors
)
# fit a GLM model for the first species
model_sp1 <- glm(
occ_sp1 ~ temperature_std +
precipitation_std +
tree_cover_std +
flowering_std,
data = data_set,
family = binomial()
)
# repeat for species 2 and 3
model_sp2 <- glm(
occ_sp2 ~ temperature_std +
precipitation_std +
tree_cover_std +
flowering_std,
data = data_set,
family = binomial()
)
model_sp3 <- glm(
occ_sp3 ~ temperature_std +
precipitation_std +
tree_cover_std +
flowering_std,
data = data_set,
family = binomial()
)
mods <- apply(data_set[,c("occ_sp1", "occ_sp2", "occ_sp3")], 2, function (mycol)
glm( mycol~ temperature_std +
precipitation_std +
tree_cover_std +
flowering_std,
data = data_set,
family = binomial()
))
mods[[1]]
this_models <- function (mycol) {
glm( mycol~ temperature_std +
precipitation_std +
tree_cover_std +
flowering_std,
data = data_set,
family = binomial()
)
}
model_sp1 <- this_models(data_set$occ_sp1)
model_sp2 <- this_models(data_set$occ_sp2)
model_sp3 <- this_models(data_set$occ_sp3)
# let's assume the models are fitted perfectly and we don't need
# to look at model diagnostics. Then we can use these perfect
# models to make some plots of occurrence probabilities against
# predictors
# how many values should we use to draw our fitted association?
n_plot <- 100
# we need a sequence of predictor values (temperature in this case)
# so we can predict occurrence probabilities along a gradient
temp_seq <- seq(min(data_set$temperature_std),
max(data_set$temperature_std),
length = n_plot)
# now we make a new data.frame, with the predictor sequence used
# for the variable of interest, and zeros otherwise
# (holding all other predictors at their mean)
plot_data <- data.frame(
temperature_std = temp_seq,
precipitation_std = rep(0, n_plot),
tree_cover_std = rep(0, n_plot),
flowering_std = rep(0, n_plot)
)
# we can use this new data.frame to predict the probability of
# occurrence at each value along the predictor sequence
plot_values <- predict(model_sp1, newdata = plot_data, type = "response")
# now plot it
plot(plot_values ~ temp_seq, type = "l")
# there's a few things we might want to fix with this plot:
# 1. the x-axis has the standardised values but ideally would
# have the true temperature values
# 2. we could add some informative axis labels
# 3. we could make some formatting changes based on our own preferences
# change the axis range
temp_actual <- temp_seq * sd(data_set$temperature_c) +
mean(data_set$temperature_c)
plot(plot_values ~ temp_actual, type = "l")
# change the axis labels
plot(plot_values ~ temp_actual,
type = "l",
xlab = "Temperature (C)",
ylab = "Probabiliy of occurrence")
# other random things we can change
plot(plot_values ~ temp_actual,
type = "l",
bty = "l",
las = 1,
lwd = 2.5,
ylim = c(0, 1),
xlab = "Temperature (C)",
ylab = "Probabiliy of occurrence")
# could even add a label to the plot
mtext("Species 1", side = 3, line = 0.5, adj = 0.02, cex = 1.5)
# that was easy enough. But now we want to repeat this processs for other
# predictors and species
# let's start with precipitation for species 1
# (note: using almost the same code as above)
# generate a sequence of precipitation values
precip_seq <- seq(min(data_set$precipitation_std), max(data_set$precipitation_std), length = n_plot)
# create a new data.frame with this precipitation sequence,
# holding all other variables at zero (their mean)
plot_data <- data.frame(
temperature_std = rep(0, n_plot),
precipitation_std = precip_seq,
tree_cover_std = rep(0, n_plot),
flowering_std = rep(0, n_plot)
)
# predict occurrence probabilities at each value of precipitation
plot_values <- predict(model_sp1, newdata = plot_data, type = "response")
# rescale the x-axis values to be actual precipitation rather than
# standardised values
precip_actual <- precip_seq * sd(data_set$precipitation_mm) +
mean(data_set$precipitation_mm)
# plot it, with your preferred plot settings
plot(plot_values ~ precip_actual,
type = "l",
bty = "l",
las = 1,
lwd = 2.5,
ylim = c(0, 1),
xlab = "Precipitation (mm)",
ylab = "Probabiliy of occurrence")
# add a label
mtext("Species 1", side = 3, line = 0.5, adj = 0.02, cex = 1.5)
# what if we want to plot precipitation for species 2?
# repeat the above code, swapping out species 1 for species 2
precip_seq <- seq(min(data_set$precipitation_std), max(data_set$precipitation_std), length = n_plot)
plot_data <- data.frame(
temperature_std = rep(0, n_plot),
precipitation_std = precip_seq,
tree_cover_std = rep(0, n_plot),
flowering_std = rep(0, n_plot)
)
plot_values <- predict(model_sp2, newdata = plot_data, type = "response")
precip_actual <- precip_seq * sd(data_set$precipitation_mm) + mean(data_set$precipitation_mm)
plot(plot_values ~ precip_actual,
type = "l",
bty = "l",
las = 1,
lwd = 2.5,
ylim = c(0, 1),
xlab = "Precipitation (mm)",
ylab = "Probabiliy of occurrence")
mtext("Species 2", side = 3, line = 0.5, adj = 0.02, cex = 1.5)
# great, but this seems like a huge amount of copy-paste and a lot of
# opportunities for mistakes if we want to plot three species by
# four predictors. What if we had more species or more predictors?
#--- Here are my functions
plot_models <- function (model_name, species_no, ... ) {
plot_values <- predict(model_name, newdata = plot_data, type = "response")
precip_actual <- precip_seq * sd(data_set$precipitation_mm) + mean(data_set$precipitation_mm)
plot(plot_values ~ precip_actual,
type = "l",
bty = "l",
las = 1,
lwd = 2.5,
ylim = c(0, 1),
xlab = "Precipitation (mm)",
ylab = "Probabiliy of occurrence",
...)
species_label <- paste("Species", species_no, sep=" ")
mtext(species_label, side = 3, line = 0.5, adj = 0.02, cex = 1.5)
}
plot_models(model_sp1, species_no=1, col="red", ylim=c(0,2))
plot_models(model_sp2, species_no=2, col="blue")
plot_models(model_sp3, species_no=3, col="green")