-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-EDA.R
197 lines (159 loc) · 8.01 KB
/
01-EDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
library(tidyverse)
library(rgeco)
ggplot2::theme_set(ggplot2::theme_minimal())
# data downloaded from: https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan
DATA_DIR <- here::here('data')
DATA_PATH <- file.path(DATA_DIR, 'train_FD001.txt')
DATA_NAME <- basename(DATA_PATH)
# ---- Load data ----
# this is a dataset of simulated turbofan sensor readings over the lifetime of 100 turbofans
# In this training set, each machine is followed up to the failure time.
d <- readr::read_delim(DATA_PATH, delim = ' ',
col_names = c('id', 'time',
stringr::str_c('setting', seq_len(3)),
stringr::str_c('sensor', seq_len(21)),
'empty'),
trim_ws = TRUE
) %>%
dplyr::select(-empty)
# ---- Format as time-to-event ----
## Let's encode the explicit time-to-failure for each machine.
## each machine will have an "event" code of 1 since the failure is observed in this dataset
events <- d %>%
dplyr::group_by(id) %>%
dplyr::summarize(start = min(time)-1,
end = max(time),
event = TRUE,
group = 'training') %>%
dplyr::ungroup()
# we can use this `events` dataframe to plot the survival time over this set of machines
km_data <- rgeco::prep_km_data(data = events, formula = survival::Surv(origin = start, time = end, event = event) ~ group)
km_data %>%
ggplot(aes(x = time, y = estimate)) +
geom_step() +
scale_y_continuous('Machines Running (%)', labels = scales::percent) +
scale_x_continuous('Time (cycles)') +
ggtitle('Kaplan-Meier Estimates of Turbofan Lifetimes') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# compute RMST for these data at different cutoff times tau
rmst_data <- tibble(tau = seq(from = 0, to = max(km_data$time), by = 10)) %>%
dplyr::mutate(
rmst = purrr::map_dbl(tau,
~ RISCA::rmst(times = km_data$time, surv.rates = km_data$estimate, max.time = .x)))
# ---- EDA of settings data ----
## This dataset records each of 3 operational settings, for each fan
## The operational settings are assumed to be exogenous, that is: they influence the failure rate for each machine, but are not influenced by it.
d %>%
tidyr::gather(setting_name, setting_value, dplyr::starts_with('setting')) %>%
ggplot(., aes(x = time, y = setting_value, group = id)) +
geom_line(alpha = 0.1, size = 0.1) +
facet_wrap(~ setting_name, scale = 'free_y') +
scale_x_continuous('Time (cycles') +
scale_y_continuous('Setting value') +
ggtitle('Operational settings over time for each machine') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# We learn from this that the `setting3` value is 100 for all machines.
# Also, setting1 & setting2 are on different scales.
# let's plot as percent change from baseline
d %>%
tidyr::gather(setting_name, setting_value, dplyr::starts_with('setting')) %>%
dplyr::group_by(id, setting_name) %>%
dplyr::mutate(percent_change = (setting_value - dplyr::first(setting_value, order_by=time)) / dplyr::first(setting_value, order_by=time)) %>%
dplyr::ungroup() %>%
ggplot(., aes(x = time, y = percent_change, group = id)) +
geom_line(alpha = 0.1, size = 0.1) +
facet_wrap(~ setting_name, scale = 'free_y') +
scale_x_continuous('Time (cycles') +
scale_y_continuous('Setting value (% change from baseline)', labels = scales::percent) +
ggtitle('Operational settings over time for each machine') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# both setting1 & setting2 have a reasonable frequency of extreme values.
# also, It looks like setting2 has discrete values whereas setting1 is continuous.
# ---- EDA: Setting1 ----
d %>%
dplyr::filter(time == 1) %>%
ggplot(aes(x = setting1)) +
geom_histogram() +
scale_x_continuous('Setting1 value at baseline') +
ggtitle('Distribution of Setting1 values at baseline') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# At baseline, this setting shows a skewed distribution
d %>%
ggplot(aes(x = setting1, group = time, fill = time)) +
geom_histogram(position = 'stack') +
scale_x_continuous('Setting1 value') +
ggtitle('Distribution of Setting1 values over all times') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# Over all time, this shows a distribution centered at 0
# with very little evidence of being skewed
d %>%
dplyr::mutate(setting1_normalized = (setting1 - mean(setting1))/sd(setting1)) %>%
ggplot(aes(x = setting1_normalized, group = time, fill = time)) +
geom_histogram() +
scale_x_continuous('[Setting1 - mean(Setting1)] / sd(Setting1)') +
ggtitle('Distribution of Normalized Setting1 values') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# these data, like those of setting2, appear well-behaved when normalized
# ---- EDA: Setting2 ----
d %>%
dplyr::filter(time == 1) %>%
ggplot(aes(x = setting2)) +
geom_histogram() +
scale_x_continuous('Setting2 value at baseline') +
ggtitle('Distribution of Setting2 values at baseline') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
d %>%
ggplot(aes(x = setting2, group = time, fill = time)) +
geom_histogram() +
scale_x_continuous('Setting2 value') +
ggtitle('Distribution of Setting2 values') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
d %>%
dplyr::mutate(setting2_normalized = (setting2 - mean(setting2))/sd(setting2)) %>%
ggplot(aes(x = setting2_normalized, group = time, fill = time)) +
geom_histogram() +
scale_x_continuous('[Setting2 - mean(Setting2)] / sd(Setting2)') +
ggtitle('Distribution of Normalized Setting2 values') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# ---- EDA: correlation of settings ----
## Are setting1 & setting2 correlated?
d %>%
dplyr::mutate(setting1_normalized = (setting1 - mean(setting1))/sd(setting1),
setting2_normalized = (setting2 - mean(setting2))/sd(setting2)) %>%
ggplot(aes(x = setting1_normalized, y = setting2_normalized)) +
geom_jitter(alpha = 0.1) +
scale_x_continuous('Setting1 (normalized)') +
scale_y_continuous('Setting2 (normalized)') +
ggtitle('Correlation of Setting1 and Setting2 values') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# There is almost no evidence of correlation between these two values
## Are setting1 & setting2 correlated with baseline values?
d %>%
dplyr::mutate(setting1_normalized = (setting1 - mean(setting1))/sd(setting1),
setting2_normalized = (setting2 - mean(setting2))/sd(setting2)) %>%
dplyr::group_by(id) %>%
dplyr::mutate(first_setting1 = dplyr::first(setting1_normalized, order_by = time)) %>%
dplyr::ungroup() %>%
ggplot(aes(x = first_setting1, y = setting2_normalized)) +
geom_jitter(alpha = 0.2) +
scale_x_continuous('First Setting1 (normalized)') +
scale_y_continuous('Setting2 (normalized)') +
ggtitle('Correlation of First Setting1 and all Setting2 values') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
d %>%
dplyr::mutate(setting1_normalized = (setting1 - mean(setting1))/sd(setting1),
setting2_normalized = (setting2 - mean(setting2))/sd(setting2)) %>%
dplyr::group_by(id) %>%
dplyr::mutate(first_setting1 = dplyr::first(setting1_normalized, order_by = time),
first_setting2 = dplyr::first(setting2_normalized, order_by = time)) %>%
dplyr::ungroup() %>%
ggplot(aes(x = first_setting2, y = setting1_normalized)) +
geom_jitter(alpha = 0.2) +
scale_x_continuous('First Setting2 (normalized)') +
scale_y_continuous('Setting1 (normalized)') +
ggtitle('Correlation of First Setting2 and all Setting1 values') +
labs(caption = glue::glue('For the dataset: {DATA_NAME}'))
# It appears that setting1 & setting2 are set randomly for all machines
# irrespective of the initial setting or the other setting values
# In other words, we do not have a scenario where there are combinations of settings
# that are more likely than others.