Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tidymodels update 2021 #2

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ all: results/final_model.rds results/accuracy_vs_k.png results/predictor_distrib

# download data
data/raw/wdbc.feather: src/download_data.py
python src/download_data.py --out_type=feather --url=http://mlr.cs.umass.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data --out_file=data/raw/wdbc.feather
python src/download_data.py --out_type=feather --url=https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data --out_file=data/raw/wdbc.feather

# pre-process data (e.g., scale and split into train & test)
data/processed/training.feather data/processed/test.feather scale_factor.rds: src/pre_process_wisc.r data/raw/wdbc.feather
Rscript src/pre_process_wisc.r --input=data/raw/wdbc.feather --out_dir=data/processed

# exploratory data analysis - visualize predictor distributions across classes
results/predictor_distributions_across_class.png: src/eda_wisc.r data/processed/training.feather
Rscript src/eda_wisc.r --train=data/processed/training.feather --out_dir=results
Rscript src/eda_wisc.r --train=data/processed/training.feather --out_dir=src/breast_cancer_eda_files/figure-gfm

# tune model (here, find K for k-nn using 30 fold cv with Cohen's Kappa)
results/final_model.rds results/accuracy_vs_k.png: src/fit_breast_cancer_predict_model.r data/processed/training.feather
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 4 additions & 4 deletions src/breast_cancer_test_results.r
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ main <- function(test, out_dir) {

# Load and wrangle test data ----------------------------------------------
test_data <- read_feather(test)
x_test <- test_data %>%
x_test <- test_data |>
select(-class, -se_fractal_dimension, -se_smoothness, -se_symmetry, -se_texture)
y_test <- test_data %>%
select(class) %>%
mutate(class = as.factor(class)) %>%
y_test <- test_data |>
select(class) |>
mutate(class = as.factor(class)) |>
pull()

# Load model and predict --------------------------------------------------
Expand Down
9 changes: 4 additions & 5 deletions src/eda_wisc.r
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ Options:
--out_dir=<out_dir> Path to directory where the plots should be saved
" -> doc

library(feather)
library(arrow)
library(tidyverse)
library(caret)
library(docopt)
library(ggridges)
library(ggthemes)
Expand All @@ -24,9 +23,9 @@ opt <- docopt(doc)
main <- function(train, out_dir) {

# visualize predictor distributions by class
train_data <- read_feather(train) %>%
gather(key = predictor, value = value, -class) %>%
mutate(predictor = str_replace_all(predictor, "_", " ")) %>%
train_data <- arrow::read_feather(train) |>
pivot_longer(names_to = "predictor", values_to = "value", -class) |>
mutate(predictor = str_replace_all(predictor, "_", " ")) |>
ggplot(aes(x = value, y = class, colour = class, fill = class)) +
facet_wrap(. ~ predictor, scale = "free", ncol = 4) +
geom_density_ridges(alpha = 0.8) +
Expand Down
10 changes: 5 additions & 5 deletions src/fit_breast_cancer_predict_model.r
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ main <- function(train, out_dir) {
# Tune hyperparameters ----------------------------------------------------

train_data <- read_feather(train)
x_train <- train_data %>%
x_train <- train_data |>
select(-class, -se_fractal_dimension, -se_smoothness, -se_symmetry, -se_texture)
y_train <- train_data %>%
select(class) %>%
mutate(class = as.factor(class)) %>%
y_train <- train_data |>
select(class) |>
mutate(class = as.factor(class)) |>
pull()
k = data.frame(k = seq(1, 100, by = 2))
cross_val <- trainControl(method="cv", number = 30)
Expand All @@ -37,7 +37,7 @@ main <- function(train, out_dir) {

# Visualize kappa for K's ----------------------------------------------

kappa_vs_k <- model_cv_30fold$results %>%
kappa_vs_k <- model_cv_30fold$results |>
ggplot(aes(x = k, y = Kappa)) +
geom_point() +
geom_errorbar(aes(ymin = Kappa - (KappaSD/sqrt(30)), ymax = Kappa + (KappaSD/sqrt(30)))) +
Expand Down
114 changes: 56 additions & 58 deletions src/pre_process_wisc.r
Original file line number Diff line number Diff line change
Expand Up @@ -11,82 +11,80 @@ Options:
--out_dir=<out_dir> Path to directory where the processed data should be written
" -> doc

library(feather)

library(tidyverse)
library(caret)
library(tidymodels)
library(docopt)
library(arrow)
set.seed(2020)

opt <- docopt(doc)
main <- function(input, out_dir){
# read data and convert class to factor
raw_data <- read_feather(input)
raw_data <- arrow::read_feather(input)
colnames(raw_data) <- c("id",
"class",
"mean_radius",
"mean_texture",
"mean_perimeter",
"mean_area",
"mean_smoothness",
"mean_compactness",
"mean_concavity",
"mean_concave_points",
"mean_symmetry",
"mean_fractal_dimension",
"se_radius",
"se_texture",
"se_perimeter",
"se_area",
"se_smoothness",
"se_compactness",
"se_concavity",
"se_concave_points",
"se_symmetry",
"se_fractal_dimension",
"max_radius",
"max_texture",
"max_perimeter",
"max_area",
"max_smoothness",
"max_compactness",
"max_concavity",
"max_concave_points",
"max_symmetry",
"max_fractal_dimension")
raw_data <- raw_data %>%
select(-id) %>%
"class",
"mean_radius",
"mean_texture",
"mean_perimeter",
"mean_area",
"mean_smoothness",
"mean_compactness",
"mean_concavity",
"mean_concave_points",
"mean_symmetry",
"mean_fractal_dimension",
"se_radius",
"se_texture",
"se_perimeter",
"se_area",
"se_smoothness",
"se_compactness",
"se_concavity",
"se_concave_points",
"se_symmetry",
"se_fractal_dimension",
"max_radius",
"max_texture",
"max_perimeter",
"max_area",
"max_smoothness",
"max_compactness",
"max_concavity",
"max_concave_points",
"max_symmetry",
"max_fractal_dimension")
raw_data <- raw_data |>
select(-id) |>
mutate(class = as.factor(class))

# split into training and test data sets
training_rows <- raw_data %>%
select(class) %>%
pull() %>%
createDataPartition(p = 0.75, list = FALSE)
training_data <- raw_data %>% slice(training_rows)
test_data <- raw_data %>% slice(-training_rows)
# split into training and test datasets with rsample package
split <- rsample::initial_split(raw_data, prop = 0.75)
training_data <- rsample::training(split)
testing_data <- rsample::testing(split)

# calculate the required statistics into the recipe
class_rec <- recipes::recipe(class ~ ., data = training_data) |>
recipes::step_scale(all_predictors()) |>
recipes::step_center(all_predictors()) |>
recipes::prep()

# "baking" the recipe: scaling and centering the data
training_scaled <- class_rec |>
recipes::bake(new_data = NULL)

# scale test data using scale factor
x_train <- training_data %>%
select(-class)
x_test <- test_data %>%
select(-class)
pre_process_scaler <- preProcess(x_train, method = c("center", "scale"))
x_train_scaled <- predict(pre_process_scaler, x_train)
x_test_scaled <- predict(pre_process_scaler, x_test)
training_scaled <- x_train_scaled %>%
mutate(class = training_data %>% select(class) %>% pull())
test_scaled <- x_test_scaled %>%
mutate(class = test_data %>% select(class) %>% pull())
test_scaled <- class_rec |>
recipes::bake(new_data = testing_data)

# write scale factor to a file
# write the recipe to a file
try({
dir.create(out_dir)
})
saveRDS(pre_process_scaler, file = paste0(out_dir, "/scale_factor.rds"))
saveRDS(class_rec, file = paste0(out_dir, "/scale_factor.rds"))

# write training and test data to feather files
write_feather(training_scaled, paste0(out_dir, "/training.feather"))
write_feather(test_scaled, paste0(out_dir, "/test.feather"))
}

main(opt[["--input"]], opt[["--out_dir"]])
main(opt[["--input"]], opt[["--out_dir"]])
20 changes: 10 additions & 10 deletions src/reticulate_fit_breast_cancer_predict_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ main <- function(train, out_dir) {
# Load and wrangle data ---------------------------------------------------

# load training data as an R data frame
train_data <- read_feather("data/processed/training.feather") %>%
train_data <- read_feather("data/processed/training.feather") |>
mutate(class = as.integer(as.factor(class)) )

# create X and Y
X_train <- train_data %>%
X_train <- train_data |>
select(-class)
y_train <- train_data %>%
select(class) %>%
pull(class) %>%
as.factor() %>%
y_train <- train_data |>
select(class) |>
pull(class) |>
as.factor() |>
as.integer()
y_train <- y_train - 1

Expand Down Expand Up @@ -77,10 +77,10 @@ main <- function(train, out_dir) {
ggsave(paste0(out_dir, "/kappa_vs_k.png"), kappa_vs_k, width = 5, height = 3)

# Fit final model ---------------------------------------------------------
best_k <- cv_results %>%
filter(cv_mean == max(cv_mean)) %>%
select(k) %>%
pull() %>%
best_k <- cv_results |>
filter(cv_mean == max(cv_mean)) |>
select(k) |>
pull() |>
as.integer()
knn_final_model <- sklearn_neighbours$KNeighborsClassifier(n_neighbors = best_k)
knn_final_model_fit <- knn_final_model$fit(X_train, y_train)
Expand Down