-
Notifications
You must be signed in to change notification settings - Fork 176
/
train_model.py
87 lines (75 loc) · 2.51 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from ml.data import process_data
from ml.model import (
compute_model_metrics,
inference,
load_model,
performance_on_categorical_slice,
save_model,
train_model,
)
# TODO: load the cencus.csv data
project_path = "Your path here"
data_path = os.path.join(project_path, "data", "census.csv")
print(data_path)
data = None # your code here
# TODO: split the provided data to have a train dataset and a test dataset
# Optional enhancement, use K-fold cross validation instead of a train-test split.
train, test = None, None# Your code here
# DO NOT MODIFY
cat_features = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"native-country",
]
# TODO: use the process_data function provided to process the data.
X_train, y_train, encoder, lb = process_data(
# your code here
# use the train dataset
# use training=True
# do not need to pass encoder and lb as input
)
X_test, y_test, _, _ = process_data(
test,
categorical_features=cat_features,
label="salary",
training=False,
encoder=encoder,
lb=lb,
)
# TODO: use the train_model function to train the model on the training dataset
model = None # your code here
# save the model and the encoder
model_path = os.path.join(project_path, "model", "model.pkl")
save_model(model, model_path)
encoder_path = os.path.join(project_path, "model", "encoder.pkl")
save_model(encoder, encoder_path)
# load the model
model = load_model(
model_path
)
# TODO: use the inference function to run the model inferences on the test dataset.
preds = None # your code here
# Calculate and print the metrics
p, r, fb = compute_model_metrics(y_test, preds)
print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}")
# TODO: compute the performance on model slices using the performance_on_categorical_slice function
# iterate through the categorical features
for col in cat_features:
# iterate through the unique values in one categorical feature
for slicevalue in sorted(test[col].unique()):
count = test[test[col] == slicevalue].shape[0]
p, r, fb = performance_on_categorical_slice(
# your code here
# use test, col and slicevalue as part of the input
)
with open("slice_output.txt", "a") as f:
print(f"{col}: {slicevalue}, Count: {count:,}", file=f)
print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}", file=f)