-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_col.py
100 lines (75 loc) · 2.59 KB
/
test_col.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pickle
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from data_prep import StacsTransformer
'''~Config'''
target = 'color'
# Load data using pandas
df = pd.read_csv('data/data_train.csv')
# Drop target we are not predicting
if target == 'color':
df = df.drop(columns=['texture'])
else:
df = df.drop(columns=['color'])
# Drop one row with missing values
df.dropna(inplace=True)
# Pop out respective target class
y = df.pop(target)
# Assign input (excludes output classes)
X = df
# Split into training and validation set, stratify with regard to target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
# Set index to 'id' column as insignificant as a feature
X_train = X_train.set_index('id')
X_valid = X_valid.set_index('id')
# Build transformer for categorical features
cat_transformer = Pipeline([
('onehot', OneHotEncoder())
])
# Build transformer for numeric features
num_transformer = Pipeline([
('scaler', StandardScaler())
])
# Combine transformers
preprocessor = ColumnTransformer(transformers=[
('cat', cat_transformer, make_column_selector(dtype_include='object')),
('num', num_transformer, make_column_selector(dtype_include=['float64', 'int64']))
])
# Assign classifier name
classifier = 'rf'
# Build the model
model = RandomForestClassifier()
# Combine preprocessor and model into one pipeline
clf = Pipeline(steps=[
('prepper', StacsTransformer()),
('preprocessor', preprocessor),
('ftselector', SelectKBest(f_classif)),
('classifier', model)
])
# Load optimal pipeline parameters
path = 'optimal-params/' + classifier + '_' + target + '.pkl'
with open(path, 'rb') as f:
optimal_params = pickle.load(f)
# Set optimal pipeline parameters
clf.set_params(**optimal_params)
# Fit classifier on training set
clf_fit = clf.fit(X_train, y_train)
# Load test using pandas
X_test = pd.read_csv('data/data_test.csv')
# Drop one row with missing values
X_test.dropna(inplace=True)
# Set index to 'id' column as insignificant as a feature
X_test = X_test.set_index('id')
y_hat = clf_fit.predict(X_test)
if target == 'color':
path = 'output/colour_test.csv'
else:
path = 'output/texture_test.csv'
# Output corresponding predictions to a text file
np.savetxt(path, y_hat, fmt='%s')