Skip to content

Commit bdbeaa8

Browse files
author
Michael Klear
committedAug 5, 2021
add save/load fts; fix logger bug; add inference mode
1 parent 3818344 commit bdbeaa8

File tree

6 files changed

+411
-218
lines changed

6 files changed

+411
-218
lines changed
 

‎demo_data/demo.ipynb

+246-184
Large diffs are not rendered by default.

‎demo_data/saving_and_loading.ipynb

+18-33
Large diffs are not rendered by default.

‎dfencoder/autoencoder.py

+97
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,25 @@
55
import numpy as np
66
import torch
77
import tqdm
8+
import dill
9+
import json
810

911
from .dataframe import EncoderDataFrame
1012
from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
1113
from .scalers import StandardScaler, NullScaler, GaussRankScaler
1214

15+
16+
17+
18+
def load_model(path):
19+
"""
20+
Loads serialized model from input path.
21+
"""
22+
with open(path, 'rb') as f:
23+
loaded_serialized_model = f.read()
24+
loaded_model = dill.loads(loaded_serialized_model)
25+
return loaded_model
26+
1327
def ohe(input_vector, dim, device="cpu"):
1428
"""Does one-hot encoding of input vector."""
1529
batch_size = len(input_vector)
@@ -54,6 +68,7 @@ def transform(self, df):
5468
return df
5569

5670

71+
5772
class CompleteLayer(torch.nn.Module):
5873
"""
5974
Impliments a layer with linear transformation
@@ -854,6 +869,80 @@ def get_deep_stack_features(self, df):
854869
result = torch.cat(result, dim=0)
855870
return result
856871

872+
def _deserialize_json(self, data):
873+
"""
874+
encodes json data into appropriate features
875+
for inference.
876+
"data" should be a string.
877+
"""
878+
data = json.loads(data)
879+
return data
880+
row = pd.DataFrame()
881+
for item in data:
882+
row[item] = [data[item]]
883+
return row
884+
885+
886+
def compute_targets_dict(self, data):
887+
numeric = []
888+
for num_name in self.num_names:
889+
raw_value = data[num_name]
890+
trans_value = self.numeric_fts[num_name]['scaler'].transform(np.array([raw_value]))
891+
numeric.append(trans_value)
892+
num = torch.tensor(numeric).reshape(1, -1).float().to(self.device)
893+
894+
binary = []
895+
for bin_name in self.bin_names:
896+
value = data[bin_name]
897+
code = self.binary_fts[bin_name][value]
898+
binary.append(int(code))
899+
bin = torch.tensor(binary).reshape(1, -1).float().to(self.device)
900+
codes = []
901+
for ft in self.categorical_fts:
902+
category = data[ft]
903+
code = self.categorical_fts[ft]['cats'].index(category)
904+
code = torch.tensor(code).to(self.device)
905+
codes.append(code)
906+
return num, bin, codes
907+
908+
def encode_input_dict(self, data):
909+
"""
910+
Handles raw df inputs.
911+
Passes categories through embedding layers.
912+
"""
913+
num, bin, codes = self.compute_targets_dict(data)
914+
embeddings = []
915+
for i, ft in enumerate(self.categorical_fts):
916+
feature = self.categorical_fts[ft]
917+
emb = feature['embedding'](codes[i]).reshape(1, -1)
918+
embeddings.append(emb)
919+
return [num], [bin], embeddings
920+
921+
def get_deep_stack_features_json(self, data):
922+
"""
923+
gets "deep stack" features for a single record;
924+
intended for executing "inference" logic for a
925+
network request.
926+
data can either be a json string or a dict.
927+
"""
928+
if isinstance(data, str):
929+
data = self._deserialize_json(data)
930+
931+
self.eval()
932+
933+
with torch.no_grad():
934+
this_batch = []
935+
num, bin, embeddings = self.encode_input_dict(data)
936+
x = torch.cat(num + bin + embeddings, dim=1)
937+
for layer in self.encoder:
938+
x = layer(x)
939+
this_batch.append(x)
940+
for layer in self.decoder:
941+
x = layer(x)
942+
this_batch.append(x)
943+
z = torch.cat(this_batch, dim=1)
944+
return z
945+
857946
def get_anomaly_score(self, df):
858947
"""
859948
Returns a per-row loss of the input dataframe.
@@ -957,3 +1046,11 @@ def df_predict(self, df):
9571046
output_df = self.decode_to_df(x, df=df)
9581047

9591048
return output_df
1049+
1050+
def save(self, path):
1051+
"""
1052+
Saves serialized model to input path.
1053+
"""
1054+
with open(path, 'wb') as f:
1055+
serialized_model = dill.dumps(self)
1056+
f.write(serialized_model)

‎dfencoder/logging.py

+3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ def end_epoch(self):
4949
self.id_val_fts[ft][1].append(mean)
5050
#reset id_val_fts log
5151
self.id_val_fts[ft][0] = []
52+
53+
def show_embeddings(self, categories):
54+
pass
5255

5356
class IpynbLogger(BasicLogger):
5457
"""Plots Logging Data in jupyter notebook"""

‎setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
'scikit-learn',
1616
'tensorboardX',
1717
'matplotlib',
18-
'wheel'
18+
'wheel',
19+
'dill'
1920
]
2021
version = '0.0.36'
2122

‎test.py

+45
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import time
33
import os
44
import shutil
5+
import json
56
from collections import OrderedDict
67

78
import pandas as pd
@@ -22,6 +23,37 @@ def tearDown(self):
2223
t = time.time() - self.startTime
2324
print("%s: %.3f seconds" % (self.id(), t))
2425

26+
class ModelBuilder(object):
27+
28+
def __init__(self):
29+
self.model = None
30+
self.out_df = None
31+
32+
def build_model(self):
33+
if self.model is None:
34+
encoder = AutoEncoder(
35+
encoder_layers=[32, 32],
36+
decoder_layers=[32, 32],
37+
encoder_dropout=.5,
38+
decoder_dropout=[.2, None],
39+
activation='tanh',
40+
swap_p=.2,
41+
batch_size=123,
42+
optimizer='sgd',
43+
lr_decay=.95
44+
)
45+
encoder.build_model(df)
46+
out_df = encoder.prepare_df(df)
47+
assert not out_df.isna().any().any()
48+
layers_count = 0
49+
for prm in encoder.parameters():
50+
layers_count += 1
51+
assert layers_count == 33
52+
self.model, self.out_df = encoder, out_df
53+
return encoder, out_df
54+
else:
55+
return self.model, self.out_df
56+
2557
class TestCompleteLayer(TimedCase):
2658
def test_init(self):
2759
layer = CompleteLayer(12, 5, activation='sigmoid', dropout=.2)
@@ -165,6 +197,17 @@ def test_fit(self):
165197
assert data.shape == sample.shape
166198
return encoder
167199

200+
def test_inference(self):
201+
record = df.sample()
202+
js = record.iloc[0].to_json()
203+
output = model._deserialize_json(js)
204+
z_json = model.get_deep_stack_features_json(js)
205+
dct = json.loads(js)
206+
z_dict = model.get_deep_stack_features_json(dct)
207+
z = model.get_deep_stack_features(record)
208+
assert (z_json == z).all()
209+
assert (z_json == z_dict).all()
210+
168211
def test_get_representation(self):
169212
encoder = AutoEncoder()
170213
sample = df.sample(1025)
@@ -314,6 +357,8 @@ def test_null_indicator(self):
314357
if __name__ == '__main__':
315358
os.mkdir('_testlog')
316359
df = pd.read_csv('adult.csv')
360+
b = ModelBuilder()
361+
model, _ = b.build_model()
317362
unittest.main(exit=False)
318363
shutil.rmtree('_testlog')
319364
quit()

0 commit comments

Comments
 (0)
Please sign in to comment.