Skip to content

Commit ccdc75f

Browse files
committed
updated training file and engine impl
1 parent 8444b8a commit ccdc75f

File tree

2 files changed

+6
-14
lines changed

2 files changed

+6
-14
lines changed

constant.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515

1616

1717
# has first 161060 sample data(lines)
18-
# TRAINING_DATA_PATH = "data/train_sample_161060.csv"
18+
TRAINING_DATA_PATH = "data/train_sample_161060.csv"
1919

2020
# has first 8241 sample data(lines)
2121
# TRAINING_DATA_PATH = "data/train_sample_8241.csv"
2222

2323
# Following line's datafile is huge and python process will be killed while it is running.. or maybe taking forever with partial_fit
2424
# there are two other training sample data with project, which is data/train_sample_161060.csv and data/train_sample_8241.csv
2525
# that suffix means the line of traning data in the file. Change it to smaller one to see how Detox behaves with different training size/set.
26-
TRAINING_DATA_PATH = "data/train.csv"
26+
# TRAINING_DATA_PATH = "data/train.csv"

detox_engine.py

+4-12
Original file line numberDiff line numberDiff line change
@@ -95,14 +95,6 @@ def __init__(self):
9595
training = self.vectorizer.fit_transform(df['comment_text'])
9696
del df
9797
gc.collect()
98-
#first_run = False
99-
#else:
100-
# training = self.vectorizer.transform(df['comment_text'])
101-
102-
# if you want to check what's the transformed chat log matrix look like, uncomment below lines.
103-
#print("looking at the shape of the training set...")
104-
#print(training.shape)
105-
#print(training)
10698

10799
print("Initiating training...")
108100
train_x, test_x, train_y, test_y = train_test_split(training.toarray(), toxic.values, test_size=0.2)
@@ -113,13 +105,13 @@ def __init__(self):
113105
del train_x, train_y
114106
gc.collect()
115107

116-
#print("Completed training. Generating classification result...")
117-
#pred_y = self.classifier.predict(test_x)
108+
print("Completed training. Generating classification result...")
109+
pred_y = self.classifier.predict(test_x)
118110

119-
del test_x, test_y
111+
del test_x
120112
gc.collect()
121113

122-
#print(classification_report(test_y, pred_y))
114+
print(classification_report(test_y, pred_y))
123115

124116
# store the classifier and vectorizer so it can be used later
125117
print("Storing classifier and vectorizer into disk...")

0 commit comments

Comments
 (0)