updated training file and engine impl

freesoft · freesoft · commit ccdc75f6df76 · 2018-12-16T03:07:42.000-08:00
diff --git a/constant.py b/constant.py
@@ -15,12 +15,12 @@
 
 
 # has first 161060 sample data(lines)
-# TRAINING_DATA_PATH = "data/train_sample_161060.csv"
+TRAINING_DATA_PATH = "data/train_sample_161060.csv"
 
 # has first 8241 sample data(lines)
 # TRAINING_DATA_PATH = "data/train_sample_8241.csv" 
 
 # Following line's datafile is huge and python process will be killed while it is running.. or maybe taking forever with partial_fit
 # there are two other training sample data with project, which is data/train_sample_161060.csv and data/train_sample_8241.csv
 # that suffix means the line of traning data in the file. Change it to smaller one to see how Detox behaves with different training size/set.
-TRAINING_DATA_PATH = "data/train.csv"
+# TRAINING_DATA_PATH = "data/train.csv"
diff --git a/detox_engine.py b/detox_engine.py
@@ -95,14 +95,6 @@ def __init__(self):
             training = self.vectorizer.fit_transform(df['comment_text'])  
             del df
             gc.collect()
-                #first_run = False
-            #else:
-            #    training = self.vectorizer.transform(df['comment_text'])  
-
-            # if you want to check what's the transformed chat log matrix look like, uncomment below lines.
-            #print("looking at the shape of the training set...")
-            #print(training.shape)
-            #print(training)
 
             print("Initiating training...")
             train_x, test_x, train_y, test_y = train_test_split(training.toarray(), toxic.values, test_size=0.2)
@@ -113,13 +105,13 @@ def __init__(self):
             del train_x, train_y
             gc.collect()
                 
-            #print("Completed training. Generating classification result...")
-            #pred_y = self.classifier.predict(test_x)
+            print("Completed training. Generating classification result...")
+            pred_y = self.classifier.predict(test_x)
 
-            del test_x, test_y
+            del test_x
             gc.collect()
 
-            #print(classification_report(test_y, pred_y))
+            print(classification_report(test_y, pred_y))
 
             # store the classifier and vectorizer so it can be used later    
             print("Storing classifier and vectorizer into disk...")