Create classifier and cleaning script

Jamesm4 · Apr 29, 2018 · 365e3a1 · 365e3a1
1 parent 826c628
commit 365e3a1
Show file tree

Hide file tree

Showing 8 changed files with 6,135 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 __pycache__/*
+*.swp
diff --git a/__pycache__/sentiment.cpython-35.pyc b/__pycache__/sentiment.cpython-35.pyc
diff --git a/classifier.py b/classifier.py
@@ -0,0 +1,52 @@
+import json
+import pickle
+from textblob import TextBlob
+from textblob.classifiers import NaiveBayesClassifier
+from textblob.classifiers import MaxEntClassifier
+
+class Classifier:
+
+  def __init__(self, train=True):
+    if train:
+      fp = open("./data/train.csv")
+      self.cl = NaiveBayesClassifier(fp, format="csv")
+      # self.cl = MaxEntClassifier(fp, format="csv")
+      fp.close()
+
+      # fp = open("./data/classifier.pickle", "wb")
+      # pickle.dump(self.cl, fp, -1)
+      # fp.close()
+    else:
+      fp = open("./data/classifier.pickle", "rb")
+      self.cl = pickle.load(fp)
+      fp.close()
+
+  def test(self):
+    return self.cl.classify("This is a test sentence")
+
+  def classify(self, text):
+    return self.cl.classify(text)
+
+  def n_classify(self, text):
+    dist = self.cl.prob_classify(text)
+
+    probs = {"sentiments": []}
+    for s in dist.samples():
+      if dist.prob(s) >= .10:
+        probs["sentiments"].append({s: dist.prob(s)})
+
+    return json.dumps(probs)
+
+  def accuracy(self):
+    fp = open('./data/test.csv')
+    test_accuracy = self.cl.accuracy(fp, format="csv")
+    fp.close()
+    return test_accuracy
+
+def main():
+  cl = Classifier(train=True)
+  print(cl.test())
+
+if __name__ == "__main__":
+    main()
+
diff --git a/clean.py b/clean.py
@@ -0,0 +1,48 @@
+import sys
+
+TRAIN_N = 5000
+TEST_N = 1000
+
+def removeEntities(text):
+
+  def clean(word):
+    if "@" in word: return "NAME"
+    elif "http:" in word: return "LINK"
+    elif "#" in word: return "HASHTAG"
+    else: return word
+
+  return ' '.join(map(clean, text.split()))
+
+class Data:
+
+  def __init__(self, row):
+
+    split = row.split(",")
+
+    self.tweet_id = split[0]
+    self.sentiment = split[1].replace('"', '')
+    self.author = split[2].replace('"', '')
+    self.content = ' '.join(split[3:]).replace('"', '').replace("\n", "").lower()
+    self.content = removeEntities(self.content)
+
+  def write(self, fp):
+    fp.write(self.content + "," + self.sentiment + "\n")
+
+dirty = open("./data/text_emotion.csv")
+next(dirty)
+train = open("./data/train.csv", "w")
+test = open("./data/test.csv", "w")
+
+for i, line in enumerate(dirty):
+  if i == TRAIN_N + TEST_N: break
+
+  d = Data(line)
+  out = train if i < TRAIN_N else test
+  d.write(out)
+
+print("Cleaned " + str(TRAIN_N + TEST_N) + " rows of data.")
+
+dirty.close()
+train.close()
+test.close()
+
diff --git a/data/test.csv b/data/test.csv
diff --git a/data/train.csv b/data/train.csv
diff --git a/sentiment.py b/sentiment.py
@@ -1,11 +1,23 @@
+import classifier
 from flask import Flask
 from flask import request
+
+cl = classifier.Classifier(train=True)
+
 app = Flask(__name__)
 
 @app.route("/")
-def hello():
-    return "Hello World!"
+def test():
+  return cl.test()
 
 @app.route("/sentiment", methods=["POST"])
 def sentiment():
-    return request.form["text"]
+  return cl.classify(request.form["text"])
+
+@app.route("/sentiments", methods=["POST"])
+def sentiments():
+  return cl.n_classify(request.form["text"])
+
+@app.route("/accuracy")
+def accuracy():
+  return str(round(cl.accuracy(), 3))
diff --git a/test.py b/test.py
@@ -1,9 +1,25 @@
 import requests
 
-URL = "http://localhost:5000/sentiment"
+BASE_URL = "http://localhost:5000/"
 
+print("Beginning test suite.\n")
+
+# Test 1
+print("Test 1")
 payload = {"text": "I'm so sad so very very sad"}
+req = requests.post(BASE_URL + "sentiment", data=payload)
+print("Classification: " + req.text + "\n")
+
+# Test 2
+print("Test 2")
+payload = {"text": "These shoes are fabulous!"}
+req = requests.post(BASE_URL + "sentiments", data=payload)
+print("Probabilities:")
+print(req.json())
+print("\n")
 
-req = requests.post(URL, data=payload)
+# Test 3
+print("Test 3")
+req = requests.get(BASE_URL + "accuracy")
+print("Accuracy on test set: " + req.text + "\n")
 
-print(req.text)