Skip to content

Commit

Permalink
Create classifier and cleaning script
Browse files Browse the repository at this point in the history
  • Loading branch information
Marshall James committed Apr 29, 2018
1 parent 826c628 commit 365e3a1
Show file tree
Hide file tree
Showing 8 changed files with 6,135 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__pycache__/*
*.swp
Binary file modified __pycache__/sentiment.cpython-35.pyc
Binary file not shown.
52 changes: 52 additions & 0 deletions classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import json
import pickle
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.classifiers import MaxEntClassifier

class Classifier:

def __init__(self, train=True):
if train:
fp = open("./data/train.csv")
self.cl = NaiveBayesClassifier(fp, format="csv")
# self.cl = MaxEntClassifier(fp, format="csv")
fp.close()

# fp = open("./data/classifier.pickle", "wb")
# pickle.dump(self.cl, fp, -1)
# fp.close()
else:
fp = open("./data/classifier.pickle", "rb")
self.cl = pickle.load(fp)
fp.close()

def test(self):
return self.cl.classify("This is a test sentence")

def classify(self, text):
return self.cl.classify(text)

def n_classify(self, text):
dist = self.cl.prob_classify(text)

probs = {"sentiments": []}
for s in dist.samples():
if dist.prob(s) >= .10:
probs["sentiments"].append({s: dist.prob(s)})

return json.dumps(probs)

def accuracy(self):
fp = open('./data/test.csv')
test_accuracy = self.cl.accuracy(fp, format="csv")
fp.close()
return test_accuracy

def main():
cl = Classifier(train=True)
print(cl.test())

if __name__ == "__main__":
main()

48 changes: 48 additions & 0 deletions clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import sys

TRAIN_N = 5000
TEST_N = 1000

def removeEntities(text):

def clean(word):
if "@" in word: return "NAME"
elif "http:" in word: return "LINK"
elif "#" in word: return "HASHTAG"
else: return word

return ' '.join(map(clean, text.split()))

class Data:

def __init__(self, row):

split = row.split(",")

self.tweet_id = split[0]
self.sentiment = split[1].replace('"', '')
self.author = split[2].replace('"', '')
self.content = ' '.join(split[3:]).replace('"', '').replace("\n", "").lower()
self.content = removeEntities(self.content)

def write(self, fp):
fp.write(self.content + "," + self.sentiment + "\n")

dirty = open("./data/text_emotion.csv")
next(dirty)
train = open("./data/train.csv", "w")
test = open("./data/test.csv", "w")

for i, line in enumerate(dirty):
if i == TRAIN_N + TEST_N: break

d = Data(line)
out = train if i < TRAIN_N else test
d.write(out)

print("Cleaned " + str(TRAIN_N + TEST_N) + " rows of data.")

dirty.close()
train.close()
test.close()

1,000 changes: 1,000 additions & 0 deletions data/test.csv

Large diffs are not rendered by default.

5,000 changes: 5,000 additions & 0 deletions data/train.csv

Large diffs are not rendered by default.

18 changes: 15 additions & 3 deletions sentiment.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
import classifier
from flask import Flask
from flask import request

cl = classifier.Classifier(train=True)

app = Flask(__name__)

@app.route("/")
def hello():
return "Hello World!"
def test():
return cl.test()

@app.route("/sentiment", methods=["POST"])
def sentiment():
return request.form["text"]
return cl.classify(request.form["text"])

@app.route("/sentiments", methods=["POST"])
def sentiments():
return cl.n_classify(request.form["text"])

@app.route("/accuracy")
def accuracy():
return str(round(cl.accuracy(), 3))
22 changes: 19 additions & 3 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
import requests

URL = "http://localhost:5000/sentiment"
BASE_URL = "http://localhost:5000/"

print("Beginning test suite.\n")

# Test 1
print("Test 1")
payload = {"text": "I'm so sad so very very sad"}
req = requests.post(BASE_URL + "sentiment", data=payload)
print("Classification: " + req.text + "\n")

# Test 2
print("Test 2")
payload = {"text": "These shoes are fabulous!"}
req = requests.post(BASE_URL + "sentiments", data=payload)
print("Probabilities:")
print(req.json())
print("\n")

req = requests.post(URL, data=payload)
# Test 3
print("Test 3")
req = requests.get(BASE_URL + "accuracy")
print("Accuracy on test set: " + req.text + "\n")

print(req.text)

0 comments on commit 365e3a1

Please sign in to comment.