-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogreg_classifier.py
66 lines (46 loc) · 2.52 KB
/
logreg_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
reviews_test = []
reviews_train = []
for line in open('data/aclImdb/movie_data/full_train.txt', encoding="utf8"):
reviews_train.append(line.strip())
for line in open('data/aclImdb/movie_data/full_test.txt', encoding="utf8"):
reviews_test.append(line.strip())
print("An example from train set of review:\n{}\n".format(reviews_train[0]))
# Learn regex, very important for manipulating string in NLP applications
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def preprocess_reviews(reviews):
reviews = [REPLACE_WITH_SPACE.sub(" ", REPLACE_NO_SPACE.sub("", review.lower())) for review in reviews]
return reviews
reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)
print('---------------------------------------------------------\n')
print("An example from train set of review after processing:\n{}\n".format(reviews_train[0]))
# Convert a collection of text documents to a matrix of token counts
# CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)
# Based on the dataset structure, first 12.5K belongs to +1 class and remaining 12.5K belongs to -1 class
target = [1 if x < 12500 else 0 for x in range(25000)]
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)
# modify hyper-parameter C, which adjusts the regularization
for c in [0.01, 0.05, 0.25, 0.5, 1]:
lr = LogisticRegression(C=c, max_iter=200)
lr.fit(X_train, y_train)
print("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))
final_model = LogisticRegression(C=0.05, max_iter=200)
final_model.fit(X, target)
print("\nFinal Accuracy: %s\n" % accuracy_score(target, final_model.predict(X_test)))
# Final Accuracy: 0.88128
feature_to_coef = {word: coef for word, coef in zip(cv.get_feature_names(), final_model.coef_[0])}
for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
print(best_positive)
print('\n---------------------------------------------------------\n')
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
print(best_negative)