diff --git a/Twitter_Sentiment_Analysis.ipynb b/Twitter_Sentiment_Analysis.ipynb
index 7a3345e..716a085 100644
--- a/Twitter_Sentiment_Analysis.ipynb
+++ b/Twitter_Sentiment_Analysis.ipynb
@@ -107,10 +107,10 @@
"metadata": {
"id": "pXSrmmZr7orS",
"colab_type": "code",
- "outputId": "0528d36c-3b50-4ce8-dce4-a5eda7b27679",
+ "outputId": "4d3612ec-f2ba-4e61-9703-eef93a53265a",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 377
+ "height": 394
}
},
"source": [
@@ -163,33 +163,40 @@
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.externals import joblib"
],
- "execution_count": 3,
+ "execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: tweepy in /usr/local/lib/python3.6/dist-packages (3.6.0)\n",
- "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy) (1.12.0)\n",
- "Requirement already satisfied: PySocks>=1.5.7 in /usr/local/lib/python3.6/dist-packages (from tweepy) (1.7.1)\n",
"Requirement already satisfied: requests>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy) (2.23.0)\n",
"Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy) (1.3.0)\n",
- "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.11.1->tweepy) (2.9)\n",
+ "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy) (1.12.0)\n",
+ "Requirement already satisfied: PySocks>=1.5.7 in /usr/local/lib/python3.6/dist-packages (from tweepy) (1.7.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.11.1->tweepy) (2020.4.5.1)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.11.1->tweepy) (3.0.4)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.11.1->tweepy) (1.24.3)\n",
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.11.1->tweepy) (2.9)\n",
"Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy) (3.1.0)\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
- "[nltk_data] Package punkt is already up-to-date!\n",
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
- "[nltk_data] Package stopwords is already up-to-date!\n",
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n",
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
- "[nltk_data] Package wordnet is already up-to-date!\n",
+ "[nltk_data] Unzipping corpora/wordnet.zip.\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /root/nltk_data...\n",
- "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
- "[nltk_data] date!\n"
+ "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n"
],
"name": "stdout"
+ },
+ {
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
+ " warnings.warn(msg, category=FutureWarning)\n"
+ ],
+ "name": "stderr"
}
]
},
@@ -317,17 +324,17 @@
"metadata": {
"id": "hQDprSKllSNL",
"colab_type": "code",
- "outputId": "e0447214-0427-4a1a-b526-4bacb6f0274d",
+ "outputId": "f3cb8325-5421-45c0-f9cc-60f0a23e3278",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 391
+ "height": 204
}
},
"source": [
"data1 = pd.read_csv('https://github.com/TharinduMunasinge/Twitter-Sentiment-Analysis/raw/master/DataSet/FinalizedFull.csv').rename(columns={'tweet':'text'})\n",
"data1.head()"
],
- "execution_count": 0,
+ "execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
@@ -397,77 +404,7 @@
"metadata": {
"tags": []
},
- "execution_count": 23
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " text | \n",
- " senti | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " @united UA5396 can wait for me. I'm on the gro... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " I hate Time Warner! Soooo wish I had Vios. Can... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " @united Oh, we are sure it's not planned, but ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Tom Shanahan's latest column on SDSU and its N... | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Found the self driving car!! /IWo3QSvdu2 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text senti\n",
- "0 @united UA5396 can wait for me. I'm on the gro... 0\n",
- "1 I hate Time Warner! Soooo wish I had Vios. Can... 0\n",
- "2 @united Oh, we are sure it's not planned, but ... 0\n",
- "3 Tom Shanahan's latest column on SDSU and its N... 2\n",
- "4 Found the self driving car!! /IWo3QSvdu2 2"
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 34
+ "execution_count": 3
}
]
},
@@ -476,24 +413,21 @@
"metadata": {
"id": "sOCWOfJFPLUd",
"colab_type": "code",
- "outputId": "d5c4726b-abde-40fb-cebe-64de9bdedaf3",
+ "outputId": "a904e268-1d20-44d7-ac94-c09b648335a0",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 119
+ "height": 68
}
},
"source": [
"# Check the balance of the dataset\n",
"counts1 = print_balance(data1)"
],
- "execution_count": 0,
+ "execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
- "This dataset contains 377 negative tweets (37.8%)\n",
- "This dataset contains 239 neutral tweets (24.0%)\n",
- "This dataset contains 381 positive tweets (38.2%)\n",
"This dataset contains 377 negative tweets (37.8%)\n",
"This dataset contains 239 neutral tweets (24.0%)\n",
"This dataset contains 381 positive tweets (38.2%)\n"
@@ -527,10 +461,10 @@
"metadata": {
"id": "Ks9cEN71BvyK",
"colab_type": "code",
- "outputId": "29eaf93c-b1a0-42fe-e0c5-d2118ff47f5e",
+ "outputId": "fc799ee0-2da7-4b3d-82fb-38f6066fbee5",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 391
+ "height": 204
}
},
"source": [
@@ -545,7 +479,7 @@
"\n",
"data2.head()"
],
- "execution_count": 0,
+ "execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
@@ -615,77 +549,7 @@
"metadata": {
"tags": []
},
- "execution_count": 25
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " senti | \n",
- " text | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 4 | \n",
- " Now all @Apple has to do is get swype on the i... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4 | \n",
- " @Apple will be adding more carrier support to ... | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4 | \n",
- " Hilarious @youtube video - guy does a duet wit... | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4 | \n",
- " @RIM you made it too easy for me to switch to ... | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " I just realized that the reason I got into twi... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " senti text\n",
- "0 4 Now all @Apple has to do is get swype on the i...\n",
- "1 4 @Apple will be adding more carrier support to ...\n",
- "2 4 Hilarious @youtube video - guy does a duet wit...\n",
- "3 4 @RIM you made it too easy for me to switch to ...\n",
- "4 4 I just realized that the reason I got into twi..."
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 36
+ "execution_count": 5
}
]
},
@@ -694,24 +558,21 @@
"metadata": {
"id": "xHT5w5apSblQ",
"colab_type": "code",
- "outputId": "c0c924e6-b291-41ae-c4f1-e751972ae8d4",
+ "outputId": "50d3f8c1-6699-4ebd-e27a-8ac3e8c218f7",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 119
+ "height": 68
}
},
"source": [
"# Check the balance of the dataset\n",
"counts2 = print_balance(data2)"
],
- "execution_count": 0,
+ "execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
- "This dataset contains 572 negative tweets (16.7%)\n",
- "This dataset contains 2333 neutral tweets (68.1%)\n",
- "This dataset contains 519 positive tweets (15.2%)\n",
"This dataset contains 572 negative tweets (16.7%)\n",
"This dataset contains 2333 neutral tweets (68.1%)\n",
"This dataset contains 519 positive tweets (15.2%)\n"
@@ -737,10 +598,10 @@
"metadata": {
"id": "nILYW_U3UgY0",
"colab_type": "code",
- "outputId": "bbe4274d-bbe4-45ed-b98b-73ae65ad23cf",
+ "outputId": "028ef27e-03ef-45d5-95d9-d36ee3ce4977",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 119
+ "height": 68
}
},
"source": [
@@ -751,14 +612,11 @@
"# Check the balance of the dataset\n",
"train_counts = print_balance(train)"
],
- "execution_count": 0,
+ "execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
- "This dataset contains 949 negative tweets (21.5%)\n",
- "This dataset contains 2572 neutral tweets (58.2%)\n",
- "This dataset contains 900 positive tweets (20.4%)\n",
"This dataset contains 949 negative tweets (21.5%)\n",
"This dataset contains 2572 neutral tweets (58.2%)\n",
"This dataset contains 900 positive tweets (20.4%)\n"
@@ -796,10 +654,10 @@
"metadata": {
"id": "-xAwNoKD8okU",
"colab_type": "code",
- "outputId": "842f5601-13f7-4bcd-9d63-47beb54e7671",
+ "outputId": "8bf2683d-91af-4ec8-d2a8-56ce5bd4d0da",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 119
+ "height": 68
}
},
"source": [
@@ -812,14 +670,11 @@
"# Create train and test sets\n",
"X_train, X_test, y_train, y_test = train_test_split(train_adj.text.values, train_adj.senti.values, random_state=637)"
],
- "execution_count": 0,
+ "execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
- "This dataset contains 900 negative tweets (28.6%)\n",
- "This dataset contains 1350 neutral tweets (42.9%)\n",
- "This dataset contains 900 positive tweets (28.6%)\n",
"This dataset contains 900 negative tweets (28.6%)\n",
"This dataset contains 1350 neutral tweets (42.9%)\n",
"This dataset contains 900 positive tweets (28.6%)\n"
@@ -857,10 +712,10 @@
"metadata": {
"id": "zwvToCqPIX7M",
"colab_type": "code",
- "outputId": "a2ceee8c-df8d-4216-90e1-fc8400b544b2",
+ "outputId": "1ba76319-d6c7-4abb-c6a5-2931e8abd500",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 323
+ "height": 170
}
},
"source": [
@@ -880,20 +735,11 @@
"X_test_bal = test_bal.text.values\n",
"y_test_bal = test_bal.senti.values"
],
- "execution_count": 0,
+ "execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
- "\u001b[1mComposition of unbalanced test set:\u001b[0m\n",
- "This dataset contains 111 negative tweets (20.2%)\n",
- "This dataset contains 327 neutral tweets (59.1%)\n",
- "This dataset contains 114 positive tweets (20.7%)\n",
- "\n",
- "\u001b[1mComposition of balanced test set:\u001b[0m\n",
- "This dataset contains 114 negative tweets (33.3%)\n",
- "This dataset contains 114 neutral tweets (33.3%)\n",
- "This dataset contains 114 positive tweets (33.3%)\n",
"\u001b[1mComposition of unbalanced test set:\u001b[0m\n",
"This dataset contains 111 negative tweets (20.2%)\n",
"This dataset contains 327 neutral tweets (59.1%)\n",
@@ -1247,7 +1093,7 @@
" print('Calculating uncertainty for aggregate sentiment scores...')\n",
" self.agg_errs, self.agg_true_means, self.agg_pred_means = self.agg_sent_score(X, y,\n",
" n_bootstraps=1000,\n",
- " bootstrap_size=0.5,\n",
+ " bootstrap_size=1000,\n",
" verbose=True)\n",
" self.agg_me = self.agg_errs.mean() # Mean average error in aggregated sentiment scores\n",
" self.agg_unc = model.agg_errs.std()*2 # Uncertainty in aggregated sentiment scores (95% confidence)\n",
@@ -1275,7 +1121,8 @@
"\n",
" def predict_agg(self, X, verbose=False):\n",
" agg_sent = ((self.grid.predict(X) - 2) / 2).mean()\n",
- " if verbose: print('\\n(scale from -1 to 1) Aggregated sentiment score: {0:.2f} \\u00B1 {1:.2f}\\n'.format(agg_sent, self.agg_unc))\n",
+ " agg_unc = self.agg_unc * np.sqrt(1000 / len(X))\n",
+ " if verbose: print('(scale from -1 to 1) Aggregated sentiment score: {0:.2f} \\u00B1 {1:.2f}\\n'.format(agg_sent, agg_unc))\n",
" return agg_sent, self.agg_unc\n",
"\n",
"\n",
@@ -1377,14 +1224,13 @@
" return fig\n",
"\n",
"\n",
- " def agg_sent_score(self, X, y, n_bootstraps=1000, bootstrap_size=0.5, verbose=False):\n",
+ " def agg_sent_score(self, X, y, n_bootstraps=1000, bootstrap_size=1000, verbose=False):\n",
" np.random.seed(637)\n",
- " size = int(len(X) * bootstrap_size)\n",
" true_means, pred_means, errs = np.ones(n_bootstraps), np.ones(n_bootstraps), np.ones(n_bootstraps)\n",
" for i in range(n_bootstraps):\n",
" if verbose:\n",
" if not (i+1)%(n_bootstraps//5): print('{0:.0f}% done'.format((i+1)/(n_bootstraps//100)))\n",
- " ind = np.random.randint(0, len(X), size)\n",
+ " ind = np.random.randint(0, len(X), int(bootstrap_size))\n",
" X_boot = X[ind]\n",
" y_boot = y[ind]\n",
" y_pred = self.grid.predict(X_boot)\n",
@@ -1401,7 +1247,7 @@
" return errs, true_means, pred_means\n",
"\n",
"\n",
- " def agg_sent_hist(self, X, y, n_bootstraps=1000, bootstrap_size=0.5, verbose=False):\n",
+ " def agg_sent_hist(self, X, y, n_bootstraps=1000, bootstrap_size=1000, verbose=False):\n",
" errs, true_means, pred_means = self.agg_sent_score(X, y, n_bootstraps, bootstrap_size, verbose)\n",
" fig, ax = plt.subplots(figsize=(7, 7))\n",
" ax.hist(errs)\n",
@@ -1412,7 +1258,7 @@
" return fig\n",
"\n",
"\n",
- " def agg_sent_plot(self, X, y, n_bootstraps=1000, bootstrap_size=0.5, verbose=False):\n",
+ " def agg_sent_plot(self, X, y, n_bootstraps=1000, bootstrap_size=1000, verbose=False):\n",
" errs, true_means, pred_means = self.agg_sent_score(X, y, n_bootstraps, bootstrap_size, verbose)\n",
" fig, ax = plt.subplots(figsize=(7, 7))\n",
" ax.scatter(true_means, pred_means)\n",
@@ -1484,26 +1330,27 @@
"metadata": {
"id": "qsH0VP_pW-OR",
"colab_type": "code",
- "outputId": "a467e4ea-af07-4377-d4d0-3b9200a9352d",
+ "outputId": "ecfcdea1-60e9-46ec-cec8-b503a753e907",
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 445
+ "height": 394
}
},
"source": [
- "model = TwitterSentimentModel()\n",
- "model.fit(X_train, y_train, classifier='all')\n",
- "model.set_uncertainty(X_test, y_test)\n",
+ "if False:\n",
+ " model = TwitterSentimentModel()\n",
+ " model.fit(X_train, y_train, classifier='SGD')\n",
+ " model.set_uncertainty(X_test, y_test)\n",
"\n",
- "with open(\"trained_model.pickle\", \"wb\") as f:\n",
- " pickle.dump(model, f)"
+ " with open(\"trained_model.pickle\", \"wb\") as f:\n",
+ " pickle.dump(model, f)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
- "Fitting 10 folds for each of 132 candidates, totalling 1320 fits\n"
+ "Fitting 10 folds for each of 24 candidates, totalling 240 fits\n"
],
"name": "stdout"
},
@@ -1511,12 +1358,9 @@
"output_type": "stream",
"text": [
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n",
- "[Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 55.7s\n",
- "[Parallel(n_jobs=-1)]: Done 196 tasks | elapsed: 2.3min\n",
- "[Parallel(n_jobs=-1)]: Done 446 tasks | elapsed: 2.8min\n",
- "[Parallel(n_jobs=-1)]: Done 796 tasks | elapsed: 3.4min\n",
- "[Parallel(n_jobs=-1)]: Done 1246 tasks | elapsed: 5.7min\n",
- "[Parallel(n_jobs=-1)]: Done 1320 out of 1320 | elapsed: 6.0min finished\n"
+ "[Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 6.8s\n",
+ "[Parallel(n_jobs=-1)]: Done 196 tasks | elapsed: 28.8s\n",
+ "[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 35.5s finished\n"
],
"name": "stderr"
},
@@ -1538,7 +1382,7 @@
"60% done\n",
"80% done\n",
"100% done\n",
- "Done! Aggregate sentiment uncertainty = 0.0494\n"
+ "Done! Aggregate sentiment uncertainty = 0.0353\n"
],
"name": "stdout"
}
@@ -1573,7 +1417,7 @@
"metadata": {
"id": "WTrBk5ecFw4N",
"colab_type": "code",
- "outputId": "f7e00bd4-2f0d-412b-f53d-127c8e2692dd",
+ "outputId": "ea182b06-2b89-44c3-864d-13ab768a313b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
@@ -1601,7 +1445,7 @@
"metadata": {
"id": "_1skobezDS2r",
"colab_type": "code",
- "outputId": "b21d7ab2-802d-4088-f43c-c60f881e8401",
+ "outputId": "b8b94f47-e497-452d-9236-72f74f92f76f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 466
@@ -1633,7 +1477,7 @@
"metadata": {
"id": "s5X5AWId2jWp",
"colab_type": "code",
- "outputId": "c21ad1d3-a520-48ce-fc2a-b6530941e990",
+ "outputId": "4cbf5fb3-5e2d-4d4e-a3ae-dd1d70f8263d",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 487
@@ -1649,14 +1493,14 @@
{
"output_type": "stream",
"text": [
- "Mean aggregate score error: 0.00407\n"
+ "Mean aggregate score error: 0.00379\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
- "image/png": "\n",
+ "image/png": "\n",
"text/plain": [
"