diff --git a/GetTweets.py b/GetTweets.py new file mode 100644 index 0000000..76a61bd --- /dev/null +++ b/GetTweets.py @@ -0,0 +1,24 @@ +import twitter +from TwitterKeys import * +from Pickling import PickleBuddy, unPickleBuddy + +def cleanTweet(unsorted): + textList = [] + cleanList = [] + for tweet in unsorted: + textList.append(tweet.text) + for text in textList: + if text[0] == 'R': + cleaned = (text.split(':')[1]) + if '#' in cleaned: + cleaned = cleaned.split("#")[0] + if cleaned != '': + cleanList.append(cleaned) + return cleanList + + +def getTweets(): + PickleBuddy() + unfilteredTweet = unPickleBuddy() + niceTweet = cleanTweet(unfilteredTweet) + return niceTweet diff --git a/Pickling.py b/Pickling.py new file mode 100644 index 0000000..cfff58d --- /dev/null +++ b/Pickling.py @@ -0,0 +1,25 @@ +filePath = "C:/Users/sokuno/TextMining" +import os +import pickle +import twitter +from TwitterKeys import * + + +def PickleBuddy(): + files = os.listdir(filePath) # make a list of all the files that already exist + if not "cachedData.pickle" in files: + #setting up api using twitter key info from TwitterKey.py + api = twitter.Api(consumer_key=CONSUMER_KEY, + consumer_secret=CONSUMER_SECRET, + access_token_key=ACCESS_TOKEN_KEY, + access_token_secret=ACCESS_TOKEN_SECRET) + fullTweet = api.GetSearch(term='#inspirationalquotes', raw_query=None, geocode=None, since_id=None, max_id=None, until=None, since=None, count=100, lang=None, locale=None, result_type='recent', include_entities=None) + pickles = open('cachedData.pickle', 'wb') + pickle.dump(fullTweet, pickles) + pickles.close + +def unPickleBuddy(): + ultimateUnPickle = [] + unpickles = open(filePath+'/cachedData.pickle', 'rb') + untest = pickle.load(unpickles) + return untest diff --git a/README.md b/README.md index 8cce527..faafe20 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,13 @@ # TextMining This is the base repo for the text mining and analysis project for Software Design at Olin College. + +First, install/import the following: +-re +-random +-sys +-twitter +-os +-pickle + +Then run using the TwitterQuoteBot.py file diff --git a/TextMining_reflection.pdf b/TextMining_reflection.pdf new file mode 100644 index 0000000..0d10a01 Binary files /dev/null and b/TextMining_reflection.pdf differ diff --git a/TwitterQuoteBot.py b/TwitterQuoteBot.py new file mode 100644 index 0000000..b54dc3b --- /dev/null +++ b/TwitterQuoteBot.py @@ -0,0 +1,6 @@ +import re +from GetTweets import getTweets +from sentence_generator import buildMapping, genSentence, main +words = "".join(getTweets()) +words = words.replace('\r', '').replace('\n', '').replace('T.','').replace('D.','') +main(words,1) diff --git a/cachedData.pickle b/cachedData.pickle new file mode 100644 index 0000000..2a0c21e Binary files /dev/null and b/cachedData.pickle differ diff --git a/cachedData1.pickle b/cachedData1.pickle new file mode 100644 index 0000000..a33c80a Binary files /dev/null and b/cachedData1.pickle differ diff --git a/cachedData2.pickle b/cachedData2.pickle new file mode 100644 index 0000000..bc43f36 Binary files /dev/null and b/cachedData2.pickle differ diff --git a/sentence_generator.py b/sentence_generator.py new file mode 100644 index 0000000..dc543f3 --- /dev/null +++ b/sentence_generator.py @@ -0,0 +1,128 @@ +#!/usr/bin/python + +import re +import random +import sys + +# These mappings can get fairly large -- they're stored globally to +# save copying time. + +# (tuple of words) -> {dict: word -> number of times the word appears following the tuple} +# Example entry: +# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0} +# Used briefly while first constructing the normalized mapping +tempMapping = {} + +# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple} +# Example entry: +# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333} +mapping = {} + +# Contains the set of words that can start sentences +starts = [] + +# We want to be able to compare words independent of their capitalization. +def fixCaps(word): + # Ex: "FOO" -> "foo" + if word.isupper() and word != "I": + word = word.lower() + # Ex: "LaTeX" => "Latex" + elif word [0].isupper(): + word = word.lower().capitalize() + # Ex: "wOOt" -> "woot" + else: + word = word.lower() + return word + +# Tuples can be hashed; lists can't. We need hashable values for dict keys. +# This looks like a hack (and it is, a little) but in practice it doesn't +# affect processing time too negatively. +def toHashKey(lst): + return tuple(lst) + +# Returns the contents of the file, split into a list of words and +# (some) punctuation. +def wordlist(words): + # f = open(filename, 'r') + wordlist = [fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", words)] + # f.close() + return wordlist + +# Self-explanatory -- adds "word" to the "tempMapping" dict under "history". +# tempMapping (and mapping) both match each word to a list of possible next +# words. +# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to +# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"]. +def addItemToTempMapping(history, word): + global tempMapping + while len(history) > 0: + first = toHashKey(history) + if first in tempMapping: + if word in tempMapping[first]: + tempMapping[first][word] += 1.0 + else: + tempMapping[first][word] = 1.0 + else: + tempMapping[first] = {} + tempMapping[first][word] = 1.0 + history = history[1:] + +# Building and normalizing the mapping. +def buildMapping(wordlist, markovLength): + global tempMapping + starts.append(wordlist [0]) + for i in range(1, len(wordlist) - 1): + if i <= markovLength: + history = wordlist[: i + 1] + else: + history = wordlist[i - markovLength + 1 : i + 1] + follow = wordlist[i + 1] + # if the last elt was a period, add the next word to the start list + if history[-1] == "." and follow not in ".,!?;": + starts.append(follow) + addItemToTempMapping(history, follow) + # Normalize the values in tempMapping, put them into mapping + for first, followset in tempMapping.items(): + total = sum(followset.values()) + # Normalizing here: + mapping[first] = dict([(k, v / total) for k, v in followset.items()]) + +# Returns the next word in the sentence (chosen randomly), +# given the previous ones. +def next(prevList): + sum = 0.0 + retval = "" + index = random.random() + # Shorten prevList until it's in mapping + while toHashKey(prevList) not in mapping: + prevList.pop(0) + # Get a random word from the mapping, given prevList + for k, v in mapping[toHashKey(prevList)].items(): + sum += v + if sum >= index and retval == "": + retval = k + return retval + +def genSentence(markovLength): + # Start with a random "starting word" + curr = random.choice(starts) + sent = curr.capitalize() + prevList = [curr] + # Keep adding words until we hit a period + while (curr not in "."): + curr = next(prevList) + prevList.append(curr) + # if the prevList has gotten too long, trim it + if len(prevList) > markovLength: + prevList.pop(0) + if (curr not in ".,!?;"): + sent += " " # Add spaces between words (but not punctuation) + sent += curr + return sent + +def main(words,markov): + buildMapping(wordlist(words), markov) + print(genSentence(markov)) + +if __name__ == "__main__": + main()