-
Notifications
You must be signed in to change notification settings - Fork 16
Add files via upload #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
""" | ||
John Wen - Text Mining assignment | ||
|
||
""" | ||
|
||
import requests | ||
import random | ||
|
||
def gettext(url): | ||
""" maybe modify this to incorporate other websites | ||
if I have the time""" | ||
return requests.get(url).text | ||
|
||
Christmas = gettext('http://www.gutenberg.org/cache/epub/46/pg46.txt') | ||
#OliverTwist = gettext('http://www.gutenberg.org/ebooks/730.txt.utf-8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove comments for your final code |
||
#DavidCopperfield = gettext('http://www.gutenberg.org/files/766/766-0.txt') | ||
#GreatExpectations = gettext('http://www.gutenberg.org/files/1400/1400-0.txt') | ||
#ATaleofTwoCities = gettext('http://www.gutenberg.org/files/98/98-0.txt') | ||
|
||
def cleanuplist(textlist): | ||
""" takes a text as a string and returns a list of words | ||
without any of the symbols and lowercased | ||
|
||
>>> cleanuplist('This project is so hard!') | ||
['this', 'project', 'is', 'so', 'hard'] | ||
>>> cleanuplist('I need, a bunch, of !? doctest?') | ||
['i', 'need', 'a', 'bunch', 'of', 'doctest'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice unit test! 👍 |
||
|
||
""" | ||
cleanedlist = [] | ||
textlist = textlist.lower().split() | ||
for word in textlist: | ||
symbols = "-_=+[}{]:;?/.>,<?!@#$%^&*()|'" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good way of implementing this function. I also suggest you look into regular expression if you want your code to be more concise! https://en.wikipedia.org/wiki/Regular_expression |
||
for i in range (0,len(symbols)): | ||
word = word.replace(symbols[i], '') | ||
if len(word) > 0: | ||
cleanedlist.append(word) | ||
return cleanedlist | ||
|
||
#print(len(cleanuplist(ATaleofTwoCities))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comments |
||
|
||
|
||
def wordcounter(text): | ||
""" counts the text after it's cleaned up, ignoring symbols, | ||
by parsing through a list | ||
|
||
>>> wordcounter('This This This is is Another Docstring Test Test Is!!!!!!!!!') | ||
{'this': 3, 'is': 3, 'another': 1, 'docstring': 1, 'test': 2} | ||
|
||
""" | ||
cleanlist = cleanuplist(text) | ||
d = dict() | ||
for words in cleanlist: | ||
d[words] = d.get(words,0) + 1 | ||
return d | ||
|
||
|
||
def highestfreqword(d): | ||
""" a) create a list of the dict's keys and values; | ||
b) return the key and value with the max value as a tuple | ||
|
||
>>> highestfreqword({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15}) | ||
('large', 15) | ||
|
||
""" | ||
v=list(d.values()) | ||
k=list(d.keys()) | ||
return k[v.index(max(v))], d[k[v.index(max(v))]] | ||
|
||
|
||
def topNvalues(d,n): | ||
""" returns the top N values of the text as tuples in a list | ||
|
||
>>> topNvalues({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15},2) | ||
[('large', 15), ('the', 5)] | ||
|
||
""" | ||
newdictionary = d | ||
listoftups = [] | ||
while n > 0: | ||
listoftups.append(highestfreqword(newdictionary)) | ||
newdictionary.pop(highestfreqword(newdictionary)[0]) | ||
n = n - 1 | ||
return listoftups | ||
|
||
#print(topNvalues(wordcounter(ATaleofTwoCities),5)) | ||
|
||
def uniquewordsused(s): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one way to make this function name more readable is by using camelcase convention like |
||
""" returns the number of unique words in the string | ||
this function takes a lot of computational power LIKE A LOT | ||
|
||
>>> uniquewordsused('There should be six words used') | ||
6 | ||
>>> uniquewordsused('There should be seven words used here') | ||
7 | ||
""" | ||
cleanlist = cleanuplist(s) | ||
counter = 0 | ||
index = 1 | ||
while index < len(cleanlist) + 1: | ||
if cleanlist[index - 1] not in cleanlist[index:]: | ||
if cleanlist[index - 1] not in cleanlist[:index - 1]: | ||
counter = counter + 1 | ||
index = index + 1 | ||
return counter | ||
|
||
def dictionaryofprefixes(s): | ||
""" Returns a dictionary with a list of words in the text as keys | ||
and empty list as values. Will be used as a prefix index for suffixdictionary function. | ||
Does not repeat words | ||
|
||
>>> dictionaryofprefixes('This This This is another doctring test') | ||
{'this': [], 'is': [], 'another': [], 'doctring': [], 'test': []} | ||
|
||
""" | ||
cleanlist = cleanuplist(s) | ||
d = {} | ||
for words in cleanlist: | ||
if words not in d: | ||
d[words] = [] | ||
return d | ||
|
||
|
||
|
||
def suffixdictionary(s): | ||
""" Takes the premade dictionary key index and starts appending | ||
suffixes to the list of values for each key | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So what is this function returning? I think you can be more clear with your docstring. |
||
|
||
>>> suffixdictionary('This cat is this cars best friend. This docstring is this assignments') | ||
{'this': ['cat', 'cars', 'docstring', 'assignments'], 'cat': ['is'], 'is': ['this'], 'cars': ['best'], 'best': ['friend'], 'friend': ['this'], 'docstring': ['is'], 'assignments': []} | ||
|
||
""" | ||
d = dictionaryofprefixes(s) | ||
cleanlist = cleanuplist(s) | ||
index = 0 | ||
while index < len(cleanlist) - 1: | ||
if cleanlist[index + 1] not in d[cleanlist[index]]: | ||
d[cleanlist[index]].append(cleanlist[index + 1]) | ||
index = index + 1 | ||
return d | ||
|
||
#print(suffixes('This is a practice for a dictionary prefixes example so a cat can compute this program!!!')) | ||
|
||
|
||
def sentencegenerator(text,startword,length = 10): | ||
""" Takes a sentence generator that only takes one prefix | ||
and generates a random suffix from the dictionary and creates | ||
a setence of values""" | ||
d = suffixdictionary(text) | ||
sentence = [startword] | ||
index = 0 | ||
while index < length: | ||
sentence.append(random.choice(d[sentence[index]])) | ||
index = index + 1 | ||
return ' '.join(sentence) | ||
|
||
|
||
print(sentencegenerator((Christmas),'the',20)) | ||
|
||
|
||
if __name__ == "__main__": | ||
import doctest | ||
doctest.testmod(verbose=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
""" | ||
John Wen - Text Mining assignment | ||
|
||
""" | ||
|
||
import requests | ||
#requests used to obtain text from urls. | ||
import random | ||
#random function used to generate sentences based on prefix-suffix | ||
|
||
def gettext(url): | ||
"""takes a url and returns the url as a long string""" | ||
return requests.get(url).text | ||
|
||
"""Top 5 Charles Dickens Books as Text Files""" | ||
|
||
Christmas = gettext('http://www.gutenberg.org/cache/epub/46/pg46.txt') | ||
OliverTwist = gettext('http://www.gutenberg.org/ebooks/730.txt.utf-8') | ||
DavidCopperfield = gettext('http://www.gutenberg.org/files/766/766-0.txt') | ||
GreatExpectations = gettext('http://www.gutenberg.org/files/1400/1400-0.txt') | ||
ATaleofTwoCities = gettext('http://www.gutenberg.org/files/98/98-0.txt') | ||
|
||
def cleanuplist(textlist): | ||
""" takes a text as a string and returns a list of words | ||
without any of the symbols and lowercased | ||
|
||
>>> cleanuplist('This project is so hard!') | ||
['this', 'project', 'is', 'so', 'hard'] | ||
>>> cleanuplist('I need, a bunch, of !? doctest?') | ||
['i', 'need', 'a', 'bunch', 'of', 'doctest'] | ||
|
||
Functionality: iteratively go through the words in the list and if they are in the symbols | ||
it would replace the symbol with a space | ||
""" | ||
cleanedlist = [] | ||
textlist = textlist.lower().split() | ||
for word in textlist: | ||
symbols = "-_=+[}{]:;?/.>,<?!@#$%^&*()|'" | ||
for i in range (0,len(symbols)): | ||
word = word.replace(symbols[i], '') | ||
if len(word) > 0: | ||
cleanedlist.append(word) | ||
return cleanedlist | ||
|
||
def wordcounter(text): | ||
""" counts the text after it's cleaned up, ignoring symbols, | ||
by parsing through a list | ||
|
||
>>> wordcounter('This This This is is Another Docstring Test Test Is!!!!!!!!!') | ||
{'this': 3, 'is': 3, 'another': 1, 'docstring': 1, 'test': 2} | ||
>>> wordcounter('Test Test and way more test') | ||
{'test': 3, 'and': 1, 'way': 1, 'more': 1} | ||
|
||
Functionality: goes through the dictionary and uses the get function to change the values of each key | ||
""" | ||
cleanlist = cleanuplist(text) | ||
d = dict() | ||
for words in cleanlist: | ||
d[words] = d.get(words,0) + 1 | ||
return d | ||
|
||
|
||
def highestfreqword(d): | ||
""" returns the highest value in the dictionary as a tuple pair | ||
|
||
>>> highestfreqword({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15}) | ||
('large', 15) | ||
>>> highestfreqword({'the': 5, 'apple': 8, 'is' : 3, 'large' : 2}) | ||
('apple', 8) | ||
|
||
Functionality: Breaks down the dictionary down into lists of keys and values and | ||
picks the highest value and maps that index back to the key. | ||
""" | ||
v=list(d.values()) | ||
k=list(d.keys()) | ||
return k[v.index(max(v))], d[k[v.index(max(v))]] | ||
|
||
|
||
def topNvalues(d,n): | ||
""" returns the top N values of the text as tuples in a list | ||
|
||
>>> topNvalues({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15}, 2) | ||
[('large', 15), ('the', 5)] | ||
>>> topNvalues({'cars': 51, 'bananas': 252, 'pies' : 33, 'bread' : 153}, 3) | ||
[('bananas', 252), ('bread', 153), ('cars', 51)] | ||
|
||
Functionality: Runs the highestfreqword function N times while popping out the | ||
selected value after each trial. | ||
|
||
""" | ||
newdictionary = d | ||
listoftups = [] | ||
while n > 0: | ||
listoftups.append(highestfreqword(newdictionary)) | ||
newdictionary.pop(highestfreqword(newdictionary)[0]) | ||
n = n - 1 | ||
#runs through the function and adds the highest value tuple to the list and then | ||
#pops it from the original dictionary | ||
return listoftups | ||
|
||
def uniquewordsused(s): | ||
""" returns the number of unique words in the string | ||
this function takes a lot of computational power LIKE A LOT | ||
|
||
>>> uniquewordsused('There should be six words used') | ||
6 | ||
>>> uniquewordsused('There should be seven words used here') | ||
7 | ||
|
||
Functionality: Takes a list of words, and runs two if functions through each word. | ||
The if functions check to see if the word appears before the index and after the index value. | ||
If it doesn't then it adds one to the counter. | ||
|
||
""" | ||
cleanlist = cleanuplist(s) | ||
counter = 0 | ||
index = 1 | ||
while index < len(cleanlist) + 1: | ||
if cleanlist[index - 1] not in cleanlist[index:]: | ||
if cleanlist[index - 1] not in cleanlist[:index - 1]: | ||
counter = counter + 1 | ||
index = index + 1 | ||
return counter | ||
|
||
def dictionaryofprefixes(s): | ||
""" Returns a dictionary with a list of words in the text as keys | ||
and empty list as values. Will be used as a prefix index for suffixdictionary function. | ||
Does not repeat words | ||
|
||
>>> dictionaryofprefixes('This This This is another doctring test') | ||
{'this': [], 'is': [], 'another': [], 'doctring': [], 'test': []} | ||
>>> dictionaryofprefixes('More docstring tests') | ||
{'more': [], 'docstring': [], 'tests': []} | ||
|
||
Functionality: Just returns each word as keys with empty values. | ||
""" | ||
cleanlist = cleanuplist(s) | ||
d = {} | ||
for words in cleanlist: | ||
if words not in d: | ||
d[words] = [] | ||
return d | ||
|
||
|
||
|
||
def suffixdictionary(s): | ||
""" Takes the premade dictionary key index and starts appending | ||
suffixes to the list of values for each key | ||
|
||
>>> suffixdictionary('This cat is this cars best friend. This docstring is this assignments') | ||
{'this': ['cat', 'cars', 'docstring', 'assignments'], 'cat': ['is'], 'is': ['this'], 'cars': ['best'], 'best': ['friend'], 'friend': ['this'], 'docstring': ['is'], 'assignments': []} | ||
>>> suffixdictionary('Maybe if I create more docstrings, I will get a 5') | ||
{'maybe': ['if'], 'if': ['i'], 'i': ['create', 'will'], 'create': ['more'], 'more': ['docstrings'], 'docstrings': ['i'], 'will': ['get'], 'get': ['a'], 'a': ['5'], '5': []} | ||
""" | ||
d = dictionaryofprefixes(s) | ||
cleanlist = cleanuplist(s) | ||
index = 0 | ||
while index < len(cleanlist) - 1: | ||
if cleanlist[index + 1] not in d[cleanlist[index]]: | ||
d[cleanlist[index]].append(cleanlist[index + 1]) | ||
index = index + 1 | ||
return d | ||
|
||
def sentencegenerator(text,startword,length = 10): | ||
""" Takes a sentence generator that only takes one prefix | ||
and generates a random suffix from the dictionary and creates | ||
a setence of values | ||
|
||
>>> sentencegenerator(Christmas,'the',20) | ||
'the lock "ill give him as heartily sorry he wrote the foundations ein or using his steps towards you" returned the' | ||
Functionality: Starts with a word, and goes through the created prefix-suffix dictionary | ||
and randomly selects a element in that value list. | ||
""" | ||
random.seed(1) | ||
#randomseed value for doctest, comment out to actually have the function run | ||
d = suffixdictionary(text) | ||
sentence = [startword] | ||
index = 0 | ||
while index < length: | ||
sentence.append(random.choice(d[sentence[index]])) | ||
index = index + 1 | ||
return ' '.join(sentence) | ||
|
||
"""Results of sentence generator | ||
|
||
print(sentencegenerator((Christmas),'the',20)) | ||
Results1: the shop but to express the upper portion of tank his feet observable beneath the habit with my dears god bless | ||
Results2: the children in too well in carriages and lying down for" muttered with christmas past relenting" said so irresistibly contagious as | ||
Results3: the burialplace of chestnuts on cornhill at midnight hark the quick wheels of addressing mr scrooges keyhole and tender and his | ||
""" | ||
|
||
if __name__ == "__main__": | ||
import doctest | ||
doctest.testmod(verbose=False) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This docstring does not explain what this function is about. Please refer to Oliver's GeneFinder solution. https://github.com/sd17fall/GeneFinder/blob/formatted/gene_finder.py