-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
corpcmp.py
executable file
·59 lines (45 loc) · 1.32 KB
/
corpcmp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
#
# Read a text file and print all words that are not in the dictionary
# (and not in the missing list). This is useful for finding words that
# are missing from the dictionary.
# TODO: Work directly on epubs instead of text files
#
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from db import EnskDatabase
from util import read_wordlist
lemmatizer = WordNetLemmatizer()
missing = read_wordlist("missing.txt")
with open("texts/quine.txt", "r") as f:
corpus = f.read()
# Initialize database singleton
e = EnskDatabase()
# Read all dictionary entries into memory
res = e.read_all_entries()
dict_words = [e["word"].lower() for e in res]
words = word_tokenize(corpus)
ps = PorterStemmer()
for w in words:
w = re.sub(r"\d+$", "", w)
if w.endswith("."):
w = w[:-1]
if w.endswith("…"):
w = w[:-1]
if w.endswith("ing") or w.endswith("ed"):
continue
if w.endswith("s"):
n = w[:-1]
if n in dict_words or n.lower() in dict_words:
continue
lemma = lemmatizer.lemmatize(w)
llow = lemma.lower()
if (
lemma not in dict_words
and llow not in dict_words
and lemma not in missing
and llow not in missing
):
print(lemma)