Skip to content

Commit c2b6e44

Browse files
committed
Add typing and fix set/list logical bug.
1 parent a4546e1 commit c2b6e44

File tree

5 files changed

+165
-60
lines changed

5 files changed

+165
-60
lines changed

.pre-commit-config.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,7 @@ repos:
2424
hooks:
2525
- id: black
2626
args: [--line-length=120, --skip-string-normalization]
27+
- repo: https://github.com/pre-commit/mirrors-mypy
28+
rev: v0.900
29+
hooks:
30+
- id: mypy

rake_nltk/rake.py

+89-43
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,16 @@
99
from collections import Counter, defaultdict
1010
from enum import Enum
1111
from itertools import chain, groupby, product
12+
from typing import DefaultDict, Dict, List, Optional, Set, Tuple
1213

1314
import nltk
1415
from nltk.tokenize import wordpunct_tokenize
1516

17+
# Readability type definitions.
18+
Word = str
19+
Sentence = str
20+
Phrase = Tuple[str, ...]
21+
1622

1723
class Metric(Enum):
1824
"""Different metrics that can be used for ranking."""
@@ -22,27 +28,45 @@ class Metric(Enum):
2228
WORD_FREQUENCY = 2 # Uses f(w) alone as the metric
2329

2430

25-
class Rake(object):
31+
class Rake:
2632
"""Rapid Automatic Keyword Extraction Algorithm."""
2733

2834
def __init__(
2935
self,
30-
stopwords=None,
31-
punctuations=None,
32-
language='english',
33-
ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
34-
max_length=100000,
35-
min_length=1,
36+
stopwords: Optional[Set[str]] = None,
37+
punctuations: Optional[Set[str]] = None,
38+
language: str = 'english',
39+
ranking_metric: Metric = Metric.DEGREE_TO_FREQUENCY_RATIO,
40+
max_length: int = 100000,
41+
min_length: int = 1,
42+
include_repeated_phrases: bool = True,
3643
):
3744
"""Constructor.
3845
39-
:param stopwords: List of Words to be ignored for keyword extraction.
46+
:param stopwords: Words to be ignored for keyword extraction.
4047
:param punctuations: Punctuations to be ignored for keyword extraction.
41-
:param language: Language to be used for stopwords
48+
:param language: Language to be used for stopwords.
4249
:param max_length: Maximum limit on the number of words in a phrase
4350
(Inclusive. Defaults to 100000)
4451
:param min_length: Minimum limit on the number of words in a phrase
4552
(Inclusive. Defaults to 1)
53+
:param include_repeated_phrases: If phrases repeat in phrase list consider
54+
them as is without dropping any phrases for future
55+
calculations. (Defaults to True) Ex: "Magic systems is
56+
a company. Magic systems was founded by Raul".
57+
58+
If repeated phrases are allowed phrase list would be
59+
[
60+
(magic, systems), (company,), (magic, systems),
61+
(founded,), (raul,)
62+
]
63+
64+
If they aren't allowed phrase list would be
65+
[
66+
(magic, systems), (company,),
67+
(founded,), (raul,)
68+
]
69+
4670
"""
4771
# By default use degree to frequency ratio as the metric.
4872
if isinstance(ranking_metric, Metric):
@@ -51,71 +75,78 @@ def __init__(
5175
self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO
5276

5377
# If stopwords not provided we use language stopwords by default.
54-
self.stopwords = stopwords
55-
if self.stopwords is None:
56-
self.stopwords = nltk.corpus.stopwords.words(language)
78+
self.stopwords: Set[str]
79+
if stopwords:
80+
self.stopwords = stopwords
81+
else:
82+
self.stopwords = set(nltk.corpus.stopwords.words(language))
5783

5884
# If punctuations are not provided we ignore all punctuation symbols.
59-
self.punctuations = punctuations
60-
if self.punctuations is None:
61-
self.punctuations = string.punctuation
85+
self.punctuations: Set[str]
86+
if punctuations:
87+
self.punctuations = punctuations
88+
else:
89+
self.punctuations = set(string.punctuation)
6290

6391
# All things which act as sentence breaks during keyword extraction.
64-
self.to_ignore = set(chain(self.stopwords, self.punctuations))
92+
self.to_ignore: Set[str] = set(chain(self.stopwords, self.punctuations))
6593

6694
# Assign min or max length to the attributes
67-
self.min_length = min_length
68-
self.max_length = max_length
95+
self.min_length: int = min_length
96+
self.max_length: int = max_length
97+
98+
# Whether we should include repeated phreases in the computation or not.
99+
self.include_repeated_phrases: bool = include_repeated_phrases
69100

70101
# Stuff to be extracted from the provided text.
71-
self.frequency_dist = None
72-
self.degree = None
73-
self.rank_list = None
74-
self.ranked_phrases = None
102+
self.frequency_dist: Dict[Word, int]
103+
self.degree: Dict[Word, int]
104+
self.rank_list: List[Tuple[float, Sentence]]
105+
self.ranked_phrases: List[Sentence]
75106

76-
def extract_keywords_from_text(self, text):
107+
def extract_keywords_from_text(self, text: str):
77108
"""Method to extract keywords from the text provided.
78109
79110
:param text: Text to extract keywords from, provided as a string.
80111
"""
81-
sentences = nltk.tokenize.sent_tokenize(text)
112+
sentences: List[Sentence] = nltk.tokenize.sent_tokenize(text)
82113
self.extract_keywords_from_sentences(sentences)
83114

84-
def extract_keywords_from_sentences(self, sentences):
115+
def extract_keywords_from_sentences(self, sentences: List[Sentence]):
85116
"""Method to extract keywords from the list of sentences provided.
86117
87118
:param sentences: Text to extraxt keywords from, provided as a list
88119
of strings, where each string is a sentence.
89120
"""
90-
phrase_list = self._generate_phrases(sentences)
121+
phrase_list: List[Phrase] = self._generate_phrases(sentences)
91122
self._build_frequency_dist(phrase_list)
92123
self._build_word_co_occurance_graph(phrase_list)
93124
self._build_ranklist(phrase_list)
94125

95-
def get_ranked_phrases(self):
126+
def get_ranked_phrases(self) -> List[Sentence]:
96127
"""Method to fetch ranked keyword strings.
97128
98129
:return: List of strings where each string represents an extracted
99130
keyword string.
100131
"""
101132
return self.ranked_phrases
102133

103-
def get_ranked_phrases_with_scores(self):
134+
def get_ranked_phrases_with_scores(self) -> List[Tuple[float, Sentence]]:
104135
"""Method to fetch ranked keyword strings along with their scores.
105136
106137
:return: List of tuples where each tuple is formed of an extracted
107138
keyword string and its score. Ex: (5.68, 'Four Scoures')
108139
"""
109140
return self.rank_list
110141

111-
def get_word_frequency_distribution(self):
142+
def get_word_frequency_distribution(self) -> Dict[Word, int]:
112143
"""Method to fetch the word frequency distribution in the given text.
113144
114145
:return: Dictionary (defaultdict) of the format `word -> frequency`.
115146
"""
116147
return self.frequency_dist
117148

118-
def get_word_degrees(self):
149+
def get_word_degrees(self) -> Dict[Word, int]:
119150
"""Method to fetch the degree of words in the given text. Degree can be
120151
defined as sum of co-occurances of the word with other words in the
121152
given text.
@@ -124,22 +155,22 @@ def get_word_degrees(self):
124155
"""
125156
return self.degree
126157

127-
def _build_frequency_dist(self, phrase_list):
158+
def _build_frequency_dist(self, phrase_list: List[Phrase]) -> None:
128159
"""Builds frequency distribution of the words in the given body of text.
129160
130161
:param phrase_list: List of List of strings where each sublist is a
131162
collection of words which form a contender phrase.
132163
"""
133164
self.frequency_dist = Counter(chain.from_iterable(phrase_list))
134165

135-
def _build_word_co_occurance_graph(self, phrase_list):
166+
def _build_word_co_occurance_graph(self, phrase_list: List[Phrase]) -> None:
136167
"""Builds the co-occurance graph of words in the given body of text to
137168
compute degree of each word.
138169
139170
:param phrase_list: List of List of strings where each sublist is a
140171
collection of words which form a contender phrase.
141172
"""
142-
co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
173+
co_occurance_graph: DefaultDict[Word, DefaultDict[Word, int]] = defaultdict(lambda: defaultdict(lambda: 0))
143174
for phrase in phrase_list:
144175
# For each phrase in the phrase list, count co-occurances of the
145176
# word with other words in the phrase.
@@ -152,11 +183,12 @@ def _build_word_co_occurance_graph(self, phrase_list):
152183
for key in co_occurance_graph:
153184
self.degree[key] = sum(co_occurance_graph[key].values())
154185

155-
def _build_ranklist(self, phrase_list):
186+
def _build_ranklist(self, phrase_list: List[Phrase]):
156187
"""Method to rank each contender phrase using the formula
157188
158189
phrase_score = sum of scores of words in the phrase.
159-
word_score = d(w)/f(w) where d is degree and f is frequency.
190+
word_score = d(w) or f(w) or d(w)/f(w) where d is degree
191+
and f is frequency.
160192
161193
:param phrase_list: List of List of strings where each sublist is a
162194
collection of words which form a contender phrase.
@@ -175,7 +207,7 @@ def _build_ranklist(self, phrase_list):
175207
self.rank_list.sort(reverse=True)
176208
self.ranked_phrases = [ph[1] for ph in self.rank_list]
177209

178-
def _generate_phrases(self, sentences):
210+
def _generate_phrases(self, sentences: List[Sentence]) -> List[Phrase]:
179211
"""Method to generate contender phrases given the sentences of the text
180212
document.
181213
@@ -184,14 +216,28 @@ def _generate_phrases(self, sentences):
184216
:return: Set of string tuples where each tuple is a collection
185217
of words forming a contender phrase.
186218
"""
187-
phrase_list = set()
219+
phrase_list: List[Phrase] = []
188220
# Create contender phrases from sentences.
189221
for sentence in sentences:
190-
word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
191-
phrase_list.update(self._get_phrase_list_from_words(word_list))
222+
word_list: List[Word] = [word.lower() for word in wordpunct_tokenize(sentence)]
223+
phrase_list.extend(self._get_phrase_list_from_words(word_list))
224+
225+
# Based on user's choice to include or not include repeated phrases
226+
# we compute the phrase list and return it. If not including repeated
227+
# phrases, we only include the first occurance of the phrase and drop
228+
# the rest.
229+
if not self.include_repeated_phrases:
230+
unique_phrase_tracker: Set[Phrase] = set()
231+
non_repeated_phrase_list: List[Phrase] = []
232+
for phrase in phrase_list:
233+
if phrase not in unique_phrase_tracker:
234+
unique_phrase_tracker.add(phrase)
235+
non_repeated_phrase_list.append(phrase)
236+
return non_repeated_phrase_list
237+
192238
return phrase_list
193239

194-
def _get_phrase_list_from_words(self, word_list):
240+
def _get_phrase_list_from_words(self, word_list: List[Word]) -> List[Phrase]:
195241
"""Method to create contender phrases from the list of words that form
196242
a sentence by dropping stopwords and punctuations and grouping the left
197243
words into phrases. Only phrases in the given length range (both limits
@@ -210,9 +256,9 @@ def _get_phrase_list_from_words(self, word_list):
210256
211257
:param word_list: List of words which form a sentence when joined in
212258
the same order.
213-
:return: List of contender phrases that are formed after dropping
214-
stopwords and punctuations.
259+
:return: List of contender phrases honouring phrase length requirements
260+
that are formed after dropping stopwords and punctuations.
215261
"""
216262
groups = groupby(word_list, lambda x: x not in self.to_ignore)
217-
phrases = [tuple(group[1]) for group in groups if group[0]]
263+
phrases: List[Phrase] = [tuple(group[1]) for group in groups if group[0]]
218264
return list(filter(lambda x: self.min_length <= len(x) <= self.max_length, phrases))

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python
22
from os import path
3+
from typing import Dict
34

45
from setuptools import setup
56
from setuptools.command.develop import develop
@@ -37,7 +38,7 @@ def run(self):
3738

3839

3940
# Get package and author details.
40-
about = {}
41+
about: Dict[str, str] = {}
4142
with open(path.join(here, 'rake_nltk', '__version__.py')) as f:
4243
exec(f.read(), about)
4344

0 commit comments

Comments
 (0)