9
9
from collections import Counter , defaultdict
10
10
from enum import Enum
11
11
from itertools import chain , groupby , product
12
+ from typing import DefaultDict , Dict , List , Optional , Set , Tuple
12
13
13
14
import nltk
14
15
from nltk .tokenize import wordpunct_tokenize
15
16
17
+ # Readability type definitions.
18
+ Word = str
19
+ Sentence = str
20
+ Phrase = Tuple [str , ...]
21
+
16
22
17
23
class Metric (Enum ):
18
24
"""Different metrics that can be used for ranking."""
@@ -22,27 +28,45 @@ class Metric(Enum):
22
28
WORD_FREQUENCY = 2 # Uses f(w) alone as the metric
23
29
24
30
25
- class Rake ( object ) :
31
+ class Rake :
26
32
"""Rapid Automatic Keyword Extraction Algorithm."""
27
33
28
34
def __init__ (
29
35
self ,
30
- stopwords = None ,
31
- punctuations = None ,
32
- language = 'english' ,
33
- ranking_metric = Metric .DEGREE_TO_FREQUENCY_RATIO ,
34
- max_length = 100000 ,
35
- min_length = 1 ,
36
+ stopwords : Optional [Set [str ]] = None ,
37
+ punctuations : Optional [Set [str ]] = None ,
38
+ language : str = 'english' ,
39
+ ranking_metric : Metric = Metric .DEGREE_TO_FREQUENCY_RATIO ,
40
+ max_length : int = 100000 ,
41
+ min_length : int = 1 ,
42
+ include_repeated_phrases : bool = True ,
36
43
):
37
44
"""Constructor.
38
45
39
- :param stopwords: List of Words to be ignored for keyword extraction.
46
+ :param stopwords: Words to be ignored for keyword extraction.
40
47
:param punctuations: Punctuations to be ignored for keyword extraction.
41
- :param language: Language to be used for stopwords
48
+ :param language: Language to be used for stopwords.
42
49
:param max_length: Maximum limit on the number of words in a phrase
43
50
(Inclusive. Defaults to 100000)
44
51
:param min_length: Minimum limit on the number of words in a phrase
45
52
(Inclusive. Defaults to 1)
53
+ :param include_repeated_phrases: If phrases repeat in phrase list consider
54
+ them as is without dropping any phrases for future
55
+ calculations. (Defaults to True) Ex: "Magic systems is
56
+ a company. Magic systems was founded by Raul".
57
+
58
+ If repeated phrases are allowed phrase list would be
59
+ [
60
+ (magic, systems), (company,), (magic, systems),
61
+ (founded,), (raul,)
62
+ ]
63
+
64
+ If they aren't allowed phrase list would be
65
+ [
66
+ (magic, systems), (company,),
67
+ (founded,), (raul,)
68
+ ]
69
+
46
70
"""
47
71
# By default use degree to frequency ratio as the metric.
48
72
if isinstance (ranking_metric , Metric ):
@@ -51,71 +75,78 @@ def __init__(
51
75
self .metric = Metric .DEGREE_TO_FREQUENCY_RATIO
52
76
53
77
# If stopwords not provided we use language stopwords by default.
54
- self .stopwords = stopwords
55
- if self .stopwords is None :
56
- self .stopwords = nltk .corpus .stopwords .words (language )
78
+ self .stopwords : Set [str ]
79
+ if stopwords :
80
+ self .stopwords = stopwords
81
+ else :
82
+ self .stopwords = set (nltk .corpus .stopwords .words (language ))
57
83
58
84
# If punctuations are not provided we ignore all punctuation symbols.
59
- self .punctuations = punctuations
60
- if self .punctuations is None :
61
- self .punctuations = string .punctuation
85
+ self .punctuations : Set [str ]
86
+ if punctuations :
87
+ self .punctuations = punctuations
88
+ else :
89
+ self .punctuations = set (string .punctuation )
62
90
63
91
# All things which act as sentence breaks during keyword extraction.
64
- self .to_ignore = set (chain (self .stopwords , self .punctuations ))
92
+ self .to_ignore : Set [ str ] = set (chain (self .stopwords , self .punctuations ))
65
93
66
94
# Assign min or max length to the attributes
67
- self .min_length = min_length
68
- self .max_length = max_length
95
+ self .min_length : int = min_length
96
+ self .max_length : int = max_length
97
+
98
+ # Whether we should include repeated phreases in the computation or not.
99
+ self .include_repeated_phrases : bool = include_repeated_phrases
69
100
70
101
# Stuff to be extracted from the provided text.
71
- self .frequency_dist = None
72
- self .degree = None
73
- self .rank_list = None
74
- self .ranked_phrases = None
102
+ self .frequency_dist : Dict [ Word , int ]
103
+ self .degree : Dict [ Word , int ]
104
+ self .rank_list : List [ Tuple [ float , Sentence ]]
105
+ self .ranked_phrases : List [ Sentence ]
75
106
76
- def extract_keywords_from_text (self , text ):
107
+ def extract_keywords_from_text (self , text : str ):
77
108
"""Method to extract keywords from the text provided.
78
109
79
110
:param text: Text to extract keywords from, provided as a string.
80
111
"""
81
- sentences = nltk .tokenize .sent_tokenize (text )
112
+ sentences : List [ Sentence ] = nltk .tokenize .sent_tokenize (text )
82
113
self .extract_keywords_from_sentences (sentences )
83
114
84
- def extract_keywords_from_sentences (self , sentences ):
115
+ def extract_keywords_from_sentences (self , sentences : List [ Sentence ] ):
85
116
"""Method to extract keywords from the list of sentences provided.
86
117
87
118
:param sentences: Text to extraxt keywords from, provided as a list
88
119
of strings, where each string is a sentence.
89
120
"""
90
- phrase_list = self ._generate_phrases (sentences )
121
+ phrase_list : List [ Phrase ] = self ._generate_phrases (sentences )
91
122
self ._build_frequency_dist (phrase_list )
92
123
self ._build_word_co_occurance_graph (phrase_list )
93
124
self ._build_ranklist (phrase_list )
94
125
95
- def get_ranked_phrases (self ):
126
+ def get_ranked_phrases (self ) -> List [ Sentence ] :
96
127
"""Method to fetch ranked keyword strings.
97
128
98
129
:return: List of strings where each string represents an extracted
99
130
keyword string.
100
131
"""
101
132
return self .ranked_phrases
102
133
103
- def get_ranked_phrases_with_scores (self ):
134
+ def get_ranked_phrases_with_scores (self ) -> List [ Tuple [ float , Sentence ]] :
104
135
"""Method to fetch ranked keyword strings along with their scores.
105
136
106
137
:return: List of tuples where each tuple is formed of an extracted
107
138
keyword string and its score. Ex: (5.68, 'Four Scoures')
108
139
"""
109
140
return self .rank_list
110
141
111
- def get_word_frequency_distribution (self ):
142
+ def get_word_frequency_distribution (self ) -> Dict [ Word , int ] :
112
143
"""Method to fetch the word frequency distribution in the given text.
113
144
114
145
:return: Dictionary (defaultdict) of the format `word -> frequency`.
115
146
"""
116
147
return self .frequency_dist
117
148
118
- def get_word_degrees (self ):
149
+ def get_word_degrees (self ) -> Dict [ Word , int ] :
119
150
"""Method to fetch the degree of words in the given text. Degree can be
120
151
defined as sum of co-occurances of the word with other words in the
121
152
given text.
@@ -124,22 +155,22 @@ def get_word_degrees(self):
124
155
"""
125
156
return self .degree
126
157
127
- def _build_frequency_dist (self , phrase_list ) :
158
+ def _build_frequency_dist (self , phrase_list : List [ Phrase ]) -> None :
128
159
"""Builds frequency distribution of the words in the given body of text.
129
160
130
161
:param phrase_list: List of List of strings where each sublist is a
131
162
collection of words which form a contender phrase.
132
163
"""
133
164
self .frequency_dist = Counter (chain .from_iterable (phrase_list ))
134
165
135
- def _build_word_co_occurance_graph (self , phrase_list ) :
166
+ def _build_word_co_occurance_graph (self , phrase_list : List [ Phrase ]) -> None :
136
167
"""Builds the co-occurance graph of words in the given body of text to
137
168
compute degree of each word.
138
169
139
170
:param phrase_list: List of List of strings where each sublist is a
140
171
collection of words which form a contender phrase.
141
172
"""
142
- co_occurance_graph = defaultdict (lambda : defaultdict (lambda : 0 ))
173
+ co_occurance_graph : DefaultDict [ Word , DefaultDict [ Word , int ]] = defaultdict (lambda : defaultdict (lambda : 0 ))
143
174
for phrase in phrase_list :
144
175
# For each phrase in the phrase list, count co-occurances of the
145
176
# word with other words in the phrase.
@@ -152,11 +183,12 @@ def _build_word_co_occurance_graph(self, phrase_list):
152
183
for key in co_occurance_graph :
153
184
self .degree [key ] = sum (co_occurance_graph [key ].values ())
154
185
155
- def _build_ranklist (self , phrase_list ):
186
+ def _build_ranklist (self , phrase_list : List [ Phrase ] ):
156
187
"""Method to rank each contender phrase using the formula
157
188
158
189
phrase_score = sum of scores of words in the phrase.
159
- word_score = d(w)/f(w) where d is degree and f is frequency.
190
+ word_score = d(w) or f(w) or d(w)/f(w) where d is degree
191
+ and f is frequency.
160
192
161
193
:param phrase_list: List of List of strings where each sublist is a
162
194
collection of words which form a contender phrase.
@@ -175,7 +207,7 @@ def _build_ranklist(self, phrase_list):
175
207
self .rank_list .sort (reverse = True )
176
208
self .ranked_phrases = [ph [1 ] for ph in self .rank_list ]
177
209
178
- def _generate_phrases (self , sentences ) :
210
+ def _generate_phrases (self , sentences : List [ Sentence ]) -> List [ Phrase ] :
179
211
"""Method to generate contender phrases given the sentences of the text
180
212
document.
181
213
@@ -184,14 +216,28 @@ def _generate_phrases(self, sentences):
184
216
:return: Set of string tuples where each tuple is a collection
185
217
of words forming a contender phrase.
186
218
"""
187
- phrase_list = set ()
219
+ phrase_list : List [ Phrase ] = []
188
220
# Create contender phrases from sentences.
189
221
for sentence in sentences :
190
- word_list = [word .lower () for word in wordpunct_tokenize (sentence )]
191
- phrase_list .update (self ._get_phrase_list_from_words (word_list ))
222
+ word_list : List [Word ] = [word .lower () for word in wordpunct_tokenize (sentence )]
223
+ phrase_list .extend (self ._get_phrase_list_from_words (word_list ))
224
+
225
+ # Based on user's choice to include or not include repeated phrases
226
+ # we compute the phrase list and return it. If not including repeated
227
+ # phrases, we only include the first occurance of the phrase and drop
228
+ # the rest.
229
+ if not self .include_repeated_phrases :
230
+ unique_phrase_tracker : Set [Phrase ] = set ()
231
+ non_repeated_phrase_list : List [Phrase ] = []
232
+ for phrase in phrase_list :
233
+ if phrase not in unique_phrase_tracker :
234
+ unique_phrase_tracker .add (phrase )
235
+ non_repeated_phrase_list .append (phrase )
236
+ return non_repeated_phrase_list
237
+
192
238
return phrase_list
193
239
194
- def _get_phrase_list_from_words (self , word_list ) :
240
+ def _get_phrase_list_from_words (self , word_list : List [ Word ]) -> List [ Phrase ] :
195
241
"""Method to create contender phrases from the list of words that form
196
242
a sentence by dropping stopwords and punctuations and grouping the left
197
243
words into phrases. Only phrases in the given length range (both limits
@@ -210,9 +256,9 @@ def _get_phrase_list_from_words(self, word_list):
210
256
211
257
:param word_list: List of words which form a sentence when joined in
212
258
the same order.
213
- :return: List of contender phrases that are formed after dropping
214
- stopwords and punctuations.
259
+ :return: List of contender phrases honouring phrase length requirements
260
+ that are formed after dropping stopwords and punctuations.
215
261
"""
216
262
groups = groupby (word_list , lambda x : x not in self .to_ignore )
217
- phrases = [tuple (group [1 ]) for group in groups if group [0 ]]
263
+ phrases : List [ Phrase ] = [tuple (group [1 ]) for group in groups if group [0 ]]
218
264
return list (filter (lambda x : self .min_length <= len (x ) <= self .max_length , phrases ))
0 commit comments