5
5
< head >
6
6
< meta charset ="utf-8 " />
7
7
< meta name ="viewport " content ="width=device-width, initial-scale=1.0 " />
8
- < title > rake_nltk.rake — rake-nltk 1.0.5 documentation</ title >
8
+ < title > rake_nltk.rake — rake-nltk 1.0.6 documentation</ title >
9
9
< link rel ="stylesheet " type ="text/css " href ="../../_static/pygments.css " />
10
10
< link rel ="stylesheet " type ="text/css " href ="../../_static/alabaster.css " />
11
11
< script data-url_root ="../../ " id ="documentation_options " src ="../../_static/documentation_options.js "> </ script >
@@ -78,10 +78,9 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
78
78
< span class ="kn "> from</ span > < span class ="nn "> collections</ span > < span class ="kn "> import</ span > < span class ="n "> Counter</ span > < span class ="p "> ,</ span > < span class ="n "> defaultdict</ span >
79
79
< span class ="kn "> from</ span > < span class ="nn "> enum</ span > < span class ="kn "> import</ span > < span class ="n "> Enum</ span >
80
80
< span class ="kn "> from</ span > < span class ="nn "> itertools</ span > < span class ="kn "> import</ span > < span class ="n "> chain</ span > < span class ="p "> ,</ span > < span class ="n "> groupby</ span > < span class ="p "> ,</ span > < span class ="n "> product</ span >
81
- < span class ="kn "> from</ span > < span class ="nn "> typing</ span > < span class ="kn "> import</ span > < span class ="n "> DefaultDict</ span > < span class ="p "> ,</ span > < span class ="n "> Dict</ span > < span class ="p "> ,</ span > < span class ="n "> List</ span > < span class ="p "> ,</ span > < span class ="n "> Optional</ span > < span class ="p "> ,</ span > < span class ="n "> Set</ span > < span class ="p "> ,</ span > < span class ="n "> Tuple</ span >
81
+ < span class ="kn "> from</ span > < span class ="nn "> typing</ span > < span class ="kn "> import</ span > < span class ="n "> Callable </ span > < span class =" p " > , </ span > < span class =" n " > DefaultDict</ span > < span class ="p "> ,</ span > < span class ="n "> Dict</ span > < span class ="p "> ,</ span > < span class ="n "> List</ span > < span class ="p "> ,</ span > < span class ="n "> Optional</ span > < span class ="p "> ,</ span > < span class ="n "> Set</ span > < span class ="p "> ,</ span > < span class ="n "> Tuple</ span >
82
82
83
83
< span class ="kn "> import</ span > < span class ="nn "> nltk</ span >
84
- < span class ="kn "> from</ span > < span class ="nn "> nltk.tokenize</ span > < span class ="kn "> import</ span > < span class ="n "> wordpunct_tokenize</ span >
85
84
86
85
< span class ="c1 "> # Readability type definitions.</ span >
87
86
< span class ="n "> Word</ span > < span class ="o "> =</ span > < span class ="nb "> str</ span >
@@ -109,6 +108,8 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
109
108
< span class ="n "> max_length</ span > < span class ="p "> :</ span > < span class ="nb "> int</ span > < span class ="o "> =</ span > < span class ="mi "> 100000</ span > < span class ="p "> ,</ span >
110
109
< span class ="n "> min_length</ span > < span class ="p "> :</ span > < span class ="nb "> int</ span > < span class ="o "> =</ span > < span class ="mi "> 1</ span > < span class ="p "> ,</ span >
111
110
< span class ="n "> include_repeated_phrases</ span > < span class ="p "> :</ span > < span class ="nb "> bool</ span > < span class ="o "> =</ span > < span class ="kc "> True</ span > < span class ="p "> ,</ span >
111
+ < span class ="n "> sentence_tokenizer</ span > < span class ="p "> :</ span > < span class ="n "> Optional</ span > < span class ="p "> [</ span > < span class ="n "> Callable</ span > < span class ="p "> [[</ span > < span class ="nb "> str</ span > < span class ="p "> ],</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ]]]</ span > < span class ="o "> =</ span > < span class ="kc "> None</ span > < span class ="p "> ,</ span >
112
+ < span class ="n "> word_tokenizer</ span > < span class ="p "> :</ span > < span class ="n "> Optional</ span > < span class ="p "> [</ span > < span class ="n "> Callable</ span > < span class ="p "> [[</ span > < span class ="nb "> str</ span > < span class ="p "> ],</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ]]]</ span > < span class ="o "> =</ span > < span class ="kc "> None</ span > < span class ="p "> ,</ span >
112
113
< span class ="p "> ):</ span >
113
114
< span class ="sd "> """Constructor.</ span >
114
115
@@ -135,7 +136,8 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
135
136
< span class ="sd "> (magic, systems), (company,),</ span >
136
137
< span class ="sd "> (founded,), (raul,)</ span >
137
138
< span class ="sd "> ]</ span >
138
-
139
+ < span class ="sd "> :param sentence_tokenizer: Tokenizer used to tokenize the text string into sentences.</ span >
140
+ < span class ="sd "> :param word_tokenizer: Tokenizer used to tokenize the sentence string into words.</ span >
139
141
< span class ="sd "> """</ span >
140
142
< span class ="c1 "> # By default use degree to frequency ratio as the metric.</ span >
141
143
< span class ="k "> if</ span > < span class ="nb "> isinstance</ span > < span class ="p "> (</ span > < span class ="n "> ranking_metric</ span > < span class ="p "> ,</ span > < span class ="n "> Metric</ span > < span class ="p "> ):</ span >
@@ -167,6 +169,18 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
167
169
< span class ="c1 "> # Whether we should include repeated phreases in the computation or not.</ span >
168
170
< span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> include_repeated_phrases</ span > < span class ="p "> :</ span > < span class ="nb "> bool</ span > < span class ="o "> =</ span > < span class ="n "> include_repeated_phrases</ span >
169
171
172
+ < span class ="c1 "> # Tokenizers.</ span >
173
+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> sentence_tokenizer</ span > < span class ="p "> :</ span > < span class ="n "> Callable</ span > < span class ="p "> [[</ span > < span class ="nb "> str</ span > < span class ="p "> ],</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ]]</ span >
174
+ < span class ="k "> if</ span > < span class ="n "> sentence_tokenizer</ span > < span class ="p "> :</ span >
175
+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> sentence_tokenizer</ span > < span class ="o "> =</ span > < span class ="n "> sentence_tokenizer</ span >
176
+ < span class ="k "> else</ span > < span class ="p "> :</ span >
177
+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> sentence_tokenizer</ span > < span class ="o "> =</ span > < span class ="n "> nltk</ span > < span class ="o "> .</ span > < span class ="n "> tokenize</ span > < span class ="o "> .</ span > < span class ="n "> sent_tokenize</ span >
178
+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> word_tokenizer</ span > < span class ="p "> :</ span > < span class ="n "> Callable</ span > < span class ="p "> [[</ span > < span class ="nb "> str</ span > < span class ="p "> ],</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="nb "> str</ span > < span class ="p "> ]]</ span >
179
+ < span class ="k "> if</ span > < span class ="n "> word_tokenizer</ span > < span class ="p "> :</ span >
180
+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> word_tokenizer</ span > < span class ="o "> =</ span > < span class ="n "> word_tokenizer</ span >
181
+ < span class ="k "> else</ span > < span class ="p "> :</ span >
182
+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> word_tokenizer</ span > < span class ="o "> =</ span > < span class ="n "> nltk</ span > < span class ="o "> .</ span > < span class ="n "> tokenize</ span > < span class ="o "> .</ span > < span class ="n "> wordpunct_tokenize</ span >
183
+
170
184
< span class ="c1 "> # Stuff to be extracted from the provided text.</ span >
171
185
< span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> frequency_dist</ span > < span class ="p "> :</ span > < span class ="n "> Dict</ span > < span class ="p "> [</ span > < span class ="n "> Word</ span > < span class ="p "> ,</ span > < span class ="nb "> int</ span > < span class ="p "> ]</ span >
172
186
< span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> degree</ span > < span class ="p "> :</ span > < span class ="n "> Dict</ span > < span class ="p "> [</ span > < span class ="n "> Word</ span > < span class ="p "> ,</ span > < span class ="nb "> int</ span > < span class ="p "> ]</ span >
@@ -178,7 +192,7 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
178
192
179
193
< span class ="sd "> :param text: Text to extract keywords from, provided as a string.</ span >
180
194
< span class ="sd "> """</ span >
181
- < span class ="n "> sentences</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Sentence</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="n " > nltk </ span > < span class ="o "> .</ span > < span class ="n "> tokenize </ span > < span class =" o " > . </ span > < span class =" n " > sent_tokenize </ span > < span class ="p "> (</ span > < span class ="n "> text</ span > < span class ="p "> )</ span >
195
+ < span class ="n "> sentences</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Sentence</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="bp " > self </ span > < span class ="o "> .</ span > < span class ="n "> _tokenize_text_to_sentences </ span > < span class ="p "> (</ span > < span class ="n "> text</ span > < span class ="p "> )</ span >
182
196
< span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> extract_keywords_from_sentences</ span > < span class ="p "> (</ span > < span class ="n "> sentences</ span > < span class ="p "> )</ span > </ div >
183
197
184
198
< div class ="viewcode-block " id ="Rake.extract_keywords_from_sentences "> < a class ="viewcode-back " href ="../../api.html#rake_nltk.Rake.extract_keywords_from_sentences "> [docs]</ a > < span class ="k "> def</ span > < span class ="nf "> extract_keywords_from_sentences</ span > < span class ="p "> (</ span > < span class ="bp "> self</ span > < span class ="p "> ,</ span > < span class ="n "> sentences</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Sentence</ span > < span class ="p "> ]):</ span >
@@ -224,6 +238,26 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
224
238
< span class ="sd "> """</ span >
225
239
< span class ="k "> return</ span > < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> degree</ span > </ div >
226
240
241
+ < span class ="k "> def</ span > < span class ="nf "> _tokenize_text_to_sentences</ span > < span class ="p "> (</ span > < span class ="bp "> self</ span > < span class ="p "> ,</ span > < span class ="n "> text</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Sentence</ span > < span class ="p "> ]:</ span >
242
+ < span class ="sd "> """Tokenizes the given text string into sentences using the configured</ span >
243
+ < span class ="sd "> sentence tokenizer. Configuration uses `nltk.tokenize.sent_tokenize`</ span >
244
+ < span class ="sd "> by default.</ span >
245
+
246
+ < span class ="sd "> :param text: String text to tokenize into sentences.</ span >
247
+ < span class ="sd "> :return: List of sentences as per the tokenizer used.</ span >
248
+ < span class ="sd "> """</ span >
249
+ < span class ="k "> return</ span > < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> sentence_tokenizer</ span > < span class ="p "> (</ span > < span class ="n "> text</ span > < span class ="p "> )</ span >
250
+
251
+ < span class ="k "> def</ span > < span class ="nf "> _tokenize_sentence_to_words</ span > < span class ="p "> (</ span > < span class ="bp "> self</ span > < span class ="p "> ,</ span > < span class ="n "> sentence</ span > < span class ="p "> :</ span > < span class ="n "> Sentence</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Word</ span > < span class ="p "> ]:</ span >
252
+ < span class ="sd "> """Tokenizes the given sentence string into words using the configured</ span >
253
+ < span class ="sd "> word tokenizer. Configuration uses `nltk.tokenize.wordpunct_tokenize`</ span >
254
+ < span class ="sd "> by default.</ span >
255
+
256
+ < span class ="sd "> :param sentence: String sentence to tokenize into words.</ span >
257
+ < span class ="sd "> :return: List of words as per the tokenizer used.</ span >
258
+ < span class ="sd "> """</ span >
259
+ < span class ="k "> return</ span > < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> word_tokenizer</ span > < span class ="p "> (</ span > < span class ="n "> sentence</ span > < span class ="p "> )</ span >
260
+
227
261
< span class ="k "> def</ span > < span class ="nf "> _build_frequency_dist</ span > < span class ="p "> (</ span > < span class ="bp "> self</ span > < span class ="p "> ,</ span > < span class ="n "> phrase_list</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Phrase</ span > < span class ="p "> ])</ span > < span class ="o "> -></ span > < span class ="kc "> None</ span > < span class ="p "> :</ span >
228
262
< span class ="sd "> """Builds frequency distribution of the words in the given body of text.</ span >
229
263
@@ -288,7 +322,7 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
288
322
< span class ="n "> phrase_list</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Phrase</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="p "> []</ span >
289
323
< span class ="c1 "> # Create contender phrases from sentences.</ span >
290
324
< span class ="k "> for</ span > < span class ="n "> sentence</ span > < span class ="ow "> in</ span > < span class ="n "> sentences</ span > < span class ="p "> :</ span >
291
- < span class ="n "> word_list</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Word</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="p "> [</ span > < span class ="n "> word</ span > < span class ="o "> .</ span > < span class ="n "> lower</ span > < span class ="p "> ()</ span > < span class ="k "> for</ span > < span class ="n "> word</ span > < span class ="ow "> in</ span > < span class ="n "> wordpunct_tokenize </ span > < span class ="p "> (</ span > < span class ="n "> sentence</ span > < span class ="p "> )]</ span >
325
+ < span class ="n "> word_list</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> Word</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="p "> [</ span > < span class ="n "> word</ span > < span class ="o "> .</ span > < span class ="n "> lower</ span > < span class ="p "> ()</ span > < span class ="k "> for</ span > < span class ="n "> word</ span > < span class ="ow "> in</ span > < span class ="bp " > self </ span > < span class =" o " > . </ span > < span class =" n "> _tokenize_sentence_to_words </ span > < span class ="p "> (</ span > < span class ="n "> sentence</ span > < span class ="p "> )]</ span >
292
326
< span class ="n "> phrase_list</ span > < span class ="o "> .</ span > < span class ="n "> extend</ span > < span class ="p "> (</ span > < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> _get_phrase_list_from_words</ span > < span class ="p "> (</ span > < span class ="n "> word_list</ span > < span class ="p "> ))</ span >
293
327
294
328
< span class ="c1 "> # Based on user's choice to include or not include repeated phrases</ span >
0 commit comments