Skip to content

Commit a1aaaef

Browse files
committed
Add changes to allow for custom tokenizers.
1 parent fd6e5ca commit a1aaaef

28 files changed

+321
-26
lines changed

CHANGELOG.rst

+4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
Release History
22
===============
33

4+
v1.0.6
5+
------
6+
* Allowing usage of custom word and sentence tokenizers.
7+
48
v1.0.5
59
------
610
* Adding python typing for better/clear interfaces.

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ RAKE short for Rapid Automatic Keyword Extraction algorithm, is a domain indepen
1010

1111
![Demo](http://i.imgur.com/wVOzU7y.gif)
1212

13+
## Features
14+
15+
* Ridiculously simple interface.
16+
* Configurable word and sentence tokenizers, language based stop words etc
17+
* Configurable ranking metric.
18+
1319
## Setup
1420

1521
### Using pip

README.rst

+7
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@ and its co-occurance with other words in the text.
1010

1111
|Demo|
1212

13+
Features
14+
--------
15+
16+
* Ridiculously simple interface.
17+
* Configurable word and sentence tokenizers, language based stop words etc
18+
* Configurable ranking metric.
19+
1320
Setup
1421
-----
1522

docs/_build/html/.buildinfo

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 8552c64675e11083f929cafe599db8ba
3+
config: bf6aed3296a8c66404883d828afe2007
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
2.56 KB
Binary file not shown.
5.4 KB
Binary file not shown.
2.95 KB
Binary file not shown.
1.12 KB
Binary file not shown.

docs/_build/html/_modules/index.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<head>
66
<meta charset="utf-8" />
77
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
8-
<title>Overview: module code &#8212; rake-nltk 1.0.5 documentation</title>
8+
<title>Overview: module code &#8212; rake-nltk 1.0.6 documentation</title>
99
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
1010
<link rel="stylesheet" type="text/css" href="../_static/alabaster.css" />
1111
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>

docs/_build/html/_modules/rake_nltk/rake.html

+40-6
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<head>
66
<meta charset="utf-8" />
77
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
8-
<title>rake_nltk.rake &#8212; rake-nltk 1.0.5 documentation</title>
8+
<title>rake_nltk.rake &#8212; rake-nltk 1.0.6 documentation</title>
99
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
1010
<link rel="stylesheet" type="text/css" href="../../_static/alabaster.css" />
1111
<script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
@@ -78,10 +78,9 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
7878
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">Counter</span><span class="p">,</span> <span class="n">defaultdict</span>
7979
<span class="kn">from</span> <span class="nn">enum</span> <span class="kn">import</span> <span class="n">Enum</span>
8080
<span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">chain</span><span class="p">,</span> <span class="n">groupby</span><span class="p">,</span> <span class="n">product</span>
81-
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">DefaultDict</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Tuple</span>
81+
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">DefaultDict</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Tuple</span>
8282

8383
<span class="kn">import</span> <span class="nn">nltk</span>
84-
<span class="kn">from</span> <span class="nn">nltk.tokenize</span> <span class="kn">import</span> <span class="n">wordpunct_tokenize</span>
8584

8685
<span class="c1"># Readability type definitions.</span>
8786
<span class="n">Word</span> <span class="o">=</span> <span class="nb">str</span>
@@ -109,6 +108,8 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
109108
<span class="n">max_length</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100000</span><span class="p">,</span>
110109
<span class="n">min_length</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
111110
<span class="n">include_repeated_phrases</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
111+
<span class="n">sentence_tokenizer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
112+
<span class="n">word_tokenizer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
112113
<span class="p">):</span>
113114
<span class="sd">&quot;&quot;&quot;Constructor.</span>
114115

@@ -135,7 +136,8 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
135136
<span class="sd"> (magic, systems), (company,),</span>
136137
<span class="sd"> (founded,), (raul,)</span>
137138
<span class="sd"> ]</span>
138-
139+
<span class="sd"> :param sentence_tokenizer: Tokenizer used to tokenize the text string into sentences.</span>
140+
<span class="sd"> :param word_tokenizer: Tokenizer used to tokenize the sentence string into words.</span>
139141
<span class="sd"> &quot;&quot;&quot;</span>
140142
<span class="c1"># By default use degree to frequency ratio as the metric.</span>
141143
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ranking_metric</span><span class="p">,</span> <span class="n">Metric</span><span class="p">):</span>
@@ -167,6 +169,18 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
167169
<span class="c1"># Whether we should include repeated phreases in the computation or not.</span>
168170
<span class="bp">self</span><span class="o">.</span><span class="n">include_repeated_phrases</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">include_repeated_phrases</span>
169171

172+
<span class="c1"># Tokenizers.</span>
173+
<span class="bp">self</span><span class="o">.</span><span class="n">sentence_tokenizer</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span>
174+
<span class="k">if</span> <span class="n">sentence_tokenizer</span><span class="p">:</span>
175+
<span class="bp">self</span><span class="o">.</span><span class="n">sentence_tokenizer</span> <span class="o">=</span> <span class="n">sentence_tokenizer</span>
176+
<span class="k">else</span><span class="p">:</span>
177+
<span class="bp">self</span><span class="o">.</span><span class="n">sentence_tokenizer</span> <span class="o">=</span> <span class="n">nltk</span><span class="o">.</span><span class="n">tokenize</span><span class="o">.</span><span class="n">sent_tokenize</span>
178+
<span class="bp">self</span><span class="o">.</span><span class="n">word_tokenizer</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span>
179+
<span class="k">if</span> <span class="n">word_tokenizer</span><span class="p">:</span>
180+
<span class="bp">self</span><span class="o">.</span><span class="n">word_tokenizer</span> <span class="o">=</span> <span class="n">word_tokenizer</span>
181+
<span class="k">else</span><span class="p">:</span>
182+
<span class="bp">self</span><span class="o">.</span><span class="n">word_tokenizer</span> <span class="o">=</span> <span class="n">nltk</span><span class="o">.</span><span class="n">tokenize</span><span class="o">.</span><span class="n">wordpunct_tokenize</span>
183+
170184
<span class="c1"># Stuff to be extracted from the provided text.</span>
171185
<span class="bp">self</span><span class="o">.</span><span class="n">frequency_dist</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Word</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span>
172186
<span class="bp">self</span><span class="o">.</span><span class="n">degree</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Word</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span>
@@ -178,7 +192,7 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
178192

179193
<span class="sd"> :param text: Text to extract keywords from, provided as a string.</span>
180194
<span class="sd"> &quot;&quot;&quot;</span>
181-
<span class="n">sentences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Sentence</span><span class="p">]</span> <span class="o">=</span> <span class="n">nltk</span><span class="o">.</span><span class="n">tokenize</span><span class="o">.</span><span class="n">sent_tokenize</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
195+
<span class="n">sentences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Sentence</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_tokenize_text_to_sentences</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
182196
<span class="bp">self</span><span class="o">.</span><span class="n">extract_keywords_from_sentences</span><span class="p">(</span><span class="n">sentences</span><span class="p">)</span></div>
183197

184198
<div class="viewcode-block" id="Rake.extract_keywords_from_sentences"><a class="viewcode-back" href="../../api.html#rake_nltk.Rake.extract_keywords_from_sentences">[docs]</a> <span class="k">def</span> <span class="nf">extract_keywords_from_sentences</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sentences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Sentence</span><span class="p">]):</span>
@@ -224,6 +238,26 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
224238
<span class="sd"> &quot;&quot;&quot;</span>
225239
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">degree</span></div>
226240

241+
<span class="k">def</span> <span class="nf">_tokenize_text_to_sentences</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Sentence</span><span class="p">]:</span>
242+
<span class="sd">&quot;&quot;&quot;Tokenizes the given text string into sentences using the configured</span>
243+
<span class="sd"> sentence tokenizer. Configuration uses `nltk.tokenize.sent_tokenize`</span>
244+
<span class="sd"> by default.</span>
245+
246+
<span class="sd"> :param text: String text to tokenize into sentences.</span>
247+
<span class="sd"> :return: List of sentences as per the tokenizer used.</span>
248+
<span class="sd"> &quot;&quot;&quot;</span>
249+
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">sentence_tokenizer</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
250+
251+
<span class="k">def</span> <span class="nf">_tokenize_sentence_to_words</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sentence</span><span class="p">:</span> <span class="n">Sentence</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Word</span><span class="p">]:</span>
252+
<span class="sd">&quot;&quot;&quot;Tokenizes the given sentence string into words using the configured</span>
253+
<span class="sd"> word tokenizer. Configuration uses `nltk.tokenize.wordpunct_tokenize`</span>
254+
<span class="sd"> by default.</span>
255+
256+
<span class="sd"> :param sentence: String sentence to tokenize into words.</span>
257+
<span class="sd"> :return: List of words as per the tokenizer used.</span>
258+
<span class="sd"> &quot;&quot;&quot;</span>
259+
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">word_tokenizer</span><span class="p">(</span><span class="n">sentence</span><span class="p">)</span>
260+
227261
<span class="k">def</span> <span class="nf">_build_frequency_dist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">phrase_list</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Phrase</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
228262
<span class="sd">&quot;&quot;&quot;Builds frequency distribution of the words in the given body of text.</span>
229263

@@ -288,7 +322,7 @@ <h1>Source code for rake_nltk.rake</h1><div class="highlight"><pre>
288322
<span class="n">phrase_list</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Phrase</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
289323
<span class="c1"># Create contender phrases from sentences.</span>
290324
<span class="k">for</span> <span class="n">sentence</span> <span class="ow">in</span> <span class="n">sentences</span><span class="p">:</span>
291-
<span class="n">word_list</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Word</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">word</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">wordpunct_tokenize</span><span class="p">(</span><span class="n">sentence</span><span class="p">)]</span>
325+
<span class="n">word_list</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Word</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">word</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_tokenize_sentence_to_words</span><span class="p">(</span><span class="n">sentence</span><span class="p">)]</span>
292326
<span class="n">phrase_list</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_get_phrase_list_from_words</span><span class="p">(</span><span class="n">word_list</span><span class="p">))</span>
293327

294328
<span class="c1"># Based on user&#39;s choice to include or not include repeated phrases</span>

docs/_build/html/_sources/advanced.rst.txt

+34
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,37 @@ was founded in a garage" has the phrase (magic, systems) occuring twice.
8989
9090
# To include all phrases only once and ignore the repetitions
9191
r = Rake(include_repeated_phrases=False)
92+
93+
to control the sentence tokenizer
94+
---------------------------------
95+
96+
So that user can choose the sentence tokenizer they want to use.
97+
98+
.. code:: python
99+
100+
from rake_nltk import Rake
101+
102+
# To use default `nltk.tokenize.sent_tokenize` tokenizer.
103+
r = Rake() # Equivalent to Rake(sentence_tokenizer=nltk.tokenize.sent_tokenize)
104+
105+
# To use a custom tokenizer.
106+
def custom_tokenizer(text: str) -> List[str]:
107+
...
108+
r = Rake(sentence_tokenizer=custom_tokenizer)
109+
110+
to control the word tokenizer
111+
---------------------------------
112+
113+
So that user can choose the word tokenizer they want to use.
114+
115+
.. code:: python
116+
117+
from rake_nltk import Rake
118+
119+
# To use default `nltk.tokenize.wordpunct_tokenize` tokenizer.
120+
r = Rake() # Equivalent to Rake(word_tokenizer=nltk.tokenize.wordpunct_tokenize)
121+
122+
# To use a custom tokenizer.
123+
def custom_tokenizer(text: str) -> List[str]:
124+
...
125+
r = Rake(word_tokenizer=custom_tokenizer)

docs/_build/html/_sources/index.rst.txt

+7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ and its co-occurance with other words in the text.
1919

2020
|Demo|
2121

22+
Features
23+
--------
24+
25+
* Ridiculously simple interface.
26+
* Configurable word and sentence tokenizers, language based stop words etc
27+
* Configurable ranking metric.
28+
2229
Setup
2330
-----
2431

docs/_build/html/_static/documentation_options.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
var DOCUMENTATION_OPTIONS = {
22
URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
3-
VERSION: '1.0.5',
3+
VERSION: '1.0.6',
44
LANGUAGE: 'None',
55
COLLAPSE_INDEX: false,
66
BUILDER: 'html',

0 commit comments

Comments
 (0)