diff --git a/README.md b/README.md
index 01e31cc..0018270 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
-| 2.3.4 | 2.3.4.2 |
+| 2.3.4 | 2.3.4.3 |
| 2.3.3 | 2.3.3.5 |
| 2.3.2 | 2.3.2.1 |
| 2.3.1 | 2.3.1.1 |
@@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part.
## Installation
```bash
-bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip
+bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.3/elasticsearch-analysis-url-2.3.4.3.zip
```
## Usage
@@ -112,7 +112,8 @@ Set up your index like so:
"url_host": {
"type": "url",
"part": "host",
- "url_decode": true
+ "url_decode": true,
+ "tokenize_host": false
}
},
"analyzer": {
diff --git a/pom.xml b/pom.xml
index 45fc797..a0bbdf8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
org.elasticsearch
elasticsearch-analysis-url
- 2.3.4.2
+ 2.3.4.3
jar
Elasticsearch URL token filter plugin
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/Token.java b/src/main/java/org/elasticsearch/index/analysis/url/Token.java
new file mode 100644
index 0000000..deed5f7
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/analysis/url/Token.java
@@ -0,0 +1,60 @@
+package org.elasticsearch.index.analysis.url;
+
+import com.google.common.base.Objects;
+import org.elasticsearch.index.analysis.URLPart;
+
+/**
+ * @author Joe Linn
+ * 8/14/2016
+ */
+class Token {
+ private final String token;
+ private final URLPart part;
+ private final int start;
+ private final int end;
+
+ public Token(String token, URLPart part, int start, int end) {
+ this.token = token;
+ this.part = part;
+ this.start = start;
+ this.end = end;
+ }
+
+ public String getToken() {
+ return token;
+ }
+
+ public URLPart getPart() {
+ return part;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof Token)) {
+ return false;
+ }
+ Token that = (Token) obj;
+ return this.start == that.start
+ && this.end == that.end
+ && Objects.equal(this.token, that.token)
+ && Objects.equal(this.part, that.part);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = token != null ? token.hashCode() : 0;
+ result = 31 * result + part.hashCode();
+ result = 31 * result + start;
+ result = 31 * result + end;
+ return result;
+ }
+}
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
index 863058a..481c201 100644
--- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
+++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
@@ -6,6 +6,8 @@
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.index.analysis.URLPart;
@@ -27,7 +29,7 @@ public final class URLTokenFilter extends TokenFilter {
private List parts;
- private final boolean urlDeocde;
+ private boolean urlDeocde;
/**
* If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
@@ -45,6 +47,8 @@ public final class URLTokenFilter extends TokenFilter {
private boolean tokenizeQuery = true;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+ private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final boolean allowMalformed;
@@ -52,8 +56,8 @@ public final class URLTokenFilter extends TokenFilter {
private boolean passthrough;
- private List tokens;
- private Iterator iterator;
+ private List tokens;
+ private Iterator iterator;
public URLTokenFilter(TokenStream input, URLPart part) {
this(input, part, false);
@@ -106,6 +110,12 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
return this;
}
+ public URLTokenFilter setUrlDeocde(boolean urlDeocde) {
+ this.urlDeocde = urlDeocde;
+ return this;
+ }
+
+
@Override
public boolean incrementToken() throws IOException {
if (iterator == null || !iterator.hasNext()) {
@@ -114,11 +124,10 @@ public boolean incrementToken() throws IOException {
}
}
clearAttributes();
- String next = iterator.next();
- if (allowMalformed) {
- next = parseMalformed(next);
- }
- termAttribute.append(next);
+ Token next = iterator.next();
+ termAttribute.append(next.getToken());
+ typeAttribute.setType(next.getPart().name().toLowerCase());
+ offsetAttribute.setOffset(next.getStart(), next.getEnd());
return true;
}
@@ -139,7 +148,7 @@ private boolean advance() throws IOException {
} catch (IOException e) {
if (e.getMessage().contains("Malformed URL")) {
if (allowMalformed) {
- tokens = ImmutableList.of(urlString);
+ tokens = ImmutableList.of(new Token(urlString, URLPart.WHOLE, 0, urlString.length()));
} else {
throw new MalformedURLException("Malformed URL: " + urlString);
}
@@ -164,8 +173,8 @@ private boolean advance() throws IOException {
* @return a list of tokens extracted from the input string
* @throws IOException
*/
- private List tokenize(String input) throws IOException {
- List tokens = new ArrayList<>();
+ private List tokenize(String input) throws IOException {
+ List tokens = new ArrayList<>();
URLTokenizer tokenizer = new URLTokenizer();
tokenizer.setParts(parts);
tokenizer.setUrlDecode(urlDeocde);
@@ -176,8 +185,15 @@ private List tokenize(String input) throws IOException {
tokenizer.setTokenizeMalformed(tokenizeMalformed);
tokenizer.setReader(new StringReader(input));
tokenizer.reset();
+
+ String term;
+ URLPart part;
+ OffsetAttribute offset;
while (tokenizer.incrementToken()) {
- tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString());
+ term = tokenizer.getAttribute(CharTermAttribute.class).toString();
+ part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
+ offset = tokenizer.getAttribute(OffsetAttribute.class);
+ tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
}
return tokens;
}
@@ -198,6 +214,7 @@ public void reset() throws IOException {
* Attempt to parse a malformed url string
* @param urlString the malformed url string
* @return the url part if it can be parsed, null otherwise
+ * @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer}
*/
private String parseMalformed(String urlString) {
if (parts != null && !parts.isEmpty()) {
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
index 64a0f94..5b56c43 100644
--- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -1,6 +1,5 @@
package org.elasticsearch.index.analysis.url;
-import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
@@ -511,47 +510,4 @@ private List tokenizeSpecial(URL url) {
}
- private class Token {
- private final String token;
- private final URLPart part;
- private final int start;
- private final int end;
-
- public Token(String token, URLPart part, int start, int end) {
- this.token = token;
- this.part = part;
- this.start = start;
- this.end = end;
- }
-
- public String getToken() { return token; }
-
- public URLPart getPart() { return part; }
-
- public int getStart() { return start; }
-
- public int getEnd() { return end; }
-
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null || !(obj instanceof Token)) {
- return false;
- }
- Token that = (Token) obj;
- return this.start == that.start
- && this.end == that.end
- && Objects.equal(this.token, that.token)
- && Objects.equal(this.part, that.part);
- }
-
- @Override
- public int hashCode() {
- int result = token != null ? token.hashCode() : 0;
- result = 31 * result + part.hashCode();
- result = 31 * result + start;
- result = 31 * result + end;
- return result;
- }
- }
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithTokenAndPosition.java b/src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java
similarity index 79%
rename from src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithTokenAndPosition.java
rename to src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java
index 6c7cc13..11631c4 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithTokenAndPosition.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java
@@ -1,7 +1,7 @@
package org.elasticsearch.index.analysis.url;
import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.hamcrest.Description;
@@ -14,8 +14,8 @@
* Joe Linn
* 8/2/2015
*/
-public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher {
- private static final Logger log = Logger.getLogger(IsTokenizerWithTokenAndPosition.class);
+public class IsTokenStreamWithTokenAndPosition extends TypeSafeMatcher {
+ private static final Logger log = Logger.getLogger(IsTokenStreamWithTokenAndPosition.class);
private final String token;
private final int start;
@@ -25,14 +25,14 @@ public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher
private int actualStart;
private int actualEnd;
- public IsTokenizerWithTokenAndPosition(String token, int start, int end) {
+ public IsTokenStreamWithTokenAndPosition(String token, int start, int end) {
this.token = token;
this.start = start;
this.end = end;
}
@Override
- protected boolean matchesSafely(Tokenizer tokenizer) {
+ protected boolean matchesSafely(TokenStream tokenizer) {
CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
try {
@@ -71,7 +71,7 @@ public void describeTo(Description description) {
@Override
- protected void describeMismatchSafely(Tokenizer item, Description mismatchDescription) {
+ protected void describeMismatchSafely(TokenStream item, Description mismatchDescription) {
if(!foundToken){
mismatchDescription.appendText("tokenizer which did not contain token ").appendValue(token);
} else {
@@ -85,7 +85,7 @@ protected void describeMismatchSafely(Tokenizer item, Description mismatchDescri
}
@Factory
- public static IsTokenizerWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
- return new IsTokenizerWithTokenAndPosition(token, start, end);
+ public static IsTokenStreamWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
+ return new IsTokenStreamWithTokenAndPosition(token, start, end);
}
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java
index 16bbae4..a6a3e9b 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java
@@ -10,6 +10,8 @@
import java.io.IOException;
import java.net.MalformedURLException;
+import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
+
public class URLTokenFilterTest extends BaseTokenStreamTestCase {
public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
public static final String TEST_HTTP_URL2 = "http://www.foo.bar.com";
@@ -27,6 +29,13 @@ public void testFilterProtocol() throws IOException {
@Test
public void testFilterHost() throws IOException {
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
+
+ URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
+ .setUrlDeocde(false);
+ assertThat(filter, hasTokenAtOffset("www.foo.bar.com", 7, 22));
+ assertThat(filter, hasTokenAtOffset("foo.bar.com", 11, 22));
+ assertThat(filter, hasTokenAtOffset("bar.com", 15, 22));
+ assertThat(filter, hasTokenAtOffset("com", 19, 22));
}
@Test
@@ -77,12 +86,15 @@ public void testInferPort() throws IOException {
@Test
public void testMalformed() throws IOException {
URLTokenFilter filter = createFilter("http://:::::::/baz", URLPart.PROTOCOL, false, true);
+ filter.setTokenizeMalformed(true);
assertTokenStreamContents(filter, "http");
filter = createFilter("foo.com/bar?baz=bat", URLPart.QUERY, false, true);
+ filter.setTokenizeMalformed(true);
assertTokenStreamContents(filter, "baz=bat");
filter = createFilter("baz.com:3456/foo", URLPart.PORT, false, true);
+ filter.setTokenizeMalformed(true);
assertTokenStreamContents(filter, "3456");
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
index e663c04..cb295aa 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
@@ -12,7 +12,7 @@
import java.util.ArrayList;
import java.util.List;
-import static org.elasticsearch.index.analysis.url.IsTokenizerWithTokenAndPosition.hasTokenAtOffset;
+import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.hasItem;
diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json
index f10b40a..529c9fa 100644
--- a/src/test/resources/test-settings.json
+++ b/src/test/resources/test-settings.json
@@ -50,7 +50,8 @@
"url_port_malformed": {
"type": "url",
"part": "port",
- "allow_malformed": true
+ "allow_malformed": true,
+ "tokenize_malformed": true
},
"url_host_passthrough": {
"type": "url",