From 6e4591ff765f43a3d7c0509f3effed61a2765e51 Mon Sep 17 00:00:00 2001 From: jlinn Date: Sun, 14 Aug 2016 18:07:18 -0700 Subject: [PATCH] Propagate token offsets and types through UrlTokenFilter --- README.md | 7 ++- pom.xml | 2 +- .../index/analysis/url/Token.java | 60 +++++++++++++++++++ .../index/analysis/url/URLTokenFilter.java | 41 +++++++++---- .../index/analysis/url/URLTokenizer.java | 44 -------------- ...=> IsTokenStreamWithTokenAndPosition.java} | 16 ++--- .../analysis/url/URLTokenFilterTest.java | 12 ++++ .../index/analysis/url/URLTokenizerTest.java | 2 +- src/test/resources/test-settings.json | 3 +- 9 files changed, 117 insertions(+), 70 deletions(-) create mode 100644 src/main/java/org/elasticsearch/index/analysis/url/Token.java rename src/test/java/org/elasticsearch/index/analysis/url/{IsTokenizerWithTokenAndPosition.java => IsTokenStreamWithTokenAndPosition.java} (79%) diff --git a/README.md b/README.md index 01e31cc..0018270 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part. | Elasticsearch Version | Plugin Version | |-----------------------|----------------| -| 2.3.4 | 2.3.4.2 | +| 2.3.4 | 2.3.4.3 | | 2.3.3 | 2.3.3.5 | | 2.3.2 | 2.3.2.1 | | 2.3.1 | 2.3.1.1 | @@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part. ## Installation ```bash -bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip +bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.3/elasticsearch-analysis-url-2.3.4.3.zip ``` ## Usage @@ -112,7 +112,8 @@ Set up your index like so: "url_host": { "type": "url", "part": "host", - "url_decode": true + "url_decode": true, + "tokenize_host": false } }, "analyzer": { diff --git a/pom.xml b/pom.xml index 45fc797..a0bbdf8 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.elasticsearch elasticsearch-analysis-url - 2.3.4.2 + 2.3.4.3 jar Elasticsearch URL token filter plugin diff --git a/src/main/java/org/elasticsearch/index/analysis/url/Token.java b/src/main/java/org/elasticsearch/index/analysis/url/Token.java new file mode 100644 index 0000000..deed5f7 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/url/Token.java @@ -0,0 +1,60 @@ +package org.elasticsearch.index.analysis.url; + +import com.google.common.base.Objects; +import org.elasticsearch.index.analysis.URLPart; + +/** + * @author Joe Linn + * 8/14/2016 + */ +class Token { + private final String token; + private final URLPart part; + private final int start; + private final int end; + + public Token(String token, URLPart part, int start, int end) { + this.token = token; + this.part = part; + this.start = start; + this.end = end; + } + + public String getToken() { + return token; + } + + public URLPart getPart() { + return part; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + + @Override + public boolean equals(Object obj) { + if (obj == null || !(obj instanceof Token)) { + return false; + } + Token that = (Token) obj; + return this.start == that.start + && this.end == that.end + && Objects.equal(this.token, that.token) + && Objects.equal(this.part, that.part); + } + + @Override + public int hashCode() { + int result = token != null ? token.hashCode() : 0; + result = 31 * result + part.hashCode(); + result = 31 * result + start; + result = 31 * result + end; + return result; + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java index 863058a..481c201 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java @@ -6,6 +6,8 @@ import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.elasticsearch.common.Strings; import org.elasticsearch.index.analysis.URLPart; @@ -27,7 +29,7 @@ public final class URLTokenFilter extends TokenFilter { private List parts; - private final boolean urlDeocde; + private boolean urlDeocde; /** * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer} @@ -45,6 +47,8 @@ public final class URLTokenFilter extends TokenFilter { private boolean tokenizeQuery = true; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final boolean allowMalformed; @@ -52,8 +56,8 @@ public final class URLTokenFilter extends TokenFilter { private boolean passthrough; - private List tokens; - private Iterator iterator; + private List tokens; + private Iterator iterator; public URLTokenFilter(TokenStream input, URLPart part) { this(input, part, false); @@ -106,6 +110,12 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) { return this; } + public URLTokenFilter setUrlDeocde(boolean urlDeocde) { + this.urlDeocde = urlDeocde; + return this; + } + + @Override public boolean incrementToken() throws IOException { if (iterator == null || !iterator.hasNext()) { @@ -114,11 +124,10 @@ public boolean incrementToken() throws IOException { } } clearAttributes(); - String next = iterator.next(); - if (allowMalformed) { - next = parseMalformed(next); - } - termAttribute.append(next); + Token next = iterator.next(); + termAttribute.append(next.getToken()); + typeAttribute.setType(next.getPart().name().toLowerCase()); + offsetAttribute.setOffset(next.getStart(), next.getEnd()); return true; } @@ -139,7 +148,7 @@ private boolean advance() throws IOException { } catch (IOException e) { if (e.getMessage().contains("Malformed URL")) { if (allowMalformed) { - tokens = ImmutableList.of(urlString); + tokens = ImmutableList.of(new Token(urlString, URLPart.WHOLE, 0, urlString.length())); } else { throw new MalformedURLException("Malformed URL: " + urlString); } @@ -164,8 +173,8 @@ private boolean advance() throws IOException { * @return a list of tokens extracted from the input string * @throws IOException */ - private List tokenize(String input) throws IOException { - List tokens = new ArrayList<>(); + private List tokenize(String input) throws IOException { + List tokens = new ArrayList<>(); URLTokenizer tokenizer = new URLTokenizer(); tokenizer.setParts(parts); tokenizer.setUrlDecode(urlDeocde); @@ -176,8 +185,15 @@ private List tokenize(String input) throws IOException { tokenizer.setTokenizeMalformed(tokenizeMalformed); tokenizer.setReader(new StringReader(input)); tokenizer.reset(); + + String term; + URLPart part; + OffsetAttribute offset; while (tokenizer.incrementToken()) { - tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString()); + term = tokenizer.getAttribute(CharTermAttribute.class).toString(); + part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type()); + offset = tokenizer.getAttribute(OffsetAttribute.class); + tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset())); } return tokens; } @@ -198,6 +214,7 @@ public void reset() throws IOException { * Attempt to parse a malformed url string * @param urlString the malformed url string * @return the url part if it can be parsed, null otherwise + * @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer} */ private String parseMalformed(String urlString) { if (parts != null && !parts.isEmpty()) { diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java index 64a0f94..5b56c43 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java @@ -1,6 +1,5 @@ package org.elasticsearch.index.analysis.url; -import com.google.common.base.Objects; import com.google.common.base.Optional; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; @@ -511,47 +510,4 @@ private List tokenizeSpecial(URL url) { } - private class Token { - private final String token; - private final URLPart part; - private final int start; - private final int end; - - public Token(String token, URLPart part, int start, int end) { - this.token = token; - this.part = part; - this.start = start; - this.end = end; - } - - public String getToken() { return token; } - - public URLPart getPart() { return part; } - - public int getStart() { return start; } - - public int getEnd() { return end; } - - - @Override - public boolean equals(Object obj) { - if (obj == null || !(obj instanceof Token)) { - return false; - } - Token that = (Token) obj; - return this.start == that.start - && this.end == that.end - && Objects.equal(this.token, that.token) - && Objects.equal(this.part, that.part); - } - - @Override - public int hashCode() { - int result = token != null ? token.hashCode() : 0; - result = 31 * result + part.hashCode(); - result = 31 * result + start; - result = 31 * result + end; - return result; - } - } } diff --git a/src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithTokenAndPosition.java b/src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java similarity index 79% rename from src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithTokenAndPosition.java rename to src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java index 6c7cc13..11631c4 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithTokenAndPosition.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java @@ -1,7 +1,7 @@ package org.elasticsearch.index.analysis.url; import org.apache.log4j.Logger; -import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.hamcrest.Description; @@ -14,8 +14,8 @@ * Joe Linn * 8/2/2015 */ -public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher { - private static final Logger log = Logger.getLogger(IsTokenizerWithTokenAndPosition.class); +public class IsTokenStreamWithTokenAndPosition extends TypeSafeMatcher { + private static final Logger log = Logger.getLogger(IsTokenStreamWithTokenAndPosition.class); private final String token; private final int start; @@ -25,14 +25,14 @@ public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher private int actualStart; private int actualEnd; - public IsTokenizerWithTokenAndPosition(String token, int start, int end) { + public IsTokenStreamWithTokenAndPosition(String token, int start, int end) { this.token = token; this.start = start; this.end = end; } @Override - protected boolean matchesSafely(Tokenizer tokenizer) { + protected boolean matchesSafely(TokenStream tokenizer) { CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class); OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class); try { @@ -71,7 +71,7 @@ public void describeTo(Description description) { @Override - protected void describeMismatchSafely(Tokenizer item, Description mismatchDescription) { + protected void describeMismatchSafely(TokenStream item, Description mismatchDescription) { if(!foundToken){ mismatchDescription.appendText("tokenizer which did not contain token ").appendValue(token); } else { @@ -85,7 +85,7 @@ protected void describeMismatchSafely(Tokenizer item, Description mismatchDescri } @Factory - public static IsTokenizerWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) { - return new IsTokenizerWithTokenAndPosition(token, start, end); + public static IsTokenStreamWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) { + return new IsTokenStreamWithTokenAndPosition(token, start, end); } } diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java index 16bbae4..a6a3e9b 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java @@ -10,6 +10,8 @@ import java.io.IOException; import java.net.MalformedURLException; +import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset; + public class URLTokenFilterTest extends BaseTokenStreamTestCase { public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag"; public static final String TEST_HTTP_URL2 = "http://www.foo.bar.com"; @@ -27,6 +29,13 @@ public void testFilterProtocol() throws IOException { @Test public void testFilterHost() throws IOException { assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com"); + + URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.HOST) + .setUrlDeocde(false); + assertThat(filter, hasTokenAtOffset("www.foo.bar.com", 7, 22)); + assertThat(filter, hasTokenAtOffset("foo.bar.com", 11, 22)); + assertThat(filter, hasTokenAtOffset("bar.com", 15, 22)); + assertThat(filter, hasTokenAtOffset("com", 19, 22)); } @Test @@ -77,12 +86,15 @@ public void testInferPort() throws IOException { @Test public void testMalformed() throws IOException { URLTokenFilter filter = createFilter("http://:::::::/baz", URLPart.PROTOCOL, false, true); + filter.setTokenizeMalformed(true); assertTokenStreamContents(filter, "http"); filter = createFilter("foo.com/bar?baz=bat", URLPart.QUERY, false, true); + filter.setTokenizeMalformed(true); assertTokenStreamContents(filter, "baz=bat"); filter = createFilter("baz.com:3456/foo", URLPart.PORT, false, true); + filter.setTokenizeMalformed(true); assertTokenStreamContents(filter, "3456"); } diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java index e663c04..cb295aa 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java @@ -12,7 +12,7 @@ import java.util.ArrayList; import java.util.List; -import static org.elasticsearch.index.analysis.url.IsTokenizerWithTokenAndPosition.hasTokenAtOffset; +import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.hasItem; diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json index f10b40a..529c9fa 100644 --- a/src/test/resources/test-settings.json +++ b/src/test/resources/test-settings.json @@ -50,7 +50,8 @@ "url_port_malformed": { "type": "url", "part": "port", - "allow_malformed": true + "allow_malformed": true, + "tokenize_malformed": true }, "url_host_passthrough": { "type": "url",