Propagate token offsets and types through UrlTokenFilter

jlinn · Aug 15, 2016 · 6e4591f · 6e4591f
1 parent a0088d9
commit 6e4591f
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 70 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
 
 | Elasticsearch Version | Plugin Version |
 |-----------------------|----------------|
-| 2.3.4 | 2.3.4.2 |
+| 2.3.4 | 2.3.4.3 |
 | 2.3.3 | 2.3.3.5 |
 | 2.3.2 | 2.3.2.1 |
 | 2.3.1 | 2.3.1.1 |
@@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part.
 
 ## Installation
 ```bash
-bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip
+bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.3/elasticsearch-analysis-url-2.3.4.3.zip
 ```
 
 ## Usage
@@ -112,7 +112,8 @@ Set up your index like so:
                 "url_host": {
                     "type": "url",
                     "part": "host",
-                    "url_decode": true
+                    "url_decode": true,
+                    "tokenize_host": false
                 }
             },
             "analyzer": {

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>org.elasticsearch</groupId>
     <artifactId>elasticsearch-analysis-url</artifactId>
-    <version>2.3.4.2</version>
+    <version>2.3.4.3</version>
     <packaging>jar</packaging>
     <description>Elasticsearch URL token filter plugin</description>
 

diff --git a/src/main/java/org/elasticsearch/index/analysis/url/Token.java b/src/main/java/org/elasticsearch/index/analysis/url/Token.java
@@ -0,0 +1,60 @@
+package org.elasticsearch.index.analysis.url;
+
+import com.google.common.base.Objects;
+import org.elasticsearch.index.analysis.URLPart;
+
+/**
+ * @author Joe Linn
+ *         8/14/2016
+ */
+class Token {
+    private final String token;
+    private final URLPart part;
+    private final int start;
+    private final int end;
+
+    public Token(String token, URLPart part, int start, int end) {
+        this.token = token;
+        this.part = part;
+        this.start = start;
+        this.end = end;
+    }
+
+    public String getToken() {
+        return token;
+    }
+
+    public URLPart getPart() {
+        return part;
+    }
+
+    public int getStart() {
+        return start;
+    }
+
+    public int getEnd() {
+        return end;
+    }
+
+
+    @Override
+    public boolean equals(Object obj) {
+        if (obj == null || !(obj instanceof Token)) {
+            return false;
+        }
+        Token that = (Token) obj;
+        return this.start == that.start
+                && this.end == that.end
+                && Objects.equal(this.token, that.token)
+                && Objects.equal(this.part, that.part);
+    }
+
+    @Override
+    public int hashCode() {
+        int result = token != null ? token.hashCode() : 0;
+        result = 31 * result + part.hashCode();
+        result = 31 * result + start;
+        result = 31 * result + end;
+        return result;
+    }
+}
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
@@ -6,6 +6,8 @@
 import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
 import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.index.analysis.URLPart;
 
@@ -27,7 +29,7 @@ public final class URLTokenFilter extends TokenFilter {
 
     private List<URLPart> parts;
 
-    private final boolean urlDeocde;
+    private boolean urlDeocde;
 
     /**
      * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
@@ -45,15 +47,17 @@ public final class URLTokenFilter extends TokenFilter {
     private boolean tokenizeQuery = true;
 
     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+    private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
 
     private final boolean allowMalformed;
 
     private boolean tokenizeMalformed;
 
     private boolean passthrough;
 
-    private List<String> tokens;
-    private Iterator<String> iterator;
+    private List<Token> tokens;
+    private Iterator<Token> iterator;
 
     public URLTokenFilter(TokenStream input, URLPart part) {
         this(input, part, false);
@@ -106,6 +110,12 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
         return this;
     }
 
+    public URLTokenFilter setUrlDeocde(boolean urlDeocde) {
+        this.urlDeocde = urlDeocde;
+        return this;
+    }
+
+
     @Override
     public boolean incrementToken() throws IOException {
         if (iterator == null || !iterator.hasNext()) {
@@ -114,11 +124,10 @@ public boolean incrementToken() throws IOException {
             }
         }
         clearAttributes();
-        String next = iterator.next();
-        if (allowMalformed) {
-            next = parseMalformed(next);
-        }
-        termAttribute.append(next);
+        Token next = iterator.next();
+        termAttribute.append(next.getToken());
+        typeAttribute.setType(next.getPart().name().toLowerCase());
+        offsetAttribute.setOffset(next.getStart(), next.getEnd());
         return true;
     }
 
@@ -139,7 +148,7 @@ private boolean advance() throws IOException {
             } catch (IOException e) {
                 if (e.getMessage().contains("Malformed URL")) {
                     if (allowMalformed) {
-                        tokens = ImmutableList.of(urlString);
+                        tokens = ImmutableList.of(new Token(urlString, URLPart.WHOLE, 0, urlString.length()));
                     } else {
                         throw new MalformedURLException("Malformed URL: " + urlString);
                     }
@@ -164,8 +173,8 @@ private boolean advance() throws IOException {
      * @return a list of tokens extracted from the input string
      * @throws IOException
      */
-    private List<String> tokenize(String input) throws IOException {
-        List<String> tokens = new ArrayList<>();
+    private List<Token> tokenize(String input) throws IOException {
+        List<Token> tokens = new ArrayList<>();
         URLTokenizer tokenizer = new URLTokenizer();
         tokenizer.setParts(parts);
         tokenizer.setUrlDecode(urlDeocde);
@@ -176,8 +185,15 @@ private List<String> tokenize(String input) throws IOException {
         tokenizer.setTokenizeMalformed(tokenizeMalformed);
         tokenizer.setReader(new StringReader(input));
         tokenizer.reset();
+
+        String term;
+        URLPart part;
+        OffsetAttribute offset;
         while (tokenizer.incrementToken()) {
-            tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString());
+            term = tokenizer.getAttribute(CharTermAttribute.class).toString();
+            part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
+            offset = tokenizer.getAttribute(OffsetAttribute.class);
+            tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
         }
         return tokens;
     }
@@ -198,6 +214,7 @@ public void reset() throws IOException {
      * Attempt to parse a malformed url string
      * @param urlString the malformed url string
      * @return the url part if it can be parsed, null otherwise
+     * @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer}
      */
     private String parseMalformed(String urlString) {
         if (parts != null && !parts.isEmpty()) {

diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -1,6 +1,5 @@
 package org.elasticsearch.index.analysis.url;
 
-import com.google.common.base.Objects;
 import com.google.common.base.Optional;
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableList;
@@ -511,47 +510,4 @@ private List<Token> tokenizeSpecial(URL url) {
     }
 
 
-    private class Token {
-        private final String token;
-        private final URLPart part;
-        private final int start;
-        private final int end;
-
-        public Token(String token, URLPart part, int start, int end) {
-            this.token = token;
-            this.part = part;
-            this.start = start;
-            this.end = end;
-        }
-
-        public String getToken() { return token; }
-
-        public URLPart getPart() { return part; }
-
-        public int getStart() { return start; }
-
-        public int getEnd() { return end; }
-
-
-        @Override
-        public boolean equals(Object obj) {
-            if (obj == null || !(obj instanceof Token)) {
-                return false;
-            }
-            Token that = (Token) obj;
-            return this.start == that.start
-                    && this.end == that.end
-                    && Objects.equal(this.token, that.token)
-                    && Objects.equal(this.part, that.part);
-        }
-
-        @Override
-        public int hashCode() {
-            int result = token != null ? token.hashCode() : 0;
-            result = 31 * result + part.hashCode();
-            result = 31 * result + start;
-            result = 31 * result + end;
-            return result;
-        }
-    }
 }
diff --git a/.../url/IsTokenizerWithTokenAndPosition.java → ...rl/IsTokenStreamWithTokenAndPosition.java b/.../url/IsTokenizerWithTokenAndPosition.java → ...rl/IsTokenStreamWithTokenAndPosition.java
@@ -1,7 +1,7 @@
 package org.elasticsearch.index.analysis.url;
 
 import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.hamcrest.Description;
@@ -14,8 +14,8 @@
  * Joe Linn
  * 8/2/2015
  */
-public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher<Tokenizer> {
-    private static final Logger log = Logger.getLogger(IsTokenizerWithTokenAndPosition.class);
+public class IsTokenStreamWithTokenAndPosition extends TypeSafeMatcher<TokenStream> {
+    private static final Logger log = Logger.getLogger(IsTokenStreamWithTokenAndPosition.class);
 
     private final String token;
     private final int start;
@@ -25,14 +25,14 @@ public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher<Tokenizer>
     private int actualStart;
     private int actualEnd;
 
-    public IsTokenizerWithTokenAndPosition(String token, int start, int end) {
+    public IsTokenStreamWithTokenAndPosition(String token, int start, int end) {
         this.token = token;
         this.start = start;
         this.end = end;
     }
 
     @Override
-    protected boolean matchesSafely(Tokenizer tokenizer) {
+    protected boolean matchesSafely(TokenStream tokenizer) {
         CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
         OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
         try {
@@ -71,7 +71,7 @@ public void describeTo(Description description) {
 
 
     @Override
-    protected void describeMismatchSafely(Tokenizer item, Description mismatchDescription) {
+    protected void describeMismatchSafely(TokenStream item, Description mismatchDescription) {
         if(!foundToken){
             mismatchDescription.appendText("tokenizer which did not contain token ").appendValue(token);
         } else {
@@ -85,7 +85,7 @@ protected void describeMismatchSafely(Tokenizer item, Description mismatchDescri
     }
 
     @Factory
-    public static IsTokenizerWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
-        return new IsTokenizerWithTokenAndPosition(token, start, end);
+    public static IsTokenStreamWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
+        return new IsTokenStreamWithTokenAndPosition(token, start, end);
     }
 }
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java
@@ -10,6 +10,8 @@
 import java.io.IOException;
 import java.net.MalformedURLException;
 
+import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
+
 public class URLTokenFilterTest extends BaseTokenStreamTestCase {
     public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
     public static final String TEST_HTTP_URL2 = "http://www.foo.bar.com";
@@ -27,6 +29,13 @@ public void testFilterProtocol() throws IOException {
     @Test
     public void testFilterHost() throws IOException {
         assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
+
+        URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
+                .setUrlDeocde(false);
+        assertThat(filter, hasTokenAtOffset("www.foo.bar.com", 7, 22));
+        assertThat(filter, hasTokenAtOffset("foo.bar.com", 11, 22));
+        assertThat(filter, hasTokenAtOffset("bar.com", 15, 22));
+        assertThat(filter, hasTokenAtOffset("com", 19, 22));
     }
 
     @Test
@@ -77,12 +86,15 @@ public void testInferPort() throws IOException {
     @Test
     public void testMalformed() throws IOException {
         URLTokenFilter filter = createFilter("http://:::::::/baz", URLPart.PROTOCOL, false, true);
+        filter.setTokenizeMalformed(true);
         assertTokenStreamContents(filter, "http");
 
         filter = createFilter("foo.com/bar?baz=bat", URLPart.QUERY, false, true);
+        filter.setTokenizeMalformed(true);
         assertTokenStreamContents(filter, "baz=bat");
 
         filter = createFilter("baz.com:3456/foo", URLPart.PORT, false, true);
+        filter.setTokenizeMalformed(true);
         assertTokenStreamContents(filter, "3456");
     }
 

diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
@@ -12,7 +12,7 @@
 import java.util.ArrayList;
 import java.util.List;
 
-import static org.elasticsearch.index.analysis.url.IsTokenizerWithTokenAndPosition.hasTokenAtOffset;
+import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
 import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.CoreMatchers.hasItem;
 

diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json
@@ -50,7 +50,8 @@
             "url_port_malformed": {
                 "type": "url",
                 "part": "port",
-                "allow_malformed": true
+                "allow_malformed": true,
+                "tokenize_malformed": true
             },
             "url_host_passthrough": {
                 "type": "url",