Skip to content

Commit

Permalink
Propagate token offsets and types through UrlTokenFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
jlinn committed Aug 15, 2016
1 parent a0088d9 commit 6e4591f
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 70 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.

| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
| 2.3.4 | 2.3.4.2 |
| 2.3.4 | 2.3.4.3 |
| 2.3.3 | 2.3.3.5 |
| 2.3.2 | 2.3.2.1 |
| 2.3.1 | 2.3.1.1 |
Expand All @@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part.

## Installation
```bash
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.3/elasticsearch-analysis-url-2.3.4.3.zip
```

## Usage
Expand Down Expand Up @@ -112,7 +112,8 @@ Set up your index like so:
"url_host": {
"type": "url",
"part": "host",
"url_decode": true
"url_decode": true,
"tokenize_host": false
}
},
"analyzer": {
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>2.3.4.2</version>
<version>2.3.4.3</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand Down
60 changes: 60 additions & 0 deletions src/main/java/org/elasticsearch/index/analysis/url/Token.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.base.Objects;
import org.elasticsearch.index.analysis.URLPart;

/**
* @author Joe Linn
* 8/14/2016
*/
class Token {
private final String token;
private final URLPart part;
private final int start;
private final int end;

public Token(String token, URLPart part, int start, int end) {
this.token = token;
this.part = part;
this.start = start;
this.end = end;
}

public String getToken() {
return token;
}

public URLPart getPart() {
return part;
}

public int getStart() {
return start;
}

public int getEnd() {
return end;
}


@Override
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof Token)) {
return false;
}
Token that = (Token) obj;
return this.start == that.start
&& this.end == that.end
&& Objects.equal(this.token, that.token)
&& Objects.equal(this.part, that.part);
}

@Override
public int hashCode() {
int result = token != null ? token.hashCode() : 0;
result = 31 * result + part.hashCode();
result = 31 * result + start;
result = 31 * result + end;
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.index.analysis.URLPart;

Expand All @@ -27,7 +29,7 @@ public final class URLTokenFilter extends TokenFilter {

private List<URLPart> parts;

private final boolean urlDeocde;
private boolean urlDeocde;

/**
* If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
Expand All @@ -45,15 +47,17 @@ public final class URLTokenFilter extends TokenFilter {
private boolean tokenizeQuery = true;

private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);

private final boolean allowMalformed;

private boolean tokenizeMalformed;

private boolean passthrough;

private List<String> tokens;
private Iterator<String> iterator;
private List<Token> tokens;
private Iterator<Token> iterator;

public URLTokenFilter(TokenStream input, URLPart part) {
this(input, part, false);
Expand Down Expand Up @@ -106,6 +110,12 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
return this;
}

public URLTokenFilter setUrlDeocde(boolean urlDeocde) {
this.urlDeocde = urlDeocde;
return this;
}


@Override
public boolean incrementToken() throws IOException {
if (iterator == null || !iterator.hasNext()) {
Expand All @@ -114,11 +124,10 @@ public boolean incrementToken() throws IOException {
}
}
clearAttributes();
String next = iterator.next();
if (allowMalformed) {
next = parseMalformed(next);
}
termAttribute.append(next);
Token next = iterator.next();
termAttribute.append(next.getToken());
typeAttribute.setType(next.getPart().name().toLowerCase());
offsetAttribute.setOffset(next.getStart(), next.getEnd());
return true;
}

Expand All @@ -139,7 +148,7 @@ private boolean advance() throws IOException {
} catch (IOException e) {
if (e.getMessage().contains("Malformed URL")) {
if (allowMalformed) {
tokens = ImmutableList.of(urlString);
tokens = ImmutableList.of(new Token(urlString, URLPart.WHOLE, 0, urlString.length()));
} else {
throw new MalformedURLException("Malformed URL: " + urlString);
}
Expand All @@ -164,8 +173,8 @@ private boolean advance() throws IOException {
* @return a list of tokens extracted from the input string
* @throws IOException
*/
private List<String> tokenize(String input) throws IOException {
List<String> tokens = new ArrayList<>();
private List<Token> tokenize(String input) throws IOException {
List<Token> tokens = new ArrayList<>();
URLTokenizer tokenizer = new URLTokenizer();
tokenizer.setParts(parts);
tokenizer.setUrlDecode(urlDeocde);
Expand All @@ -176,8 +185,15 @@ private List<String> tokenize(String input) throws IOException {
tokenizer.setTokenizeMalformed(tokenizeMalformed);
tokenizer.setReader(new StringReader(input));
tokenizer.reset();

String term;
URLPart part;
OffsetAttribute offset;
while (tokenizer.incrementToken()) {
tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString());
term = tokenizer.getAttribute(CharTermAttribute.class).toString();
part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
offset = tokenizer.getAttribute(OffsetAttribute.class);
tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
}
return tokens;
}
Expand All @@ -198,6 +214,7 @@ public void reset() throws IOException {
* Attempt to parse a malformed url string
* @param urlString the malformed url string
* @return the url part if it can be parsed, null otherwise
* @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer}
*/
private String parseMalformed(String urlString) {
if (parts != null && !parts.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
Expand Down Expand Up @@ -511,47 +510,4 @@ private List<Token> tokenizeSpecial(URL url) {
}


private class Token {
private final String token;
private final URLPart part;
private final int start;
private final int end;

public Token(String token, URLPart part, int start, int end) {
this.token = token;
this.part = part;
this.start = start;
this.end = end;
}

public String getToken() { return token; }

public URLPart getPart() { return part; }

public int getStart() { return start; }

public int getEnd() { return end; }


@Override
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof Token)) {
return false;
}
Token that = (Token) obj;
return this.start == that.start
&& this.end == that.end
&& Objects.equal(this.token, that.token)
&& Objects.equal(this.part, that.part);
}

@Override
public int hashCode() {
int result = token != null ? token.hashCode() : 0;
result = 31 * result + part.hashCode();
result = 31 * result + start;
result = 31 * result + end;
return result;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package org.elasticsearch.index.analysis.url;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.hamcrest.Description;
Expand All @@ -14,8 +14,8 @@
* Joe Linn
* 8/2/2015
*/
public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher<Tokenizer> {
private static final Logger log = Logger.getLogger(IsTokenizerWithTokenAndPosition.class);
public class IsTokenStreamWithTokenAndPosition extends TypeSafeMatcher<TokenStream> {
private static final Logger log = Logger.getLogger(IsTokenStreamWithTokenAndPosition.class);

private final String token;
private final int start;
Expand All @@ -25,14 +25,14 @@ public class IsTokenizerWithTokenAndPosition extends TypeSafeMatcher<Tokenizer>
private int actualStart;
private int actualEnd;

public IsTokenizerWithTokenAndPosition(String token, int start, int end) {
public IsTokenStreamWithTokenAndPosition(String token, int start, int end) {
this.token = token;
this.start = start;
this.end = end;
}

@Override
protected boolean matchesSafely(Tokenizer tokenizer) {
protected boolean matchesSafely(TokenStream tokenizer) {
CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
try {
Expand Down Expand Up @@ -71,7 +71,7 @@ public void describeTo(Description description) {


@Override
protected void describeMismatchSafely(Tokenizer item, Description mismatchDescription) {
protected void describeMismatchSafely(TokenStream item, Description mismatchDescription) {
if(!foundToken){
mismatchDescription.appendText("tokenizer which did not contain token ").appendValue(token);
} else {
Expand All @@ -85,7 +85,7 @@ protected void describeMismatchSafely(Tokenizer item, Description mismatchDescri
}

@Factory
public static IsTokenizerWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
return new IsTokenizerWithTokenAndPosition(token, start, end);
public static IsTokenStreamWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
return new IsTokenStreamWithTokenAndPosition(token, start, end);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import java.io.IOException;
import java.net.MalformedURLException;

import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;

public class URLTokenFilterTest extends BaseTokenStreamTestCase {
public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
public static final String TEST_HTTP_URL2 = "http://www.foo.bar.com";
Expand All @@ -27,6 +29,13 @@ public void testFilterProtocol() throws IOException {
@Test
public void testFilterHost() throws IOException {
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");

URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
.setUrlDeocde(false);
assertThat(filter, hasTokenAtOffset("www.foo.bar.com", 7, 22));
assertThat(filter, hasTokenAtOffset("foo.bar.com", 11, 22));
assertThat(filter, hasTokenAtOffset("bar.com", 15, 22));
assertThat(filter, hasTokenAtOffset("com", 19, 22));
}

@Test
Expand Down Expand Up @@ -77,12 +86,15 @@ public void testInferPort() throws IOException {
@Test
public void testMalformed() throws IOException {
URLTokenFilter filter = createFilter("http://:::::::/baz", URLPart.PROTOCOL, false, true);
filter.setTokenizeMalformed(true);
assertTokenStreamContents(filter, "http");

filter = createFilter("foo.com/bar?baz=bat", URLPart.QUERY, false, true);
filter.setTokenizeMalformed(true);
assertTokenStreamContents(filter, "baz=bat");

filter = createFilter("baz.com:3456/foo", URLPart.PORT, false, true);
filter.setTokenizeMalformed(true);
assertTokenStreamContents(filter, "3456");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import java.util.ArrayList;
import java.util.List;

import static org.elasticsearch.index.analysis.url.IsTokenizerWithTokenAndPosition.hasTokenAtOffset;
import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.hasItem;

Expand Down
3 changes: 2 additions & 1 deletion src/test/resources/test-settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
"url_port_malformed": {
"type": "url",
"part": "port",
"allow_malformed": true
"allow_malformed": true,
"tokenize_malformed": true
},
"url_host_passthrough": {
"type": "url",
Expand Down

0 comments on commit 6e4591f

Please sign in to comment.