From 03d775f2edc6fdd0744af76b8bde585e5b6a427a Mon Sep 17 00:00:00 2001 From: jlinn Date: Tue, 28 Jul 2015 16:26:43 -0700 Subject: [PATCH] Stop attempting to parse tokens if parsing has already succeeded --- README.md | 4 ++-- pom.xml | 2 +- .../index/analysis/url/URLTokenFilter.java | 12 +++++++++++- .../analysis/url/URLTokenFilterIntegrationTest.java | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c933e06..7a351b2 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,13 @@ This plugin enables URL token filtering by URL part. | Elasticsearch Version | Plugin Version | |-----------------------|----------------| -| 1.6.0 | 1.2.0 | +| 1.6.0 | 1.2.1 | | 1.5.2 | 1.1.0 | | 1.4.2 | 1.0.0 | ## Installation ```bash -bin/plugin --install analysis-url --url https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v1.1.0/elasticsearch-analysis-url-1.1.0.zip +bin/plugin --install analysis-url --url https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v1.2.1/elasticsearch-analysis-url-1.2.1.zip ``` ## Usage diff --git a/pom.xml b/pom.xml index a8a73c3..606c183 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.elasticsearch elasticsearch-analysis-url - 1.2.0 + 1.2.1 jar Elasticsearch URL token filter plugin diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java index e939444..37a0d57 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java @@ -28,6 +28,8 @@ public final class URLTokenFilter extends TokenFilter { private final boolean allowMalformed; + private boolean parsed; + public URLTokenFilter(TokenStream input, URLPart part) { this(input, part, false); } @@ -45,7 +47,7 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolea @Override public boolean incrementToken() throws IOException { - if (input.incrementToken()) { + if (input.incrementToken() && !parsed) { final String urlString = termAttribute.toString(); termAttribute.setEmpty(); if (Strings.isNullOrEmpty(urlString) || urlString.equals("null")) { @@ -77,12 +79,14 @@ public boolean incrementToken() throws IOException { default: partString = url.toString(); } + parsed = !Strings.isNullOrEmpty(partString); } catch (MalformedURLException e) { if (allowMalformed) { partString = parseMalformed(urlString); if (Strings.isNullOrEmpty(partString)) { return false; } + parsed = true; } else { throw e; } @@ -96,6 +100,12 @@ public boolean incrementToken() throws IOException { return false; } + @Override + public void reset() throws IOException { + super.reset(); + parsed = false; + } + private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)"); private static final Pattern REGEX_PORT = Pattern.compile(":([0-9]{1,5})"); private static final Pattern REGEX_QUERY = Pattern.compile("\\?(.+)"); diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java index f618113..02e4a03 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java @@ -64,6 +64,7 @@ public void testEmptyString() { @Test public void testUrlDecode() { assertURLAnalyzesTo("https://foo.bar.com?email=foo%40bar.com", "url_query", "email=foo@bar.com"); + assertURLAnalyzesTo("https://ssl.google-analytics.com/r/__utm.gif?utmwv=5.6.4&utms=1&utmn=1031590447&utmhn=www.linkedin.com&utmcs=-&utmsr=1024x768&utmvp=1256x2417&utmsc=24-bit&utmul=en-us&utmje=1&utmfl=-&utmdt=Wells%20Fargo%20Capital%20Finance%20%7C%20LinkedIn&utmhid=735221740&utmr=http%3A%2F%2Fwww.google.com%2Fsearch%3Fq%3Dsite%253Alinkedin.com%2Bwells%2Bfargo%26rls%3Dcom.microsoft%3Aen-us%26ie%3DUTF-8%26oe%3DUTF-8%26startIndex%3D%26startPage%3D1&utmp=biz-overview-public&utmht=1428449620694&utmac=UA-3242811-1&utmcc=__utma%3D23068709.1484257758.1428449621.1428449621.1428449621.1%3B%2B__utmz%3D23068709.1428449621.1.1.utmcsr%3Dgoogle%7Cutmccn%3D(organic)%7Cutmcmd%3Dorganic%7Cutmctr%3Dsite%253Alinkedin.com%2520wells%2520fargo%3B&utmjid=1336170366&utmredir=1&utmu=qBCAAAAAAAAAAAAAAAAAAAAE~", "url_port", "443"); } @Test