diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 0cf7c8a4..2455f89b 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -142,15 +142,11 @@ public int compare(StringTuple o1, StringTuple o2) { } - public static final Pattern WWWN_PATTERN = Pattern.compile("^www\\d*\\."); + public static final Pattern WWWN_PATTERN = Pattern.compile("(^www\\d*\\.).+\\."); public static String massageHost(String host) { - while(true) { - Matcher m = WWWN_PATTERN.matcher(host); - if(m.find()) { - host = host.substring(m.group(0).length()); - } else { - break; - } + Matcher m = WWWN_PATTERN.matcher(host); + if(m.find()) { + host = host.substring(m.group(1).length()); } return host; } diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java index c21bcbe8..4313e199 100644 --- a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java @@ -33,7 +33,7 @@ public void testGetHex() { assertEquals(14,guc.getHex('E')); assertEquals(15,guc.getHex('F')); assertEquals(-1,guc.getHex('G')); - assertEquals(-1,guc.getHex('G')); + assertEquals(-1,guc.getHex('g')); assertEquals(-1,guc.getHex('q')); assertEquals(-1,guc.getHex(' ')); } diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index e2c46258..da669704 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -42,9 +42,12 @@ public void testAlphaReorderQuery() { } public void testMassageHost() { + assertEquals("www.com",IAURLCanonicalizer.massageHost("www.com")); + assertEquals("www3288.com",IAURLCanonicalizer.massageHost("www3288.com")); assertEquals("foo.com",IAURLCanonicalizer.massageHost("foo.com")); assertEquals("foo.com",IAURLCanonicalizer.massageHost("www.foo.com")); assertEquals("foo.com",IAURLCanonicalizer.massageHost("www12.foo.com")); + assertEquals("www.foo.com",IAURLCanonicalizer.massageHost("www12.www.foo.com")); assertEquals("www2foo.com",IAURLCanonicalizer.massageHost("www2foo.com")); assertEquals("www2foo.com",IAURLCanonicalizer.massageHost("www2.www2foo.com")); }