From 8d97ecb7d599f977d0b45fa3c716e1cf887fd33d Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Mon, 28 Aug 2023 14:42:42 -0400 Subject: [PATCH] Don't remove www as 2nd level domain. Fixes #29 Requires at least two dots (.) in domain name before removing leading www\d* --- src/main/java/org/archive/url/IAURLCanonicalizer.java | 4 ++-- src/test/java/org/archive/url/IAURLCanonicalizerTest.java | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/url/IAURLCanonicalizer.java b/src/main/java/org/archive/url/IAURLCanonicalizer.java index 0cf7c8a4..740ea232 100644 --- a/src/main/java/org/archive/url/IAURLCanonicalizer.java +++ b/src/main/java/org/archive/url/IAURLCanonicalizer.java @@ -142,12 +142,12 @@ public int compare(StringTuple o1, StringTuple o2) { } - public static final Pattern WWWN_PATTERN = Pattern.compile("^www\\d*\\."); + public static final Pattern WWWN_PATTERN = Pattern.compile("(^www\\d*\\.).+\\."); public static String massageHost(String host) { while(true) { Matcher m = WWWN_PATTERN.matcher(host); if(m.find()) { - host = host.substring(m.group(0).length()); + host = host.substring(m.group(1).length()); } else { break; } diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java index e2c46258..771858ca 100644 --- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java +++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java @@ -42,9 +42,12 @@ public void testAlphaReorderQuery() { } public void testMassageHost() { + assertEquals("www.com",IAURLCanonicalizer.massageHost("www.com")); + assertEquals("www3288.com",IAURLCanonicalizer.massageHost("www3288.com")); assertEquals("foo.com",IAURLCanonicalizer.massageHost("foo.com")); assertEquals("foo.com",IAURLCanonicalizer.massageHost("www.foo.com")); assertEquals("foo.com",IAURLCanonicalizer.massageHost("www12.foo.com")); + assertEquals("foo.com",IAURLCanonicalizer.massageHost("www12.www.foo.com")); assertEquals("www2foo.com",IAURLCanonicalizer.massageHost("www2foo.com")); assertEquals("www2foo.com",IAURLCanonicalizer.massageHost("www2.www2foo.com")); }