Skip to content

Commit

Permalink
Don't remove www as 2nd level domain. Fixes commoncrawl#29
Browse files Browse the repository at this point in the history
Requires at least two dots (.) in domain name before removing
leading www\d*
  • Loading branch information
tfmorris committed Aug 28, 2023
1 parent 0d273fb commit 8d97ecb
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/main/java/org/archive/url/IAURLCanonicalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,12 @@ public int compare(StringTuple o1, StringTuple o2) {
}


public static final Pattern WWWN_PATTERN = Pattern.compile("^www\\d*\\.");
public static final Pattern WWWN_PATTERN = Pattern.compile("(^www\\d*\\.).+\\.");
public static String massageHost(String host) {
while(true) {
Matcher m = WWWN_PATTERN.matcher(host);
if(m.find()) {
host = host.substring(m.group(0).length());
host = host.substring(m.group(1).length());
} else {
break;
}
Expand Down
3 changes: 3 additions & 0 deletions src/test/java/org/archive/url/IAURLCanonicalizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,12 @@ public void testAlphaReorderQuery() {
}

public void testMassageHost() {
assertEquals("www.com",IAURLCanonicalizer.massageHost("www.com"));
assertEquals("www3288.com",IAURLCanonicalizer.massageHost("www3288.com"));
assertEquals("foo.com",IAURLCanonicalizer.massageHost("foo.com"));
assertEquals("foo.com",IAURLCanonicalizer.massageHost("www.foo.com"));
assertEquals("foo.com",IAURLCanonicalizer.massageHost("www12.foo.com"));
assertEquals("foo.com",IAURLCanonicalizer.massageHost("www12.www.foo.com"));
assertEquals("www2foo.com",IAURLCanonicalizer.massageHost("www2foo.com"));
assertEquals("www2foo.com",IAURLCanonicalizer.massageHost("www2.www2foo.com"));
}
Expand Down

0 comments on commit 8d97ecb

Please sign in to comment.