Skip to content

Commit

Permalink
apply fuzzy match if url prefix and regex match
Browse files Browse the repository at this point in the history
even if no groups are captured by the regex

see webrecorder/pywb#524 for some discussion
  • Loading branch information
nlevitt committed Dec 3, 2019
1 parent 86c2c27 commit bdb5849
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 2 deletions.
52 changes: 51 additions & 1 deletion src/outbackcdx/UrlCanonicalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,53 @@ public class UrlCanonicalizer {
private static final Pattern UNDOTTED_IP = Pattern.compile("(?:0x)?[0-9]{1,12}");
static final Pattern DOTTED_IP = Pattern.compile("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}");

/**
* TODO explain how these rules work.
*
* <ul>
* <li>{@link https://github.com/webrecorder/pywb/wiki/Fuzzy-Match-Rules}
* <li>{@link https://github.com/webrecorder/pywb/blob/5f938e68797/pywb/warcserver/index/fuzzymatcher.py}
* </ul>
*
* <pre>rules:
* - url_prefix: 'com,twitter)/i/profiles/show/'
* fuzzy_lookup: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
* - url_prefix: 'com,twitter)/i/timeline'
* fuzzy_lookup:
* - max_position
* - include_entities
* - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerpagelet'
* fuzzy_lookup:
* match: '("(?:cursor|cursorindex)":["\d\w]+)'
* find_all: true
* - url_prefix: 'com,staticflickr,'
* fuzzy_lookup:
* match: '([0-9]+_[a-z0-9]+).*?.jpg'
* replace: '/'
* # replace: 'staticflickr,'
* - url_prefix: ['com,yimg,l)/g/combo', 'com,yimg,s)/pw/combo', 'com,yahooapis,yui)/combo']
* fuzzy_lookup: '([^/]+(?:\.css|\.js))'
* - url_prefix: 'com,vimeo,av)/'
* # only use non query part of url, ignore query
* fuzzy_lookup: '()'
* - url_prefix: 'com,googlevideo,'
* fuzzy_lookup:
* match:
* regex: 'com,googlevideo.*&#47;videoplayback.*'
* args:
* - id
* - itag
* #- mime
* filter:
* - 'urlkey:{0}'
* - '!mimetype:text/plain'
* type: 'domain'
* - url_prefix: com,example,zuh)/
* fuzzy_lookup: '[&?](?:.*)'
* </pre>
*
* @author nlevitt
*/
public static class FuzzyRule {
final List<String> urlPrefixes;
final Pattern pattern;
Expand Down Expand Up @@ -129,12 +176,15 @@ public String apply(String surt) {
if (surt.startsWith(prefix)) {
Matcher m = pattern.matcher(surt);
List<String> groups = new ArrayList<String>();
boolean regexMatches = false;
if (findAll) {
while (m.find()) {
regexMatches = true;
groups.add(m.group());
}
} else {
if (m.find()) {
regexMatches = true;
for (int i = 1; i <= m.groupCount(); i++) {
if (m.group(i) != null) {
groups.add(m.group(i));
Expand All @@ -143,7 +193,7 @@ public String apply(String surt) {
}
}

if (!groups.isEmpty()) {
if (regexMatches) {
int replaceAfterIndex = surt.indexOf(replaceAfter);
String pref;
if (isDomain) {
Expand Down
17 changes: 16 additions & 1 deletion test/outbackcdx/UrlCanonicalizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ public void initFuzz() throws UnsupportedEncodingException, ConfigurationExcepti
" - 'urlkey:{0}'\n" +
" - '!mimetype:text/plain'\n" +
" type: 'domain'\n" +
"- url_prefix: com,example,zuh)/\n" +
" fuzzy_lookup: '[&?](?:.*)'\n" +
"";

fuzzCanon = new UrlCanonicalizer(new ByteArrayInputStream(yaml.getBytes("UTF-8")));
Expand Down Expand Up @@ -188,11 +190,18 @@ public void testFuzzCanon() {
assertEquals(
fuzzCanon.surtCanonicalize("http://o-o.preferred.nuq04t11.v3.cache1.googlevideo.com/videoplayback?id=1c98fe7da5ffb404&itag=5&app=blogger&ip=0.0.0.0&ipbits=0&expire=1335344084&sparams=id,itag,ip,ipbits,expire&signature=5371654FF54A9C169F2F42334235D096F41053A7.448A800D1DED819ED5C476E29BA69F38FEE48B26&key=ck1&redirect_counter=2&cms_options=map=ts_be&cms_redirect=yes"),
"fuzzy:com,googlevideo,?id=1c98fe7da5ffb404&itag=5");

assertEquals(
fuzzCanon.surtCanonicalize("http://zuh.example.com/?some=query&params"),
fuzzCanon.surtCanonicalize("http://zuh.example.com/?some=other&query=params"));
assertEquals(
fuzzCanon.surtCanonicalize("http://zuh.example.com/?some=query&params"),
"fuzzy:com,example,zuh)/?");
}

@Test
public void testFuzzConfig() {
assertEquals(fuzzCanon.fuzzyRules.size(), 7);
assertEquals(fuzzCanon.fuzzyRules.size(), 8);

assertEquals(fuzzCanon.fuzzyRules.get(0).urlPrefixes, Arrays.asList("com,twitter)/i/profiles/show/"));
assertEquals(fuzzCanon.fuzzyRules.get(0).pattern.pattern(), "/profiles/show/.*with_replies\\?.*(max_id=[^&]+)");
Expand Down Expand Up @@ -235,5 +244,11 @@ public void testFuzzConfig() {
assertEquals(fuzzCanon.fuzzyRules.get(6).replaceAfter, "?");
assertEquals(fuzzCanon.fuzzyRules.get(6).findAll, false);
assertEquals(fuzzCanon.fuzzyRules.get(6).isDomain, true);

assertEquals(fuzzCanon.fuzzyRules.get(7).urlPrefixes, Arrays.asList("com,example,zuh)/"));
assertEquals(fuzzCanon.fuzzyRules.get(7).pattern.pattern(), "[&?](?:.*)");
assertEquals(fuzzCanon.fuzzyRules.get(7).replaceAfter, "?");
assertEquals(fuzzCanon.fuzzyRules.get(7).findAll, false);
assertEquals(fuzzCanon.fuzzyRules.get(7).isDomain, false);
}
}

0 comments on commit bdb5849

Please sign in to comment.