Skip to content
This repository has been archived by the owner on Aug 2, 2024. It is now read-only.

Commit

Permalink
memento-fix: fix for ukwa/ukwa-pywb#37.
Browse files Browse the repository at this point in the history
- support memento timegate on top-frame (when no timestamp is provided)
- treat top-frame no-timestamp url as canonical timegate
- tests: update tests, add memento redirect mode tests for timegate, timegate with accept-dt header
  • Loading branch information
ikreymer committed Feb 14, 2019
1 parent 623f0da commit 791a706
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 15 deletions.
29 changes: 17 additions & 12 deletions pywb/apps/rewriterapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,13 +281,16 @@ def render_content(self, wb_url, kwargs, environ):
headers=headers)
else:
wb_url.mod = pref_mod

else:
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
kwargs)

if response:
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
# don't return top-frame response for timegate with exact redirects
if not is_timegate or not redirect_to_exact:
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

if is_proxy:
environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
Expand Down Expand Up @@ -376,15 +379,14 @@ def render_content(self, wb_url, kwargs, environ):
if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
set_content_loc = True

# if redir to exact, redir if url or ts are different
if redirect_to_exact:
if (set_content_loc or
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):

# if redirect to exact timestamp, bit only if not live
if redirect_to_exact and not cdx.get('is_live'):
if set_content_loc or is_timegate or wb_url.timestamp != cdx.get('timestamp'):
new_url = urlrewriter.get_new_url(url=target_uri,
timestamp=cdx['timestamp'],
mod=wb_url.mod)


resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
if self.enable_memento:
if is_timegate and not is_proxy:
Expand All @@ -393,7 +395,8 @@ def render_content(self, wb_url, kwargs, environ):
resp.status_headers,
is_timegate, is_proxy,
pref_applied=pref_applied,
mod=pref_mod)
mod=pref_mod,
is_memento=False)

else:
resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')
Expand Down Expand Up @@ -478,21 +481,22 @@ def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):

def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy, coll=None,
pref_applied=None, mod=None):
pref_applied=None, mod=None, is_memento=True):

mod = mod or self.replay_mod
replay_mod = mod or self.replay_mod

# memento url + header
if not memento_dt and memento_ts:
memento_dt = timestamp_to_http_date(memento_ts)

if memento_dt:
status_headers.headers.append(('Memento-Datetime', memento_dt))
if is_memento:
status_headers.headers.append(('Memento-Datetime', memento_dt))

if is_proxy:
memento_url = url
else:
memento_url = full_prefix + memento_ts + mod
memento_url = full_prefix + memento_ts + replay_mod
memento_url += '/' + url
else:
memento_url = None
Expand Down Expand Up @@ -526,6 +530,7 @@ def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
def _get_timegate_timemap(self, url, full_prefix, mod):
# timegate url
timegate_url = full_prefix
mod = ''
if mod:
timegate_url += mod + '/'

Expand Down Expand Up @@ -620,7 +625,7 @@ def make_timemap(self, wb_url, res, full_prefix, output):
status = str(res.status_code) + ' ' + res.reason

if res.status_code == 200 and output == 'link':
timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix, self.replay_mod)
timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix, wb_url.mod)

text = MementoUtils.wrap_timemap_header(wb_url.url,
timegate,
Expand Down
78 changes: 75 additions & 3 deletions tests/test_memento.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _assert_memento(self, resp, url, ts, fmod, dt=''):
assert resp.headers['Content-Location'] in memento_link

# timegate link
assert self.make_timegate_link(url, fmod) in links
assert self.make_timegate_link(url, '') in links

# timemap link
assert self.make_timemap_link(url) in links
Expand All @@ -57,7 +57,7 @@ def test_memento_top_frame(self):
assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links

#timegate link
assert self.make_timegate_link(url, 'mp_') in links
assert self.make_timegate_link(url, '') in links

# Body
assert '"20140127171238"' in resp.text
Expand Down Expand Up @@ -129,7 +129,7 @@ def test_timemap(self):

exp = """\
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
<http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",
<http://example.com?example=1>; rel="original",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
Expand Down Expand Up @@ -183,3 +183,75 @@ def test_error_bad_accept_datetime(self):
assert resp.status_int == 400


# ============================================================================
class TestMementoRedirectClassic(MementoMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestMementoRedirectClassic, cls).setup_class('config_test_redirect_classic.yaml')

def test_memento_top_frame_timegate(self, fmod):
resp = self.testapp.get('/pywb/http://www.iana.org/')
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140127171238/http://www.iana.org/')
assert resp.headers['Link'] != ''

# Memento Headers
assert VARY in resp.headers
assert MEMENTO_DATETIME not in resp.headers

# memento link
dt = 'Mon, 27 Jan 2014 17:12:38 GMT'
url = 'http://www.iana.org/'

links = self.get_links(resp)

assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links

#timegate link
assert self.make_timegate_link(url, '') in links


resp = resp.follow()

# Body
assert '"20140127171238"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text

def test_memento_top_frame_timegate_accept_dt(self, fmod):
headers = {'Accept-Datetime': 'Sun, 26 Jan 2014 20:06:24 GMT'}
resp = self.testapp.get('/pywb/http://www.iana.org/', headers=headers)
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140126200624/http://www.iana.org/')
assert resp.headers['Link'] != ''

# Memento Headers
assert VARY in resp.headers
assert MEMENTO_DATETIME not in resp.headers

# memento link
dt = 'Sun, 26 Jan 2014 20:06:24 GMT'
url = 'http://www.iana.org/'

links = self.get_links(resp)

assert self.make_memento_link(url, '20140126200624', dt, 'mp_', include_coll=False) in links

#timegate link
assert self.make_timegate_link(url, '') in links


resp = resp.follow()

# Body
assert '"20140126200624"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text

def test_memento_not_time_gate(self, fmod):
headers = {'Accept-Datetime': 'Sun, 26 Jan 2014 20:06:24 GMT'}
resp = self.testapp.get('/pywb/2/http://www.iana.org/', headers=headers)
assert resp.status_code == 200

def test_timegate_error_not_found(self):
resp = self.testapp.get('/pywb/http://example.com/x-not-found', status=404)
assert resp.status_code == 404

0 comments on commit 791a706

Please sign in to comment.