Skip to content

Commit

Permalink
Fix S3 sync issue with keys containing urlencode values
Browse files Browse the repository at this point in the history
Fixes aws#749.  This was a regression from the fix for aws#675
where we use the encoding_type of "url" to workaround
the stdlib xmlparser not handling new lines.

The problem is that pagination in s3 uses the last key name as
the marker, and because the keys are returned urlencoded, we
need to urldecode the keys so botocore sends the correct next
marker.  In the case where urldecoded(key) != key we will incorrectly
sync new files.
  • Loading branch information
jamesls committed Apr 17, 2014
1 parent cbacc26 commit 32ac411
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 11 deletions.
41 changes: 33 additions & 8 deletions awscli/customizations/s3/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,39 @@ def list_objects(self, bucket, prefix=None):
kwargs = {'bucket': bucket, 'encoding_type': 'url'}
if prefix is not None:
kwargs['prefix'] = prefix
pages = self._operation.paginate(self._endpoint, **kwargs)
for response, page in pages:
contents = page['Contents']
for content in contents:
source_path = bucket + '/' + unquote_str(content['Key'])
size = content['Size']
last_update = self._date_parser(content['LastModified'])
yield source_path, size, last_update
# This event handler is needed because we use encoding_type url and
# we're paginating. The pagination token is the last Key of the
# Contents list. However, botocore does not know that the encoding
# type needs to be urldecoded.
with ScopedEventHandler(self._operation.session, 'after-call.s3.ListObjects',
self._decode_keys):
pages = self._operation.paginate(self._endpoint, **kwargs)
for response, page in pages:
contents = page['Contents']
for content in contents:
source_path = bucket + '/' + content['Key']
size = content['Size']
last_update = self._date_parser(content['LastModified'])
yield source_path, size, last_update

def _decode_keys(self, parsed, **kwargs):
for content in parsed['Contents']:
content['Key'] = unquote_str(content['Key'])


class ScopedEventHandler(object):
"""Register an event callback for the duration of a scope."""

def __init__(self, session, event_name, handler):
self._session = session
self._event_name = event_name
self._handler = handler

def __enter__(self):
self._session.register(self._event_name, self._handler)

def __exit__(self, exc_type, exc_value, traceback):
self._session.unregister(self._event_name, self._handler)


IORequest = namedtuple('IORequest', ['filename', 'offset', 'data'])
Expand Down
17 changes: 17 additions & 0 deletions tests/integration/customizations/s3/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,23 @@ def test_sync_with_delete_option_with_same_prefix(self):
self.assertNotIn('delete:', p.stdout)
self.assertEqual('', p.stdout)

def test_sync_with_plus_chars(self):
# 1. Create > 1000 files with '+' in the filename.
# 2. Sync up to s3.
# 3. Sync down from s3
# 4. Verify nothing was synced back down from s3.
bucket_name = self.create_bucket()
for i in range(1010):
# Create a file with a space char and a '+' char in the filename.
self.files.create_file('foo +%04d' % i, contents='')
p = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
self.assert_no_errors(p)
p = aws('s3 sync s3://%s/ %s' % (bucket_name, self.files.rootdir))
self.assertNotIn('download:', p.stdout)
self.assertNotIn('delete:', p.stdout)
self.assertEqual('', p.stdout)
self.assert_no_errors(p)


@unittest.skipIf(platform.system() not in ['Darwin', 'Linux'],
'Symlink tests only supported on mac/linux')
Expand Down
19 changes: 16 additions & 3 deletions tests/unit/customizations/s3/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import mock

from botocore.hooks import HierarchicalEmitter
from awscli.customizations.s3.utils import find_bucket_key, find_chunksize
from awscli.customizations.s3.utils import ReadFileChunk
from awscli.customizations.s3.utils import relative_path
Expand Down Expand Up @@ -196,13 +197,23 @@ def test_priority_attr_is_missing(self):
class TestBucketList(unittest.TestCase):
def setUp(self):
self.operation = mock.Mock()
self.emitter = HierarchicalEmitter()
self.operation.session.register = self.emitter.register
self.operation.session.unregister = self.emitter.unregister
self.endpoint = mock.sentinel.endpoint
self.date_parser = mock.Mock()
self.date_parser.return_value = mock.sentinel.now
self.responses = []

def fake_paginate(self, *args, **kwargs):
for response in self.responses:
self.emitter.emit('after-call.s3.ListObjects', parsed=response[1])
return self.responses

def test_list_objects(self):
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': 'a', 'Size': 1},
Expand All @@ -224,7 +235,8 @@ def test_urlencoded_keys(self):
# them before yielding them. For example, note the %0D
# in bar.txt:
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': 'bar%0D.txt', 'Size': 1}]}),
Expand All @@ -236,7 +248,8 @@ def test_urlencoded_keys(self):

def test_urlencoded_with_unicode_keys(self):
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': '%E2%9C%93', 'Size': 1}]}),
Expand Down

0 comments on commit 32ac411

Please sign in to comment.