Skip to content

Commit

Permalink
Fix S3 sync issue with keys containing urlencode values
Browse files Browse the repository at this point in the history
Fixes aws#749.  This was a regression from the fix for aws#675
where we use the encoding_type of "url" to workaround
the stdlib xmlparser not handling new lines.

The problem is that pagination in s3 uses the last key name as
the marker, and because the keys are returned urlencoded, we
need to urldecode the keys so botocore sends the correct next
marker.  In the case where urldecoded(key) != key we will incorrectly
sync new files.

Also added an integ test for syncing with '+' chars.
  • Loading branch information
jamesls committed Apr 17, 2014
1 parent cbacc26 commit 1236dd2
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 36 deletions.
41 changes: 33 additions & 8 deletions awscli/customizations/s3/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,39 @@ def list_objects(self, bucket, prefix=None):
kwargs = {'bucket': bucket, 'encoding_type': 'url'}
if prefix is not None:
kwargs['prefix'] = prefix
pages = self._operation.paginate(self._endpoint, **kwargs)
for response, page in pages:
contents = page['Contents']
for content in contents:
source_path = bucket + '/' + unquote_str(content['Key'])
size = content['Size']
last_update = self._date_parser(content['LastModified'])
yield source_path, size, last_update
# This event handler is needed because we use encoding_type url and
# we're paginating. The pagination token is the last Key of the
# Contents list. However, botocore does not know that the encoding
# type needs to be urldecoded.
with ScopedEventHandler(self._operation.session, 'after-call.s3.ListObjects',
self._decode_keys):
pages = self._operation.paginate(self._endpoint, **kwargs)
for response, page in pages:
contents = page['Contents']
for content in contents:
source_path = bucket + '/' + content['Key']
size = content['Size']
last_update = self._date_parser(content['LastModified'])
yield source_path, size, last_update

def _decode_keys(self, parsed, **kwargs):
for content in parsed['Contents']:
content['Key'] = unquote_str(content['Key'])


class ScopedEventHandler(object):
"""Register an event callback for the duration of a scope."""

def __init__(self, session, event_name, handler):
self._session = session
self._event_name = event_name
self._handler = handler

def __enter__(self):
self._session.register(self._event_name, self._handler)

def __exit__(self, exc_type, exc_value, traceback):
self._session.unregister(self._event_name, self._handler)


IORequest = namedtuple('IORequest', ['filename', 'offset', 'data'])
Expand Down
72 changes: 47 additions & 25 deletions tests/integration/customizations/s3/test_plugin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# -*- coding: utf-8 -*-
# Copyright 2013 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
Expand All @@ -25,6 +24,7 @@
import signal

import botocore.session
import six

from tests.integration import aws
from tests.unit.customizations.s3 import create_bucket as _create_bucket
Expand Down Expand Up @@ -68,6 +68,17 @@ def extra_teardown(self):
# Subclasses can use this to define extra teardown steps.
pass

def assert_key_contents_equal(self, bucket, key, expected_contents):
if isinstance(expected_contents, six.StringIO):
expected_contents = expected_contents.getvalue()
actual_contents = self.get_key_contents(bucket, key)
# The contents can be huge so we try to give helpful error messages
# without necessarily printing the actual contents.
self.assertEqual(len(actual_contents), len(expected_contents))
if actual_contents != expected_contents:
self.fail("Contents for %s/%s do not match (but they "
"have the same length)" % (bucket, key))

def create_bucket(self):
bucket_name = _create_bucket(self.session)
self.addCleanup(self.delete_bucket, bucket_name)
Expand Down Expand Up @@ -147,8 +158,7 @@ def test_mv_local_to_s3(self):
# When we move an object, the local file is gone:
self.assertTrue(not os.path.exists(full_path))
# And now resides in s3.
contents = self.get_key_contents(bucket_name, 'foo.txt')
self.assertEqual(contents, 'this is foo.txt')
self.assert_key_contents_equal(bucket_name, 'foo.txt', 'this is foo.txt')

def test_mv_s3_to_local(self):
bucket_name = self.create_bucket()
Expand Down Expand Up @@ -179,22 +189,21 @@ def test_mv_s3_to_s3(self):
def test_mv_s3_to_s3_multipart(self):
from_bucket = self.create_bucket()
to_bucket = self.create_bucket()
file_contents = 'abcd' * (1024 * 1024 * 10)
file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
self.put_object(from_bucket, 'foo.txt', file_contents)

p = aws('s3 mv s3://%s/foo.txt s3://%s/foo.txt' % (from_bucket,
to_bucket))
self.assert_no_errors(p)
contents = self.get_key_contents(to_bucket, 'foo.txt')
self.assertEqual(contents, file_contents)
self.assert_key_contents_equal(to_bucket, 'foo.txt', file_contents)
# And verify that the object no longer exists in the from_bucket.
self.assertTrue(not self.key_exists(from_bucket, key_name='foo.txt'))

def test_mv_s3_to_s3_multipart_recursive(self):
from_bucket = self.create_bucket()
to_bucket = self.create_bucket()

large_file_contents = 'abcd' * (1024 * 1024 * 10)
large_file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
small_file_contents = 'small file contents'
self.put_object(from_bucket, 'largefile', large_file_contents)
self.put_object(from_bucket, 'smallfile', small_file_contents)
Expand All @@ -211,29 +220,28 @@ def test_mv_s3_to_s3_multipart_recursive(self):
self.assertTrue(self.key_exists(to_bucket, key_name='smallfile'))

# And the contents are what we expect.
self.assertEqual(self.get_key_contents(to_bucket, 'smallfile'),
small_file_contents)
self.assertEqual(self.get_key_contents(to_bucket, 'largefile'),
large_file_contents)
self.assert_key_contents_equal(to_bucket, 'smallfile',
small_file_contents)
self.assert_key_contents_equal(to_bucket, 'largefile',
large_file_contents)

def test_mv_with_large_file(self):
bucket_name = self.create_bucket()
# 40MB will force a multipart upload.
file_contents = 'abcd' * (1024 * 1024 * 10)
foo_txt = self.files.create_file('foo.txt', file_contents)
file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
foo_txt = self.files.create_file('foo.txt', file_contents.getvalue())
p = aws('s3 mv %s s3://%s/foo.txt' % (foo_txt, bucket_name))
self.assert_no_errors(p)
# When we move an object, the local file is gone:
self.assertTrue(not os.path.exists(foo_txt))
# And now resides in s3.
contents = self.get_key_contents(bucket_name, 'foo.txt')
self.assertEqual(len(contents), len(file_contents))
self.assert_key_contents_equal(bucket_name, 'foo.txt', file_contents)

# Now verify we can download this file.
p = aws('s3 mv s3://%s/foo.txt %s' % (bucket_name, foo_txt))
self.assert_no_errors(p)
self.assertTrue(os.path.exists(foo_txt))
self.assertEqual(os.path.getsize(foo_txt), len(file_contents))
self.assertEqual(os.path.getsize(foo_txt), len(file_contents.getvalue()))

def test_mv_to_nonexistent_bucket(self):
full_path = self.files.create_file('foo.txt', 'this is foo.txt')
Expand Down Expand Up @@ -315,16 +323,12 @@ def test_cp_without_trailing_slash(self):
def test_cp_s3_s3_multipart(self):
from_bucket = self.create_bucket()
to_bucket = self.create_bucket()
file_contents = 'abcd' * (1024 * 1024 * 10)
file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
self.put_object(from_bucket, 'foo.txt', file_contents)

p = aws('s3 cp s3://%s/foo.txt s3://%s/foo.txt' % (from_bucket, to_bucket))
self.assert_no_errors(p)
contents = self.get_key_contents(to_bucket, 'foo.txt')
# Don't use assertEqual() here, this will spit out a huge
# 20mb diff of 'abcd' chars. Just let the user know we failed.
if contents != file_contents:
self.fail("Downlaoded contents of 10mb file are not the same.")
self.assert_key_contents_equal(to_bucket, 'foo.txt', file_contents)
self.assertTrue(self.key_exists(from_bucket, key_name='foo.txt'))

def test_guess_mime_type(self):
Expand All @@ -342,18 +346,19 @@ def test_guess_mime_type(self):
def test_download_large_file(self):
# This will force a multipart download.
bucket_name = self.create_bucket()
foo_contents = 'abcd' * (1024 * 1024 * 10)
foo_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
self.put_object(bucket_name, key_name='foo.txt', contents=foo_contents)
local_foo_txt = self.files.full_path('foo.txt')
p = aws('s3 cp s3://%s/foo.txt %s' % (bucket_name, local_foo_txt))
self.assert_no_errors(p)
self.assertEqual(os.path.getsize(local_foo_txt), len(foo_contents))
self.assertEqual(os.path.getsize(local_foo_txt),
len(foo_contents.getvalue()))

@unittest.skipIf(platform.system() not in ['Darwin', 'Linux'],
'SIGINT not supported on Windows.')
def test_download_ctrl_c_does_not_hang(self):
bucket_name = self.create_bucket()
foo_contents = 'abcd' * (1024 * 1024 * 20)
foo_contents = six.StringIO('abcd' * (1024 * 1024 * 20))
self.put_object(bucket_name, key_name='foo.txt', contents=foo_contents)
local_foo_txt = self.files.full_path('foo.txt')
process = aws('s3 cp s3://%s/foo.txt %s' % (bucket_name, local_foo_txt), wait_for_finish=False)
Expand Down Expand Up @@ -398,6 +403,23 @@ def test_download_non_existent_key(self):


class TestSync(BaseS3CLICommand):
def test_sync_with_plus_chars(self):
# 1. Create > 1000 files with '+' in the filename.
# 2. Sync up to s3.
# 3. Sync up to s3
# 4. Verify nothing was synced up down from s3 in step 3.
bucket_name = self.create_bucket()
filenames = []
for i in range(2000):
# Create a file with a space char and a '+' char in the filename.
filenames.append(self.files.create_file('foo +%06d' % i, contents=''))
p = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
self.assert_no_errors(p)
time.sleep(1)
p2 = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
self.assertNotIn('upload:', p2.stdout)
self.assertEqual('', p2.stdout)

def test_sync_to_from_s3(self):
bucket_name = self.create_bucket()
foo_txt = self.files.create_file('foo.txt', 'foo contents')
Expand Down
6 changes: 6 additions & 0 deletions tests/unit/customizations/s3/fake_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ def emit(self, *args, **kwargs):
def emit_first_non_none_response(self, *args, **kwargs):
pass

def register(self, name, handler):
pass

def unregister(self, name, handler):
pass


class FakeService(object):
"""
Expand Down
30 changes: 27 additions & 3 deletions tests/unit/customizations/s3/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@

import mock

from botocore.hooks import HierarchicalEmitter
from awscli.customizations.s3.utils import find_bucket_key, find_chunksize
from awscli.customizations.s3.utils import ReadFileChunk
from awscli.customizations.s3.utils import relative_path
from awscli.customizations.s3.utils import StablePriorityQueue
from awscli.customizations.s3.utils import BucketLister
from awscli.customizations.s3.utils import ScopedEventHandler
from awscli.customizations.s3.constants import MAX_SINGLE_UPLOAD_SIZE


Expand Down Expand Up @@ -196,13 +198,23 @@ def test_priority_attr_is_missing(self):
class TestBucketList(unittest.TestCase):
def setUp(self):
self.operation = mock.Mock()
self.emitter = HierarchicalEmitter()
self.operation.session.register = self.emitter.register
self.operation.session.unregister = self.emitter.unregister
self.endpoint = mock.sentinel.endpoint
self.date_parser = mock.Mock()
self.date_parser.return_value = mock.sentinel.now
self.responses = []

def fake_paginate(self, *args, **kwargs):
for response in self.responses:
self.emitter.emit('after-call.s3.ListObjects', parsed=response[1])
return self.responses

def test_list_objects(self):
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': 'a', 'Size': 1},
Expand All @@ -224,7 +236,8 @@ def test_urlencoded_keys(self):
# them before yielding them. For example, note the %0D
# in bar.txt:
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': 'bar%0D.txt', 'Size': 1}]}),
Expand All @@ -236,7 +249,8 @@ def test_urlencoded_keys(self):

def test_urlencoded_with_unicode_keys(self):
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': '%E2%9C%93', 'Size': 1}]}),
Expand All @@ -246,5 +260,15 @@ def test_urlencoded_with_unicode_keys(self):
# And note how it's been converted to '\r'.
self.assertEqual(objects, [(u'foo/\u2713', 1, now)])


class TestScopedEventHandler(unittest.TestCase):
def test_scoped_session_handler(self):
session = mock.Mock()
scoped = ScopedEventHandler(session, 'eventname', 'handler')
with scoped:
session.register.assert_called_with('eventname', 'handler')
session.unregister.assert_called_with('eventname', 'handler')


if __name__ == "__main__":
unittest.main()

0 comments on commit 1236dd2

Please sign in to comment.