Fix S3 sync issue with keys containing urlencode values

Fixes aws#749. This was a regression from the fix for aws#675 where we use the encoding_type of "url" to workaround the stdlib xmlparser not handling new lines. The problem is that pagination in s3 uses the last key name as the marker, and because the keys are returned urlencoded, we need to urldecode the keys so botocore sends the correct next marker. In the case where urldecoded(key) != key we will incorrectly sync new files. Also added an integ test for syncing with '+' chars.
jamesls · Apr 17, 2014 · 1236dd2 · 1236dd2
1 parent cbacc26
commit 1236dd2
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 36 deletions.
diff --git a/awscli/customizations/s3/utils.py b/awscli/customizations/s3/utils.py
@@ -302,14 +302,39 @@ def list_objects(self, bucket, prefix=None):
         kwargs = {'bucket': bucket, 'encoding_type': 'url'}
         if prefix is not None:
             kwargs['prefix'] = prefix
-        pages = self._operation.paginate(self._endpoint, **kwargs)
-        for response, page in pages:
-            contents = page['Contents']
-            for content in contents:
-                source_path = bucket + '/' + unquote_str(content['Key'])
-                size = content['Size']
-                last_update = self._date_parser(content['LastModified'])
-                yield source_path, size, last_update
+        # This event handler is needed because we use encoding_type url and
+        # we're paginating.  The pagination token is the last Key of the
+        # Contents list.  However, botocore does not know that the encoding
+        # type needs to be urldecoded.
+        with ScopedEventHandler(self._operation.session, 'after-call.s3.ListObjects',
+                                self._decode_keys):
+            pages = self._operation.paginate(self._endpoint, **kwargs)
+            for response, page in pages:
+                contents = page['Contents']
+                for content in contents:
+                    source_path = bucket + '/' + content['Key']
+                    size = content['Size']
+                    last_update = self._date_parser(content['LastModified'])
+                    yield source_path, size, last_update
+
+    def _decode_keys(self, parsed, **kwargs):
+        for content in parsed['Contents']:
+            content['Key'] = unquote_str(content['Key'])
+
+
+class ScopedEventHandler(object):
+    """Register an event callback for the duration of a scope."""
+
+    def __init__(self, session, event_name, handler):
+        self._session = session
+        self._event_name = event_name
+        self._handler = handler
+
+    def __enter__(self):
+        self._session.register(self._event_name, self._handler)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._session.unregister(self._event_name, self._handler)
 
 
 IORequest = namedtuple('IORequest', ['filename', 'offset', 'data'])

diff --git a/tests/integration/customizations/s3/test_plugin.py b/tests/integration/customizations/s3/test_plugin.py
@@ -1,4 +1,3 @@
-
 # -*- coding: utf-8 -*-
 # Copyright 2013 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
@@ -25,6 +24,7 @@
 import signal
 
 import botocore.session
+import six
 
 from tests.integration import aws
 from tests.unit.customizations.s3 import create_bucket as _create_bucket
@@ -68,6 +68,17 @@ def extra_teardown(self):
         # Subclasses can use this to define extra teardown steps.
         pass
 
+    def assert_key_contents_equal(self, bucket, key, expected_contents):
+        if isinstance(expected_contents, six.StringIO):
+            expected_contents = expected_contents.getvalue()
+        actual_contents = self.get_key_contents(bucket, key)
+        # The contents can be huge so we try to give helpful error messages
+        # without necessarily printing the actual contents.
+        self.assertEqual(len(actual_contents), len(expected_contents))
+        if actual_contents != expected_contents:
+            self.fail("Contents for %s/%s do not match (but they "
+                      "have the same length)" % (bucket, key))
+
     def create_bucket(self):
         bucket_name = _create_bucket(self.session)
         self.addCleanup(self.delete_bucket, bucket_name)
@@ -147,8 +158,7 @@ def test_mv_local_to_s3(self):
         # When we move an object, the local file is gone:
         self.assertTrue(not os.path.exists(full_path))
         # And now resides in s3.
-        contents = self.get_key_contents(bucket_name, 'foo.txt')
-        self.assertEqual(contents, 'this is foo.txt')
+        self.assert_key_contents_equal(bucket_name, 'foo.txt', 'this is foo.txt')
 
     def test_mv_s3_to_local(self):
         bucket_name = self.create_bucket()
@@ -179,22 +189,21 @@ def test_mv_s3_to_s3(self):
     def test_mv_s3_to_s3_multipart(self):
         from_bucket = self.create_bucket()
         to_bucket = self.create_bucket()
-        file_contents = 'abcd' * (1024 * 1024 * 10)
+        file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
         self.put_object(from_bucket, 'foo.txt', file_contents)
 
         p = aws('s3 mv s3://%s/foo.txt s3://%s/foo.txt' % (from_bucket,
                                                            to_bucket))
         self.assert_no_errors(p)
-        contents = self.get_key_contents(to_bucket, 'foo.txt')
-        self.assertEqual(contents, file_contents)
+        self.assert_key_contents_equal(to_bucket, 'foo.txt', file_contents)
         # And verify that the object no longer exists in the from_bucket.
         self.assertTrue(not self.key_exists(from_bucket, key_name='foo.txt'))
 
     def test_mv_s3_to_s3_multipart_recursive(self):
         from_bucket = self.create_bucket()
         to_bucket = self.create_bucket()
 
-        large_file_contents = 'abcd' * (1024 * 1024 * 10)
+        large_file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
         small_file_contents = 'small file contents'
         self.put_object(from_bucket, 'largefile', large_file_contents)
         self.put_object(from_bucket, 'smallfile', small_file_contents)
@@ -211,29 +220,28 @@ def test_mv_s3_to_s3_multipart_recursive(self):
         self.assertTrue(self.key_exists(to_bucket, key_name='smallfile'))
 
         # And the contents are what we expect.
-        self.assertEqual(self.get_key_contents(to_bucket, 'smallfile'),
-                         small_file_contents)
-        self.assertEqual(self.get_key_contents(to_bucket, 'largefile'),
-                         large_file_contents)
+        self.assert_key_contents_equal(to_bucket, 'smallfile',
+                                       small_file_contents)
+        self.assert_key_contents_equal(to_bucket, 'largefile',
+                                       large_file_contents)
 
     def test_mv_with_large_file(self):
         bucket_name = self.create_bucket()
         # 40MB will force a multipart upload.
-        file_contents = 'abcd' * (1024 * 1024 * 10)
-        foo_txt = self.files.create_file('foo.txt', file_contents)
+        file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
+        foo_txt = self.files.create_file('foo.txt', file_contents.getvalue())
         p = aws('s3 mv %s s3://%s/foo.txt' % (foo_txt, bucket_name))
         self.assert_no_errors(p)
         # When we move an object, the local file is gone:
         self.assertTrue(not os.path.exists(foo_txt))
         # And now resides in s3.
-        contents = self.get_key_contents(bucket_name, 'foo.txt')
-        self.assertEqual(len(contents), len(file_contents))
+        self.assert_key_contents_equal(bucket_name, 'foo.txt', file_contents)
 
         # Now verify we can download this file.
         p = aws('s3 mv s3://%s/foo.txt %s' % (bucket_name, foo_txt))
         self.assert_no_errors(p)
         self.assertTrue(os.path.exists(foo_txt))
-        self.assertEqual(os.path.getsize(foo_txt), len(file_contents))
+        self.assertEqual(os.path.getsize(foo_txt), len(file_contents.getvalue()))
 
     def test_mv_to_nonexistent_bucket(self):
         full_path = self.files.create_file('foo.txt', 'this is foo.txt')
@@ -315,16 +323,12 @@ def test_cp_without_trailing_slash(self):
     def test_cp_s3_s3_multipart(self):
         from_bucket = self.create_bucket()
         to_bucket = self.create_bucket()
-        file_contents = 'abcd' * (1024 * 1024 * 10)
+        file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
         self.put_object(from_bucket, 'foo.txt', file_contents)
 
         p = aws('s3 cp s3://%s/foo.txt s3://%s/foo.txt' % (from_bucket, to_bucket))
         self.assert_no_errors(p)
-        contents = self.get_key_contents(to_bucket, 'foo.txt')
-        # Don't use assertEqual() here, this will spit out a huge
-        # 20mb diff of 'abcd' chars.  Just let the user know we failed.
-        if contents != file_contents:
-            self.fail("Downlaoded contents of 10mb file are not the same.")
+        self.assert_key_contents_equal(to_bucket, 'foo.txt', file_contents)
         self.assertTrue(self.key_exists(from_bucket, key_name='foo.txt'))
 
     def test_guess_mime_type(self):
@@ -342,18 +346,19 @@ def test_guess_mime_type(self):
     def test_download_large_file(self):
         # This will force a multipart download.
         bucket_name = self.create_bucket()
-        foo_contents = 'abcd' * (1024 * 1024 * 10)
+        foo_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
         self.put_object(bucket_name, key_name='foo.txt', contents=foo_contents)
         local_foo_txt = self.files.full_path('foo.txt')
         p = aws('s3 cp s3://%s/foo.txt %s' % (bucket_name, local_foo_txt))
         self.assert_no_errors(p)
-        self.assertEqual(os.path.getsize(local_foo_txt), len(foo_contents))
+        self.assertEqual(os.path.getsize(local_foo_txt),
+                         len(foo_contents.getvalue()))
 
     @unittest.skipIf(platform.system() not in ['Darwin', 'Linux'],
                     'SIGINT not supported on Windows.')
     def test_download_ctrl_c_does_not_hang(self):
         bucket_name = self.create_bucket()
-        foo_contents = 'abcd' * (1024 * 1024 * 20)
+        foo_contents = six.StringIO('abcd' * (1024 * 1024 * 20))
         self.put_object(bucket_name, key_name='foo.txt', contents=foo_contents)
         local_foo_txt = self.files.full_path('foo.txt')
         process = aws('s3 cp s3://%s/foo.txt %s' % (bucket_name, local_foo_txt), wait_for_finish=False)
@@ -398,6 +403,23 @@ def test_download_non_existent_key(self):
 
 
 class TestSync(BaseS3CLICommand):
+    def test_sync_with_plus_chars(self):
+        # 1. Create > 1000 files with '+' in the filename.
+        # 2. Sync up to s3.
+        # 3. Sync up to s3
+        # 4. Verify nothing was synced up down from s3 in step 3.
+        bucket_name = self.create_bucket()
+        filenames = []
+        for i in range(2000):
+            # Create a file with a space char and a '+' char in the filename.
+            filenames.append(self.files.create_file('foo +%06d' % i, contents=''))
+        p = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
+        self.assert_no_errors(p)
+        time.sleep(1)
+        p2 = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
+        self.assertNotIn('upload:', p2.stdout)
+        self.assertEqual('', p2.stdout)
+
     def test_sync_to_from_s3(self):
         bucket_name = self.create_bucket()
         foo_txt = self.files.create_file('foo.txt', 'foo contents')

diff --git a/tests/unit/customizations/s3/fake_session.py b/tests/unit/customizations/s3/fake_session.py
@@ -70,6 +70,12 @@ def emit(self, *args, **kwargs):
     def emit_first_non_none_response(self, *args, **kwargs):
         pass
 
+    def register(self, name, handler):
+        pass
+
+    def unregister(self, name, handler):
+        pass
+
 
 class FakeService(object):
     """

diff --git a/tests/unit/customizations/s3/test_utils.py b/tests/unit/customizations/s3/test_utils.py
@@ -6,11 +6,13 @@
 
 import mock
 
+from botocore.hooks import HierarchicalEmitter
 from awscli.customizations.s3.utils import find_bucket_key, find_chunksize
 from awscli.customizations.s3.utils import ReadFileChunk
 from awscli.customizations.s3.utils import relative_path
 from awscli.customizations.s3.utils import StablePriorityQueue
 from awscli.customizations.s3.utils import BucketLister
+from awscli.customizations.s3.utils import ScopedEventHandler
 from awscli.customizations.s3.constants import MAX_SINGLE_UPLOAD_SIZE
 
 
@@ -196,13 +198,23 @@ def test_priority_attr_is_missing(self):
 class TestBucketList(unittest.TestCase):
     def setUp(self):
         self.operation = mock.Mock()
+        self.emitter = HierarchicalEmitter()
+        self.operation.session.register = self.emitter.register
+        self.operation.session.unregister = self.emitter.unregister
         self.endpoint = mock.sentinel.endpoint
         self.date_parser = mock.Mock()
         self.date_parser.return_value = mock.sentinel.now
+        self.responses = []
+
+    def fake_paginate(self, *args, **kwargs):
+        for response in self.responses:
+            self.emitter.emit('after-call.s3.ListObjects', parsed=response[1])
+        return self.responses
 
     def test_list_objects(self):
         now = mock.sentinel.now
-        self.operation.paginate.return_value = [
+        self.operation.paginate = self.fake_paginate
+        self.responses = [
             (None, {'Contents': [
                 {'LastModified': '2014-02-27T04:20:38.000Z',
                  'Key': 'a', 'Size': 1},
@@ -224,7 +236,8 @@ def test_urlencoded_keys(self):
         # them before yielding them.  For example, note the %0D
         # in bar.txt:
         now = mock.sentinel.now
-        self.operation.paginate.return_value = [
+        self.operation.paginate = self.fake_paginate
+        self.responses = [
             (None, {'Contents': [
                 {'LastModified': '2014-02-27T04:20:38.000Z',
                  'Key': 'bar%0D.txt', 'Size': 1}]}),
@@ -236,7 +249,8 @@ def test_urlencoded_keys(self):
 
     def test_urlencoded_with_unicode_keys(self):
         now = mock.sentinel.now
-        self.operation.paginate.return_value = [
+        self.operation.paginate = self.fake_paginate
+        self.responses = [
             (None, {'Contents': [
                 {'LastModified': '2014-02-27T04:20:38.000Z',
                  'Key': '%E2%9C%93', 'Size': 1}]}),
@@ -246,5 +260,15 @@ def test_urlencoded_with_unicode_keys(self):
         # And note how it's been converted to '\r'.
         self.assertEqual(objects, [(u'foo/\u2713', 1, now)])
 
+
+class TestScopedEventHandler(unittest.TestCase):
+    def test_scoped_session_handler(self):
+        session = mock.Mock()
+        scoped = ScopedEventHandler(session, 'eventname', 'handler')
+        with scoped:
+            session.register.assert_called_with('eventname', 'handler')
+        session.unregister.assert_called_with('eventname', 'handler')
+
+
 if __name__ == "__main__":
     unittest.main()