Added the ability to stream data using cp.

This feature enables users to stream from stdin to s3 or from s3 to stdout. Streaming large files is both multithreaded and uses multipart transfers. The streaming feature is limited to single file ``cp`` commands.
aws · Sep 2, 2014 · abce027 · abce027
1 parent 4e093bd
commit abce027
Show file tree

Hide file tree

Showing 22 changed files with 871 additions and 143 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,9 @@ CHANGELOG
 Next Release (TBD)
 ==================
 
+* feature:``aws s3 cp``: Added ability to upload local
+  file streams from standard input to s3 and download s3
+  objects as local file streams to standard output.  
 * feature:Page Size: Add a ``--page-size`` option, that
   controls page size when perfoming an operation that
   uses pagination.

diff --git a/awscli/customizations/s3/constants.py b/awscli/customizations/s3/constants.py
@@ -18,3 +18,4 @@
 MAX_SINGLE_UPLOAD_SIZE = 5 * (1024 ** 3)
 MAX_UPLOAD_SIZE = 5 * (1024 ** 4)
 MAX_QUEUE_SIZE = 1000
+STREAM_INPUT_TIMEOUT = 0.1
diff --git a/awscli/customizations/s3/executor.py b/awscli/customizations/s3/executor.py
@@ -15,8 +15,8 @@
 import sys
 import threading
 
-from awscli.customizations.s3.utils import uni_print, \
-        IORequest, IOCloseRequest, StablePriorityQueue
+from awscli.customizations.s3.utils import uni_print, bytes_print, \
+    IORequest, IOCloseRequest, StablePriorityQueue
 from awscli.customizations.s3.tasks import OrderableTask
 
 
@@ -50,8 +50,7 @@ def __init__(self, num_threads, result_queue,
         self.quiet = quiet
         self.threads_list = []
         self.write_queue = write_queue
-        self.print_thread = PrintThread(self.result_queue,
-                                        self.quiet)
+        self.print_thread = PrintThread(self.result_queue, self.quiet)
         self.print_thread.daemon = True
         self.io_thread = IOWriterThread(self.write_queue)
 
@@ -153,23 +152,28 @@ def run(self):
                 self._cleanup()
                 return
             elif isinstance(task, IORequest):
-                filename, offset, data = task
-                fileobj = self.fd_descriptor_cache.get(filename)
-                if fileobj is None:
-                    fileobj = open(filename, 'rb+')
-                    self.fd_descriptor_cache[filename] = fileobj
-                fileobj.seek(offset)
+                filename, offset, data, is_stream = task
+                if is_stream:
+                    fileobj = sys.stdout
+                    bytes_print(data)
+                else:
+                    fileobj = self.fd_descriptor_cache.get(filename)
+                    if fileobj is None:
+                        fileobj = open(filename, 'rb+')
+                        self.fd_descriptor_cache[filename] = fileobj
+                    fileobj.seek(offset)
+                    fileobj.write(data)
                 LOGGER.debug("Writing data to: %s, offset: %s",
                              filename, offset)
-                fileobj.write(data)
                 fileobj.flush()
             elif isinstance(task, IOCloseRequest):
                 LOGGER.debug("IOCloseRequest received for %s, closing file.",
                              task.filename)
-                fileobj = self.fd_descriptor_cache.get(task.filename)
-                if fileobj is not None:
-                    fileobj.close()
-                    del self.fd_descriptor_cache[task.filename]
+                if not task.is_stream:
+                    fileobj = self.fd_descriptor_cache.get(task.filename)
+                    if fileobj is not None:
+                        fileobj.close()
+                        del self.fd_descriptor_cache[task.filename]
 
     def _cleanup(self):
         for fileobj in self.fd_descriptor_cache.values():
@@ -237,7 +241,7 @@ def __init__(self, result_queue, quiet):
         self._lock = threading.Lock()
         self._needs_newline = False
 
-        self._total_parts = 0
+        self._total_parts = '...'
         self._total_files = '...'
 
         # This is a public attribute that clients can inspect to determine

diff --git a/awscli/customizations/s3/filegenerator.py b/awscli/customizations/s3/filegenerator.py
@@ -95,7 +95,7 @@ def __init__(self, directory, filename):
 class FileStat(object):
     def __init__(self, src, dest=None, compare_key=None, size=None,
                  last_update=None, src_type=None, dest_type=None,
-                 operation_name=None):
+                 operation_name=None, is_stream=False):
         self.src = src
         self.dest = dest
         self.compare_key = compare_key
@@ -104,6 +104,7 @@ def __init__(self, src, dest=None, compare_key=None, size=None,
         self.src_type = src_type
         self.dest_type = dest_type
         self.operation_name = operation_name
+        self.is_stream = is_stream
 
 
 class FileGenerator(object):
@@ -115,7 +116,8 @@ class FileGenerator(object):
     ``FileInfo`` objects to send to a ``Comparator`` or ``S3Handler``.
     """
     def __init__(self, service, endpoint, operation_name,
-                 follow_symlinks=True, page_size=None, result_queue=None):
+                 follow_symlinks=True, page_size=None, result_queue=None,
+                 is_stream=False):
         self._service = service
         self._endpoint = endpoint
         self.operation_name = operation_name
@@ -124,6 +126,7 @@ def __init__(self, service, endpoint, operation_name,
         self.result_queue = result_queue
         if not result_queue:
             self.result_queue = queue.Queue()
+        self.is_stream = is_stream
 
     def call(self, files):
         """
@@ -135,7 +138,11 @@ def call(self, files):
         dest = files['dest']
         src_type = src['type']
         dest_type = dest['type']
-        function_table = {'s3': self.list_objects, 'local': self.list_files}
+        function_table = {'s3': self.list_objects}
+        if self.is_stream:
+            function_table['local'] = self.list_local_file_stream
+        else:
+            function_table['local'] = self.list_files
         sep_table = {'s3': '/', 'local': os.sep}
         source = src['path']
         file_list = function_table[src_type](source, files['dir_op'])
@@ -155,7 +162,15 @@ def call(self, files):
                            compare_key=compare_key, size=size,
                            last_update=last_update, src_type=src_type,
                            dest_type=dest_type,
-                           operation_name=self.operation_name)
+                           operation_name=self.operation_name,
+                           is_stream=self.is_stream)
+
+    def list_local_file_stream(self, path, dir_op):
+        """
+        Yield some dummy values for a local file stream since it does not
+        actually have a file.
+        """
+        yield '-', 0, None
 
     def list_files(self, path, dir_op):
         """

diff --git a/awscli/customizations/s3/fileinfo.py b/awscli/customizations/s3/fileinfo.py
@@ -11,7 +11,7 @@
 from botocore.compat import quote
 from awscli.customizations.s3.utils import find_bucket_key, \
         check_etag, check_error, operate, uni_print, \
-        guess_content_type, MD5Error
+        guess_content_type, MD5Error, bytes_print
 
 
 class CreateDirectoryError(Exception):
@@ -26,7 +26,7 @@ def read_file(filename):
         return in_file.read()
 
 
-def save_file(filename, response_data, last_update):
+def save_file(filename, response_data, last_update, is_stream=False):
     """
     This writes to the file upon downloading.  It reads the data in the
     response.  Makes a new directory if needed and then writes the
@@ -35,31 +35,57 @@ def save_file(filename, response_data, last_update):
     """
     body = response_data['Body']
     etag = response_data['ETag'][1:-1]
-    d = os.path.dirname(filename)
-    try:
-        if not os.path.exists(d):
-            os.makedirs(d)
-    except OSError as e:
-        if not e.errno == errno.EEXIST:
-            raise CreateDirectoryError(
-                "Could not create directory %s: %s" % (d, e))
+    if not is_stream:
+        d = os.path.dirname(filename)
+        try:
+            if not os.path.exists(d):
+                os.makedirs(d)
+        except OSError as e:
+            if not e.errno == errno.EEXIST:
+                raise CreateDirectoryError(
+                    "Could not create directory %s: %s" % (d, e))
     md5 = hashlib.md5()
     file_chunks = iter(partial(body.read, 1024 * 1024), b'')
-    with open(filename, 'wb') as out_file:
-        if not _is_multipart_etag(etag):
-            for chunk in file_chunks:
-                md5.update(chunk)
-                out_file.write(chunk)
-        else:
-            for chunk in file_chunks:
-                out_file.write(chunk)
+    if is_stream:
+        # Need to save the data to be able to check the etag for a stream
+        # becuase once the data is written to the stream there is no
+        # undoing it.
+        payload = write_to_file(None, etag, md5, file_chunks, True)
+    else:
+        with open(filename, 'wb') as out_file:
+            write_to_file(out_file, etag, md5, file_chunks)
+
     if not _is_multipart_etag(etag):
         if etag != md5.hexdigest():
-            os.remove(filename)
+            if not is_stream:
+                os.remove(filename)
             raise MD5Error(filename)
-    last_update_tuple = last_update.timetuple()
-    mod_timestamp = time.mktime(last_update_tuple)
-    os.utime(filename, (int(mod_timestamp), int(mod_timestamp)))
+
+    if not is_stream:
+        last_update_tuple = last_update.timetuple()
+        mod_timestamp = time.mktime(last_update_tuple)
+        os.utime(filename, (int(mod_timestamp), int(mod_timestamp)))
+    else:
+        # Now write the output to stdout since the md5 is correct.
+        bytes_print(payload)
+        sys.stdout.flush()
+
+
+def write_to_file(out_file, etag, md5, file_chunks, is_stream=False):
+    """
+    Updates the etag for each file chunk.  It will write to the file if it a
+    file but if it is a stream it will return a byte string to be later
+    written to a stream.
+    """
+    body = b''
+    for chunk in file_chunks:
+        if not _is_multipart_etag(etag):
+            md5.update(chunk)
+        if is_stream:
+            body += chunk
+        else:
+            out_file.write(chunk)
+    return body
 
 
 def _is_multipart_etag(etag):
@@ -140,7 +166,7 @@ class FileInfo(TaskInfo):
     def __init__(self, src, dest=None, compare_key=None, size=None,
                  last_update=None, src_type=None, dest_type=None,
                  operation_name=None, service=None, endpoint=None,
-                 parameters=None, source_endpoint=None):
+                 parameters=None, source_endpoint=None, is_stream=False):
         super(FileInfo, self).__init__(src, src_type=src_type,
                                        operation_name=operation_name,
                                        service=service,
@@ -157,6 +183,7 @@ def __init__(self, src, dest=None, compare_key=None, size=None,
             self.parameters = {'acl': None,
                                'sse': None}
         self.source_endpoint = source_endpoint
+        self.is_stream = is_stream
 
     def _permission_to_param(self, permission):
         if permission == 'read':
@@ -204,24 +231,30 @@ def _handle_object_params(self, params):
         if self.parameters['expires']:
             params['expires'] = self.parameters['expires'][0]
 
-    def upload(self):
+    def upload(self, payload=None):
         """
         Redirects the file to the multipart upload function if the file is
         large.  If it is small enough, it puts the file as an object in s3.
         """
-        with open(self.src, 'rb') as body:
-            bucket, key = find_bucket_key(self.dest)
-            params = {
-                'endpoint': self.endpoint,
-                'bucket': bucket,
-                'key': key,
-                'body': body,
-            }
-            self._handle_object_params(params)
-            response_data, http = operate(self.service, 'PutObject', params)
-            etag = response_data['ETag'][1:-1]
-            body.seek(0)
-            check_etag(etag, body)
+        if payload:
+            self._handle_upload(payload)
+        else:
+            with open(self.src, 'rb') as body:
+                self._handle_upload(body)
+
+    def _handle_upload(self, body):
+        bucket, key = find_bucket_key(self.dest)
+        params = {
+            'endpoint': self.endpoint,
+            'bucket': bucket,
+            'key': key,
+            'body': body,
+        }
+        self._handle_object_params(params)
+        response_data, http = operate(self.service, 'PutObject', params)
+        etag = response_data['ETag'][1:-1]
+        body.seek(0)
+        check_etag(etag, body)
 
     def _inject_content_type(self, params, filename):
         # Add a content type param if we can guess the type.
@@ -237,7 +270,8 @@ def download(self):
         bucket, key = find_bucket_key(self.src)
         params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key}
         response_data, http = operate(self.service, 'GetObject', params)
-        save_file(self.dest, response_data, self.last_update)
+        save_file(self.dest, response_data, self.last_update,
+                  self.is_stream)
 
     def copy(self):
         """

diff --git a/awscli/customizations/s3/fileinfobuilder.py b/awscli/customizations/s3/fileinfobuilder.py
@@ -42,6 +42,7 @@ def _inject_info(self, file_base):
         file_info_attr['src_type'] = file_base.src_type
         file_info_attr['dest_type'] = file_base.dest_type
         file_info_attr['operation_name'] = file_base.operation_name
+        file_info_attr['is_stream'] = file_base.is_stream
         file_info_attr['service'] = self._service
         file_info_attr['endpoint'] = self._endpoint
         file_info_attr['source_endpoint'] = self._source_endpoint