From 643eee1e4aab97f290dd92187727c4e3fa298a65 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 4 Oct 2018 12:44:06 +0100 Subject: [PATCH 1/4] Use ZipFile.open instead of ZipFile.read To avoid huge memory usage in unusual situations (e.g. a TensorFlow wheel on a Raspberry Pi), use ZipFile.open and shutil.copyfileobj instead of reading all the decompressed data into a byte-string. --- src/pip/_internal/utils/misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pip/_internal/utils/misc.py b/src/pip/_internal/utils/misc.py index 84a421fe4fc..b555b89ff63 100644 --- a/src/pip/_internal/utils/misc.py +++ b/src/pip/_internal/utils/misc.py @@ -468,7 +468,6 @@ def unzip_file(filename, location, flatten=True): leading = has_leading_dir(zip.namelist()) and flatten for info in zip.infolist(): name = info.filename - data = zip.read(name) fn = name if leading: fn = split_leading_dir(name)[1] @@ -479,9 +478,10 @@ def unzip_file(filename, location, flatten=True): ensure_dir(fn) else: ensure_dir(dir) - fp = open(fn, 'wb') + fp = zip.open(name) try: - fp.write(data) + with open(fn, 'wb') as destfp: + shutil.copyfileobj(fp, destfp) finally: fp.close() mode = info.external_attr >> 16 From c0d3e145451249f57adb265cc87c4d388aabf75d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 12 Oct 2018 23:10:32 +0100 Subject: [PATCH 2/4] Address comments in PR Add news file and comment to zip.open line --- news/5848.bugfix | 6 ++++++ src/pip/_internal/utils/misc.py | 3 +++ 2 files changed, 9 insertions(+) create mode 100644 news/5848.bugfix diff --git a/news/5848.bugfix b/news/5848.bugfix new file mode 100644 index 00000000000..7b31aa6bf41 --- /dev/null +++ b/news/5848.bugfix @@ -0,0 +1,6 @@ +Extract files from wheel archives in chunks, instead of decompressing the +entire file content into memory before writing it to disk. For the vast +majority of cases (PCs with ample RAM and/or small wheels) this makes no +difference. But there are some circumstances (e.g. a Raspberry Pi with no swap, +the default config, plus a wheel containing huge libraries like Tensorflow) in +which this leads to a failure to unpack and a crash with a MemoryError. diff --git a/src/pip/_internal/utils/misc.py b/src/pip/_internal/utils/misc.py index b555b89ff63..da3638a8203 100644 --- a/src/pip/_internal/utils/misc.py +++ b/src/pip/_internal/utils/misc.py @@ -478,6 +478,9 @@ def unzip_file(filename, location, flatten=True): ensure_dir(fn) else: ensure_dir(dir) + # Open the archive member as a file-like-object so we can + # copy it in chunks with copyfileobj; using read() instead + # potentially allocates an arbitrarily large amount of memory fp = zip.open(name) try: with open(fn, 'wb') as destfp: From 864287e13eda47a5770e8c3eaa20ce502c3afeb0 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 12 Oct 2018 23:36:47 +0100 Subject: [PATCH 3/4] Shorten commentary --- news/5848.bugfix | 8 ++------ src/pip/_internal/utils/misc.py | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/news/5848.bugfix b/news/5848.bugfix index 7b31aa6bf41..ebf7094db31 100644 --- a/news/5848.bugfix +++ b/news/5848.bugfix @@ -1,6 +1,2 @@ -Extract files from wheel archives in chunks, instead of decompressing the -entire file content into memory before writing it to disk. For the vast -majority of cases (PCs with ample RAM and/or small wheels) this makes no -difference. But there are some circumstances (e.g. a Raspberry Pi with no swap, -the default config, plus a wheel containing huge libraries like Tensorflow) in -which this leads to a failure to unpack and a crash with a MemoryError. +Extract files from wheels in chunks, to avoid memory issues on smaller +platforms when handling wheels containing large files. diff --git a/src/pip/_internal/utils/misc.py b/src/pip/_internal/utils/misc.py index da3638a8203..353074167e9 100644 --- a/src/pip/_internal/utils/misc.py +++ b/src/pip/_internal/utils/misc.py @@ -478,9 +478,8 @@ def unzip_file(filename, location, flatten=True): ensure_dir(fn) else: ensure_dir(dir) - # Open the archive member as a file-like-object so we can - # copy it in chunks with copyfileobj; using read() instead - # potentially allocates an arbitrarily large amount of memory + # Don't use read() to avoid allocating an arbitrarily large + # chunk of memory for the file's content fp = zip.open(name) try: with open(fn, 'wb') as destfp: From 64f089716b3f533e457e3cfc190c2cc630cc4016 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 15 Oct 2018 09:06:54 +0100 Subject: [PATCH 4/4] Single line news entry --- news/5848.bugfix | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/news/5848.bugfix b/news/5848.bugfix index ebf7094db31..f525bc79144 100644 --- a/news/5848.bugfix +++ b/news/5848.bugfix @@ -1,2 +1 @@ -Extract files from wheels in chunks, to avoid memory issues on smaller -platforms when handling wheels containing large files. +Greatly reduce memory usage when installing wheels containing large files.