From c42700d10362ea6299ee40b3463dcb8a1043734e Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Tue, 25 Apr 2023 20:01:04 -0400 Subject: [PATCH 1/5] Fix Zip64 extensions not being properly applied in some cases This commit fixes an issue where adding a small file to a `ZipFile` object while forcing zip64 extensions causes an extra Zip64 record to be added to the zip, but doesn't update the `min_version` or file sizes. Fixes #103861 --- Lib/test/test_zipfile/test_core.py | 35 +++++++++++++++++++ Lib/zipfile/__init__.py | 12 +++---- ...-04-25-19-58-13.gh-issue-103861.JeozgD.rst | 2 ++ 3 files changed, 42 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-04-25-19-58-13.gh-issue-103861.JeozgD.rst diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 73c6b0185a1a0e..9b7bfa39efad89 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1080,6 +1080,41 @@ def test_generated_valid_zip64_extra(self): self.assertEqual(zinfo.header_offset, expected_header_offset) self.assertEqual(zf.read(zinfo), expected_content) + def test_force_zip64(self): + """Test that forcing zip64 extensions correctly notes this in the zip file""" + data = io.BytesIO() + with zipfile.ZipFile(data, mode="w", allowZip64=True) as zf: + with zf.open("text.txt", mode="w", force_zip64=True) as zi: + zi.write(b"_") + + zipdata = data.getvalue() + + # pull out and check zip information + ( + header, vers, os, flags, comp, csize, usize, fn_len, + ex_total_len, filename, ex_id, ex_len, ex_usize, ex_csize, cd_sig + ) = struct.unpack("<4sBBHH8xIIHH8shhQQx4s", zipdata[:63]) + + self.assertEqual(header, b"PK\x03\x04") # local file header + self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract + self.assertEqual(os, 0) # compatible with MS-DOS + self.assertEqual(flags, 0) # no flags + self.assertEqual(comp, 0) # compression method = stored + self.assertEqual(csize, 0xFFFFFFFF) # sizes are in zip64 extra + self.assertEqual(usize, 0xFFFFFFFF) + self.assertEqual(fn_len, 8) # filename len + self.assertEqual(ex_total_len, 20) # size of extra records + self.assertEqual(ex_id, 1) # Zip64 extra record + self.assertEqual(ex_len, 16) # 16 bytes of data + self.assertEqual(ex_usize, 1) # uncompressed size + self.assertEqual(ex_csize, 1) # compressed size + self.assertEqual(cd_sig, b"PK\x01\x02") # ensure the central directory header is next + + z = zipfile.ZipFile(io.BytesIO(zipdata)) + zinfos = z.infolist() + self.assertEqual(len(zinfos), 1) + self.assertGreaterEqual(zinfos[0].extract_version, zipfile.ZIP64_VERSION) # requires zip64 to extract + @requires_zlib() class DeflateTestZip64InSmallFiles(AbstractTestZip64InSmallFiles, diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 95c047991f872b..6352a1daf228dd 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -455,17 +455,15 @@ def FileHeader(self, zip64=None): extra = self.extra min_version = 0 - if zip64 is None: - zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT + if (file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT): + if zip64 is None: + zip64 = True + elif not zip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") if zip64: fmt = ' ZIP64_LIMIT or compress_size > ZIP64_LIMIT: - if not zip64: - raise LargeZipFile("Filesize would require ZIP64 extensions") - # File is larger than what fits into a 4 byte integer, - # fall back to the ZIP64 extension file_size = 0xffffffff compress_size = 0xffffffff min_version = ZIP64_VERSION diff --git a/Misc/NEWS.d/next/Library/2023-04-25-19-58-13.gh-issue-103861.JeozgD.rst b/Misc/NEWS.d/next/Library/2023-04-25-19-58-13.gh-issue-103861.JeozgD.rst new file mode 100644 index 00000000000000..cc1c444449fe93 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-04-25-19-58-13.gh-issue-103861.JeozgD.rst @@ -0,0 +1,2 @@ +Fix ``zipfile.Zipfile`` creating invalid zip files when ``force_zip64`` was +used to add files to them. Patch by Carey Metcalfe. From d24c6a410c6e79e3673e36d5a2db7a2b6e266104 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Thu, 27 Apr 2023 20:23:57 -0400 Subject: [PATCH 2/5] Fix edge cases in checking if zip64 extensions are required This fixes an issue where if data requiring zip64 extensions was added to an unseekable stream without specifying `force_zip64=True`, zip64 extensions would not be used and a RuntimeError would not be raised when closing the file (even though the size would be known at that point). This would result in successfully writing corrupt zip files. Deciding if zip64 extensions are required outside of the `FileHeader` function means that both `FileHeader` and `_ZipWriteFile` will always be in sync. Previously, the `FileHeader` function could enable zip64 extensions without propagating that decision to the `_ZipWriteFile` class, which would then not correctly write the data descriptor record or check for errors on close. --- Lib/test/test_zipfile/test_core.py | 86 ++++++++++++++++++++++++++++++ Lib/zipfile/__init__.py | 25 ++++----- 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 9b7bfa39efad89..1d4aead2fc8a97 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1115,6 +1115,92 @@ def test_force_zip64(self): self.assertEqual(len(zinfos), 1) self.assertGreaterEqual(zinfos[0].extract_version, zipfile.ZIP64_VERSION) # requires zip64 to extract + def test_unseekable_zip_unknown_filesize(self): + """Test that creating a zip with/without seeking will raise a RuntimeError if zip64 was required but not used""" + + def make_zip(fp): + with zipfile.ZipFile(fp, mode="w", allowZip64=True) as zf: + with zf.open("text.txt", mode="w", force_zip64=False) as zi: + zi.write(b"_" * (zipfile.ZIP64_LIMIT + 1)) + + self.assertRaises(RuntimeError, make_zip, io.BytesIO()) + self.assertRaises(RuntimeError, make_zip, Unseekable(io.BytesIO())) + + def test_zip64_required_not_allowed_fail(self): + """Test that trying to add a large file to a zip that doesn't allow zip64 extensions fails on add""" + def make_zip(fp): + with zipfile.ZipFile(fp, mode="w", allowZip64=False) as zf: + # pretend zipfile.ZipInfo.from_file was used to get the name and filesize + info = zipfile.ZipInfo("text.txt") + info.file_size = zipfile.ZIP64_LIMIT + 1 + zf.open(info, mode="w") + + self.assertRaises(zipfile.LargeZipFile, make_zip, io.BytesIO()) + self.assertRaises(zipfile.LargeZipFile, make_zip, Unseekable(io.BytesIO())) + + def test_unseekable_zip_known_filesize(self): + """Test that creating a zip without seeking will use zip64 extensions if the file size is provided up-front""" + + file_size = zipfile.ZIP64_LIMIT + 1 + + def make_zip(fp): + with zipfile.ZipFile(fp, mode="w", allowZip64=True) as zf: + # pretend zipfile.ZipInfo.from_file was used to get the name and filesize + info = zipfile.ZipInfo("text.txt") + info.file_size = file_size + with zf.open(info, mode="w", force_zip64=False) as zi: + zi.write(b"_" * file_size) + return fp + + # check seekable file information + seekable_data = make_zip(io.BytesIO()).getvalue() + ( + header, vers, os, flags, comp, csize, usize, fn_len, + ex_total_len, filename, ex_id, ex_len, ex_usize, ex_csize, + cd_sig + ) = struct.unpack("<4sBBHH8xIIHH8shhQQ{}x4s".format(file_size), seekable_data[:62 + file_size]) + + self.assertEqual(header, b"PK\x03\x04") + self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract + self.assertEqual(os, 0) # compatible with MS-DOS + self.assertEqual(flags, 0) # no flags set + self.assertEqual(comp, 0) # compression method = stored + self.assertEqual(csize, 0xFFFFFFFF) # sizes are in zip64 extra + self.assertEqual(usize, 0xFFFFFFFF) + self.assertEqual(fn_len, 8) # filename len + self.assertEqual(ex_total_len, 20) # size of extra records + self.assertEqual(ex_id, 1) # Zip64 extra record + self.assertEqual(ex_len, 16) # 16 bytes of data + self.assertEqual(ex_usize, file_size) # uncompressed size + self.assertEqual(ex_csize, file_size) # compressed size + self.assertEqual(cd_sig, b"PK\x01\x02") # ensure the central directory header is next + + # check unseekable file information + unseekable_data = make_zip(Unseekable(io.BytesIO())).fp.getvalue() + ( + header, vers, os, flags, comp, csize, usize, fn_len, + ex_total_len, filename, ex_id, ex_len, ex_usize, ex_csize, + dd_header, dd_usize, dd_csize, cd_sig + ) = struct.unpack("<4sBBHH8xIIHH8shhQQ{}x4s4xQQ4s".format(file_size), unseekable_data[:86 + file_size]) + + self.assertEqual(header, b"PK\x03\x04") + self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract + self.assertEqual(os, 0) # compatible with MS-DOS + self.assertEqual("{:b}".format(flags), "1000") # streaming flag set + self.assertEqual(comp, 0) # compression method = stored + self.assertEqual(csize, 0xFFFFFFFF) # sizes are in zip64 extra + self.assertEqual(usize, 0xFFFFFFFF) + self.assertEqual(fn_len, 8) # filename len + self.assertEqual(ex_total_len, 20) # size of extra records + self.assertEqual(ex_id, 1) # Zip64 extra record + self.assertEqual(ex_len, 16) # 16 bytes of data + self.assertEqual(ex_usize, 0) # uncompressed size - 0 to defer to data descriptor + self.assertEqual(ex_csize, 0) # compressed size - 0 to defer to data descriptor + self.assertEqual(dd_header, b"PK\07\x08") # data descriptor + self.assertEqual(dd_usize, file_size) # file size (8 bytes because zip64) + self.assertEqual(dd_csize, file_size) # compressed size (8 bytes because zip64) + self.assertEqual(cd_sig, b"PK\x01\x02") # ensure the central directory header is next + @requires_zlib() class DeflateTestZip64InSmallFiles(AbstractTestZip64InSmallFiles, diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 6352a1daf228dd..7cb37a30667c50 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -439,7 +439,7 @@ def __repr__(self): result.append('>') return ''.join(result) - def FileHeader(self, zip64=None): + def FileHeader(self, zip64): """Return the per-file header as a bytes object.""" dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] @@ -455,11 +455,6 @@ def FileHeader(self, zip64=None): extra = self.extra min_version = 0 - if (file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT): - if zip64 is None: - zip64 = True - elif not zip64: - raise LargeZipFile("Filesize would require ZIP64 extensions") if zip64: fmt = ' ZIP64_LIMIT: + raise RuntimeError("File size too large, try using force_zip64") + if self._compress_size > ZIP64_LIMIT: + raise RuntimeError("Compressed size too large, try using force_zip64") + # Write updated header info if self._zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: # Write CRC and file sizes after the file data @@ -1223,13 +1224,6 @@ def close(self): self._zinfo.compress_size, self._zinfo.file_size)) self._zipfile.start_dir = self._fileobj.tell() else: - if not self._zip64: - if self._file_size > ZIP64_LIMIT: - raise RuntimeError( - 'File size too large, try using force_zip64') - if self._compress_size > ZIP64_LIMIT: - raise RuntimeError( - 'Compressed size too large, try using force_zip64') # Seek backwards and write file header (which will now include # correct CRC and file sizes) @@ -1668,8 +1662,9 @@ def _open_to_write(self, zinfo, force_zip64=False): zinfo.external_attr = 0o600 << 16 # permissions: ?rw------- # Compressed size can be larger than uncompressed size - zip64 = self._allowZip64 and \ - (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT) + zip64 = force_zip64 or (zinfo.file_size * 1.05 > ZIP64_LIMIT) + if not self._allowZip64 and zip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") if self._seekable: self.fp.seek(self.start_dir) From 9b66476f542e9ad7f869e2a5f22f8f4eaf231ca9 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Sun, 30 Apr 2023 22:10:47 -0400 Subject: [PATCH 3/5] SQUASHME: Add zip64 default back to FileHeader --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 7cb37a30667c50..b1a41730082fcc 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -439,7 +439,7 @@ def __repr__(self): result.append('>') return ''.join(result) - def FileHeader(self, zip64): + def FileHeader(self, zip64=False): """Return the per-file header as a bytes object.""" dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] From b68e70f696f967c52eb4dca3de8ff58f6e3a78a0 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Sun, 30 Apr 2023 22:57:59 -0400 Subject: [PATCH 4/5] SQUASHME: Added comments to tests --- Lib/test/test_zipfile/test_core.py | 36 ++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 1d4aead2fc8a97..9960259c4cde0c 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1082,6 +1082,25 @@ def test_generated_valid_zip64_extra(self): def test_force_zip64(self): """Test that forcing zip64 extensions correctly notes this in the zip file""" + + # GH-103861 describes an issue where forcing a small file to use zip64 + # extensions would add a zip64 extra record, but not change the data + # sizes to 0xFFFFFFFF to indicate to the extractor that the zip64 + # record should be read. Additionally, it would not set the required + # version to indicate that zip64 extensions are required to extract it. + # This test replicates the situation and reads the raw data to specifically ensure: + # - The required extract version is always >= ZIP64_VERSION + # - The compressed and uncompressed size in the file headers are both + # 0xFFFFFFFF (ie. point to zip64 record) + # - The zip64 record is provided and has the correct sizes in it + # Other aspects of the zip are checked as well, but verifying the above is the main goal. + # Because this is hard to verify by parsing the data as a zip, the raw + # bytes are checked to ensure that they line up with the zip spec. + # The spec for this can be found at: https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + # The relevent sections for this test are: + # - 4.3.7 for local file header + # - 4.5.3 for zip64 extra field + data = io.BytesIO() with zipfile.ZipFile(data, mode="w", allowZip64=True) as zf: with zf.open("text.txt", mode="w", force_zip64=True) as zi: @@ -1141,6 +1160,19 @@ def make_zip(fp): def test_unseekable_zip_known_filesize(self): """Test that creating a zip without seeking will use zip64 extensions if the file size is provided up-front""" + # This test ensures that the zip will use a zip64 data descriptor (same + # as a regular data descriptor except the sizes are 8 bytes instead of + # 4) record to communicate the size of a file if the zip is being + # written to an unseekable stream. + # Because this sort of thing is hard to verify by parsing the data back + # in as a zip, this test looks at the raw bytes created to ensure that + # the correct data has been generated. + # The spec for this can be found at: https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + # The relevent sections for this test are: + # - 4.3.7 for local file header + # - 4.3.9 for the data descriptor + # - 4.5.3 for zip64 extra field + file_size = zipfile.ZIP64_LIMIT + 1 def make_zip(fp): @@ -1160,7 +1192,7 @@ def make_zip(fp): cd_sig ) = struct.unpack("<4sBBHH8xIIHH8shhQQ{}x4s".format(file_size), seekable_data[:62 + file_size]) - self.assertEqual(header, b"PK\x03\x04") + self.assertEqual(header, b"PK\x03\x04") # local file header self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract self.assertEqual(os, 0) # compatible with MS-DOS self.assertEqual(flags, 0) # no flags set @@ -1183,7 +1215,7 @@ def make_zip(fp): dd_header, dd_usize, dd_csize, cd_sig ) = struct.unpack("<4sBBHH8xIIHH8shhQQ{}x4s4xQQ4s".format(file_size), unseekable_data[:86 + file_size]) - self.assertEqual(header, b"PK\x03\x04") + self.assertEqual(header, b"PK\x03\x04") # local file header self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract self.assertEqual(os, 0) # compatible with MS-DOS self.assertEqual("{:b}".format(flags), "1000") # streaming flag set From ee7c78bd6f183e346cc4bdaa427043aaae9da874 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 15 May 2023 23:26:56 -0700 Subject: [PATCH 5/5] Restore ZipInfo.FileHeader zip64=None default to maintain backwards compatibility in the API. Code within this module always passes an explicit zip64, so the overall bug fix remains valid. We just don't want to break existing user code constructing their own ZipInfo objects and calling zi.FileHeader themselves for whatever reasons. _(hopefully rare, but it isn't a protected or private API so we can't make assumptions)_ --- Lib/zipfile/__init__.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index b1a41730082fcc..aabd5cfbe6a2cd 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -439,8 +439,13 @@ def __repr__(self): result.append('>') return ''.join(result) - def FileHeader(self, zip64=False): - """Return the per-file header as a bytes object.""" + def FileHeader(self, zip64=None): + """Return the per-file header as a bytes object. + + When the optional zip64 arg is None rather than a bool, we will + decide based upon the file_size and compress_size, if known, + False otherwise. + """ dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) @@ -455,6 +460,10 @@ def FileHeader(self, zip64=False): extra = self.extra min_version = 0 + if zip64 is None: + # We always explicitly pass zip64 within this module.... This + # remains for anyone using ZipInfo.FileHeader as a public API. + zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT if zip64: fmt = '