From cd788e6b6b9f2e8547f93364e26b516626156f2a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 18 May 2026 20:42:29 +0300 Subject: [PATCH 1/4] gh-84353: Preserve non-UTF-8 filenames when appending to ZipFile --- Lib/test/test_zipfile/test_core.py | 40 +++++++++++-------- Lib/zipfile/__init__.py | 22 +++++----- ...6-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst | 2 + 3 files changed, 39 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 0d407371f40a0f7..b03563153df5783 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3639,29 +3639,23 @@ def test_read_with_unsuitable_metadata_encoding(self): def test_read_after_append(self): newname = '\u56db' # Han 'four' - expected_names = [name.encode('shift_jis').decode('cp437') - for name in self.file_names[:2]] + self.file_names[2:] - expected_names.append(newname) - expected_content = (*self.file_content, b"newcontent") + newname2 = 'fünf' # encodeable in cp437 + expected_names = [*self.file_names, newname, newname2] + bad_expected_names = [name.encode('shift_jis').decode('cp437') + if i < 2 else name + for i, name in enumerate(expected_names)] + expected_content = (*self.file_content, b"newcontent", b"newcontent2") with zipfile.ZipFile(TESTFN, "a") as zipfp: zipfp.writestr(newname, "newcontent") - self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names)) + zipfp.writestr(newname2, "newcontent2") + self.assertEqual(sorted(zipfp.namelist()), sorted(bad_expected_names)) with zipfile.ZipFile(TESTFN, "r") as zipfp: - self._test_read(zipfp, expected_names, expected_content) + self._test_read(zipfp, bad_expected_names, expected_content) with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: - self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names)) - for i, (name, content) in enumerate(zip(expected_names, expected_content)): - info = zipfp.getinfo(name) - self.assertEqual(info.filename, name) - self.assertEqual(info.file_size, len(content)) - if i < 2: - with self.assertRaises(zipfile.BadZipFile): - zipfp.read(name) - else: - self.assertEqual(zipfp.read(name), content) + self._test_read(zipfp, expected_names, expected_content) def test_write_with_metadata_encoding(self): ZF = zipfile.ZipFile @@ -3670,6 +3664,20 @@ def test_write_with_metadata_encoding(self): "^metadata_encoding is only"): ZF("nonesuch.zip", mode, metadata_encoding="shift_jis") + def test_add_comment(self): + with zipfile.ZipFile(TESTFN, "r") as zipfp: + bad_expected_names = zipfp.namelist() + + with zipfile.ZipFile(TESTFN, "a") as zipfp: + zipfp.comment = b'comment' + self.assertEqual(zipfp.namelist(), bad_expected_names) + + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self._test_read(zipfp, bad_expected_names, self.file_content) + + with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: + self._test_read(zipfp, self.file_names, self.file_content) + def test_cli_with_metadata_encoding(self): errmsg = "Non-conforming encodings not supported with -c." args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"] diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 86c3bc36b695c79..1ae37ad5463ccaf 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -515,7 +515,7 @@ def __repr__(self): result.append('>') return ''.join(result) - def FileHeader(self, zip64=None): + def FileHeader(self, zip64=None, metadata_encoding=None): """Return the per-file header as a bytes object. When the optional zip64 arg is None rather than a bool, we will @@ -557,7 +557,7 @@ def FileHeader(self, zip64=None): self.extract_version = max(min_version, self.extract_version) self.create_version = max(min_version, self.create_version) - filename, flag_bits = self._encodeFilenameFlags() + filename, flag_bits = self._encodeFilenameFlags(metadata_encoding) header = struct.pack(structFileHeader, stringFileHeader, self.extract_version, self.reserved, flag_bits, self.compress_type, dostime, dosdate, CRC, @@ -565,9 +565,11 @@ def FileHeader(self, zip64=None): len(filename), len(extra)) return header + filename + extra - def _encodeFilenameFlags(self): + def _encodeFilenameFlags(self, encoding): + if not encoding or self.flag_bits & _MASK_UTF_FILENAME: + encoding = 'ascii' try: - return self.filename.encode('ascii'), self.flag_bits + return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME except UnicodeEncodeError: return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME @@ -1370,7 +1372,7 @@ def close(self): # Preserve current position in file self._zipfile.start_dir = self._fileobj.tell() self._fileobj.seek(self._zinfo.header_offset) - self._fileobj.write(self._zinfo.FileHeader(self._zip64)) + self._fileobj.write(self._zinfo.FileHeader(self._zip64, self._zipfile.metadata_encoding)) self._fileobj.seek(self._zipfile.start_dir) # Successfully written: Add file to our caches @@ -1571,6 +1573,8 @@ def _RealGetContents(self): else: # Historical ZIP filename encoding filename = filename.decode(self.metadata_encoding or 'cp437') + if not self.metadata_encoding and not filename.isascii(): + self.metadata_encoding = "cp437" # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) @@ -1808,7 +1812,7 @@ def _open_to_write(self, zinfo, force_zip64=False): zinfo.compress_size = 0 zinfo.CRC = 0 - zinfo.flag_bits = 0x00 + zinfo.flag_bits = _MASK_UTF_FILENAME if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 @@ -1830,7 +1834,7 @@ def _open_to_write(self, zinfo, force_zip64=False): self._writecheck(zinfo) self._didModify = True - self.fp.write(zinfo.FileHeader(zip64)) + self.fp.write(zinfo.FileHeader(zip64, self.metadata_encoding)) self._writing = True return _ZipWriteFile(self, zinfo, zip64) @@ -2062,7 +2066,7 @@ def mkdir(self, zinfo_or_directory_name, mode=511): self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo - self.fp.write(zinfo.FileHeader(False)) + self.fp.write(zinfo.FileHeader(False, self.metadata_encoding)) self.start_dir = self.fp.tell() def __del__(self): @@ -2133,7 +2137,7 @@ def _write_end_record(self): extract_version = max(min_version, zinfo.extract_version) create_version = max(min_version, zinfo.create_version) - filename, flag_bits = zinfo._encodeFilenameFlags() + filename, flag_bits = zinfo._encodeFilenameFlags(self.metadata_encoding) centdir = struct.pack(structCentralDir, stringCentralDir, create_version, zinfo.create_system, extract_version, zinfo.reserved, diff --git a/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst b/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst new file mode 100644 index 000000000000000..198af61ba9dd1df --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst @@ -0,0 +1,2 @@ +Preserve non-ASCII filenames encoded not in UTF-8 when appending to +:class:`zipfile.ZipFile`. From 7d3a02f62fe3e6fcdff41828b26fed2c65f4ab34 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 19 May 2026 19:56:28 +0300 Subject: [PATCH 2/4] Simplify!!! --- Lib/zipfile/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 1ae37ad5463ccaf..7e6e94d82d11fd3 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -515,7 +515,7 @@ def __repr__(self): result.append('>') return ''.join(result) - def FileHeader(self, zip64=None, metadata_encoding=None): + def FileHeader(self, zip64=None): """Return the per-file header as a bytes object. When the optional zip64 arg is None rather than a bool, we will @@ -557,7 +557,7 @@ def FileHeader(self, zip64=None, metadata_encoding=None): self.extract_version = max(min_version, self.extract_version) self.create_version = max(min_version, self.create_version) - filename, flag_bits = self._encodeFilenameFlags(metadata_encoding) + filename, flag_bits = self._encodeFilenameFlags() header = struct.pack(structFileHeader, stringFileHeader, self.extract_version, self.reserved, flag_bits, self.compress_type, dostime, dosdate, CRC, @@ -565,9 +565,11 @@ def FileHeader(self, zip64=None, metadata_encoding=None): len(filename), len(extra)) return header + filename + extra - def _encodeFilenameFlags(self, encoding): - if not encoding or self.flag_bits & _MASK_UTF_FILENAME: + def _encodeFilenameFlags(self): + if self.flag_bits & _MASK_UTF_FILENAME: encoding = 'ascii' + else: + encoding = 'cp437' try: return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME except UnicodeEncodeError: @@ -1372,7 +1374,7 @@ def close(self): # Preserve current position in file self._zipfile.start_dir = self._fileobj.tell() self._fileobj.seek(self._zinfo.header_offset) - self._fileobj.write(self._zinfo.FileHeader(self._zip64, self._zipfile.metadata_encoding)) + self._fileobj.write(self._zinfo.FileHeader(self._zip64)) self._fileobj.seek(self._zipfile.start_dir) # Successfully written: Add file to our caches @@ -1573,8 +1575,6 @@ def _RealGetContents(self): else: # Historical ZIP filename encoding filename = filename.decode(self.metadata_encoding or 'cp437') - if not self.metadata_encoding and not filename.isascii(): - self.metadata_encoding = "cp437" # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) @@ -1834,7 +1834,7 @@ def _open_to_write(self, zinfo, force_zip64=False): self._writecheck(zinfo) self._didModify = True - self.fp.write(zinfo.FileHeader(zip64, self.metadata_encoding)) + self.fp.write(zinfo.FileHeader(zip64)) self._writing = True return _ZipWriteFile(self, zinfo, zip64) @@ -2066,7 +2066,7 @@ def mkdir(self, zinfo_or_directory_name, mode=511): self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo - self.fp.write(zinfo.FileHeader(False, self.metadata_encoding)) + self.fp.write(zinfo.FileHeader(False)) self.start_dir = self.fp.tell() def __del__(self): @@ -2137,7 +2137,7 @@ def _write_end_record(self): extract_version = max(min_version, zinfo.extract_version) create_version = max(min_version, zinfo.create_version) - filename, flag_bits = zinfo._encodeFilenameFlags(self.metadata_encoding) + filename, flag_bits = zinfo._encodeFilenameFlags() centdir = struct.pack(structCentralDir, stringCentralDir, create_version, zinfo.create_system, extract_version, zinfo.reserved, From 828bc55c736c3db0f54b646929738df250575873 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 27 May 2026 10:19:59 -0700 Subject: [PATCH 3/4] Address review: clarify test naming and NEWS wording - Rename bad_expected_names -> mojibake_expected_names in the append / comment tests. - The comment on newname2 was potentially misleading: it is representable in cp437 but is still stored as UTF-8 (new entries default to UTF-8 flag). - Reword the NEWS entry. --- Lib/test/test_zipfile/test_core.py | 18 +++++++++--------- ...26-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst | 6 ++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index b03563153df5783..dbad5f0d1255cfd 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3639,20 +3639,20 @@ def test_read_with_unsuitable_metadata_encoding(self): def test_read_after_append(self): newname = '\u56db' # Han 'four' - newname2 = 'fünf' # encodeable in cp437 + newname2 = 'fünf' # representable in cp437, but still stored as UTF-8 expected_names = [*self.file_names, newname, newname2] - bad_expected_names = [name.encode('shift_jis').decode('cp437') - if i < 2 else name - for i, name in enumerate(expected_names)] + mojibake_expected_names = [name.encode('shift_jis').decode('cp437') + if i < 2 else name + for i, name in enumerate(expected_names)] expected_content = (*self.file_content, b"newcontent", b"newcontent2") with zipfile.ZipFile(TESTFN, "a") as zipfp: zipfp.writestr(newname, "newcontent") zipfp.writestr(newname2, "newcontent2") - self.assertEqual(sorted(zipfp.namelist()), sorted(bad_expected_names)) + self.assertEqual(sorted(zipfp.namelist()), sorted(mojibake_expected_names)) with zipfile.ZipFile(TESTFN, "r") as zipfp: - self._test_read(zipfp, bad_expected_names, expected_content) + self._test_read(zipfp, mojibake_expected_names, expected_content) with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, expected_names, expected_content) @@ -3666,14 +3666,14 @@ def test_write_with_metadata_encoding(self): def test_add_comment(self): with zipfile.ZipFile(TESTFN, "r") as zipfp: - bad_expected_names = zipfp.namelist() + mojibake_expected_names = zipfp.namelist() with zipfile.ZipFile(TESTFN, "a") as zipfp: zipfp.comment = b'comment' - self.assertEqual(zipfp.namelist(), bad_expected_names) + self.assertEqual(zipfp.namelist(), mojibake_expected_names) with zipfile.ZipFile(TESTFN, "r") as zipfp: - self._test_read(zipfp, bad_expected_names, self.file_content) + self._test_read(zipfp, mojibake_expected_names, self.file_content) with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, self.file_names, self.file_content) diff --git a/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst b/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst index 198af61ba9dd1df..659a7e2b3560b68 100644 --- a/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst +++ b/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst @@ -1,2 +1,4 @@ -Preserve non-ASCII filenames encoded not in UTF-8 when appending to -:class:`zipfile.ZipFile`. +Preserve non-UTF-8 encoded filenames when appending to a +:class:`zipfile.ZipFile`. Previously, names stored in a legacy encoding +(without the UTF-8 flag bit set) were corrupted when the central directory +was rewritten. From 38bc1b1f5d211a3cde9d789ed64091191f788476 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 27 May 2026 10:26:56 -0700 Subject: [PATCH 4/4] reword news again --- .../Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst b/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst index 659a7e2b3560b68..84fb12e2abd81a0 100644 --- a/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst +++ b/Misc/NEWS.d/next/Library/2026-05-19-19-00-49.gh-issue-84353.ZU5zaQ.rst @@ -1,4 +1,5 @@ Preserve non-UTF-8 encoded filenames when appending to a -:class:`zipfile.ZipFile`. Previously, names stored in a legacy encoding -(without the UTF-8 flag bit set) were corrupted when the central directory -was rewritten. +:class:`zipfile.ZipFile`. Previously, non-ASCII names stored in a legacy +encoding (without the UTF-8 flag bit set) could be corrupted when the +central directory was rewritten: they were decoded as cp437 and then +re-stored as UTF-8.