Skip to content

Commit a2a74cb

Browse files
serhiy-storchakagpshead
authored andcommitted
gh-84353: Preserve non-UTF-8 filenames when appending to ZipFile (GH-150091)
Preserve non-UTF-8 filenames when appending to a ZipFile. --------- (cherry picked from commit 24c6bbc) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Gregory P. Smith <greg@krypto.org>
1 parent 40c8043 commit a2a74cb

3 files changed

Lines changed: 35 additions & 18 deletions

File tree

Lib/test/test_zipfile/test_core.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3640,29 +3640,23 @@ def test_read_with_unsuitable_metadata_encoding(self):
36403640

36413641
def test_read_after_append(self):
36423642
newname = '\u56db' # Han 'four'
3643-
expected_names = [name.encode('shift_jis').decode('cp437')
3644-
for name in self.file_names[:2]] + self.file_names[2:]
3645-
expected_names.append(newname)
3646-
expected_content = (*self.file_content, b"newcontent")
3643+
newname2 = 'fünf' # representable in cp437, but still stored as UTF-8
3644+
expected_names = [*self.file_names, newname, newname2]
3645+
mojibake_expected_names = [name.encode('shift_jis').decode('cp437')
3646+
if i < 2 else name
3647+
for i, name in enumerate(expected_names)]
3648+
expected_content = (*self.file_content, b"newcontent", b"newcontent2")
36473649

36483650
with zipfile.ZipFile(TESTFN, "a") as zipfp:
36493651
zipfp.writestr(newname, "newcontent")
3650-
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
3652+
zipfp.writestr(newname2, "newcontent2")
3653+
self.assertEqual(sorted(zipfp.namelist()), sorted(mojibake_expected_names))
36513654

36523655
with zipfile.ZipFile(TESTFN, "r") as zipfp:
3653-
self._test_read(zipfp, expected_names, expected_content)
3656+
self._test_read(zipfp, mojibake_expected_names, expected_content)
36543657

36553658
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
3656-
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
3657-
for i, (name, content) in enumerate(zip(expected_names, expected_content)):
3658-
info = zipfp.getinfo(name)
3659-
self.assertEqual(info.filename, name)
3660-
self.assertEqual(info.file_size, len(content))
3661-
if i < 2:
3662-
with self.assertRaises(zipfile.BadZipFile):
3663-
zipfp.read(name)
3664-
else:
3665-
self.assertEqual(zipfp.read(name), content)
3659+
self._test_read(zipfp, expected_names, expected_content)
36663660

36673661
def test_write_with_metadata_encoding(self):
36683662
ZF = zipfile.ZipFile
@@ -3671,6 +3665,20 @@ def test_write_with_metadata_encoding(self):
36713665
"^metadata_encoding is only"):
36723666
ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
36733667

3668+
def test_add_comment(self):
3669+
with zipfile.ZipFile(TESTFN, "r") as zipfp:
3670+
mojibake_expected_names = zipfp.namelist()
3671+
3672+
with zipfile.ZipFile(TESTFN, "a") as zipfp:
3673+
zipfp.comment = b'comment'
3674+
self.assertEqual(zipfp.namelist(), mojibake_expected_names)
3675+
3676+
with zipfile.ZipFile(TESTFN, "r") as zipfp:
3677+
self._test_read(zipfp, mojibake_expected_names, self.file_content)
3678+
3679+
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
3680+
self._test_read(zipfp, self.file_names, self.file_content)
3681+
36743682
def test_cli_with_metadata_encoding(self):
36753683
errmsg = "Non-conforming encodings not supported with -c."
36763684
args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"]

Lib/zipfile/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -566,8 +566,12 @@ def FileHeader(self, zip64=None):
566566
return header + filename + extra
567567

568568
def _encodeFilenameFlags(self):
569+
if self.flag_bits & _MASK_UTF_FILENAME:
570+
encoding = 'ascii'
571+
else:
572+
encoding = 'cp437'
569573
try:
570-
return self.filename.encode('ascii'), self.flag_bits
574+
return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME
571575
except UnicodeEncodeError:
572576
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
573577

@@ -1812,7 +1816,7 @@ def _open_to_write(self, zinfo, force_zip64=False):
18121816
zinfo.compress_size = 0
18131817
zinfo.CRC = 0
18141818

1815-
zinfo.flag_bits = 0x00
1819+
zinfo.flag_bits = _MASK_UTF_FILENAME
18161820
if zinfo.compress_type == ZIP_LZMA:
18171821
# Compressed data includes an end-of-stream (EOS) marker
18181822
zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Preserve non-UTF-8 encoded filenames when appending to a
2+
:class:`zipfile.ZipFile`. Previously, non-ASCII names stored in a legacy
3+
encoding (without the UTF-8 flag bit set) could be corrupted when the
4+
central directory was rewritten: they were decoded as cp437 and then
5+
re-stored as UTF-8.

0 commit comments

Comments
 (0)