压缩包目录结构
里面有中文.zip ├── 中文文件.txt └── 中文文件夹 └── 中文文件.txt
解压代码
from zipfile import ZipFile with ZipFile(r'./里面有中文.zip') as zfp: zfp.extractall(r'./原版解压')
解压目录样式
原版解压 ├── ╓╨╬─╬─╝■.txt └── ╓╨╬─╬─╝■╝╨ └── ╓╨╬─╬─╝■.txt
zipfile.py中ZipFile在初始化时执行了这样一个方法,目的是读取目录结构,代码如下(可以不看):
def _RealGetContents(self): """Read in the table of contents for the ZIP file.""" fp = self.fp try: endrec = _EndRecData(fp) except OSError: raise BadZipFile("File is not a zip file") if not endrec: raise BadZipFile("File is not a zip file") if self.debug > 1: print(endrec) size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory self._comment = endrec[_ECD_COMMENT] # archive comment # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd if endrec[_ECD_SIGNATURE] == stringEndArchive64: # If Zip64 extension structures are present, account for them concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) if self.debug > 2: inferred = concat + offset_cd print("given, inferred, offset", offset_cd, inferred, concat) # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat fp.seek(self.start_dir, 0) data = fp.read(size_cd) fp = io.BytesIO(data) total = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) if len(centdir) != sizeCentralDir: raise BadZipFile("Truncated central directory") centdir = struct.unpack(structCentralDir, centdir) if centdir[_CD_SIGNATURE] != stringCentralDir: raise BadZipFile("Bad magic number for central directory") if self.debug > 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) flags = centdir[5] if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] if x.extract_version > MAX_EXTRACT_VERSION: raise NotImplementedError("zip file version %.1f" % (x.extract_version / 10)) x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() x.header_offset = x.header_offset + concat self.filelist.append(x) self.NameToInfo[x.filename] = x # update total bytes read from central directory total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH]) if self.debug > 2: print("total", total)
其中关于文件名编码的处理是这样的:
if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437')
要么是utf-8
要么是cp437
,然而咱们在windows平台上压缩包文件名编码大多是gbk,这里用cp437
解码了,所以会乱码
只需要将filename重新编码cp437
解码成gbk
就好了,网络上大多数做法是对解压后的文件和目录操作,不太优雅。优雅一点点的方式是直接修改ZipFile对象中的filename
首先找到filename存在于哪里,同样在_RealGetContents
这个函数中找到如下代码
x = ZipInfo(filename) ... # 省略若干行 # self是ZipFile对象 self.filelist.append(x) self.NameToInfo[x.filename] = x
所以至少有两处存在filename
加个补丁函数把这两处改了试试,代码如下:
from zipfile import ZipFile def support_gbk(zip_file: ZipFile): name_to_info = zip_file.NameToInfo # copy map first for name, info in name_to_info.copy().items(): real_name = name.encode('cp437').decode('gbk') if real_name != name: info.filename = real_name del name_to_info[name] name_to_info[real_name] = info return zip_file with support_gbk(ZipFile(r'./里面有中文.zip')) as zfp: zfp.extractall(r'./中文不乱码')
解压出来目录结构如下
中文不乱码 ├── 中文文件.txt └── 中文文件夹 └── 中文文件.txt
完事!
如有问题,欢迎交流!