1515#from importlib import _bootstrap_external
1616#from importlib import _bootstrap # for _verbose_message
1717import _frozen_importlib_external as _bootstrap_external
18- from _frozen_importlib_external import _unpack_uint16 , _unpack_uint32
18+ from _frozen_importlib_external import _unpack_uint16 , _unpack_uint32 , _unpack_uint64
1919import _frozen_importlib as _bootstrap # for _verbose_message
2020import _imp # for check_hash_based_pycs
2121import _io # for open
@@ -40,8 +40,14 @@ class ZipImportError(ImportError):
4040_module_type = type (sys )
4141
4242END_CENTRAL_DIR_SIZE = 22
43- STRING_END_ARCHIVE = b'PK\x05 \x06 '
43+ END_CENTRAL_DIR_SIZE_64 = 56
44+ END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
45+ STRING_END_ARCHIVE = b'PK\x05 \x06 ' # standard EOCD signature
46+ STRING_END_LOCATOR_64 = b'PK\x06 \x07 ' # Zip64 EOCD Locator signature
47+ STRING_END_ZIP_64 = b'PK\x06 \x06 ' # Zip64 EOCD signature
4448MAX_COMMENT_LEN = (1 << 16 ) - 1
49+ MAX_UINT32 = 0xffffffff
50+ ZIP64_EXTRA_TAG = 0x1
4551
4652class zipimporter (_bootstrap_external ._LoaderBasics ):
4753 """zipimporter(archivepath) -> zipimporter object
@@ -356,49 +362,72 @@ def _read_directory(archive):
356362 # to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
357363 start_offset = fp .tell ()
358364 try :
365+ # Check if there's a comment.
359366 try :
360- fp .seek (- END_CENTRAL_DIR_SIZE , 2 )
361- header_position = fp .tell ()
362- buffer = fp .read (END_CENTRAL_DIR_SIZE )
367+ fp .seek (0 , 2 )
368+ file_size = fp .tell ()
363369 except OSError :
364- raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
365- if len (buffer ) != END_CENTRAL_DIR_SIZE :
366- raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
367- if buffer [:4 ] != STRING_END_ARCHIVE :
368- # Bad: End of Central Dir signature
369- # Check if there's a comment.
370- try :
371- fp .seek (0 , 2 )
372- file_size = fp .tell ()
373- except OSError :
374- raise ZipImportError (f"can't read Zip file: { archive !r} " ,
375- path = archive )
376- max_comment_start = max (file_size - MAX_COMMENT_LEN -
377- END_CENTRAL_DIR_SIZE , 0 )
378- try :
379- fp .seek (max_comment_start )
380- data = fp .read ()
381- except OSError :
382- raise ZipImportError (f"can't read Zip file: { archive !r} " ,
383- path = archive )
384- pos = data .rfind (STRING_END_ARCHIVE )
385- if pos < 0 :
386- raise ZipImportError (f'not a Zip file: { archive !r} ' ,
387- path = archive )
370+ raise ZipImportError (f"can't read Zip file: { archive !r} " ,
371+ path = archive )
372+ max_comment_plus_dirs_size = (
373+ MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
374+ END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64 )
375+ max_comment_start = max (file_size - max_comment_plus_dirs_size , 0 )
376+ try :
377+ fp .seek (max_comment_start )
378+ data = fp .read (max_comment_plus_dirs_size )
379+ except OSError :
380+ raise ZipImportError (f"can't read Zip file: { archive !r} " ,
381+ path = archive )
382+ pos = data .rfind (STRING_END_ARCHIVE )
383+ pos64 = data .rfind (STRING_END_ZIP_64 )
384+
385+ if (pos64 >= 0 and pos64 + END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64 == pos ):
386+ # Zip64 at "correct" offset from standard EOCD
387+ buffer = data [pos64 :pos64 + END_CENTRAL_DIR_SIZE_64 ]
388+ if len (buffer ) != END_CENTRAL_DIR_SIZE_64 :
389+ raise ZipImportError (
390+ f"corrupt Zip64 file: Expected { END_CENTRAL_DIR_SIZE_64 } byte "
391+ f"zip64 central directory, but read { len (buffer )} bytes." ,
392+ path = archive )
393+ header_position = file_size - len (data ) + pos64
394+
395+ central_directory_size = _unpack_uint64 (buffer [40 :48 ])
396+ central_directory_position = _unpack_uint64 (buffer [48 :56 ])
397+ num_entries = _unpack_uint64 (buffer [24 :32 ])
398+ elif pos >= 0 :
388399 buffer = data [pos :pos + END_CENTRAL_DIR_SIZE ]
389400 if len (buffer ) != END_CENTRAL_DIR_SIZE :
390401 raise ZipImportError (f"corrupt Zip file: { archive !r} " ,
391402 path = archive )
403+
392404 header_position = file_size - len (data ) + pos
393405
394- header_size = _unpack_uint32 (buffer [12 :16 ])
395- header_offset = _unpack_uint32 (buffer [16 :20 ])
396- if header_position < header_size :
406+ # Buffer now contains a valid EOCD, and header_position gives the
407+ # starting position of it.
408+ central_directory_size = _unpack_uint32 (buffer [12 :16 ])
409+ central_directory_position = _unpack_uint32 (buffer [16 :20 ])
410+ num_entries = _unpack_uint16 (buffer [8 :10 ])
411+
412+ # N.b. if someday you want to prefer the standard (non-zip64) EOCD,
413+ # you need to adjust position by 76 for arc to be 0.
414+ else :
415+ raise ZipImportError (f'not a Zip file: { archive !r} ' ,
416+ path = archive )
417+
418+ # Buffer now contains a valid EOCD, and header_position gives the
419+ # starting position of it.
420+ # XXX: These are cursory checks but are not as exact or strict as they
421+ # could be. Checking the arc-adjusted value is probably good too.
422+ if header_position < central_directory_size :
397423 raise ZipImportError (f'bad central directory size: { archive !r} ' , path = archive )
398- if header_position < header_offset :
424+ if header_position < central_directory_position :
399425 raise ZipImportError (f'bad central directory offset: { archive !r} ' , path = archive )
400- header_position -= header_size
401- arc_offset = header_position - header_offset
426+ header_position -= central_directory_size
427+ # On just-a-zipfile these values are the same and arc_offset is zero; if
428+ # the file has some bytes prepended, `arc_offset` is the number of such
429+ # bytes. This is used for pex as well as self-extracting .exe.
430+ arc_offset = header_position - central_directory_position
402431 if arc_offset < 0 :
403432 raise ZipImportError (f'bad central directory size or offset: { archive !r} ' , path = archive )
404433
@@ -415,6 +444,11 @@ def _read_directory(archive):
415444 raise EOFError ('EOF read where not expected' )
416445 # Start of file header
417446 if buffer [:4 ] != b'PK\x01 \x02 ' :
447+ if count != num_entries :
448+ raise ZipImportError (
449+ f"mismatched num_entries: { count } should be { num_entries } in { archive !r} " ,
450+ path = archive ,
451+ )
418452 break # Bad: Central Dir File Header
419453 if len (buffer ) != 46 :
420454 raise EOFError ('EOF read where not expected' )
@@ -430,9 +464,6 @@ def _read_directory(archive):
430464 comment_size = _unpack_uint16 (buffer [32 :34 ])
431465 file_offset = _unpack_uint32 (buffer [42 :46 ])
432466 header_size = name_size + extra_size + comment_size
433- if file_offset > header_offset :
434- raise ZipImportError (f'bad local header offset: { archive !r} ' , path = archive )
435- file_offset += arc_offset
436467
437468 try :
438469 name = fp .read (name_size )
@@ -444,7 +475,10 @@ def _read_directory(archive):
444475 # slower than reading the data because fseek flushes stdio's
445476 # internal buffers. See issue #8745.
446477 try :
447- if len (fp .read (header_size - name_size )) != header_size - name_size :
478+ extra_data_len = header_size - name_size
479+ extra_data = memoryview (fp .read (extra_data_len ))
480+
481+ if len (extra_data ) != extra_data_len :
448482 raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
449483 except OSError :
450484 raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
@@ -461,6 +495,60 @@ def _read_directory(archive):
461495
462496 name = name .replace ('/' , path_sep )
463497 path = _bootstrap_external ._path_join (archive , name )
498+
499+ # Ordering matches unpacking below.
500+ if (
501+ file_size == MAX_UINT32 or
502+ data_size == MAX_UINT32 or
503+ file_offset == MAX_UINT32
504+ ):
505+ # need to decode extra_data looking for a zip64 extra (which might not
506+ # be present)
507+ while extra_data :
508+ if len (extra_data ) < 4 :
509+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
510+ tag = _unpack_uint16 (extra_data [:2 ])
511+ size = _unpack_uint16 (extra_data [2 :4 ])
512+ if len (extra_data ) < 4 + size :
513+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
514+ if tag == ZIP64_EXTRA_TAG :
515+ if (len (extra_data ) - 4 ) % 8 != 0 :
516+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
517+ num_extra_values = (len (extra_data ) - 4 ) // 8
518+ if num_extra_values > 3 :
519+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
520+ values = struct .unpack_from (f"<{ min (num_extra_values , 3 )} Q" ,
521+ extra_data , offset = 4 )
522+
523+ # N.b. Here be dragons: the ordering of these is different than
524+ # the header fields, and it's really easy to get it wrong since
525+ # naturally-occuring zips that use all 3 are >4GB
526+ if file_size == MAX_UINT32 :
527+ file_size = values .pop (0 )
528+ if data_size == MAX_UINT32 :
529+ data_size = values .pop (0 )
530+ if file_offset == MAX_UINT32 :
531+ file_offset = values .pop (0 )
532+
533+ break
534+
535+ # For a typical zip, this bytes-slicing only happens 2-3 times, on
536+ # small data like timestamps and filesizes.
537+ extra_data = extra_data [4 + size :]
538+ else :
539+ _bootstrap ._verbose_message (
540+ "zipimport: suspected zip64 but no zip64 extra for {!r}" ,
541+ path ,
542+ )
543+ # XXX These two statements seem swapped because `central_directory_position`
544+ # is a position within the actual file, but `file_offset` (when compared) is
545+ # as encoded in the entry, not adjusted for this file.
546+ # N.b. this must be after we've potentially read the zip64 extra which can
547+ # change `file_offset`.
548+ if file_offset > central_directory_position :
549+ raise ZipImportError (f'bad local header offset: { archive !r} ' , path = archive )
550+ file_offset += arc_offset
551+
464552 t = (path , compress , data_size , file_size , file_offset , time , date , crc )
465553 files [name ] = t
466554 count += 1
0 commit comments