internetarchive · siznax · Dec 11, 2019 · Dec 11, 2019 · Dec 11, 2019 · Dec 11, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -2,17 +2,13 @@ language: python
 
 python:
  - 2.7
- - 3.2
- - 3.3
- - 3.4
  - 3.5
  - nightly
  - pypy
  - pypy3
 
 matrix:
  allow_failures:
-  - python: 3.5
   - python: nightly
 
 script: python setup.py test

diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
@@ -1,54 +1,61 @@
-"""An object to represent arc records
-http://archive.org/web/researcher/ArcFileFormat.php
+"""
+ARC Record
+~~~~~~~~~~
 """
 
 import re
 
 from hanzo.warctools.record import ArchiveRecord, ArchiveParser
 from hanzo.warctools.archive_detect import register_record_type
 
+
 # URL<sp>IP-address<sp>Archive-date<sp>Content-type<sp>
-#Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
-#Archive-length<nl> 
-# 
+# Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
+# Archive-length<nl>
+
 @ArchiveRecord.HEADERS(
-    URL = b'URL',
-    IP = b'IP-address',
-    DATE = b'Archive-date',
-    CONTENT_TYPE = b'Content-type',
-    CONTENT_LENGTH = b'Archive-length',
-    RESULT_CODE = b'Result-code',
-    CHECKSUM = b'Checksum',
-    LOCATION = b'Location',
-    OFFSET = b'Offset',
-    FILENAME = b'Filename',
+    URL=b'URL',
+    IP=b'IP-address',
+    DATE=b'Archive-date',
+    CONTENT_TYPE=b'Content-type',
+    CONTENT_LENGTH=b'Archive-length',
+    RESULT_CODE=b'Result-code',
+    CHECKSUM=b'Checksum',
+    LOCATION=b'Location',
+    OFFSET=b'Offset',
+    FILENAME=b'Filename',
 )
 class ArcRecord(ArchiveRecord):
 
+    """An object to represent arc records
+    http://archive.org/web/researcher/ArcFileFormat.php
+    """
+
     TRAILER = b'\n'  # an ARC record is trailed by single unix newline
 
     """Represents a record in an arc file."""
     def __init__(self, headers=None, content=None, errors=None):
-        ArchiveRecord.__init__(self, headers, content, errors) 
+        ArchiveRecord.__init__(self, headers, content, errors)
 
     @property
     def type(self):
         return b"response"
 
     def _write_to(self, out, nl):
-        #TODO: empty method?
+        # TODO: empty method?
         pass
 
     @classmethod
     def make_parser(cls):
         """Constructs a parser for arc records."""
         return ArcParser()
 
+
 class ArcRecordHeader(ArcRecord):
     """Represents the headers in an arc record."""
     def __init__(self, headers=None, content=None, errors=None, version=None,
                  raw_headers=None):
-        ArcRecord.__init__(self, headers, content, errors) 
+        ArcRecord.__init__(self, headers, content, errors)
         self.version = version
         self.raw_headers = raw_headers
 
@@ -60,20 +67,22 @@ def raw(self):
         """Return the raw representation of this record."""
         return b"".join(self.raw_headers) + self.content[1]
 
+
 def rx(pat):
     """Helper function to compile a regular expression with the IGNORECASE
     flag."""
     return re.compile(pat, flags=re.IGNORECASE)
 
+
 nl_rx = rx('^\r\n|\r|\n$')
-length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
-type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')     #pylint: disable-msg=E1101
+length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$')
+type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')  # pylint: disable-msg=E1101
 SPLIT = re.compile(br'\b\s|\s\b').split
 
+
 class ArcParser(ArchiveParser):
     """A parser for arc archives."""
 
-
     def __init__(self):
         self.version = 0
         # we don't know which version to parse initially - a v1 or v2 file so
@@ -82,16 +91,18 @@ def __init__(self):
 
         # question? will we get arc fragments?
         # should we store both headers & detect records by header length?
-        # if we don't know 
+        # if we don't know
 
         self.headers = []
 
     def parse(self, stream, offset, line=None):
         """Parses a stream as an arc archive and returns an Arc record along
         with the offset in the stream of the end of the record."""
+
         record = None
-        content_type = None
+        # content_type = None
         content_length = None
+
         if line is None:
             line = stream.readline()
 
@@ -115,18 +126,18 @@ def parse(self, stream, offset, line=None):
             # configure parser instance
             self.version = arc_version.split()[0]
             self.headers = arc_names_line.strip().split()
-            
+
             # now we have read header field in record body
             # we can extract the headers from the current record,
             # and read the length field
 
             # which is in a different place with v1 and v2
-        
-            # read headers 
+
+            # read headers
             arc_headers = self.parse_header_list(line)
-            
+
             # extract content, ignoring header lines parsed already
-            content_type, content_length, errors = \
+            _content_type, content_length, errors = \
                 self.get_content_headers(arc_headers)
 
             content_length = content_length \
@@ -141,10 +152,10 @@ def parse(self, stream, offset, line=None):
             if not self.headers:
                 raise Exception('missing filedesc')
             headers = self.parse_header_list(line)
-            content_type, content_length, errors = \
+            _content_type, content_length, errors = \
                 self.get_content_headers(headers)
 
-            record = ArcRecord(headers = headers, errors=errors)
+            record = ArcRecord(headers=headers, errors=errors)
 
         line = None
 
@@ -153,28 +164,29 @@ def parse(self, stream, offset, line=None):
 
         return (record, (), offset)
 
-    def trim(self, stream):
-        return ()
-
     def parse_header_list(self, line):
+        """returns the list of record headers"""
+
         # some people use ' ' as the empty value. lovely.
         line = line.rstrip(b'\r\n')
         values = SPLIT(line)
         if len(self.headers) != len(values):
             if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
                 # fencepost
-                values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
+                values = [s[::-1] for s in reversed(
+                    SPLIT(line[::-1], len(self.headers)-1))]
             else:
                 values = SPLIT(line, len(self.headers)-1)
 
         if len(self.headers) != len(values):
-            raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
-
-        return list(zip(self.headers, values))
+            raise Exception('missing headers %s %s' % (
+                ",".join(values), ",".join(self.headers)))
 
+        return list(zip(self.headers, values))
 
     @staticmethod
     def get_content_headers(headers):
+        """returns content_type, content_length, errors from headers"""
         content_type = None
         content_length = None
         errors = []

diff --git a/hanzo/warctools/geezip.py b/hanzo/warctools/geezip.py
@@ -0,0 +1,111 @@
+"""Extends gzip.GzipFile for raw gzip offset in python 2 and 3"""
+
+import gzip
+import io
+
+try:
+    import builtins
+except ImportError:
+    pass
+
+try:
+
+    # this branch contributed by Kenji:
+    # https://github.com/kngenie/warctools/commit/159bfdfa45cc0b51ed4a4a4d7d744ef7bf82ae23
+
+    # Python 3.5 got a major change to gzip module. Essential gunzip
+    # work is now implemented in _GzipReader and GzipFile simply wraps
+    # around it.
+
+    class _GeeZipReader(gzip._GzipReader):
+
+        """Extends python 3.5 gzip._GzipReader"""
+
+        def _read_gzip_header(self):
+            pos = self._raw_pos()
+            has_record = super(_GeeZipReader, self)._read_gzip_header()
+
+            if has_record:
+                self.member_offset = pos
+
+            return has_record
+
+        def _raw_pos(self):
+            """Return offset in raw gzip file corresponding to this
+            object's state."""
+
+            # _fp is PaddedFile object with prepend method. it doesn't have
+            # tell(). It has seek(), but it's useless as a replacement for
+            # tell(). We need to compute offset from internal attributes.
+
+            pos = self._fp.file.tell()
+
+            if self._fp._read is not None:
+                pos -= (self._fp._length - self._fp._read)
+
+            return pos
+
+    class GeeZipFile(gzip.GzipFile):
+
+        def __init__(self, filename=None, mode=None, fileobj=None):
+            if mode is None:
+                mode = getattr(fileobj, 'mode', 'rb')
+
+            if mode.startswith('r'):
+                if mode and 'b' not in mode:
+                    mode += 'b'
+                if fileobj is None:
+                    fileobj = self.myfileobj = builtins.open(
+                        filename, mode or 'rb')
+                if filename is None:
+                    filename = getattr(fileobj, 'name', '')
+                    if not isinstance(filename, (str, bytes)):
+                        filename = ''
+                self.mode = gzip.READ
+                raw = _GeeZipReader(fileobj)
+                self._buffer = io.BufferedReader(raw)
+                self.name = filename
+                self.fileobj = fileobj
+                self._raw = raw
+            else:
+                super(GeeZipFile, self).__init__(filename, mode, fileobj)
+
+        @property
+        def member_offset(self):
+            return self._raw.member_offset
+
+except AttributeError:
+
+    # this branch falls back to python 2.7+
+
+    class GeeZipFile(gzip.GzipFile):
+        """Extends gzip.GzipFile to remember self.member_offset, the raw
+        file offset of the current gzip member."""
+
+        def __init__(self, filename=None, mode=None,
+                     compresslevel=9, fileobj=None, mtime=None):
+            print('[DEBUG] GeeZipFile(gzip.GzipFile)')
+            print('[DEBUG] => fileobj', fileobj)
+
+            # ignore mtime for python 2.6
+            gzip.GzipFile.__init__(self, filename=filename, mode=mode,
+                                   compresslevel=compresslevel,
+                                   fileobj=fileobj)
+
+            self.member_offset = None
+            print('[DEBUG] => member_offset', self.member_offset)
+
+        # hook in to the place we seem to be able to reliably get the raw gzip
+        # member offset
+        def _read(self, size=1024):
+            if self._new_member:
+                try:
+                    # works for python3.2
+                    self.member_offset = (self.fileobj.tell()
+                                          - self.fileobj._length
+                                          + (self.fileobj._read or 0))
+                except AttributeError:
+                    # works for python2.7
+                    self.member_offset = self.fileobj.tell()
+
+            return gzip.GzipFile._read(self, size)