From eb38918b2a519d5419133d8add8c634f681752b6 Mon Sep 17 00:00:00 2001
From: Steve Sisney <steve@archive.org>
Date: Wed, 11 Dec 2019 01:50:11 +0000
Subject: [PATCH 1/6] Added .pylintrc, and basic pylint fixes to arc.py

---
 .pylintrc              |  2 +
 hanzo/warctools/arc.py | 86 ++++++++++++++++++++++++------------------
 2 files changed, 51 insertions(+), 37 deletions(-)
 create mode 100644 .pylintrc
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..c234cea
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,2 @@
+[TYPECHECK]
+ignored-classes=ArcRecord,WarcRecord,WarcParser
\ No newline at end of file
diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
index 545b59c..3bc262f 100644
--- a/hanzo/warctools/arc.py
+++ b/hanzo/warctools/arc.py
@@ -1,5 +1,6 @@
-"""An object to represent arc records
-http://archive.org/web/researcher/ArcFileFormat.php
+"""
+ARC Record
+~~~~~~~~~~
 """
 
 import re
@@ -7,36 +8,41 @@
 from hanzo.warctools.record import ArchiveRecord, ArchiveParser
 from hanzo.warctools.archive_detect import register_record_type
 
+
 # URL<sp>IP-address<sp>Archive-date<sp>Content-type<sp>
-#Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
-#Archive-length<nl> 
-# 
+# Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
+# Archive-length<nl>
+
 @ArchiveRecord.HEADERS(
-    URL = b'URL',
-    IP = b'IP-address',
-    DATE = b'Archive-date',
-    CONTENT_TYPE = b'Content-type',
-    CONTENT_LENGTH = b'Archive-length',
-    RESULT_CODE = b'Result-code',
-    CHECKSUM = b'Checksum',
-    LOCATION = b'Location',
-    OFFSET = b'Offset',
-    FILENAME = b'Filename',
+    URL=b'URL',
+    IP=b'IP-address',
+    DATE=b'Archive-date',
+    CONTENT_TYPE=b'Content-type',
+    CONTENT_LENGTH=b'Archive-length',
+    RESULT_CODE=b'Result-code',
+    CHECKSUM=b'Checksum',
+    LOCATION=b'Location',
+    OFFSET=b'Offset',
+    FILENAME=b'Filename',
 )
 class ArcRecord(ArchiveRecord):
 
+    """An object to represent arc records
+    http://archive.org/web/researcher/ArcFileFormat.php
+    """
+
     TRAILER = b'\n'  # an ARC record is trailed by single unix newline
 
     """Represents a record in an arc file."""
     def __init__(self, headers=None, content=None, errors=None):
-        ArchiveRecord.__init__(self, headers, content, errors) 
+        ArchiveRecord.__init__(self, headers, content, errors)
 
     @property
     def type(self):
         return b"response"
 
     def _write_to(self, out, nl):
-        #TODO: empty method?
+        # TODO: empty method?
         pass
 
     @classmethod
@@ -44,11 +50,12 @@ def make_parser(cls):
         """Constructs a parser for arc records."""
         return ArcParser()
 
+
 class ArcRecordHeader(ArcRecord):
     """Represents the headers in an arc record."""
     def __init__(self, headers=None, content=None, errors=None, version=None,
                  raw_headers=None):
-        ArcRecord.__init__(self, headers, content, errors) 
+        ArcRecord.__init__(self, headers, content, errors)
         self.version = version
         self.raw_headers = raw_headers
 
@@ -60,20 +67,22 @@ def raw(self):
         """Return the raw representation of this record."""
         return b"".join(self.raw_headers) + self.content[1]
 
+
 def rx(pat):
     """Helper function to compile a regular expression with the IGNORECASE
     flag."""
     return re.compile(pat, flags=re.IGNORECASE)
 
+
 nl_rx = rx('^\r\n|\r|\n$')
-length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
-type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')     #pylint: disable-msg=E1101
+length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$')
+type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')  # pylint: disable-msg=E1101
 SPLIT = re.compile(br'\b\s|\s\b').split
 
+
 class ArcParser(ArchiveParser):
     """A parser for arc archives."""
 
-
     def __init__(self):
         self.version = 0
         # we don't know which version to parse initially - a v1 or v2 file so
@@ -82,16 +91,18 @@ def __init__(self):
 
         # question? will we get arc fragments?
         # should we store both headers & detect records by header length?
-        # if we don't know 
+        # if we don't know
 
         self.headers = []
 
     def parse(self, stream, offset, line=None):
         """Parses a stream as an arc archive and returns an Arc record along
         with the offset in the stream of the end of the record."""
+
         record = None
-        content_type = None
+        # content_type = None
         content_length = None
+
         if line is None:
             line = stream.readline()
 
@@ -115,18 +126,18 @@ def parse(self, stream, offset, line=None):
             # configure parser instance
             self.version = arc_version.split()[0]
             self.headers = arc_names_line.strip().split()
-            
+
             # now we have read header field in record body
             # we can extract the headers from the current record,
             # and read the length field
 
             # which is in a different place with v1 and v2
-        
-            # read headers 
+
+            # read headers
             arc_headers = self.parse_header_list(line)
-            
+
             # extract content, ignoring header lines parsed already
-            content_type, content_length, errors = \
+            _content_type, content_length, errors = \
                 self.get_content_headers(arc_headers)
 
             content_length = content_length \
@@ -141,10 +152,10 @@ def parse(self, stream, offset, line=None):
             if not self.headers:
                 raise Exception('missing filedesc')
             headers = self.parse_header_list(line)
-            content_type, content_length, errors = \
+            _content_type, content_length, errors = \
                 self.get_content_headers(headers)
 
-            record = ArcRecord(headers = headers, errors=errors)
+            record = ArcRecord(headers=headers, errors=errors)
 
         line = None
 
@@ -153,28 +164,29 @@ def parse(self, stream, offset, line=None):
 
         return (record, (), offset)
 
-    def trim(self, stream):
-        return ()
-
     def parse_header_list(self, line):
+        """returns the list of record headers"""
+
         # some people use ' ' as the empty value. lovely.
         line = line.rstrip(b'\r\n')
         values = SPLIT(line)
         if len(self.headers) != len(values):
             if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
                 # fencepost
-                values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
+                values = [s[::-1] for s in reversed(
+                    SPLIT(line[::-1], len(self.headers)-1))]
             else:
                 values = SPLIT(line, len(self.headers)-1)
 
         if len(self.headers) != len(values):
-            raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
-                
-        return list(zip(self.headers, values))
+            raise Exception('missing headers %s %s' % (
+                ",".join(values), ",".join(self.headers)))
 
+        return list(zip(self.headers, values))
 
     @staticmethod
     def get_content_headers(headers):
+        """returns content_type, content_length, errors from headers"""
         content_type = None
         content_length = None
         errors = []

From 64940b7ffea390a5a7cb570dbfca13d666acbe8e Mon Sep 17 00:00:00 2001
From: Steve Sisney <steve@archive.org>
Date: Wed, 11 Dec 2019 01:50:54 +0000
Subject: [PATCH 2/6] Made basic pylint fixes to warc.py

---
 hanzo/warctools/warc.py | 135 ++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 66 deletions(-)

diff --git a/hanzo/warctools/warc.py b/hanzo/warctools/warc.py
index d274510..e39ce71 100644
--- a/hanzo/warctools/warc.py
+++ b/hanzo/warctools/warc.py
@@ -1,13 +1,16 @@
-"""An object to represent warc records, using the abstract record in
-record.py"""
+"""
+WARC Record
+~~~~~~~~~~~
+"""
 
-import re
 import hashlib
+import re
+import uuid
+
 from hanzo.warctools.record import ArchiveRecord, ArchiveParser
 from hanzo.warctools.archive_detect import register_record_type
-import uuid
 
-bad_lines = 5 # when to give up looking for the version stamp
+BAD_LINES = 5  # when to give up looking for the version stamp
 
 
 @ArchiveRecord.HEADERS(
@@ -30,10 +33,8 @@
 )
 class WarcRecord(ArchiveRecord):
 
-    # Pylint is very bad at decorators, E1101 is the message that says
-    # a member variable does not exist
-
-    # pylint: disable-msg=E1101
+    """An object to represent warc records, using the abstract record
+    in record.py"""
 
     VERSION = b"WARC/1.0"
     VERSION18 = b"WARC/0.18"
@@ -46,7 +47,8 @@ class WarcRecord(ArchiveRecord):
     CONVERSION = b"conversion"
     WARCINFO = b"warcinfo"
 
-    PROFILE_IDENTICAL_PAYLOAD_DIGEST = b"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"
+    PROFILE_IDENTICAL_PAYLOAD_DIGEST = (
+        b"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest")
 
     TRAILER = b'\r\n\r\n'
 
@@ -55,15 +57,16 @@ def __init__(self, version=VERSION, headers=None, content=None,
         """
         WarcRecord constructor.
 
-        Either content or content_file must be provided, but not both. If
-        content, which is a tuple (content_type, content_buffer), is provided,
-        when writing the warc record, any Content-Type and Content-Length that
-        appear in the supplied headers are ignored, and the values content[0]
-        and len(content[1]), respectively, are used. 
+        Either content or content_file must be provided, but not
+        both. If content, which is a tuple (content_type,
+        content_buffer), is provided, when writing the warc record,
+        any Content-Type and Content-Length that appear in the
+        supplied headers are ignored, and the values content[0] and
+        len(content[1]), respectively, are used.
 
-        When reading, the caller can stream content_file or use content, which is
-        lazily filled using content_file, and after which content_file is
-        unavailable.
+        When reading, the caller can stream content_file or use
+        content, which is lazily filled using content_file, and after
+        which content_file is unavailable.
         """
         ArchiveRecord.__init__(self, headers, content, errors)
         self.version = version
@@ -86,17 +89,19 @@ def _write_to(self, out, nl):
         out.write(self.version)
         out.write(nl)
         for k, v in self.headers:
-            if self.content_file is not None or k not in (self.CONTENT_TYPE, self.CONTENT_LENGTH):
+            if self.content_file is not None or k not in (
+                    self.CONTENT_TYPE, self.CONTENT_LENGTH):
                 out.write(k)
                 out.write(b": ")
                 out.write(v)
                 out.write(nl)
 
         if self.content_file is not None:
-            out.write(nl) # end of header blank nl
+            out.write(nl)  # end of header blank nl
             while True:
                 buf = self.content_file.read(8192)
-                if buf == b'': break
+                if buf == b'':
+                    break
                 out.write(buf)
         else:
             # if content tuple is provided, set Content-Type and
@@ -117,10 +122,10 @@ def _write_to(self, out, nl):
             out.write(str(content_length).encode('ascii'))
             out.write(nl)
 
-            out.write(nl) # end of header blank nl
+            out.write(nl)  # end of header blank nl
             if content_buffer:
                 out.write(content_buffer)
-     
+
         # end of record nl nl
         out.write(nl)
         out.write(nl)
@@ -145,7 +150,8 @@ def block_digest(self, content_buffer):
 
     @staticmethod
     def warc_uuid(text):
-        return "<urn:uuid:{}>".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii')
+        return "<urn:uuid:{}>".format(
+            uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii')
 
     @staticmethod
     def random_warc_uuid():
@@ -156,24 +162,29 @@ def rx(pat):
     """Helper to compile regexps with IGNORECASE option set."""
     return re.compile(pat, flags=re.IGNORECASE)
 
+
 version_rx = rx(br'^(?P<prefix>.*?)(?P<version>\s*WARC/(?P<number>.*?))'
                 b'(?P<nl>\r\n|\r|\n)\\Z')
+
 # a header is key: <ws> value plus any following lines with leading whitespace
 header_rx = rx(br'^(?P<name>.*?):\s?(?P<value>.*?)' b'(?P<nl>\r\n|\r|\n)\\Z')
 value_rx = rx(br'^\s+(?P<value>.+?)' b'(?P<nl>\r\n|\r|\n)\\Z')
 nl_rx = rx(b'^(?P<nl>\r\n|\r|\n\\Z)')
-length_rx = rx(b'^' + WarcRecord.CONTENT_LENGTH + b'$' ) # pylint: disable-msg=E1101
-type_rx = rx(b'^' + WarcRecord.CONTENT_TYPE + b'$')     # pylint: disable-msg=E1101
+length_rx = rx(b'^' + WarcRecord.CONTENT_LENGTH + b'$')
+type_rx = rx(b'^' + WarcRecord.CONTENT_TYPE + b'$')
 
 required_headers = set((
-        WarcRecord.TYPE.lower(),           # pylint: disable-msg=E1101
-        WarcRecord.ID.lower(),             # pylint: disable-msg=E1101
-        WarcRecord.CONTENT_LENGTH.lower(), # pylint: disable-msg=E1101
-        WarcRecord.DATE.lower(),           # pylint: disable-msg=E1101
-        ))
+    WarcRecord.TYPE.lower(),
+    WarcRecord.ID.lower(),
+    WarcRecord.CONTENT_LENGTH.lower(),
+    WarcRecord.DATE.lower(),
+))
 
 
 class WarcParser(ArchiveParser):
+
+    """WARC Parser"""
+
     KNOWN_VERSIONS = set((b'1.0', b'0.17', b'0.18'))
 
     def parse(self, stream, offset, line=None):
@@ -181,7 +192,6 @@ def parse(self, stream, offset, line=None):
         (record, errors).  Either records is null or errors is
         null. Any record-specific errors are contained in the record -
         errors is only used when *nothing* could be parsed"""
-        # pylint: disable-msg=E1101
         errors = []
         version = None
         # find WARC/.*
@@ -201,7 +211,7 @@ def parse(self, stream, offset, line=None):
                     offset += len(line)
                 if not nl_rx.match(line):
                     errors.append(('ignored line', line))
-                    if len(errors) > bad_lines:
+                    if len(errors) > BAD_LINES:
                         errors.append(('too many errors, giving up hope',))
                         return (None, errors, offset)
                 line = stream.readline()
@@ -211,7 +221,6 @@ def parse(self, stream, offset, line=None):
             return (None, errors, offset)
         if line:
             content_length = 0
-            content_type = None
 
             record = WarcRecord(errors=errors, version=version)
 
@@ -228,11 +237,11 @@ def parse(self, stream, offset, line=None):
             if prefix:
                 record.error('bad prefix on WARC version header', prefix)
 
-            #Read headers
+            # Read headers
             line = stream.readline()
             while line and not nl_rx.match(line):
 
-                #print 'header', repr(line)
+                # print 'header', repr(line)
                 match = header_rx.match(line)
                 if match:
                     if match.group('nl') != b'\x0d\x0a':
@@ -240,12 +249,12 @@ def parse(self, stream, offset, line=None):
                                      match.group('nl'))
                     name = match.group('name').strip()
                     value = [match.group('value').strip()]
-                    #print 'match',name, value
+                    # print 'match',name, value
 
                     line = stream.readline()
                     match = value_rx.match(line)
                     while match:
-                        #print 'follow', repr(line)
+                        # print 'follow', repr(line)
                         if match.group('nl') != b'\x0d\x0a':
                             record.error('incorrect newline in follow header',
                                          line, match.group('nl'))
@@ -258,15 +267,13 @@ def parse(self, stream, offset, line=None):
                     record.headers.append((name, value))
 
                     if type_rx.match(name):
-                        if value:
-                            content_type = value
-                        else:
+                        if not value:
                             record.error('invalid header', name, value)
                     elif length_rx.match(name):
                         try:
-                            #print name, value
+                            # print name, value
                             content_length = int(value)
-                            #print content_length
+                            # print content_length
                         except ValueError:
                             record.error('invalid header', name, value)
 
@@ -275,7 +282,7 @@ def parse(self, stream, offset, line=None):
             record.content_file = stream
             record.content_file.bytes_to_eoc = content_length
 
-            # check mandatory headers 
+            # check mandatory headers
             # WARC-Type WARC-Date WARC-Record-ID Content-Length
 
             return (record, (), offset)
@@ -287,14 +294,13 @@ def parse(self, stream, offset, line=None):
 
 
 def make_response(id, date, url, content, request_id):
-    # pylint: disable-msg=E1101
     headers = [
-            (WarcRecord.TYPE, WarcRecord.RESPONSE),
-            (WarcRecord.ID, id),
-            (WarcRecord.DATE, date),
-            (WarcRecord.URL, url),
-
+        (WarcRecord.TYPE, WarcRecord.RESPONSE),
+        (WarcRecord.ID, id),
+        (WarcRecord.DATE, date),
+        (WarcRecord.URL, url),
     ]
+
     if request_id:
         headers.append((WarcRecord.CONCURRENT_TO, request_id))
 
@@ -304,14 +310,13 @@ def make_response(id, date, url, content, request_id):
 
 
 def make_request(request_id, date, url, content, response_id):
-    # pylint: disable-msg=E1101
     headers = [
-            (WarcRecord.TYPE, WarcRecord.REQUEST),
-            (WarcRecord.ID, request_id),
-            (WarcRecord.DATE, date),
-            (WarcRecord.URL, url),
-
+        (WarcRecord.TYPE, WarcRecord.REQUEST),
+        (WarcRecord.ID, request_id),
+        (WarcRecord.DATE, date),
+        (WarcRecord.URL, url),
     ]
+
     if response_id:
         headers.append((WarcRecord.CONCURRENT_TO, response_id))
 
@@ -321,13 +326,12 @@ def make_request(request_id, date, url, content, response_id):
 
 
 def make_metadata(meta_id, date, content, concurrent_to=None, url=None):
-    # pylint: disable-msg=E1101
     headers = [
-            (WarcRecord.TYPE, WarcRecord.METADATA),
-            (WarcRecord.ID, meta_id),
-            (WarcRecord.DATE, date),
-
+        (WarcRecord.TYPE, WarcRecord.METADATA),
+        (WarcRecord.ID, meta_id),
+        (WarcRecord.DATE, date),
     ]
+
     if concurrent_to:
         headers.append((WarcRecord.CONCURRENT_TO, concurrent_to))
 
@@ -340,13 +344,12 @@ def make_metadata(meta_id, date, content, concurrent_to=None, url=None):
 
 
 def make_conversion(conv_id, date, content, refers_to=None, url=None):
-    # pylint: disable-msg=E1101
     headers = [
-            (WarcRecord.TYPE, WarcRecord.CONVERSION),
-            (WarcRecord.ID, conv_id),
-            (WarcRecord.DATE, date),
-
+        (WarcRecord.TYPE, WarcRecord.CONVERSION),
+        (WarcRecord.ID, conv_id),
+        (WarcRecord.DATE, date),
     ]
+
     if refers_to:
         headers.append((WarcRecord.REFERS_TO, refers_to))
 

From 29666d047018946c7e39303ad95a6ae988d2e4f6 Mon Sep 17 00:00:00 2001
From: Steve Sisney <steve@archive.org>
Date: Wed, 11 Dec 2019 02:14:52 +0000
Subject: [PATCH 3/6] Added python 3.5 GzipFile extension: geezip.py, using in
 stream.py

---
 hanzo/warctools/geezip.py | 111 ++++++++++++++++++++++++++++++++++++++
 hanzo/warctools/stream.py |  85 +++++++++++++----------------
 2 files changed, 148 insertions(+), 48 deletions(-)
 create mode 100644 hanzo/warctools/geezip.py

diff --git a/hanzo/warctools/geezip.py b/hanzo/warctools/geezip.py
new file mode 100644
index 0000000..92d87b8
--- /dev/null
+++ b/hanzo/warctools/geezip.py
@@ -0,0 +1,111 @@
+"""Extends gzip.GzipFile for raw gzip offset in python 2 and 3"""
+
+import gzip
+import io
+
+try:
+    import builtins
+except ImportError:
+    pass
+
+try:
+
+    # this branch contributed by Kenji:
+    # https://github.com/kngenie/warctools/commit/159bfdfa45cc0b51ed4a4a4d7d744ef7bf82ae23
+
+    # Python 3.5 got a major change to gzip module. Essential gunzip
+    # work is now implemented in _GzipReader and GzipFile simply wraps
+    # around it.
+
+    class _GeeZipReader(gzip._GzipReader):
+
+        """Extends python 3.5 gzip._GzipReader"""
+
+        def _read_gzip_header(self):
+            pos = self._raw_pos()
+            has_record = super(_GeeZipReader, self)._read_gzip_header()
+
+            if has_record:
+                self.member_offset = pos
+
+            return has_record
+
+        def _raw_pos(self):
+            """Return offset in raw gzip file corresponding to this
+            object's state."""
+
+            # _fp is PaddedFile object with prepend method. it doesn't have
+            # tell(). It has seek(), but it's useless as a replacement for
+            # tell(). We need to compute offset from internal attributes.
+
+            pos = self._fp.file.tell()
+
+            if self._fp._read is not None:
+                pos -= (self._fp._length - self._fp._read)
+
+            return pos
+
+    class GeeZipFile(gzip.GzipFile):
+
+        def __init__(self, filename=None, mode=None, fileobj=None):
+            if mode is None:
+                mode = getattr(fileobj, 'mode', 'rb')
+
+            if mode.startswith('r'):
+                if mode and 'b' not in mode:
+                    mode += 'b'
+                if fileobj is None:
+                    fileobj = self.myfileobj = builtins.open(
+                        filename, mode or 'rb')
+                if filename is None:
+                    filename = getattr(fileobj, 'name', '')
+                    if not isinstance(filename, (str, bytes)):
+                        filename = ''
+                self.mode = gzip.READ
+                raw = _GeeZipReader(fileobj)
+                self._buffer = io.BufferedReader(raw)
+                self.name = filename
+                self.fileobj = fileobj
+                self._raw = raw
+            else:
+                super(GeeZipFile, self).__init__(filename, mode, fileobj)
+
+        @property
+        def member_offset(self):
+            return self._raw.member_offset
+
+except AttributeError:
+
+    # this branch falls back to python 2.7+
+
+    class GeeZipFile(gzip.GzipFile):
+        """Extends gzip.GzipFile to remember self.member_offset, the raw
+        file offset of the current gzip member."""
+
+        def __init__(self, filename=None, mode=None,
+                     compresslevel=9, fileobj=None, mtime=None):
+            print('[DEBUG] GeeZipFile(gzip.GzipFile)')
+            print('[DEBUG] => fileobj', fileobj)
+
+            # ignore mtime for python 2.6
+            gzip.GzipFile.__init__(self, filename=filename, mode=mode,
+                                   compresslevel=compresslevel,
+                                   fileobj=fileobj)
+
+            self.member_offset = None
+            print('[DEBUG] => member_offset', self.member_offset)
+
+        # hook in to the place we seem to be able to reliably get the raw gzip
+        # member offset
+        def _read(self, size=1024):
+            if self._new_member:
+                try:
+                    # works for python3.2
+                    self.member_offset = (self.fileobj.tell()
+                                          - self.fileobj._length
+                                          + (self.fileobj._read or 0))
+                except AttributeError:
+                    # works for python2.7
+                    self.member_offset = self.fileobj.tell()
+
+            return gzip.GzipFile._read(self, size)
diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py
index 1fecc91..360964a 100644
--- a/hanzo/warctools/stream.py
+++ b/hanzo/warctools/stream.py
@@ -5,47 +5,54 @@
 
 from hanzo.warctools.archive_detect import is_gzip_file, guess_record_type
 
+from . import s3
+from .geezip import GeeZipFile
+
+CHUNK_SIZE = 8192  # the size to read in, make this bigger things go faster.
+
+
 def open_record_stream(record_class=None, filename=None, file_handle=None,
-                       mode="rb", gzip="auto", offset=None, length=None):
+                       mode="rb", _gzip="auto", offset=None, length=None):
     """Can take a filename or a file_handle. Normally called
     indirectly from A record class i.e WarcRecord.open_archive. If the
     first parameter is None, will try to guess"""
 
     if file_handle is None:
         if filename.startswith('s3://'):
-            from . import s3
             file_handle = s3.open_url(filename, offset=offset, length=length)
         else:
             file_handle = open(filename, mode=mode)
             if offset is not None:
                 file_handle.seek(offset)
 
-    if record_class == None:
+    if record_class is None:
         record_class = guess_record_type(file_handle)
 
-    if record_class == None:
+    if record_class is None:
         raise Exception('Failed to guess compression')
 
     record_parser = record_class.make_parser()
 
-    if gzip == 'auto':
-        if (filename and filename.endswith('.gz')) or is_gzip_file(file_handle):
-            gzip = 'record'
-            #debug('autodetect: record gzip')
+    if _gzip == 'auto':
+        if ((filename and filename.endswith('.gz'))
+                or is_gzip_file(file_handle)):
+            _gzip = 'record'
+            # debug('autodetect: record gzip')
         else:
             # assume uncompressed file
-            #debug('autodetected: uncompressed file')
-            gzip = None
+            # debug('autodetected: uncompressed file')
+            _gzip = None
 
-    if gzip == 'record':
+    if _gzip == 'record':
         return GzipRecordStream(file_handle, record_parser)
-    elif gzip == 'file':
+
+    if _gzip == 'file':
         return GzipFileStream(file_handle, record_parser)
-    else:
-        return RecordStream(file_handle, record_parser)
 
+    return RecordStream(file_handle, record_parser)
 
-class RecordStream(object):
+
+class RecordStream():
     """A readable/writable stream of Archive Records. Can be iterated over
     or read_records can give more control, and potentially offset information.
     """
@@ -98,7 +105,9 @@ def _read_record(self, offsets):
             if not re.match(br'^[\r\n]+$', line):
                 break
 
-        record, errors, offset = self.record_parser.parse(self, offset, line)
+        record, errors, offset = self.record_parser.parse(
+            self, offset, line)
+
         return offset, record, errors
 
     def write(self, record):
@@ -117,7 +126,8 @@ def _skip_to_eoc(self):
             read_size = min(CHUNK_SIZE, self.bytes_to_eoc)
             buf = self._read(read_size)
             if len(buf) < read_size:
-                raise Exception('expected {} bytes but only read {}'.format(read_size, len(buf)))
+                raise Exception('expected {} bytes but only read {}'.format(
+                    read_size, len(buf)))
 
     def _read(self, count=None):
         """Raw read, will read into next record if caller isn't careful"""
@@ -178,36 +188,14 @@ def readline(self, maxlen=None):
             self.bytes_to_eoc -= len(result)
         return result
 
-CHUNK_SIZE = 8192 # the size to read in, make this bigger things go faster.
-
-class GeeZipFile(gzip.GzipFile):
-    """Extends gzip.GzipFile to remember self.member_offset, the raw file
-    offset of the current gzip member."""
-
-    def __init__(self, filename=None, mode=None,
-                 compresslevel=9, fileobj=None, mtime=None):
-        # ignore mtime for python 2.6
-        gzip.GzipFile.__init__(self, filename=filename, mode=mode, compresslevel=compresslevel, fileobj=fileobj)
-        self.member_offset = None
-
-    # hook in to the place we seem to be able to reliably get the raw gzip
-    # member offset
-    def _read(self, size=1024):
-        if self._new_member:
-            try:
-                # works for python3.2
-                self.member_offset = self.fileobj.tell() - self.fileobj._length + (self.fileobj._read or 0)
-            except AttributeError:
-                # works for python2.7
-                self.member_offset = self.fileobj.tell()
-
-        return gzip.GzipFile._read(self, size)
 
 class GzipRecordStream(RecordStream):
     """A stream to read/write concatted file made up of gzipped
     archive records"""
+
     def __init__(self, file_handle, record_parser):
-        RecordStream.__init__(self, GeeZipFile(fileobj=file_handle), record_parser)
+        RecordStream.__init__(self, GeeZipFile(fileobj=file_handle),
+                              record_parser)
         self.raw_fh = file_handle
 
     def _read_record(self, offsets):
@@ -221,8 +209,8 @@ def _read_record(self, offsets):
             if not re.match(br'^[\r\n]+$', line):
                 break
 
-        record, errors, _offset = \
-            self.record_parser.parse(self, offset=None, line=line)
+        record, errors, _offset = self.record_parser.parse(
+            self, offset=None, line=line)
 
         offset = self.fh.member_offset
 
@@ -231,9 +219,11 @@ def _read_record(self, offsets):
     def seek(self, offset, pos=0):
         """Same as a seek on a file"""
         self.raw_fh.seek(offset, pos)
-        # trick to avoid closing and recreating GzipFile, does it always work?
+        # XXX trick to avoid closing and recreating GzipFile, does it
+        # always work?
         self.fh._new_member = True
 
+
 class GzipFileStream(RecordStream):
     """A stream to read/write gzipped file made up of all archive records"""
     def __init__(self, file_handle, record):
@@ -251,8 +241,7 @@ def _read_record(self, offsets):
             if not re.match(br'^[\r\n]+$', line):
                 break
 
-        record, errors, _offset = \
-            self.record_parser.parse(self, offset=None, line=line)
+        record, errors, offset = self.record_parser.parse(
+            self, offset=None, line=line)
 
         return offset, record, errors
-

From 97133a7b01745c29afa433aea2634c046675c3fe Mon Sep 17 00:00:00 2001
From: Steve Sisney <steve@archive.org>
Date: Wed, 11 Dec 2019 02:16:45 +0000
Subject: [PATCH 4/6] Very minor readability (not lint) fixes to
 test_warctols.py

---
 hanzo/warctools/tests/test_warctools.py | 123 +++++++++++++-----------
 1 file changed, 66 insertions(+), 57 deletions(-)

diff --git a/hanzo/warctools/tests/test_warctools.py b/hanzo/warctools/tests/test_warctools.py
index 4576da5..6e7bdd9 100644
--- a/hanzo/warctools/tests/test_warctools.py
+++ b/hanzo/warctools/tests/test_warctools.py
@@ -1,16 +1,9 @@
 # vim: set sw=4 et:
 
+import gzip
 import unittest
 
-# want unittest2 for python2.6
-try:
-    unittest.TestCase.assertIsNone
-except AttributeError:
-    import unittest2
-    unittest = unittest2
-
-import tempfile
-import gzip
+from datetime import datetime
 from hanzo import warctools, httptools
 
 try:
@@ -19,17 +12,24 @@
     from StringIO import StringIO
     BytesIO = StringIO
 
+try:
+    unittest.TestCase.assertIsNone
+except AttributeError:
+    import unittest2
+    unittest = unittest2
+
+
 class ArcRecordTerminatorTest(unittest.TestCase):
     REC1_CONTENT = (b'1 0 InternetArchive\n'
-                  + b'URL IP-address Archive-date Content-type Archive-length\n'
-                  + b'Here is some funky arc header content!\n')
+                    b'URL IP-address Archive-date Content-type Archive-length\n'
+                    b'Here is some funky arc header content!\n')
     RECORD1 = b'filedesc://ArcRecordTerminatorTest.arc 0.0.0.0 20131113000000 text/plain ' + str(len(REC1_CONTENT)).encode('ascii') + b'\n' + REC1_CONTENT
 
     REC2_CONTENT = (b'HTTP/1.1 200 OK\r\n'
-                  + b'Content-Type: text/plain\r\n'
-                  + b'Content-Length: 12\r\n'
-                  + b'\r\n'
-                  + b'01234567890\r\n')
+                    b'Content-Type: text/plain\r\n'
+                    b'Content-Length: 12\r\n'
+                    b'\r\n'
+                    b'01234567890\r\n')
     RECORD2 = b'http://example.org/ 192.168.1.1 20131113000000 text/plain ' + str(len(REC2_CONTENT)).encode('ascii') + b'\n' + REC2_CONTENT
 
     REC1_GZ = b"\x1f\x8b\x08\x00\xbf\xa9\x99R\x02\xff=NK\x0e\x820\x14\xdc\xf7\x14\xcf\x03\xf0\xa9\xc4\x8d;\xe3F\x12\x17\x86\xe0\x01\x9av\x90Fh\xc9\xeb\xd3\xc8\xedE4\xce\xec\xe6\x97\xe9\xfc\x00\x87d\xf7Eq`\xdb\xc0Fv-x\xf4\xc1H\xe4\x16Ir\xc3\x96\xca|%mK]i\xad\xabr\x05\t^RL\x83\xf1\x81\xb4\xde)M%\xd5A\xc0\x01\xb2\xac\xf5\xfe\tum\xceT_2\xe3\x1c#%\xfa\xc9\x993\x02:\xc6%\x1c$\x93y\xc2\xdf\x19\x10n\xd2\xab\x13\x18\xe4\x13\xa58\x82\xbaG\xb8\xcf\xf49\xd2\xc380\xd9os\xa3\xd4\x1b\xa0\xa9\x1c5\xc1\x00\x00\x00"
@@ -50,7 +50,7 @@ def _test_terminator(self, terminator):
             self._run_checks(fin, terminator, False)
         finally:
             fin.close()
-        
+
         fin = self._arc_gz(terminator)
         try:
             self._run_checks(fin, terminator, True)
@@ -113,27 +113,31 @@ def runTest(self):
         self._test_terminator(b'\r\r\r\r\r\r\n\n')
         self._test_terminator(b'\r\r\r\r\r\r\n\n\n')
 
+        # added this to get stdout from test runner when tests pass
+        # raise ValueError('[DEBUG] ArcRecordTerminatortest Done.')
+
+
 class WarcRecordTerminatorTest(unittest.TestCase):
     RECORD1 = (b'WARC/1.0\r\n'
-             + b'WARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\n'
-             + b'WARC-Type: warcinfo\r\n'
-             + b'Content-Type: application/warc-fields\r\n'
-             + b'Content-Length: 30\r\n'
-             + b'\r\n'
-             + b'format: WARC File Format 1.0\r\n')
+               b'WARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\n'
+               b'WARC-Type: warcinfo\r\n'
+               b'Content-Type: application/warc-fields\r\n'
+               b'Content-Length: 30\r\n'
+               b'\r\n'
+               b'format: WARC File Format 1.0\r\n')
 
     RECORD2 = (b'WARC/1.0\r\n'
-             + b'WARC-Type: response\r\n'
-             + b'WARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000001>\r\n'
-             + b'WARC-Target-URI: http://example.org/\r\n'
-             + b'Content-Type: application/http;msgtype=response\r\n'
-             + b'Content-Length: 78\r\n'
-             + b'\r\n'
-             + b'HTTP/1.1 200 OK\r\n'
-             + b'Content-Type: text/plain\r\n'
-             + b'Content-Length: 12\r\n'
-             + b'\r\n'
-             + b'01234567890\r\n')
+               b'WARC-Type: response\r\n'
+               b'WARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000001>\r\n'
+               b'WARC-Target-URI: http://example.org/\r\n'
+               b'Content-Type: application/http;msgtype=response\r\n'
+               b'Content-Length: 78\r\n'
+               b'\r\n'
+               b'HTTP/1.1 200 OK\r\n'
+               b'Content-Type: text/plain\r\n'
+               b'Content-Length: 12\r\n'
+               b'\r\n'
+               b'01234567890\r\n')
 
     RECORD1_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xff\x0bw\x0cr\xd67\xd43\xe0\xe5\n\x07\xb2t\x83R\x93\xf3\x8bRt=]\xac\x14lJ\x8b\xf2\xacJK3S\xac\x0c\xa0@\x17\x0b\x01\x03vP\x03B*\x0bR\xad\x14\xca\x13\x8b\x923\xf3\xd2\xf2y\xb9\x9c\xf3\xf3JR\xf3J\xa0\xe2\x89\x05\x059\x99\xc9\x89%\x99\xf9y\xfa 5\xbai\x99\xa99)\xc5\x08e>\xa9y\xe9%\x19V\n\xc6@\x07\xf1r\xa5\xe5\x17\xe5&\x96X)\x80LVp\xcb\xccIUp\x03\x8b(\x80\x1d\x0c\x82\x00\x04h\xbe\xd2\xbf\x00\x00\x00'
     RECORD2_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xffm\x8f\xc9\n\xc20\x10\x86\xef\x81\xbcC^\xa0MR\x97j\\@\xeaAQPJ\xa5\xe7\xa0C-\xd4$\xa4S\xd0\xb7\xb7\x85\x16A\xfd\x0f\xc3\xac\xdf\xcc\xe4\x9b4\xe12\x14\x94\xe4\xad\x17d/\x07\x8ay\xa8\x9d55\xf4\xc9\x14\xae\xd6\xdf\x82\xfdV\xb1e\xe3\x8dj\x9a\xf2\xa6D\xaf\xe0\x8f\xe9%\xd7\x03U\xfb\x020\xb8\xa4{\xc5\xee\x88Nq\x0eO\xfdp\x15\x84\xd6\x17\x9c\x92\xc4\x1a\x04\x83\xfdz\xed\\U^5\x96\xd6\xf0\xae}\xf1\xa8\x0bl+\xab\xcf]\xc3\xc0\x11L\x81w\xc5\xe2\x19%\x94\xec\xb2\xec\xdc>#Y$\x04;\x1d\xbe\xb9\x08O\xe4\xae\xd2\xa5\xf9\x05\xc8\xa8\x03\x08\x19\x8d\xc6\x93i<\x9b\x8b.\xa4\xe4\rV`\x1c`\x1f\x01\x00\x00'
@@ -153,7 +157,7 @@ def _test_terminator(self, terminator):
             self._run_checks(fin, terminator, False)
         finally:
             fin.close()
-        
+
         fin = self._warc_gz(terminator)
         try:
             self._run_checks(fin, terminator, True)
@@ -197,7 +201,7 @@ def _run_checks(self, fin, terminator, gzipped):
 
     def runTest(self):
         # anything works as long as it contains only \r and \n and ends with \n
-        self._test_terminator(b'\r\n\r\n') # the good one
+        self._test_terminator(b'\r\n\r\n')  # the good one
         self._test_terminator(b'\r\n')
         self._test_terminator(b'\n\r\n')
         self._test_terminator(b'\n\n\r\n')
@@ -213,11 +217,15 @@ def runTest(self):
         self._test_terminator(b'\r\r\r\r\r\r\n\n')
         self._test_terminator(b'\r\r\r\r\r\r\n\n\n')
 
+        # added this to get stdout from test runner when tests pass
+        # raise ValueError('[DEBUG] WarcRecordTerminatortest Done.')
+
 
 class WarcWritingTest(unittest.TestCase):
 
     # XXX should this a part of the library?
-    def build_warc_record(self, url, warc_date=None, content_buffer=None,
+    def build_warc_record(
+            self, url, warc_date=None, content_buffer=None,
             content_file=None, content_length=None, concurrent_to=None,
             warc_type=None, content_type=None, remote_ip=None, profile=None,
             refers_to=None, refers_to_target_uri=None, refers_to_date=None,
@@ -232,9 +240,11 @@ def build_warc_record(self, url, warc_date=None, content_buffer=None,
         headers = []
         if warc_type is not None:
             headers.append((warctools.WarcRecord.TYPE, warc_type))
+
         headers.append((warctools.WarcRecord.ID, record_id))
         headers.append((warctools.WarcRecord.DATE, warc_date))
         headers.append((warctools.WarcRecord.URL, url))
+
         if remote_ip is not None:
             headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
         if profile is not None:
@@ -269,43 +279,43 @@ def build_warc_record(self, url, warc_date=None, content_buffer=None,
 
     def build_record_using_tuple(self):
         content_buffer = b'Luke, I am your payload'
-        record = self.build_warc_record(url=b'http://example.org/',
-                content_buffer=content_buffer,
-                record_id=b'<urn:uuid:00000000-0000-0000-0000-000000000000>',
-                warc_date=b'2013-11-15T00:00:00Z',
-                warc_type=warctools.WarcRecord.RESPONSE,
-                content_type=httptools.RequestMessage.CONTENT_TYPE)
+        record = self.build_warc_record(
+            url=b'http://example.org/',
+            content_buffer=content_buffer,
+            record_id=b'<urn:uuid:00000000-0000-0000-0000-000000000000>',
+            warc_date=b'2013-11-15T00:00:00Z',
+            warc_type=warctools.WarcRecord.RESPONSE,
+            content_type=httptools.RequestMessage.CONTENT_TYPE)
         return record
 
     def build_record_using_stream(self):
         content_buffer = b'Shmuke, I gam four snayglob'
         fh = BytesIO(content_buffer)
-        record = self.build_warc_record(url=b'http://example.org/',
-                content_file=fh, content_length=str(len(content_buffer)).encode('ascii'),
-                record_id=b'<urn:uuid:00000000-0000-0000-0000-000000000000>',
-                warc_date=b'2013-11-15T00:00:00Z',
-                warc_type=warctools.WarcRecord.RESPONSE,
-                content_type=httptools.RequestMessage.CONTENT_TYPE)
+        record = self.build_warc_record(
+            url=b'http://example.org/',
+            content_file=fh, content_length=str(len(content_buffer)).encode('ascii'),
+            record_id=b'<urn:uuid:00000000-0000-0000-0000-000000000000>',
+            warc_date=b'2013-11-15T00:00:00Z',
+            warc_type=warctools.WarcRecord.RESPONSE,
+            content_type=httptools.RequestMessage.CONTENT_TYPE)
         return record
 
-
     def test_write_using_tuple(self):
         record = self.build_record_using_tuple()
 
         f = BytesIO()
         record.write_to(f)
         self.assertEqual(f.getvalue(), 
-                b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
+                         b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
         f.close()
 
         # should work again if we do it again
         f = BytesIO()
         record.write_to(f)
-        self.assertEqual(f.getvalue(), 
-                b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
+        self.assertEqual(f.getvalue(),
+                         b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
         f.close()
 
-
     def test_write_using_tuple_gz(self):
         record = self.build_record_using_tuple()
 
@@ -326,14 +336,14 @@ def test_write_using_tuple_gz(self):
         g.close()
         f.close()
 
-
     def test_write_using_stream(self):
         record = self.build_record_using_stream()
 
         f = BytesIO()
         record.write_to(f)
-        self.assertEqual(f.getvalue(), 
-                b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n')
+        self.assertEqual(
+            f.getvalue(),
+            b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n')
         f.close()
 
         # throws exception because record.content_file position has advanced
@@ -342,7 +352,6 @@ def test_write_using_stream(self):
             record.write_to(f)
         f.close()
 
-
     def test_write_using_stream_gz(self):
         record = self.build_record_using_stream()
 

From 433a7f58981ed3800859c99917eb9a5c59b66c82 Mon Sep 17 00:00:00 2001
From: Steve Sisney <steve@archive.org>
Date: Wed, 11 Dec 2019 02:22:06 +0000
Subject: [PATCH 5/6] Merged .pylintrc directives into pylint.rc

---
 .pylintrc | 2 --
 pylint.rc | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)
 delete mode 100644 .pylintrc

diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index c234cea..0000000
--- a/.pylintrc
+++ /dev/null
@@ -1,2 +0,0 @@
-[TYPECHECK]
-ignored-classes=ArcRecord,WarcRecord,WarcParser
\ No newline at end of file
diff --git a/pylint.rc b/pylint.rc
index 88e7d5b..e1af048 100644
--- a/pylint.rc
+++ b/pylint.rc
@@ -158,7 +158,7 @@ ignore-mixin-members=yes
 
 # List of classes names for which member attributes should not be checked
 # (useful for classes with attributes dynamically set).
-ignored-classes=SQLObject
+ignored-classes=SQLObject,ArcRecord,WarcRecord,WarcParser
 
 # When zope mode is activated, add a predefined set of Zope acquired attributes
 # to generated-members.

From 47693d9eb579add3938d76fb8cc89e549db3ddf3 Mon Sep 17 00:00:00 2001
From: Steve Sisney <steve@archive.org>
Date: Wed, 11 Dec 2019 02:23:57 +0000
Subject: [PATCH 6/6] Removed support for py32, py33 from .travis.yml

---
 .travis.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 86d04d3..bd1d72d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,6 @@ language: python
 
 python:
  - 2.7
- - 3.2
- - 3.3
- - 3.4
  - 3.5
  - nightly
  - pypy
@@ -12,7 +9,6 @@ python:
 
 matrix:
  allow_failures:
-  - python: 3.5
   - python: nightly
 
 script: python setup.py test