From eb38918b2a519d5419133d8add8c634f681752b6 Mon Sep 17 00:00:00 2001 From: Steve Sisney Date: Wed, 11 Dec 2019 01:50:11 +0000 Subject: [PATCH 1/6] Added .pylintrc, and basic pylint fixes to arc.py --- .pylintrc | 2 + hanzo/warctools/arc.py | 86 ++++++++++++++++++++++++------------------ 2 files changed, 51 insertions(+), 37 deletions(-) create mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..c234cea --- /dev/null +++ b/.pylintrc @@ -0,0 +1,2 @@ +[TYPECHECK] +ignored-classes=ArcRecord,WarcRecord,WarcParser \ No newline at end of file diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py index 545b59c..3bc262f 100644 --- a/hanzo/warctools/arc.py +++ b/hanzo/warctools/arc.py @@ -1,5 +1,6 @@ -"""An object to represent arc records -http://archive.org/web/researcher/ArcFileFormat.php +""" +ARC Record +~~~~~~~~~~ """ import re @@ -7,36 +8,41 @@ from hanzo.warctools.record import ArchiveRecord, ArchiveParser from hanzo.warctools.archive_detect import register_record_type + # URLIP-addressArchive-dateContent-type -#Result-codeChecksumLocation OffsetFilename -#Archive-length -# +# Result-codeChecksumLocation OffsetFilename +# Archive-length + @ArchiveRecord.HEADERS( - URL = b'URL', - IP = b'IP-address', - DATE = b'Archive-date', - CONTENT_TYPE = b'Content-type', - CONTENT_LENGTH = b'Archive-length', - RESULT_CODE = b'Result-code', - CHECKSUM = b'Checksum', - LOCATION = b'Location', - OFFSET = b'Offset', - FILENAME = b'Filename', + URL=b'URL', + IP=b'IP-address', + DATE=b'Archive-date', + CONTENT_TYPE=b'Content-type', + CONTENT_LENGTH=b'Archive-length', + RESULT_CODE=b'Result-code', + CHECKSUM=b'Checksum', + LOCATION=b'Location', + OFFSET=b'Offset', + FILENAME=b'Filename', ) class ArcRecord(ArchiveRecord): + """An object to represent arc records + http://archive.org/web/researcher/ArcFileFormat.php + """ + TRAILER = b'\n' # an ARC record is trailed by single unix newline """Represents a record in an arc file.""" def __init__(self, headers=None, content=None, errors=None): - ArchiveRecord.__init__(self, headers, content, errors) + ArchiveRecord.__init__(self, headers, content, errors) @property def type(self): return b"response" def _write_to(self, out, nl): - #TODO: empty method? + # TODO: empty method? pass @classmethod @@ -44,11 +50,12 @@ def make_parser(cls): """Constructs a parser for arc records.""" return ArcParser() + class ArcRecordHeader(ArcRecord): """Represents the headers in an arc record.""" def __init__(self, headers=None, content=None, errors=None, version=None, raw_headers=None): - ArcRecord.__init__(self, headers, content, errors) + ArcRecord.__init__(self, headers, content, errors) self.version = version self.raw_headers = raw_headers @@ -60,20 +67,22 @@ def raw(self): """Return the raw representation of this record.""" return b"".join(self.raw_headers) + self.content[1] + def rx(pat): """Helper function to compile a regular expression with the IGNORECASE flag.""" return re.compile(pat, flags=re.IGNORECASE) + nl_rx = rx('^\r\n|\r|\n$') -length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101 -type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101 +length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') +type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') # pylint: disable-msg=E1101 SPLIT = re.compile(br'\b\s|\s\b').split + class ArcParser(ArchiveParser): """A parser for arc archives.""" - def __init__(self): self.version = 0 # we don't know which version to parse initially - a v1 or v2 file so @@ -82,16 +91,18 @@ def __init__(self): # question? will we get arc fragments? # should we store both headers & detect records by header length? - # if we don't know + # if we don't know self.headers = [] def parse(self, stream, offset, line=None): """Parses a stream as an arc archive and returns an Arc record along with the offset in the stream of the end of the record.""" + record = None - content_type = None + # content_type = None content_length = None + if line is None: line = stream.readline() @@ -115,18 +126,18 @@ def parse(self, stream, offset, line=None): # configure parser instance self.version = arc_version.split()[0] self.headers = arc_names_line.strip().split() - + # now we have read header field in record body # we can extract the headers from the current record, # and read the length field # which is in a different place with v1 and v2 - - # read headers + + # read headers arc_headers = self.parse_header_list(line) - + # extract content, ignoring header lines parsed already - content_type, content_length, errors = \ + _content_type, content_length, errors = \ self.get_content_headers(arc_headers) content_length = content_length \ @@ -141,10 +152,10 @@ def parse(self, stream, offset, line=None): if not self.headers: raise Exception('missing filedesc') headers = self.parse_header_list(line) - content_type, content_length, errors = \ + _content_type, content_length, errors = \ self.get_content_headers(headers) - record = ArcRecord(headers = headers, errors=errors) + record = ArcRecord(headers=headers, errors=errors) line = None @@ -153,28 +164,29 @@ def parse(self, stream, offset, line=None): return (record, (), offset) - def trim(self, stream): - return () - def parse_header_list(self, line): + """returns the list of record headers""" + # some people use ' ' as the empty value. lovely. line = line.rstrip(b'\r\n') values = SPLIT(line) if len(self.headers) != len(values): if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE): # fencepost - values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))] + values = [s[::-1] for s in reversed( + SPLIT(line[::-1], len(self.headers)-1))] else: values = SPLIT(line, len(self.headers)-1) if len(self.headers) != len(values): - raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers))) - - return list(zip(self.headers, values)) + raise Exception('missing headers %s %s' % ( + ",".join(values), ",".join(self.headers))) + return list(zip(self.headers, values)) @staticmethod def get_content_headers(headers): + """returns content_type, content_length, errors from headers""" content_type = None content_length = None errors = [] From 64940b7ffea390a5a7cb570dbfca13d666acbe8e Mon Sep 17 00:00:00 2001 From: Steve Sisney Date: Wed, 11 Dec 2019 01:50:54 +0000 Subject: [PATCH 2/6] Made basic pylint fixes to warc.py --- hanzo/warctools/warc.py | 135 ++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 66 deletions(-) diff --git a/hanzo/warctools/warc.py b/hanzo/warctools/warc.py index d274510..e39ce71 100644 --- a/hanzo/warctools/warc.py +++ b/hanzo/warctools/warc.py @@ -1,13 +1,16 @@ -"""An object to represent warc records, using the abstract record in -record.py""" +""" +WARC Record +~~~~~~~~~~~ +""" -import re import hashlib +import re +import uuid + from hanzo.warctools.record import ArchiveRecord, ArchiveParser from hanzo.warctools.archive_detect import register_record_type -import uuid -bad_lines = 5 # when to give up looking for the version stamp +BAD_LINES = 5 # when to give up looking for the version stamp @ArchiveRecord.HEADERS( @@ -30,10 +33,8 @@ ) class WarcRecord(ArchiveRecord): - # Pylint is very bad at decorators, E1101 is the message that says - # a member variable does not exist - - # pylint: disable-msg=E1101 + """An object to represent warc records, using the abstract record + in record.py""" VERSION = b"WARC/1.0" VERSION18 = b"WARC/0.18" @@ -46,7 +47,8 @@ class WarcRecord(ArchiveRecord): CONVERSION = b"conversion" WARCINFO = b"warcinfo" - PROFILE_IDENTICAL_PAYLOAD_DIGEST = b"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest" + PROFILE_IDENTICAL_PAYLOAD_DIGEST = ( + b"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest") TRAILER = b'\r\n\r\n' @@ -55,15 +57,16 @@ def __init__(self, version=VERSION, headers=None, content=None, """ WarcRecord constructor. - Either content or content_file must be provided, but not both. If - content, which is a tuple (content_type, content_buffer), is provided, - when writing the warc record, any Content-Type and Content-Length that - appear in the supplied headers are ignored, and the values content[0] - and len(content[1]), respectively, are used. + Either content or content_file must be provided, but not + both. If content, which is a tuple (content_type, + content_buffer), is provided, when writing the warc record, + any Content-Type and Content-Length that appear in the + supplied headers are ignored, and the values content[0] and + len(content[1]), respectively, are used. - When reading, the caller can stream content_file or use content, which is - lazily filled using content_file, and after which content_file is - unavailable. + When reading, the caller can stream content_file or use + content, which is lazily filled using content_file, and after + which content_file is unavailable. """ ArchiveRecord.__init__(self, headers, content, errors) self.version = version @@ -86,17 +89,19 @@ def _write_to(self, out, nl): out.write(self.version) out.write(nl) for k, v in self.headers: - if self.content_file is not None or k not in (self.CONTENT_TYPE, self.CONTENT_LENGTH): + if self.content_file is not None or k not in ( + self.CONTENT_TYPE, self.CONTENT_LENGTH): out.write(k) out.write(b": ") out.write(v) out.write(nl) if self.content_file is not None: - out.write(nl) # end of header blank nl + out.write(nl) # end of header blank nl while True: buf = self.content_file.read(8192) - if buf == b'': break + if buf == b'': + break out.write(buf) else: # if content tuple is provided, set Content-Type and @@ -117,10 +122,10 @@ def _write_to(self, out, nl): out.write(str(content_length).encode('ascii')) out.write(nl) - out.write(nl) # end of header blank nl + out.write(nl) # end of header blank nl if content_buffer: out.write(content_buffer) - + # end of record nl nl out.write(nl) out.write(nl) @@ -145,7 +150,8 @@ def block_digest(self, content_buffer): @staticmethod def warc_uuid(text): - return "".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii') + return "".format( + uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii') @staticmethod def random_warc_uuid(): @@ -156,24 +162,29 @@ def rx(pat): """Helper to compile regexps with IGNORECASE option set.""" return re.compile(pat, flags=re.IGNORECASE) + version_rx = rx(br'^(?P.*?)(?P\s*WARC/(?P.*?))' b'(?P\r\n|\r|\n)\\Z') + # a header is key: value plus any following lines with leading whitespace header_rx = rx(br'^(?P.*?):\s?(?P.*?)' b'(?P\r\n|\r|\n)\\Z') value_rx = rx(br'^\s+(?P.+?)' b'(?P\r\n|\r|\n)\\Z') nl_rx = rx(b'^(?P\r\n|\r|\n\\Z)') -length_rx = rx(b'^' + WarcRecord.CONTENT_LENGTH + b'$' ) # pylint: disable-msg=E1101 -type_rx = rx(b'^' + WarcRecord.CONTENT_TYPE + b'$') # pylint: disable-msg=E1101 +length_rx = rx(b'^' + WarcRecord.CONTENT_LENGTH + b'$') +type_rx = rx(b'^' + WarcRecord.CONTENT_TYPE + b'$') required_headers = set(( - WarcRecord.TYPE.lower(), # pylint: disable-msg=E1101 - WarcRecord.ID.lower(), # pylint: disable-msg=E1101 - WarcRecord.CONTENT_LENGTH.lower(), # pylint: disable-msg=E1101 - WarcRecord.DATE.lower(), # pylint: disable-msg=E1101 - )) + WarcRecord.TYPE.lower(), + WarcRecord.ID.lower(), + WarcRecord.CONTENT_LENGTH.lower(), + WarcRecord.DATE.lower(), +)) class WarcParser(ArchiveParser): + + """WARC Parser""" + KNOWN_VERSIONS = set((b'1.0', b'0.17', b'0.18')) def parse(self, stream, offset, line=None): @@ -181,7 +192,6 @@ def parse(self, stream, offset, line=None): (record, errors). Either records is null or errors is null. Any record-specific errors are contained in the record - errors is only used when *nothing* could be parsed""" - # pylint: disable-msg=E1101 errors = [] version = None # find WARC/.* @@ -201,7 +211,7 @@ def parse(self, stream, offset, line=None): offset += len(line) if not nl_rx.match(line): errors.append(('ignored line', line)) - if len(errors) > bad_lines: + if len(errors) > BAD_LINES: errors.append(('too many errors, giving up hope',)) return (None, errors, offset) line = stream.readline() @@ -211,7 +221,6 @@ def parse(self, stream, offset, line=None): return (None, errors, offset) if line: content_length = 0 - content_type = None record = WarcRecord(errors=errors, version=version) @@ -228,11 +237,11 @@ def parse(self, stream, offset, line=None): if prefix: record.error('bad prefix on WARC version header', prefix) - #Read headers + # Read headers line = stream.readline() while line and not nl_rx.match(line): - #print 'header', repr(line) + # print 'header', repr(line) match = header_rx.match(line) if match: if match.group('nl') != b'\x0d\x0a': @@ -240,12 +249,12 @@ def parse(self, stream, offset, line=None): match.group('nl')) name = match.group('name').strip() value = [match.group('value').strip()] - #print 'match',name, value + # print 'match',name, value line = stream.readline() match = value_rx.match(line) while match: - #print 'follow', repr(line) + # print 'follow', repr(line) if match.group('nl') != b'\x0d\x0a': record.error('incorrect newline in follow header', line, match.group('nl')) @@ -258,15 +267,13 @@ def parse(self, stream, offset, line=None): record.headers.append((name, value)) if type_rx.match(name): - if value: - content_type = value - else: + if not value: record.error('invalid header', name, value) elif length_rx.match(name): try: - #print name, value + # print name, value content_length = int(value) - #print content_length + # print content_length except ValueError: record.error('invalid header', name, value) @@ -275,7 +282,7 @@ def parse(self, stream, offset, line=None): record.content_file = stream record.content_file.bytes_to_eoc = content_length - # check mandatory headers + # check mandatory headers # WARC-Type WARC-Date WARC-Record-ID Content-Length return (record, (), offset) @@ -287,14 +294,13 @@ def parse(self, stream, offset, line=None): def make_response(id, date, url, content, request_id): - # pylint: disable-msg=E1101 headers = [ - (WarcRecord.TYPE, WarcRecord.RESPONSE), - (WarcRecord.ID, id), - (WarcRecord.DATE, date), - (WarcRecord.URL, url), - + (WarcRecord.TYPE, WarcRecord.RESPONSE), + (WarcRecord.ID, id), + (WarcRecord.DATE, date), + (WarcRecord.URL, url), ] + if request_id: headers.append((WarcRecord.CONCURRENT_TO, request_id)) @@ -304,14 +310,13 @@ def make_response(id, date, url, content, request_id): def make_request(request_id, date, url, content, response_id): - # pylint: disable-msg=E1101 headers = [ - (WarcRecord.TYPE, WarcRecord.REQUEST), - (WarcRecord.ID, request_id), - (WarcRecord.DATE, date), - (WarcRecord.URL, url), - + (WarcRecord.TYPE, WarcRecord.REQUEST), + (WarcRecord.ID, request_id), + (WarcRecord.DATE, date), + (WarcRecord.URL, url), ] + if response_id: headers.append((WarcRecord.CONCURRENT_TO, response_id)) @@ -321,13 +326,12 @@ def make_request(request_id, date, url, content, response_id): def make_metadata(meta_id, date, content, concurrent_to=None, url=None): - # pylint: disable-msg=E1101 headers = [ - (WarcRecord.TYPE, WarcRecord.METADATA), - (WarcRecord.ID, meta_id), - (WarcRecord.DATE, date), - + (WarcRecord.TYPE, WarcRecord.METADATA), + (WarcRecord.ID, meta_id), + (WarcRecord.DATE, date), ] + if concurrent_to: headers.append((WarcRecord.CONCURRENT_TO, concurrent_to)) @@ -340,13 +344,12 @@ def make_metadata(meta_id, date, content, concurrent_to=None, url=None): def make_conversion(conv_id, date, content, refers_to=None, url=None): - # pylint: disable-msg=E1101 headers = [ - (WarcRecord.TYPE, WarcRecord.CONVERSION), - (WarcRecord.ID, conv_id), - (WarcRecord.DATE, date), - + (WarcRecord.TYPE, WarcRecord.CONVERSION), + (WarcRecord.ID, conv_id), + (WarcRecord.DATE, date), ] + if refers_to: headers.append((WarcRecord.REFERS_TO, refers_to)) From 29666d047018946c7e39303ad95a6ae988d2e4f6 Mon Sep 17 00:00:00 2001 From: Steve Sisney Date: Wed, 11 Dec 2019 02:14:52 +0000 Subject: [PATCH 3/6] Added python 3.5 GzipFile extension: geezip.py, using in stream.py --- hanzo/warctools/geezip.py | 111 ++++++++++++++++++++++++++++++++++++++ hanzo/warctools/stream.py | 85 +++++++++++++---------------- 2 files changed, 148 insertions(+), 48 deletions(-) create mode 100644 hanzo/warctools/geezip.py diff --git a/hanzo/warctools/geezip.py b/hanzo/warctools/geezip.py new file mode 100644 index 0000000..92d87b8 --- /dev/null +++ b/hanzo/warctools/geezip.py @@ -0,0 +1,111 @@ +"""Extends gzip.GzipFile for raw gzip offset in python 2 and 3""" + +import gzip +import io + +try: + import builtins +except ImportError: + pass + +try: + + # this branch contributed by Kenji: + # https://github.com/kngenie/warctools/commit/159bfdfa45cc0b51ed4a4a4d7d744ef7bf82ae23 + + # Python 3.5 got a major change to gzip module. Essential gunzip + # work is now implemented in _GzipReader and GzipFile simply wraps + # around it. + + class _GeeZipReader(gzip._GzipReader): + + """Extends python 3.5 gzip._GzipReader""" + + def _read_gzip_header(self): + pos = self._raw_pos() + has_record = super(_GeeZipReader, self)._read_gzip_header() + + if has_record: + self.member_offset = pos + + return has_record + + def _raw_pos(self): + """Return offset in raw gzip file corresponding to this + object's state.""" + + # _fp is PaddedFile object with prepend method. it doesn't have + # tell(). It has seek(), but it's useless as a replacement for + # tell(). We need to compute offset from internal attributes. + + pos = self._fp.file.tell() + + if self._fp._read is not None: + pos -= (self._fp._length - self._fp._read) + + return pos + + class GeeZipFile(gzip.GzipFile): + + def __init__(self, filename=None, mode=None, fileobj=None): + if mode is None: + mode = getattr(fileobj, 'mode', 'rb') + + if mode.startswith('r'): + if mode and 'b' not in mode: + mode += 'b' + if fileobj is None: + fileobj = self.myfileobj = builtins.open( + filename, mode or 'rb') + if filename is None: + filename = getattr(fileobj, 'name', '') + if not isinstance(filename, (str, bytes)): + filename = '' + self.mode = gzip.READ + raw = _GeeZipReader(fileobj) + self._buffer = io.BufferedReader(raw) + self.name = filename + self.fileobj = fileobj + self._raw = raw + else: + super(GeeZipFile, self).__init__(filename, mode, fileobj) + + @property + def member_offset(self): + return self._raw.member_offset + +except AttributeError: + + # this branch falls back to python 2.7+ + + class GeeZipFile(gzip.GzipFile): + """Extends gzip.GzipFile to remember self.member_offset, the raw + file offset of the current gzip member.""" + + def __init__(self, filename=None, mode=None, + compresslevel=9, fileobj=None, mtime=None): + print('[DEBUG] GeeZipFile(gzip.GzipFile)') + print('[DEBUG] => fileobj', fileobj) + + # ignore mtime for python 2.6 + gzip.GzipFile.__init__(self, filename=filename, mode=mode, + compresslevel=compresslevel, + fileobj=fileobj) + + self.member_offset = None + print('[DEBUG] => member_offset', self.member_offset) + + # hook in to the place we seem to be able to reliably get the raw gzip + # member offset + def _read(self, size=1024): + if self._new_member: + try: + # works for python3.2 + self.member_offset = (self.fileobj.tell() + - self.fileobj._length + + (self.fileobj._read or 0)) + except AttributeError: + # works for python2.7 + self.member_offset = self.fileobj.tell() + + return gzip.GzipFile._read(self, size) diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py index 1fecc91..360964a 100644 --- a/hanzo/warctools/stream.py +++ b/hanzo/warctools/stream.py @@ -5,47 +5,54 @@ from hanzo.warctools.archive_detect import is_gzip_file, guess_record_type +from . import s3 +from .geezip import GeeZipFile + +CHUNK_SIZE = 8192 # the size to read in, make this bigger things go faster. + + def open_record_stream(record_class=None, filename=None, file_handle=None, - mode="rb", gzip="auto", offset=None, length=None): + mode="rb", _gzip="auto", offset=None, length=None): """Can take a filename or a file_handle. Normally called indirectly from A record class i.e WarcRecord.open_archive. If the first parameter is None, will try to guess""" if file_handle is None: if filename.startswith('s3://'): - from . import s3 file_handle = s3.open_url(filename, offset=offset, length=length) else: file_handle = open(filename, mode=mode) if offset is not None: file_handle.seek(offset) - if record_class == None: + if record_class is None: record_class = guess_record_type(file_handle) - if record_class == None: + if record_class is None: raise Exception('Failed to guess compression') record_parser = record_class.make_parser() - if gzip == 'auto': - if (filename and filename.endswith('.gz')) or is_gzip_file(file_handle): - gzip = 'record' - #debug('autodetect: record gzip') + if _gzip == 'auto': + if ((filename and filename.endswith('.gz')) + or is_gzip_file(file_handle)): + _gzip = 'record' + # debug('autodetect: record gzip') else: # assume uncompressed file - #debug('autodetected: uncompressed file') - gzip = None + # debug('autodetected: uncompressed file') + _gzip = None - if gzip == 'record': + if _gzip == 'record': return GzipRecordStream(file_handle, record_parser) - elif gzip == 'file': + + if _gzip == 'file': return GzipFileStream(file_handle, record_parser) - else: - return RecordStream(file_handle, record_parser) + return RecordStream(file_handle, record_parser) -class RecordStream(object): + +class RecordStream(): """A readable/writable stream of Archive Records. Can be iterated over or read_records can give more control, and potentially offset information. """ @@ -98,7 +105,9 @@ def _read_record(self, offsets): if not re.match(br'^[\r\n]+$', line): break - record, errors, offset = self.record_parser.parse(self, offset, line) + record, errors, offset = self.record_parser.parse( + self, offset, line) + return offset, record, errors def write(self, record): @@ -117,7 +126,8 @@ def _skip_to_eoc(self): read_size = min(CHUNK_SIZE, self.bytes_to_eoc) buf = self._read(read_size) if len(buf) < read_size: - raise Exception('expected {} bytes but only read {}'.format(read_size, len(buf))) + raise Exception('expected {} bytes but only read {}'.format( + read_size, len(buf))) def _read(self, count=None): """Raw read, will read into next record if caller isn't careful""" @@ -178,36 +188,14 @@ def readline(self, maxlen=None): self.bytes_to_eoc -= len(result) return result -CHUNK_SIZE = 8192 # the size to read in, make this bigger things go faster. - -class GeeZipFile(gzip.GzipFile): - """Extends gzip.GzipFile to remember self.member_offset, the raw file - offset of the current gzip member.""" - - def __init__(self, filename=None, mode=None, - compresslevel=9, fileobj=None, mtime=None): - # ignore mtime for python 2.6 - gzip.GzipFile.__init__(self, filename=filename, mode=mode, compresslevel=compresslevel, fileobj=fileobj) - self.member_offset = None - - # hook in to the place we seem to be able to reliably get the raw gzip - # member offset - def _read(self, size=1024): - if self._new_member: - try: - # works for python3.2 - self.member_offset = self.fileobj.tell() - self.fileobj._length + (self.fileobj._read or 0) - except AttributeError: - # works for python2.7 - self.member_offset = self.fileobj.tell() - - return gzip.GzipFile._read(self, size) class GzipRecordStream(RecordStream): """A stream to read/write concatted file made up of gzipped archive records""" + def __init__(self, file_handle, record_parser): - RecordStream.__init__(self, GeeZipFile(fileobj=file_handle), record_parser) + RecordStream.__init__(self, GeeZipFile(fileobj=file_handle), + record_parser) self.raw_fh = file_handle def _read_record(self, offsets): @@ -221,8 +209,8 @@ def _read_record(self, offsets): if not re.match(br'^[\r\n]+$', line): break - record, errors, _offset = \ - self.record_parser.parse(self, offset=None, line=line) + record, errors, _offset = self.record_parser.parse( + self, offset=None, line=line) offset = self.fh.member_offset @@ -231,9 +219,11 @@ def _read_record(self, offsets): def seek(self, offset, pos=0): """Same as a seek on a file""" self.raw_fh.seek(offset, pos) - # trick to avoid closing and recreating GzipFile, does it always work? + # XXX trick to avoid closing and recreating GzipFile, does it + # always work? self.fh._new_member = True + class GzipFileStream(RecordStream): """A stream to read/write gzipped file made up of all archive records""" def __init__(self, file_handle, record): @@ -251,8 +241,7 @@ def _read_record(self, offsets): if not re.match(br'^[\r\n]+$', line): break - record, errors, _offset = \ - self.record_parser.parse(self, offset=None, line=line) + record, errors, offset = self.record_parser.parse( + self, offset=None, line=line) return offset, record, errors - From 97133a7b01745c29afa433aea2634c046675c3fe Mon Sep 17 00:00:00 2001 From: Steve Sisney Date: Wed, 11 Dec 2019 02:16:45 +0000 Subject: [PATCH 4/6] Very minor readability (not lint) fixes to test_warctols.py --- hanzo/warctools/tests/test_warctools.py | 123 +++++++++++++----------- 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/hanzo/warctools/tests/test_warctools.py b/hanzo/warctools/tests/test_warctools.py index 4576da5..6e7bdd9 100644 --- a/hanzo/warctools/tests/test_warctools.py +++ b/hanzo/warctools/tests/test_warctools.py @@ -1,16 +1,9 @@ # vim: set sw=4 et: +import gzip import unittest -# want unittest2 for python2.6 -try: - unittest.TestCase.assertIsNone -except AttributeError: - import unittest2 - unittest = unittest2 - -import tempfile -import gzip +from datetime import datetime from hanzo import warctools, httptools try: @@ -19,17 +12,24 @@ from StringIO import StringIO BytesIO = StringIO +try: + unittest.TestCase.assertIsNone +except AttributeError: + import unittest2 + unittest = unittest2 + + class ArcRecordTerminatorTest(unittest.TestCase): REC1_CONTENT = (b'1 0 InternetArchive\n' - + b'URL IP-address Archive-date Content-type Archive-length\n' - + b'Here is some funky arc header content!\n') + b'URL IP-address Archive-date Content-type Archive-length\n' + b'Here is some funky arc header content!\n') RECORD1 = b'filedesc://ArcRecordTerminatorTest.arc 0.0.0.0 20131113000000 text/plain ' + str(len(REC1_CONTENT)).encode('ascii') + b'\n' + REC1_CONTENT REC2_CONTENT = (b'HTTP/1.1 200 OK\r\n' - + b'Content-Type: text/plain\r\n' - + b'Content-Length: 12\r\n' - + b'\r\n' - + b'01234567890\r\n') + b'Content-Type: text/plain\r\n' + b'Content-Length: 12\r\n' + b'\r\n' + b'01234567890\r\n') RECORD2 = b'http://example.org/ 192.168.1.1 20131113000000 text/plain ' + str(len(REC2_CONTENT)).encode('ascii') + b'\n' + REC2_CONTENT REC1_GZ = b"\x1f\x8b\x08\x00\xbf\xa9\x99R\x02\xff=NK\x0e\x820\x14\xdc\xf7\x14\xcf\x03\xf0\xa9\xc4\x8d;\xe3F\x12\x17\x86\xe0\x01\x9av\x90Fh\xc9\xeb\xd3\xc8\xedE4\xce\xec\xe6\x97\xe9\xfc\x00\x87d\xf7Eq`\xdb\xc0Fv-x\xf4\xc1H\xe4\x16Ir\xc3\x96\xca|%mK]i\xad\xabr\x05\t^RL\x83\xf1\x81\xb4\xde)M%\xd5A\xc0\x01\xb2\xac\xf5\xfe\tum\xceT_2\xe3\x1c#%\xfa\xc9\x993\x02:\xc6%\x1c$\x93y\xc2\xdf\x19\x10n\xd2\xab\x13\x18\xe4\x13\xa58\x82\xbaG\xb8\xcf\xf49\xd2\xc380\xd9os\xa3\xd4\x1b\xa0\xa9\x1c5\xc1\x00\x00\x00" @@ -50,7 +50,7 @@ def _test_terminator(self, terminator): self._run_checks(fin, terminator, False) finally: fin.close() - + fin = self._arc_gz(terminator) try: self._run_checks(fin, terminator, True) @@ -113,27 +113,31 @@ def runTest(self): self._test_terminator(b'\r\r\r\r\r\r\n\n') self._test_terminator(b'\r\r\r\r\r\r\n\n\n') + # added this to get stdout from test runner when tests pass + # raise ValueError('[DEBUG] ArcRecordTerminatortest Done.') + + class WarcRecordTerminatorTest(unittest.TestCase): RECORD1 = (b'WARC/1.0\r\n' - + b'WARC-Record-ID: \r\n' - + b'WARC-Type: warcinfo\r\n' - + b'Content-Type: application/warc-fields\r\n' - + b'Content-Length: 30\r\n' - + b'\r\n' - + b'format: WARC File Format 1.0\r\n') + b'WARC-Record-ID: \r\n' + b'WARC-Type: warcinfo\r\n' + b'Content-Type: application/warc-fields\r\n' + b'Content-Length: 30\r\n' + b'\r\n' + b'format: WARC File Format 1.0\r\n') RECORD2 = (b'WARC/1.0\r\n' - + b'WARC-Type: response\r\n' - + b'WARC-Record-ID: \r\n' - + b'WARC-Target-URI: http://example.org/\r\n' - + b'Content-Type: application/http;msgtype=response\r\n' - + b'Content-Length: 78\r\n' - + b'\r\n' - + b'HTTP/1.1 200 OK\r\n' - + b'Content-Type: text/plain\r\n' - + b'Content-Length: 12\r\n' - + b'\r\n' - + b'01234567890\r\n') + b'WARC-Type: response\r\n' + b'WARC-Record-ID: \r\n' + b'WARC-Target-URI: http://example.org/\r\n' + b'Content-Type: application/http;msgtype=response\r\n' + b'Content-Length: 78\r\n' + b'\r\n' + b'HTTP/1.1 200 OK\r\n' + b'Content-Type: text/plain\r\n' + b'Content-Length: 12\r\n' + b'\r\n' + b'01234567890\r\n') RECORD1_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xff\x0bw\x0cr\xd67\xd43\xe0\xe5\n\x07\xb2t\x83R\x93\xf3\x8bRt=]\xac\x14lJ\x8b\xf2\xacJK3S\xac\x0c\xa0@\x17\x0b\x01\x03vP\x03B*\x0bR\xad\x14\xca\x13\x8b\x923\xf3\xd2\xf2y\xb9\x9c\xf3\xf3JR\xf3J\xa0\xe2\x89\x05\x059\x99\xc9\x89%\x99\xf9y\xfa 5\xbai\x99\xa99)\xc5\x08e>\xa9y\xe9%\x19V\n\xc6@\x07\xf1r\xa5\xe5\x17\xe5&\x96X)\x80LVp\xcb\xccIUp\x03\x8b(\x80\x1d\x0c\x82\x00\x04h\xbe\xd2\xbf\x00\x00\x00' RECORD2_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xffm\x8f\xc9\n\xc20\x10\x86\xef\x81\xbcC^\xa0MR\x97j\\@\xeaAQPJ\xa5\xe7\xa0C-\xd4$\xa4S\xd0\xb7\xb7\x85\x16A\xfd\x0f\xc3\xac\xdf\xcc\xe4\x9b4\xe12\x14\x94\xe4\xad\x17d/\x07\x8ay\xa8\x9d55\xf4\xc9\x14\xae\xd6\xdf\x82\xfdV\xb1e\xe3\x8dj\x9a\xf2\xa6D\xaf\xe0\x8f\xe9%\xd7\x03U\xfb\x020\xb8\xa4{\xc5\xee\x88Nq\x0eO\xfdp\x15\x84\xd6\x17\x9c\x92\xc4\x1a\x04\x83\xfdz\xed\\U^5\x96\xd6\xf0\xae}\xf1\xa8\x0bl+\xab\xcf]\xc3\xc0\x11L\x81w\xc5\xe2\x19%\x94\xec\xb2\xec\xdc>#Y$\x04;\x1d\xbe\xb9\x08O\xe4\xae\xd2\xa5\xf9\x05\xc8\xa8\x03\x08\x19\x8d\xc6\x93i<\x9b\x8b.\xa4\xe4\rV`\x1c`\x1f\x01\x00\x00' @@ -153,7 +157,7 @@ def _test_terminator(self, terminator): self._run_checks(fin, terminator, False) finally: fin.close() - + fin = self._warc_gz(terminator) try: self._run_checks(fin, terminator, True) @@ -197,7 +201,7 @@ def _run_checks(self, fin, terminator, gzipped): def runTest(self): # anything works as long as it contains only \r and \n and ends with \n - self._test_terminator(b'\r\n\r\n') # the good one + self._test_terminator(b'\r\n\r\n') # the good one self._test_terminator(b'\r\n') self._test_terminator(b'\n\r\n') self._test_terminator(b'\n\n\r\n') @@ -213,11 +217,15 @@ def runTest(self): self._test_terminator(b'\r\r\r\r\r\r\n\n') self._test_terminator(b'\r\r\r\r\r\r\n\n\n') + # added this to get stdout from test runner when tests pass + # raise ValueError('[DEBUG] WarcRecordTerminatortest Done.') + class WarcWritingTest(unittest.TestCase): # XXX should this a part of the library? - def build_warc_record(self, url, warc_date=None, content_buffer=None, + def build_warc_record( + self, url, warc_date=None, content_buffer=None, content_file=None, content_length=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, @@ -232,9 +240,11 @@ def build_warc_record(self, url, warc_date=None, content_buffer=None, headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) + headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) + if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: @@ -269,43 +279,43 @@ def build_warc_record(self, url, warc_date=None, content_buffer=None, def build_record_using_tuple(self): content_buffer = b'Luke, I am your payload' - record = self.build_warc_record(url=b'http://example.org/', - content_buffer=content_buffer, - record_id=b'', - warc_date=b'2013-11-15T00:00:00Z', - warc_type=warctools.WarcRecord.RESPONSE, - content_type=httptools.RequestMessage.CONTENT_TYPE) + record = self.build_warc_record( + url=b'http://example.org/', + content_buffer=content_buffer, + record_id=b'', + warc_date=b'2013-11-15T00:00:00Z', + warc_type=warctools.WarcRecord.RESPONSE, + content_type=httptools.RequestMessage.CONTENT_TYPE) return record def build_record_using_stream(self): content_buffer = b'Shmuke, I gam four snayglob' fh = BytesIO(content_buffer) - record = self.build_warc_record(url=b'http://example.org/', - content_file=fh, content_length=str(len(content_buffer)).encode('ascii'), - record_id=b'', - warc_date=b'2013-11-15T00:00:00Z', - warc_type=warctools.WarcRecord.RESPONSE, - content_type=httptools.RequestMessage.CONTENT_TYPE) + record = self.build_warc_record( + url=b'http://example.org/', + content_file=fh, content_length=str(len(content_buffer)).encode('ascii'), + record_id=b'', + warc_date=b'2013-11-15T00:00:00Z', + warc_type=warctools.WarcRecord.RESPONSE, + content_type=httptools.RequestMessage.CONTENT_TYPE) return record - def test_write_using_tuple(self): record = self.build_record_using_tuple() f = BytesIO() record.write_to(f) self.assertEqual(f.getvalue(), - b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') + b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') f.close() # should work again if we do it again f = BytesIO() record.write_to(f) - self.assertEqual(f.getvalue(), - b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') + self.assertEqual(f.getvalue(), + b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') f.close() - def test_write_using_tuple_gz(self): record = self.build_record_using_tuple() @@ -326,14 +336,14 @@ def test_write_using_tuple_gz(self): g.close() f.close() - def test_write_using_stream(self): record = self.build_record_using_stream() f = BytesIO() record.write_to(f) - self.assertEqual(f.getvalue(), - b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n') + self.assertEqual( + f.getvalue(), + b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n') f.close() # throws exception because record.content_file position has advanced @@ -342,7 +352,6 @@ def test_write_using_stream(self): record.write_to(f) f.close() - def test_write_using_stream_gz(self): record = self.build_record_using_stream() From 433a7f58981ed3800859c99917eb9a5c59b66c82 Mon Sep 17 00:00:00 2001 From: Steve Sisney Date: Wed, 11 Dec 2019 02:22:06 +0000 Subject: [PATCH 5/6] Merged .pylintrc directives into pylint.rc --- .pylintrc | 2 -- pylint.rc | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) delete mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index c234cea..0000000 --- a/.pylintrc +++ /dev/null @@ -1,2 +0,0 @@ -[TYPECHECK] -ignored-classes=ArcRecord,WarcRecord,WarcParser \ No newline at end of file diff --git a/pylint.rc b/pylint.rc index 88e7d5b..e1af048 100644 --- a/pylint.rc +++ b/pylint.rc @@ -158,7 +158,7 @@ ignore-mixin-members=yes # List of classes names for which member attributes should not be checked # (useful for classes with attributes dynamically set). -ignored-classes=SQLObject +ignored-classes=SQLObject,ArcRecord,WarcRecord,WarcParser # When zope mode is activated, add a predefined set of Zope acquired attributes # to generated-members. From 47693d9eb579add3938d76fb8cc89e549db3ddf3 Mon Sep 17 00:00:00 2001 From: Steve Sisney Date: Wed, 11 Dec 2019 02:23:57 +0000 Subject: [PATCH 6/6] Removed support for py32, py33 from .travis.yml --- .travis.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 86d04d3..bd1d72d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,6 @@ language: python python: - 2.7 - - 3.2 - - 3.3 - - 3.4 - 3.5 - nightly - pypy @@ -12,7 +9,6 @@ python: matrix: allow_failures: - - python: 3.5 - python: nightly script: python setup.py test