Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,13 @@ language: python

python:
- 2.7
- 3.2
- 3.3
- 3.4
- 3.5
- nightly
- pypy
- pypy3

matrix:
allow_failures:
- python: 3.5
- python: nightly

script: python setup.py test
Expand Down
86 changes: 49 additions & 37 deletions hanzo/warctools/arc.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,61 @@
"""An object to represent arc records
http://archive.org/web/researcher/ArcFileFormat.php
"""
ARC Record
~~~~~~~~~~
"""

import re

from hanzo.warctools.record import ArchiveRecord, ArchiveParser
from hanzo.warctools.archive_detect import register_record_type


# URL<sp>IP-address<sp>Archive-date<sp>Content-type<sp>
#Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
#Archive-length<nl>
#
# Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
# Archive-length<nl>

@ArchiveRecord.HEADERS(
URL = b'URL',
IP = b'IP-address',
DATE = b'Archive-date',
CONTENT_TYPE = b'Content-type',
CONTENT_LENGTH = b'Archive-length',
RESULT_CODE = b'Result-code',
CHECKSUM = b'Checksum',
LOCATION = b'Location',
OFFSET = b'Offset',
FILENAME = b'Filename',
URL=b'URL',
IP=b'IP-address',
DATE=b'Archive-date',
CONTENT_TYPE=b'Content-type',
CONTENT_LENGTH=b'Archive-length',
RESULT_CODE=b'Result-code',
CHECKSUM=b'Checksum',
LOCATION=b'Location',
OFFSET=b'Offset',
FILENAME=b'Filename',
)
class ArcRecord(ArchiveRecord):

"""An object to represent arc records
http://archive.org/web/researcher/ArcFileFormat.php
"""

TRAILER = b'\n' # an ARC record is trailed by single unix newline

"""Represents a record in an arc file."""
def __init__(self, headers=None, content=None, errors=None):
ArchiveRecord.__init__(self, headers, content, errors)
ArchiveRecord.__init__(self, headers, content, errors)

@property
def type(self):
return b"response"

def _write_to(self, out, nl):
#TODO: empty method?
# TODO: empty method?
pass

@classmethod
def make_parser(cls):
"""Constructs a parser for arc records."""
return ArcParser()


class ArcRecordHeader(ArcRecord):
"""Represents the headers in an arc record."""
def __init__(self, headers=None, content=None, errors=None, version=None,
raw_headers=None):
ArcRecord.__init__(self, headers, content, errors)
ArcRecord.__init__(self, headers, content, errors)
self.version = version
self.raw_headers = raw_headers

Expand All @@ -60,20 +67,22 @@ def raw(self):
"""Return the raw representation of this record."""
return b"".join(self.raw_headers) + self.content[1]


def rx(pat):
"""Helper function to compile a regular expression with the IGNORECASE
flag."""
return re.compile(pat, flags=re.IGNORECASE)


nl_rx = rx('^\r\n|\r|\n$')
length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101
length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$')
type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') # pylint: disable-msg=E1101
SPLIT = re.compile(br'\b\s|\s\b').split


class ArcParser(ArchiveParser):
"""A parser for arc archives."""


def __init__(self):
self.version = 0
# we don't know which version to parse initially - a v1 or v2 file so
Expand All @@ -82,16 +91,18 @@ def __init__(self):

# question? will we get arc fragments?
# should we store both headers & detect records by header length?
# if we don't know
# if we don't know

self.headers = []

def parse(self, stream, offset, line=None):
"""Parses a stream as an arc archive and returns an Arc record along
with the offset in the stream of the end of the record."""

record = None
content_type = None
# content_type = None
content_length = None

if line is None:
line = stream.readline()

Expand All @@ -115,18 +126,18 @@ def parse(self, stream, offset, line=None):
# configure parser instance
self.version = arc_version.split()[0]
self.headers = arc_names_line.strip().split()

# now we have read header field in record body
# we can extract the headers from the current record,
# and read the length field

# which is in a different place with v1 and v2
# read headers

# read headers
arc_headers = self.parse_header_list(line)

# extract content, ignoring header lines parsed already
content_type, content_length, errors = \
_content_type, content_length, errors = \
self.get_content_headers(arc_headers)

content_length = content_length \
Expand All @@ -141,10 +152,10 @@ def parse(self, stream, offset, line=None):
if not self.headers:
raise Exception('missing filedesc')
headers = self.parse_header_list(line)
content_type, content_length, errors = \
_content_type, content_length, errors = \
self.get_content_headers(headers)

record = ArcRecord(headers = headers, errors=errors)
record = ArcRecord(headers=headers, errors=errors)

line = None

Expand All @@ -153,28 +164,29 @@ def parse(self, stream, offset, line=None):

return (record, (), offset)

def trim(self, stream):
return ()

def parse_header_list(self, line):
"""returns the list of record headers"""

# some people use ' ' as the empty value. lovely.
line = line.rstrip(b'\r\n')
values = SPLIT(line)
if len(self.headers) != len(values):
if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
# fencepost
values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
values = [s[::-1] for s in reversed(
SPLIT(line[::-1], len(self.headers)-1))]
else:
values = SPLIT(line, len(self.headers)-1)

if len(self.headers) != len(values):
raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))

return list(zip(self.headers, values))
raise Exception('missing headers %s %s' % (
",".join(values), ",".join(self.headers)))

return list(zip(self.headers, values))

@staticmethod
def get_content_headers(headers):
"""returns content_type, content_length, errors from headers"""
content_type = None
content_length = None
errors = []
Expand Down
111 changes: 111 additions & 0 deletions hanzo/warctools/geezip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Extends gzip.GzipFile for raw gzip offset in python 2 and 3"""

import gzip
import io

try:
import builtins
except ImportError:
pass

try:

# this branch contributed by Kenji:
# https://github.com/kngenie/warctools/commit/159bfdfa45cc0b51ed4a4a4d7d744ef7bf82ae23

# Python 3.5 got a major change to gzip module. Essential gunzip
# work is now implemented in _GzipReader and GzipFile simply wraps
# around it.

class _GeeZipReader(gzip._GzipReader):

"""Extends python 3.5 gzip._GzipReader"""

def _read_gzip_header(self):
pos = self._raw_pos()
has_record = super(_GeeZipReader, self)._read_gzip_header()

if has_record:
self.member_offset = pos

return has_record

def _raw_pos(self):
"""Return offset in raw gzip file corresponding to this
object's state."""

# _fp is PaddedFile object with prepend method. it doesn't have
# tell(). It has seek(), but it's useless as a replacement for
# tell(). We need to compute offset from internal attributes.

pos = self._fp.file.tell()

if self._fp._read is not None:
pos -= (self._fp._length - self._fp._read)

return pos

class GeeZipFile(gzip.GzipFile):

def __init__(self, filename=None, mode=None, fileobj=None):
if mode is None:
mode = getattr(fileobj, 'mode', 'rb')

if mode.startswith('r'):
if mode and 'b' not in mode:
mode += 'b'
if fileobj is None:
fileobj = self.myfileobj = builtins.open(
filename, mode or 'rb')
if filename is None:
filename = getattr(fileobj, 'name', '')
if not isinstance(filename, (str, bytes)):
filename = ''
self.mode = gzip.READ
raw = _GeeZipReader(fileobj)
self._buffer = io.BufferedReader(raw)
self.name = filename
self.fileobj = fileobj
self._raw = raw
else:
super(GeeZipFile, self).__init__(filename, mode, fileobj)

@property
def member_offset(self):
return self._raw.member_offset

except AttributeError:

# this branch falls back to python 2.7+

class GeeZipFile(gzip.GzipFile):
"""Extends gzip.GzipFile to remember self.member_offset, the raw
file offset of the current gzip member."""

def __init__(self, filename=None, mode=None,
compresslevel=9, fileobj=None, mtime=None):
print('[DEBUG] GeeZipFile(gzip.GzipFile)')
print('[DEBUG] => fileobj', fileobj)

# ignore mtime for python 2.6
gzip.GzipFile.__init__(self, filename=filename, mode=mode,
compresslevel=compresslevel,
fileobj=fileobj)

self.member_offset = None
print('[DEBUG] => member_offset', self.member_offset)

# hook in to the place we seem to be able to reliably get the raw gzip
# member offset
def _read(self, size=1024):
if self._new_member:
try:
# works for python3.2
self.member_offset = (self.fileobj.tell()
- self.fileobj._length
+ (self.fileobj._read or 0))
except AttributeError:
# works for python2.7
self.member_offset = self.fileobj.tell()

return gzip.GzipFile._read(self, size)
Loading