diff --git a/README.md b/README.md index d1b360b..1977d04 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,99 @@ -# python-docx2txt # +# python-docx2txt -A pure python-based utility to extract text from docx files. +## Introduction -The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). It can however also extract text from header, footer and hyperlinks. __It can now also extract images.__ +A pure Python-based utility to extract text from docx files. -## How to install? ## -```bash +The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). +It can however also extract text from header, footer and hyperlinks. +__It can now also extract images and properties.__ + +It can be used as a [Python library](#python-library) +or from the [command line](#command-line-utility). + +## Python Library + +### Library Installation + +```sh pip install docx2txt ``` -## How to run? ## +### Library Usage -a. From command line: -```bash -# extract text -docx2txt file.docx -# extract text and images -docx2txt -i /tmp/img_dir file.docx +#### Procedural + +The library is easy to use procedurally. + +```py +>>> import docx2txt +>>> # get document text +>>> docx2txt.process('file.docx') +'header_textmain_textfooter_text' +>>> # or +>>> # get document text, extract images to /tmp/img_dir +>>> process('file.docx', img_dir='/tmp/img_dir/') +'header_textmain_textfooter_text' +``` + +#### Object Oriented + +The DocxFile class provides more granularity. +Its argument list and accompanying behaviors are identical to `process()`. +Document properties are stored as a dictionary. +No keys are guaranteed, so the get() method is recommended. + +```py +>>> import docx2txt +>>> # parse Word doc +>>> document = docx2txt.DocxFile('file.docx', img_dir='/tmp/img_dir/') +>>> # path to file +>>> document.path +'/absolute/path/to/file.docx' +>>> # all document text +>>> document.text +'header_textmain_textfooter_text' +>>> # image directory +>>> document.img_dir +>>> '/tmp/img_dir' +>>> # text components +>>> '||'.join([document.header, document.main, document.footer]) +'header_text||main_text||footer_text' +>>> # images (filename only if not extracted) +>>> document.images +['/tmp/img_dir/image1.jpg', '/tmp/img_dir/image2.jpg'] +>>> # document properties +>>> document.properties +{'property_name': 'property value', ...} +>>> document.properties['title'] +'title_text' +>>> document.properties['nonexistent'] +KeyError +>>> document.properties.get('nonexistent') +None ``` -b. From python: -```python -import docx2txt -# extract text -text = docx2txt.process("file.docx") +## Command Line Utility + +### Utility Installation + +With this README file as the working directory: -# extract text and write images in /tmp/img_dir -text = docx2txt.process("file.docx", "/tmp/img_dir") +```sh +python setup.py install ``` + +### Utility Usage + +```sh +# simple text extraction +docx2txt file.docx +# get text, extract images to /tmp/img_dir +docx2txt -i /tmp/img_dir file.docx +# get all document data +docx2txt -d file.docx +# get all data, extract images to /tmp/img_dir +docx2txt -d -i /tmp/img_dir file.docx +# same as previous, more simply: +docx2txt -di /tmp/img_dir file.docx +``` \ No newline at end of file diff --git a/bin/docx2txt b/bin/docx2txt index 62157c2..992dffc 100755 --- a/bin/docx2txt +++ b/bin/docx2txt @@ -4,6 +4,5 @@ import docx2txt if __name__ == '__main__': import sys - args = docx2txt.process_args() - text = docx2txt.process(args.docx, args.img_dir) - sys.stdout.write(text.encode('utf-8')) + for line in docx2txt.get_output(): + sys.stdout.write(line) diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py index 9f51a73..031c20a 100644 --- a/docx2txt/__init__.py +++ b/docx2txt/__init__.py @@ -1,4 +1,4 @@ -from .docx2txt import process -from .docx2txt import process_args +from .docx2txt import get_output, process # noqa +from .docx_file import DocxFile # noqa -VERSION = '0.7' +VERSION = '0.8' diff --git a/docx2txt/dict_util.py b/docx2txt/dict_util.py new file mode 100644 index 0000000..4616fc1 --- /dev/null +++ b/docx2txt/dict_util.py @@ -0,0 +1,18 @@ +"""Dictionary Utilities""" + + +def merge(dicts): + # type: (list) -> dict + merged = {} # type: dict + for d in dicts: + merged.update(d) + + return merged + + +def filter_key(src_dict, key_test, getter=dict.values): + return getter({ + key: val + for key, val + in src_dict.items() + if key_test(key)}) diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py index 0dac072..596f2df 100755 --- a/docx2txt/docx2txt.py +++ b/docx2txt/docx2txt.py @@ -1,28 +1,31 @@ #! /usr/bin/env python import argparse -import re -import xml.etree.ElementTree as ET -import zipfile import os import sys - -nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} +from . import docx_file def process_args(): - parser = argparse.ArgumentParser(description='A pure python-based utility ' - 'to extract text and images ' - 'from docx files.') - parser.add_argument("docx", help="path of the docx file") - parser.add_argument('-i', '--img_dir', help='path of directory ' - 'to extract images') + """Parse command line arguments if invoked directly + + Returns: + object -- .img_dir: output directory, .details: get document details + """ + desc = 'A pure Python-based utility to extract data from docx files.' + id_help = 'path of directory to extract images' + ad_help = 'get all document data' + + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('docx', help='path of the docx file') + parser.add_argument('-i', '--img_dir', help=id_help) + parser.add_argument('-d', '--details', help=ad_help, action='store_true') args = parser.parse_args() if not os.path.exists(args.docx): - print('File {} does not exist.'.format(args.docx)) + sys.stderr.write('File {!r} does not exist.'.format(args.docx)) sys.exit(1) if args.img_dir is not None: @@ -30,84 +33,36 @@ def process_args(): try: os.makedirs(args.img_dir) except OSError: - print("Unable to create img_dir {}".format(args.img_dir)) + sys.stderr.write( + 'Unable to create img_dir {!r}'.format(args.img_dir)) sys.exit(1) return args -def qn(tag): - """ - Stands for 'qualified name', a utility function to turn a namespace - prefixed tag name into a Clark-notation qualified tag name for lxml. For - example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. - Source: https://github.com/python-openxml/python-docx/ - """ - prefix, tagroot = tag.split(':') - uri = nsmap[prefix] - return '{{{}}}{}'.format(uri, tagroot) +def process(docx, img_dir=None): + document = docx_file.DocxFile(docx, img_dir) + return document -def xml2text(xml): - """ - A string representing the textual content of this run, with content - child elements like ```` translated to their Python - equivalent. - Adapted from: https://github.com/python-openxml/python-docx/ - """ - text = u'' - root = ET.fromstring(xml) - for child in root.iter(): - if child.tag == qn('w:t'): - t_text = child.text - text += t_text if t_text is not None else '' - elif child.tag == qn('w:tab'): - text += '\t' - elif child.tag in (qn('w:br'), qn('w:cr')): - text += '\n' - elif child.tag == qn("w:p"): - text += '\n\n' - return text +def detail_text(prop_name, prop_val): + return '{:10s}: {!r}\n'.format(prop_name, prop_val) -def process(docx, img_dir=None): - text = u'' - - # unzip the docx in memory - zipf = zipfile.ZipFile(docx) - filelist = zipf.namelist() - - # get header text - # there can be 3 header files in the zip - header_xmls = 'word/header[0-9]*.xml' - for fname in filelist: - if re.match(header_xmls, fname): - text += xml2text(zipf.read(fname)) - - # get main text - doc_xml = 'word/document.xml' - text += xml2text(zipf.read(doc_xml)) - - # get footer text - # there can be 3 footer files in the zip - footer_xmls = 'word/footer[0-9]*.xml' - for fname in filelist: - if re.match(footer_xmls, fname): - text += xml2text(zipf.read(fname)) - - if img_dir is not None: - # extract images - for fname in filelist: - _, extension = os.path.splitext(fname) - if extension in [".jpg", ".jpeg", ".png", ".bmp"]: - dst_fname = os.path.join(img_dir, os.path.basename(fname)) - with open(dst_fname, "wb") as dst_f: - dst_f.write(zipf.read(fname)) - - zipf.close() - return text.strip() +def get_output(): + args = process_args() + document = process(args.docx, args.img_dir) + + if args.details: + yield detail_text('path', document.path) + yield detail_text('header', document.header) + yield detail_text('main', document.main) + yield detail_text('footer', document.footer) + yield detail_text('images', document.images) + yield detail_text('properties', document.properties) + else: + yield document.text if __name__ == '__main__': - args = process_args() - text = process(args.docx, args.img_dir) - sys.stdout.write(text.encode('utf-8')) + for line in get_output(): + sys.stdout.write(line) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py new file mode 100644 index 0000000..0664a1e --- /dev/null +++ b/docx2txt/docx_file.py @@ -0,0 +1,365 @@ +import errno +import os.path as os_path +import sys +from os import makedirs +from zipfile import ZipFile + +from . import dict_util, xml_util + + +def simplify_rel(attribs): + # type: (dict) -> str + """Simplify Type of a Relationship node + + Arguments: + attribs {dict} -- attributes of Relationship node + + Returns: + str - salient portion of rel type (officeDocument, image, etc.) + """ + attr = attribs.get('Type', '') + + return os_path.basename(attr) + + +def locate_rel(attribs): + # type: (dict) -> str + """Get path to file in Relationship node + + Arguments: + attribs {dict} -- attributes of Relationship node + + Returns: + str -- path of Target within package + """ + attr = attribs.get('Target', '') + + return attr.lstrip('/') + + +def get_package_rels(pkg_xml): + # type: (bytes) -> dict + """Get package relationships + + Arguments: + pkg_xml {bytes} -- top level relationships XML (_rels/.rels) + + Returns: + dict -- property and document paths + """ + rels = xml_util.parse(pkg_xml) + + return { + simplify_rel(rel.attrib): locate_rel(rel.attrib) + for rel + in rels.iter()} + + +def parse_properties(prop_xml): + # type: (bytes) -> dict + """Parse XML for document metadata + + Arguments: + prop_xml {bytes} -- property XML file (docProps XML) + + Returns: + dict -- document metadata + """ + props = xml_util.parse(prop_xml) + + return {xml_util.unquote(prop.tag): prop.text for prop in props.iter()} + + +def is_property_rel(rel_type): + # type: (str) -> bool + """Test for string indicating a property relationship + + Arguments: + rel_type {str} -- relationship type + + Returns: + bool -- relationship is a property + """ + return rel_type.endswith('-properties') + + +def get_package_props(pkg, pkg_rels): + # type: (ZipFile, dict) -> dict + """Get all properties of package + + Arguments: + pkg {ZipFile} -- package as ZipFile + pkg_rels {dict} -- all package relationships + + Returns: + dict -- properties of package + """ + prop_dicts = [ + parse_properties(pkg.read(path)) + for path + in dict_util.filter_key(pkg_rels, is_property_rel)] + + return dict_util.merge(prop_dicts) + + +def get_part_rels_path(part_path): + # type: (str) -> str + """Get path to document relationships + + Arguments: + part_path {str} -- path to officeDocument relationship + + Returns: + str -- path to relationships for ``part_path`` + """ + path_comps = [ + os_path.dirname(part_path).lstrip('/'), + '_rels', + os_path.basename(part_path) + '.rels'] + + return '/'.join(path_comps) + + +def get_part_rels(pkg, part_key, part_path): + # type: (ZipFile, str, str) -> dict + """Parse relationships of part (document) + + Arguments: + pkg {zipfile.ZipFile} -- package ZipFile + part_key {str} -- key to store path of officeDocument part + part_path {str} -- path to officeDocument part in package + + Returns: + dict -- dictionary of XML data + """ + base_path = os_path.dirname(part_path).lstrip('/') + rels_path = get_part_rels_path(part_path) + rel_nodes = xml_util.parse(pkg.read(rels_path)) + + rels = {} # type: dict + for rel_node in rel_nodes.iter(): + key = simplify_rel(rel_node.attrib) + path = '/'.join([base_path, rel_node.attrib.get('Target', '')]) + + rels[key] = rels.get(key, []) + [path] + + rels.update({part_key: [part_path]}) + + return rels + + +def get_all_rels(pkg, part_type): + # type: (ZipFile, str) -> tuple + """Get relationships for package and part of ``part_type`` + + Arguments: + pkg {ZipFile} -- package as ZipFile + part_type {str} -- type of 'part' to locate (officeDocument) + + Returns: + tuple -- package relationships, part relationships + """ + pkg_rels = get_package_rels(pkg.read('_rels/.rels')) + part_path = pkg_rels.get(part_type, 'word/document.xml') + part_rels = get_part_rels(pkg, part_type, part_path) + + return pkg_rels, part_rels + + +def mkdir_p(path): + # type: (str) -> None + """Recursively create directory at ``path`` + + Arguments: + path {str} -- directory to create + """ + try: + makedirs(path) + except OSError as err: + if err.errno != errno.EEXIST or not os_path.isdir(path): + raise + + +def extract_image(img_bytes, img_dir, fname): + # type: (bytes, str, str) -> str + """Write image data to disk + + Arguments: + img_bytes {bytes} -- image data + img_dir {str} -- output directory + fname {str} -- name of source file + + Returns: + str -- absolute path to extracted image + """ + dst_fname = os_path.join(img_dir, os_path.basename(fname)) + + with open(dst_fname, 'wb') as dst_f: + dst_f.write(img_bytes) + + return os_path.abspath(dst_fname) + + +def xml2text(xml): + """ + A string representing the textual content of this run, with content + child elements like ```` translated to their Python + equivalent. + Adapted from: https://github.com/python-openxml/python-docx/ + """ + text = u'' + root = xml_util.parse(xml) + whitespace_tags = { + xml_util.quote('w:tab'): '\t', + xml_util.quote('w:br'): '\n', + xml_util.quote('w:cr'): '\n', + xml_util.quote('w:p'): '\n\n', } + text_tag = xml_util.quote('w:t') + for child in root.iter(): + text += whitespace_tags.get(child.tag, '') + if child.tag == text_tag and child.text is not None: + text += child.text + return text + + +def xml2dict(xml): + # type: (bytes) -> dict + """Get dictionary of values from ``xml`` + + Arguments: + xml {bytes} -- contents of XML file + + Returns: + dict -- dictionary of {node.tagName: node.text} + """ + root = xml_util.parse(xml) + data = { + xml_util.unquote(child.tag): child.text + for child in root.iter()} + return data + + +def read_docx(path, img_dir): + # type: (str, str) -> dict + """Load and parse contents of file at ``path`` + + Arguments: + path {str} -- path to DOCX file + + Keyword Arguments: + img_dir {str} -- save images in specififed directory + + Returns: + dict -- header, main, footer, images, and properties of DOCX file + """ + HEAD_KEY = 'header' + MAIN_KEY = 'officeDocument' + FOOT_KEY = 'footer' + IMG_KEY = 'image' + + with ZipFile(path) as pkg: + pkg_rels, doc_rels = get_all_rels(pkg, MAIN_KEY) + + text = { + key: ''.join([ + xml2text(pkg.read(fname)) + for fname in doc_rels.get(key, [])]) + for key in [HEAD_KEY, MAIN_KEY, FOOT_KEY]} # type: dict + + images = [] # type: list + if img_dir is None: + images += [ + os_path.basename(fname) + for fname in doc_rels.get(IMG_KEY, [])] + else: + mkdir_p(img_dir) + images += [ + extract_image(pkg.read(fname), img_dir, fname) + for fname in doc_rels.get(IMG_KEY, [])] + + props = get_package_props(pkg, pkg_rels) + + return { + 'header': text.get(HEAD_KEY), + 'main': text.get(MAIN_KEY), + 'footer': text.get(FOOT_KEY), + 'images': images, + 'properties': props, } + + +def get_path(path): + # type: (object) -> str + """Get absolute path to document + + Arguments: + path {str} -- path to DOCX file (nominal) + + Returns: + str -- path to document (absolute) + """ + # simple filesystem path string + try: + return os_path.abspath(str(path)) + except TypeError: + pass + + # TextIOWrapper, addinfourl, HTTPResponse... and more? + for attr in (getattr(path, key) for key in ('name', 'url')): + if attr is not None: + return str(attr) + + return '' + + +class DocxFile(object): + def __init__(self, file, img_dir=None): + doc_data = read_docx(file, img_dir) + + self._path = get_path(file) # type: str + self._img_dir = img_dir # type: str + self._header = doc_data['header'] # type: str + self._main = doc_data['main'] # type: str + self._footer = doc_data['footer'] # type: str + self._images = doc_data['images'] # type: list + self._properties = doc_data['properties'] # type: dict + + def __str__(self): + if sys.version_info[0] < 3: + return self._main.encode('utf-8') + + return self._main + + def __repr__(self): + return 'DocxFile({!r}, {!r})'.format(self._path, self._img_dir) + + @property + def path(self): + return self._path + + @property + def img_dir(self): + return self._img_dir + + @property + def header(self): + return self._header + + @property + def main(self): + return self._main + + @property + def footer(self): + return self._footer + + @property + def images(self): + return self._images + + @property + def properties(self): + return self._properties + + @property + def text(self): + return str(self).strip() diff --git a/docx2txt/xml_util.py b/docx2txt/xml_util.py new file mode 100644 index 0000000..7b4d061 --- /dev/null +++ b/docx2txt/xml_util.py @@ -0,0 +1,37 @@ +"""XML Utilities""" + +import xml.etree.ElementTree as ET + + +def quote(tag): + """ + Turn a namespace + prefixed tag name into a Clark-notation qualified tag name for lxml. For + example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. + Source: https://github.com/python-openxml/python-docx/ + """ + nsmap = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} + prefix, tagroot = tag.split(':') + uri = nsmap[prefix] + return '{{{}}}{}'.format(uri, tagroot) + + +def unquote(tag): + # type: (str) -> str + """Remove namespace from prefixed tag. + + See: [Python issue 18304](https://bugs.python.org/issue18304) + + Arguments: + tag {str} -- (possibly-)namespaced tag + + Returns: + str -- tag name without namespace + """ + return tag.split('}').pop() + + +def parse(xml_bytes): + # type: (bytes) -> ET.Element + return ET.fromstring(xml_bytes) diff --git a/setup.py b/setup.py index 004158a..212c16a 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,13 @@ import glob +# pylint: disable=no-name-in-module,import-error from distutils.core import setup - # get all of the scripts scripts = glob.glob('bin/*') setup( name='docx2txt', packages=['docx2txt'], - version='0.7', + version='0.8', description='A pure python-based utility to extract text and images ' 'from docx files.', author='Ankush Shah',