diff --git a/README.md b/README.md
index d1b360b..1977d04 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,99 @@
-# python-docx2txt #
+# python-docx2txt
-A pure python-based utility to extract text from docx files.
+## Introduction
-The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). It can however also extract text from header, footer and hyperlinks. __It can now also extract images.__
+A pure Python-based utility to extract text from docx files.
-## How to install? ##
-```bash
+The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx).
+It can however also extract text from header, footer and hyperlinks.
+__It can now also extract images and properties.__
+
+It can be used as a [Python library](#python-library)
+or from the [command line](#command-line-utility).
+
+## Python Library
+
+### Library Installation
+
+```sh
pip install docx2txt
```
-## How to run? ##
+### Library Usage
-a. From command line:
-```bash
-# extract text
-docx2txt file.docx
-# extract text and images
-docx2txt -i /tmp/img_dir file.docx
+#### Procedural
+
+The library is easy to use procedurally.
+
+```py
+>>> import docx2txt
+>>> # get document text
+>>> docx2txt.process('file.docx')
+'header_textmain_textfooter_text'
+>>> # or
+>>> # get document text, extract images to /tmp/img_dir
+>>> process('file.docx', img_dir='/tmp/img_dir/')
+'header_textmain_textfooter_text'
+```
+
+#### Object Oriented
+
+The DocxFile class provides more granularity.
+Its argument list and accompanying behaviors are identical to `process()`.
+Document properties are stored as a dictionary.
+No keys are guaranteed, so the get() method is recommended.
+
+```py
+>>> import docx2txt
+>>> # parse Word doc
+>>> document = docx2txt.DocxFile('file.docx', img_dir='/tmp/img_dir/')
+>>> # path to file
+>>> document.path
+'/absolute/path/to/file.docx'
+>>> # all document text
+>>> document.text
+'header_textmain_textfooter_text'
+>>> # image directory
+>>> document.img_dir
+>>> '/tmp/img_dir'
+>>> # text components
+>>> '||'.join([document.header, document.main, document.footer])
+'header_text||main_text||footer_text'
+>>> # images (filename only if not extracted)
+>>> document.images
+['/tmp/img_dir/image1.jpg', '/tmp/img_dir/image2.jpg']
+>>> # document properties
+>>> document.properties
+{'property_name': 'property value', ...}
+>>> document.properties['title']
+'title_text'
+>>> document.properties['nonexistent']
+KeyError
+>>> document.properties.get('nonexistent')
+None
```
-b. From python:
-```python
-import docx2txt
-# extract text
-text = docx2txt.process("file.docx")
+## Command Line Utility
+
+### Utility Installation
+
+With this README file as the working directory:
-# extract text and write images in /tmp/img_dir
-text = docx2txt.process("file.docx", "/tmp/img_dir")
+```sh
+python setup.py install
```
+
+### Utility Usage
+
+```sh
+# simple text extraction
+docx2txt file.docx
+# get text, extract images to /tmp/img_dir
+docx2txt -i /tmp/img_dir file.docx
+# get all document data
+docx2txt -d file.docx
+# get all data, extract images to /tmp/img_dir
+docx2txt -d -i /tmp/img_dir file.docx
+# same as previous, more simply:
+docx2txt -di /tmp/img_dir file.docx
+```
\ No newline at end of file
diff --git a/bin/docx2txt b/bin/docx2txt
index 62157c2..992dffc 100755
--- a/bin/docx2txt
+++ b/bin/docx2txt
@@ -4,6 +4,5 @@ import docx2txt
if __name__ == '__main__':
import sys
- args = docx2txt.process_args()
- text = docx2txt.process(args.docx, args.img_dir)
- sys.stdout.write(text.encode('utf-8'))
+ for line in docx2txt.get_output():
+ sys.stdout.write(line)
diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py
index 9f51a73..031c20a 100644
--- a/docx2txt/__init__.py
+++ b/docx2txt/__init__.py
@@ -1,4 +1,4 @@
-from .docx2txt import process
-from .docx2txt import process_args
+from .docx2txt import get_output, process # noqa
+from .docx_file import DocxFile # noqa
-VERSION = '0.7'
+VERSION = '0.8'
diff --git a/docx2txt/dict_util.py b/docx2txt/dict_util.py
new file mode 100644
index 0000000..4616fc1
--- /dev/null
+++ b/docx2txt/dict_util.py
@@ -0,0 +1,18 @@
+"""Dictionary Utilities"""
+
+
+def merge(dicts):
+ # type: (list) -> dict
+ merged = {} # type: dict
+ for d in dicts:
+ merged.update(d)
+
+ return merged
+
+
+def filter_key(src_dict, key_test, getter=dict.values):
+ return getter({
+ key: val
+ for key, val
+ in src_dict.items()
+ if key_test(key)})
diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py
index 0dac072..596f2df 100755
--- a/docx2txt/docx2txt.py
+++ b/docx2txt/docx2txt.py
@@ -1,28 +1,31 @@
#! /usr/bin/env python
import argparse
-import re
-import xml.etree.ElementTree as ET
-import zipfile
import os
import sys
-
-nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+from . import docx_file
def process_args():
- parser = argparse.ArgumentParser(description='A pure python-based utility '
- 'to extract text and images '
- 'from docx files.')
- parser.add_argument("docx", help="path of the docx file")
- parser.add_argument('-i', '--img_dir', help='path of directory '
- 'to extract images')
+ """Parse command line arguments if invoked directly
+
+ Returns:
+ object -- .img_dir: output directory, .details: get document details
+ """
+ desc = 'A pure Python-based utility to extract data from docx files.'
+ id_help = 'path of directory to extract images'
+ ad_help = 'get all document data'
+
+ parser = argparse.ArgumentParser(description=desc)
+ parser.add_argument('docx', help='path of the docx file')
+ parser.add_argument('-i', '--img_dir', help=id_help)
+ parser.add_argument('-d', '--details', help=ad_help, action='store_true')
args = parser.parse_args()
if not os.path.exists(args.docx):
- print('File {} does not exist.'.format(args.docx))
+ sys.stderr.write('File {!r} does not exist.'.format(args.docx))
sys.exit(1)
if args.img_dir is not None:
@@ -30,84 +33,36 @@ def process_args():
try:
os.makedirs(args.img_dir)
except OSError:
- print("Unable to create img_dir {}".format(args.img_dir))
+ sys.stderr.write(
+ 'Unable to create img_dir {!r}'.format(args.img_dir))
sys.exit(1)
return args
-def qn(tag):
- """
- Stands for 'qualified name', a utility function to turn a namespace
- prefixed tag name into a Clark-notation qualified tag name for lxml. For
- example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
- Source: https://github.com/python-openxml/python-docx/
- """
- prefix, tagroot = tag.split(':')
- uri = nsmap[prefix]
- return '{{{}}}{}'.format(uri, tagroot)
+def process(docx, img_dir=None):
+ document = docx_file.DocxFile(docx, img_dir)
+ return document
-def xml2text(xml):
- """
- A string representing the textual content of this run, with content
- child elements like ```` translated to their Python
- equivalent.
- Adapted from: https://github.com/python-openxml/python-docx/
- """
- text = u''
- root = ET.fromstring(xml)
- for child in root.iter():
- if child.tag == qn('w:t'):
- t_text = child.text
- text += t_text if t_text is not None else ''
- elif child.tag == qn('w:tab'):
- text += '\t'
- elif child.tag in (qn('w:br'), qn('w:cr')):
- text += '\n'
- elif child.tag == qn("w:p"):
- text += '\n\n'
- return text
+def detail_text(prop_name, prop_val):
+ return '{:10s}: {!r}\n'.format(prop_name, prop_val)
-def process(docx, img_dir=None):
- text = u''
-
- # unzip the docx in memory
- zipf = zipfile.ZipFile(docx)
- filelist = zipf.namelist()
-
- # get header text
- # there can be 3 header files in the zip
- header_xmls = 'word/header[0-9]*.xml'
- for fname in filelist:
- if re.match(header_xmls, fname):
- text += xml2text(zipf.read(fname))
-
- # get main text
- doc_xml = 'word/document.xml'
- text += xml2text(zipf.read(doc_xml))
-
- # get footer text
- # there can be 3 footer files in the zip
- footer_xmls = 'word/footer[0-9]*.xml'
- for fname in filelist:
- if re.match(footer_xmls, fname):
- text += xml2text(zipf.read(fname))
-
- if img_dir is not None:
- # extract images
- for fname in filelist:
- _, extension = os.path.splitext(fname)
- if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
- dst_fname = os.path.join(img_dir, os.path.basename(fname))
- with open(dst_fname, "wb") as dst_f:
- dst_f.write(zipf.read(fname))
-
- zipf.close()
- return text.strip()
+def get_output():
+ args = process_args()
+ document = process(args.docx, args.img_dir)
+
+ if args.details:
+ yield detail_text('path', document.path)
+ yield detail_text('header', document.header)
+ yield detail_text('main', document.main)
+ yield detail_text('footer', document.footer)
+ yield detail_text('images', document.images)
+ yield detail_text('properties', document.properties)
+ else:
+ yield document.text
if __name__ == '__main__':
- args = process_args()
- text = process(args.docx, args.img_dir)
- sys.stdout.write(text.encode('utf-8'))
+ for line in get_output():
+ sys.stdout.write(line)
diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py
new file mode 100644
index 0000000..0664a1e
--- /dev/null
+++ b/docx2txt/docx_file.py
@@ -0,0 +1,365 @@
+import errno
+import os.path as os_path
+import sys
+from os import makedirs
+from zipfile import ZipFile
+
+from . import dict_util, xml_util
+
+
+def simplify_rel(attribs):
+ # type: (dict) -> str
+ """Simplify Type of a Relationship node
+
+ Arguments:
+ attribs {dict} -- attributes of Relationship node
+
+ Returns:
+ str - salient portion of rel type (officeDocument, image, etc.)
+ """
+ attr = attribs.get('Type', '')
+
+ return os_path.basename(attr)
+
+
+def locate_rel(attribs):
+ # type: (dict) -> str
+ """Get path to file in Relationship node
+
+ Arguments:
+ attribs {dict} -- attributes of Relationship node
+
+ Returns:
+ str -- path of Target within package
+ """
+ attr = attribs.get('Target', '')
+
+ return attr.lstrip('/')
+
+
+def get_package_rels(pkg_xml):
+ # type: (bytes) -> dict
+ """Get package relationships
+
+ Arguments:
+ pkg_xml {bytes} -- top level relationships XML (_rels/.rels)
+
+ Returns:
+ dict -- property and document paths
+ """
+ rels = xml_util.parse(pkg_xml)
+
+ return {
+ simplify_rel(rel.attrib): locate_rel(rel.attrib)
+ for rel
+ in rels.iter()}
+
+
+def parse_properties(prop_xml):
+ # type: (bytes) -> dict
+ """Parse XML for document metadata
+
+ Arguments:
+ prop_xml {bytes} -- property XML file (docProps XML)
+
+ Returns:
+ dict -- document metadata
+ """
+ props = xml_util.parse(prop_xml)
+
+ return {xml_util.unquote(prop.tag): prop.text for prop in props.iter()}
+
+
+def is_property_rel(rel_type):
+ # type: (str) -> bool
+ """Test for string indicating a property relationship
+
+ Arguments:
+ rel_type {str} -- relationship type
+
+ Returns:
+ bool -- relationship is a property
+ """
+ return rel_type.endswith('-properties')
+
+
+def get_package_props(pkg, pkg_rels):
+ # type: (ZipFile, dict) -> dict
+ """Get all properties of package
+
+ Arguments:
+ pkg {ZipFile} -- package as ZipFile
+ pkg_rels {dict} -- all package relationships
+
+ Returns:
+ dict -- properties of package
+ """
+ prop_dicts = [
+ parse_properties(pkg.read(path))
+ for path
+ in dict_util.filter_key(pkg_rels, is_property_rel)]
+
+ return dict_util.merge(prop_dicts)
+
+
+def get_part_rels_path(part_path):
+ # type: (str) -> str
+ """Get path to document relationships
+
+ Arguments:
+ part_path {str} -- path to officeDocument relationship
+
+ Returns:
+ str -- path to relationships for ``part_path``
+ """
+ path_comps = [
+ os_path.dirname(part_path).lstrip('/'),
+ '_rels',
+ os_path.basename(part_path) + '.rels']
+
+ return '/'.join(path_comps)
+
+
+def get_part_rels(pkg, part_key, part_path):
+ # type: (ZipFile, str, str) -> dict
+ """Parse relationships of part (document)
+
+ Arguments:
+ pkg {zipfile.ZipFile} -- package ZipFile
+ part_key {str} -- key to store path of officeDocument part
+ part_path {str} -- path to officeDocument part in package
+
+ Returns:
+ dict -- dictionary of XML data
+ """
+ base_path = os_path.dirname(part_path).lstrip('/')
+ rels_path = get_part_rels_path(part_path)
+ rel_nodes = xml_util.parse(pkg.read(rels_path))
+
+ rels = {} # type: dict
+ for rel_node in rel_nodes.iter():
+ key = simplify_rel(rel_node.attrib)
+ path = '/'.join([base_path, rel_node.attrib.get('Target', '')])
+
+ rels[key] = rels.get(key, []) + [path]
+
+ rels.update({part_key: [part_path]})
+
+ return rels
+
+
+def get_all_rels(pkg, part_type):
+ # type: (ZipFile, str) -> tuple
+ """Get relationships for package and part of ``part_type``
+
+ Arguments:
+ pkg {ZipFile} -- package as ZipFile
+ part_type {str} -- type of 'part' to locate (officeDocument)
+
+ Returns:
+ tuple -- package relationships, part relationships
+ """
+ pkg_rels = get_package_rels(pkg.read('_rels/.rels'))
+ part_path = pkg_rels.get(part_type, 'word/document.xml')
+ part_rels = get_part_rels(pkg, part_type, part_path)
+
+ return pkg_rels, part_rels
+
+
+def mkdir_p(path):
+ # type: (str) -> None
+ """Recursively create directory at ``path``
+
+ Arguments:
+ path {str} -- directory to create
+ """
+ try:
+ makedirs(path)
+ except OSError as err:
+ if err.errno != errno.EEXIST or not os_path.isdir(path):
+ raise
+
+
+def extract_image(img_bytes, img_dir, fname):
+ # type: (bytes, str, str) -> str
+ """Write image data to disk
+
+ Arguments:
+ img_bytes {bytes} -- image data
+ img_dir {str} -- output directory
+ fname {str} -- name of source file
+
+ Returns:
+ str -- absolute path to extracted image
+ """
+ dst_fname = os_path.join(img_dir, os_path.basename(fname))
+
+ with open(dst_fname, 'wb') as dst_f:
+ dst_f.write(img_bytes)
+
+ return os_path.abspath(dst_fname)
+
+
+def xml2text(xml):
+ """
+ A string representing the textual content of this run, with content
+ child elements like ```` translated to their Python
+ equivalent.
+ Adapted from: https://github.com/python-openxml/python-docx/
+ """
+ text = u''
+ root = xml_util.parse(xml)
+ whitespace_tags = {
+ xml_util.quote('w:tab'): '\t',
+ xml_util.quote('w:br'): '\n',
+ xml_util.quote('w:cr'): '\n',
+ xml_util.quote('w:p'): '\n\n', }
+ text_tag = xml_util.quote('w:t')
+ for child in root.iter():
+ text += whitespace_tags.get(child.tag, '')
+ if child.tag == text_tag and child.text is not None:
+ text += child.text
+ return text
+
+
+def xml2dict(xml):
+ # type: (bytes) -> dict
+ """Get dictionary of values from ``xml``
+
+ Arguments:
+ xml {bytes} -- contents of XML file
+
+ Returns:
+ dict -- dictionary of {node.tagName: node.text}
+ """
+ root = xml_util.parse(xml)
+ data = {
+ xml_util.unquote(child.tag): child.text
+ for child in root.iter()}
+ return data
+
+
+def read_docx(path, img_dir):
+ # type: (str, str) -> dict
+ """Load and parse contents of file at ``path``
+
+ Arguments:
+ path {str} -- path to DOCX file
+
+ Keyword Arguments:
+ img_dir {str} -- save images in specififed directory
+
+ Returns:
+ dict -- header, main, footer, images, and properties of DOCX file
+ """
+ HEAD_KEY = 'header'
+ MAIN_KEY = 'officeDocument'
+ FOOT_KEY = 'footer'
+ IMG_KEY = 'image'
+
+ with ZipFile(path) as pkg:
+ pkg_rels, doc_rels = get_all_rels(pkg, MAIN_KEY)
+
+ text = {
+ key: ''.join([
+ xml2text(pkg.read(fname))
+ for fname in doc_rels.get(key, [])])
+ for key in [HEAD_KEY, MAIN_KEY, FOOT_KEY]} # type: dict
+
+ images = [] # type: list
+ if img_dir is None:
+ images += [
+ os_path.basename(fname)
+ for fname in doc_rels.get(IMG_KEY, [])]
+ else:
+ mkdir_p(img_dir)
+ images += [
+ extract_image(pkg.read(fname), img_dir, fname)
+ for fname in doc_rels.get(IMG_KEY, [])]
+
+ props = get_package_props(pkg, pkg_rels)
+
+ return {
+ 'header': text.get(HEAD_KEY),
+ 'main': text.get(MAIN_KEY),
+ 'footer': text.get(FOOT_KEY),
+ 'images': images,
+ 'properties': props, }
+
+
+def get_path(path):
+ # type: (object) -> str
+ """Get absolute path to document
+
+ Arguments:
+ path {str} -- path to DOCX file (nominal)
+
+ Returns:
+ str -- path to document (absolute)
+ """
+ # simple filesystem path string
+ try:
+ return os_path.abspath(str(path))
+ except TypeError:
+ pass
+
+ # TextIOWrapper, addinfourl, HTTPResponse... and more?
+ for attr in (getattr(path, key) for key in ('name', 'url')):
+ if attr is not None:
+ return str(attr)
+
+ return ''
+
+
+class DocxFile(object):
+ def __init__(self, file, img_dir=None):
+ doc_data = read_docx(file, img_dir)
+
+ self._path = get_path(file) # type: str
+ self._img_dir = img_dir # type: str
+ self._header = doc_data['header'] # type: str
+ self._main = doc_data['main'] # type: str
+ self._footer = doc_data['footer'] # type: str
+ self._images = doc_data['images'] # type: list
+ self._properties = doc_data['properties'] # type: dict
+
+ def __str__(self):
+ if sys.version_info[0] < 3:
+ return self._main.encode('utf-8')
+
+ return self._main
+
+ def __repr__(self):
+ return 'DocxFile({!r}, {!r})'.format(self._path, self._img_dir)
+
+ @property
+ def path(self):
+ return self._path
+
+ @property
+ def img_dir(self):
+ return self._img_dir
+
+ @property
+ def header(self):
+ return self._header
+
+ @property
+ def main(self):
+ return self._main
+
+ @property
+ def footer(self):
+ return self._footer
+
+ @property
+ def images(self):
+ return self._images
+
+ @property
+ def properties(self):
+ return self._properties
+
+ @property
+ def text(self):
+ return str(self).strip()
diff --git a/docx2txt/xml_util.py b/docx2txt/xml_util.py
new file mode 100644
index 0000000..7b4d061
--- /dev/null
+++ b/docx2txt/xml_util.py
@@ -0,0 +1,37 @@
+"""XML Utilities"""
+
+import xml.etree.ElementTree as ET
+
+
+def quote(tag):
+ """
+ Turn a namespace
+ prefixed tag name into a Clark-notation qualified tag name for lxml. For
+ example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
+ Source: https://github.com/python-openxml/python-docx/
+ """
+ nsmap = {
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+ prefix, tagroot = tag.split(':')
+ uri = nsmap[prefix]
+ return '{{{}}}{}'.format(uri, tagroot)
+
+
+def unquote(tag):
+ # type: (str) -> str
+ """Remove namespace from prefixed tag.
+
+ See: [Python issue 18304](https://bugs.python.org/issue18304)
+
+ Arguments:
+ tag {str} -- (possibly-)namespaced tag
+
+ Returns:
+ str -- tag name without namespace
+ """
+ return tag.split('}').pop()
+
+
+def parse(xml_bytes):
+ # type: (bytes) -> ET.Element
+ return ET.fromstring(xml_bytes)
diff --git a/setup.py b/setup.py
index 004158a..212c16a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,13 @@
import glob
+# pylint: disable=no-name-in-module,import-error
from distutils.core import setup
-
# get all of the scripts
scripts = glob.glob('bin/*')
setup(
name='docx2txt',
packages=['docx2txt'],
- version='0.7',
+ version='0.8',
description='A pure python-based utility to extract text and images '
'from docx files.',
author='Ankush Shah',