From a54c8592f1fccab083064b5ce254867a2b6cd69d Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 13:04:31 -0700 Subject: [PATCH 01/22] clear linter messages --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 004158a..4929689 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ import glob +# pylint: disable=no-name-in-module,import-error from distutils.core import setup - # get all of the scripts scripts = glob.glob('bin/*') From b1ffc88bf986052a78e69016a77aa5b34d4d5f77 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 13:24:49 -0700 Subject: [PATCH 02/22] write errors to stderr add -d/--details flag for all document data reduce calls to qn() in xml2text add DocxFile class for object-type access add get_output() for script invocation --- docx2txt/docx2txt.py | 278 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 226 insertions(+), 52 deletions(-) diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py index 0dac072..04b496c 100755 --- a/docx2txt/docx2txt.py +++ b/docx2txt/docx2txt.py @@ -1,28 +1,32 @@ #! /usr/bin/env python import argparse +import os import re +import sys import xml.etree.ElementTree as ET import zipfile -import os -import sys -nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} +def process_args(): + """Parse command line arguments if invoked directly + Returns: + object -- .img_dir: output directory, .details: get document details + """ + desc = 'A pure Python-based utility to extract data from docx files.' + id_help = 'path of directory to extract images' + ad_help = 'get all document data' -def process_args(): - parser = argparse.ArgumentParser(description='A pure python-based utility ' - 'to extract text and images ' - 'from docx files.') - parser.add_argument("docx", help="path of the docx file") - parser.add_argument('-i', '--img_dir', help='path of directory ' - 'to extract images') + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('docx', help='path of the docx file') + parser.add_argument('-i', '--img_dir', help=id_help) + parser.add_argument('-d', '--details', help=ad_help, action='store_true') args = parser.parse_args() if not os.path.exists(args.docx): - print('File {} does not exist.'.format(args.docx)) + sys.stderr.write('File {!r} does not exist.'.format(args.docx)) sys.exit(1) if args.img_dir is not None: @@ -30,11 +34,102 @@ def process_args(): try: os.makedirs(args.img_dir) except OSError: - print("Unable to create img_dir {}".format(args.img_dir)) + sys.stderr.write( + 'Unable to create img_dir {!r}'.format(args.img_dir)) sys.exit(1) return args +def get_rel_key(attrib): + # type: (dict) -> str + """Get dictionary key for XML node + + Arguments: + attrib {dict} -- relationship node attributes + + Returns: + str -- simplified key name + """ + node_type = attrib.get('Type', '') + key = str(re.sub(r'.+[\/\-]+', '', node_type)) + return key + + +def get_rel_path(parent, attrib): + # type: (str, dict) -> str + """Get path to relationship in REL file + + Arguments: + parent {str} -- parent directory of relationship + attrib {dict} -- relationship node attributes + + Returns: + str -- full path to relationship + """ + target = attrib.get('Target', '') + path = (parent + target).lstrip('/') + + return path + + +def load_rels(xml, fname): + # type: (bytes, str) -> dict + """Parse document REL file + + Arguments: + xml {bytes} -- contents of XML file + fname {str} -- path to XML file + + Returns: + dict -- dictionary of XML data + """ + root = ET.fromstring(xml) + base_path = str(re.sub(r'_rels/.+', '', fname)) + data = {} # type: dict + + for node in root.iter(): + key = get_rel_key(node.attrib) + path = get_rel_path(base_path, node.attrib) + data[key] = data.get(key, []) + [path] + + return data + + +def extract_image(img_bytes, img_dir, fname): + # type: (bytes, str, str) -> str + """Write image data to disk + + Arguments: + img_bytes {bytes} -- image data + img_dir {str} -- output directory + fname {str} -- name of source file + + Returns: + str -- absolute path to extracted image + """ + dst_fname = os.path.join(img_dir, os.path.basename(fname)) + + with open(dst_fname, 'wb') as dst_f: + dst_f.write(img_bytes) + + return os.path.abspath(dst_fname) + + +def un(tag): + # type: (str) -> str + """Stands for 'unqualified name'. Removes namespace from prefixed tag. + + See: [Python issue 18304](https://bugs.python.org/issue18304) + + Arguments: + tag {str} -- (possibly-)namespaced tag + + Returns: + str -- tag name without namespace + """ + return tag.split('}').pop() + + def qn(tag): """ Stands for 'qualified name', a utility function to turn a namespace @@ -42,6 +137,8 @@ def qn(tag): example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. Source: https://github.com/python-openxml/python-docx/ """ + nsmap = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} prefix, tagroot = tag.split(':') uri = nsmap[prefix] return '{{{}}}{}'.format(uri, tagroot) @@ -56,58 +153,135 @@ def xml2text(xml): """ text = u'' root = ET.fromstring(xml) + whitespace_tags = { + qn('w:tab'): '\t', + qn('w:br'): '\n', + qn('w:cr'): '\n', + qn('w:p'): '\n\n', } + text_tag = qn('w:t') for child in root.iter(): - if child.tag == qn('w:t'): - t_text = child.text - text += t_text if t_text is not None else '' - elif child.tag == qn('w:tab'): - text += '\t' - elif child.tag in (qn('w:br'), qn('w:cr')): - text += '\n' - elif child.tag == qn("w:p"): - text += '\n\n' + text += whitespace_tags.get(child.tag, '') + if child.tag == text_tag and child.text is not None: + text += child.text return text -def process(docx, img_dir=None): - text = u'' +def xml2dict(xml): + # type: (bytes) -> dict + """Get dictionary of values from ``xml`` + + Arguments: + xml {bytes} -- contents of XML file - # unzip the docx in memory - zipf = zipfile.ZipFile(docx) - filelist = zipf.namelist() + Returns: + dict -- dictionary of {node.tagName: node.text} + """ + root = ET.fromstring(xml) + data = { + un(child.tag): child.text + for child in root.iter()} + return data - # get header text - # there can be 3 header files in the zip - header_xmls = 'word/header[0-9]*.xml' - for fname in filelist: - if re.match(header_xmls, fname): - text += xml2text(zipf.read(fname)) - # get main text - doc_xml = 'word/document.xml' - text += xml2text(zipf.read(doc_xml)) +def parse_docx(path, img_dir): + # type: (str, str) -> dict + """Load and parse contents of file at ``path`` - # get footer text - # there can be 3 footer files in the zip - footer_xmls = 'word/footer[0-9]*.xml' - for fname in filelist: - if re.match(footer_xmls, fname): - text += xml2text(zipf.read(fname)) + Arguments: + path {str} -- path to DOCX file + + Keyword Arguments: + img_dir {str} -- save images in specififed directory (default: {None}) + + Returns: + dict -- header, main, footer, images, and properties of DOCX file + """ + TEXT_KEYS = ['header', 'officeDocument', 'footer'] + PROP_KEY = 'properties' + IMG_KEY = 'image' + + zipf = zipfile.ZipFile(path) + paths = {} + for fname in ['_rels/.rels', 'word/_rels/document.xml.rels']: + paths.update(load_rels(zipf.read(fname), fname)) + + doc_data = {IMG_KEY: paths[IMG_KEY], PROP_KEY: {}} + doc_data.update({ + key: ''.join([ + xml2text(zipf.read(fname)) + for fname in paths.get(key, [])]) + for key in TEXT_KEYS}) if img_dir is not None: - # extract images - for fname in filelist: - _, extension = os.path.splitext(fname) - if extension in [".jpg", ".jpeg", ".png", ".bmp"]: - dst_fname = os.path.join(img_dir, os.path.basename(fname)) - with open(dst_fname, "wb") as dst_f: - dst_f.write(zipf.read(fname)) + doc_data[IMG_KEY] = [ + extract_image(zipf.read(fname), img_dir, fname) + for fname in paths[IMG_KEY]] + + for fname in paths[PROP_KEY]: + doc_data[PROP_KEY].update(xml2dict(zipf.read(fname))) zipf.close() - return text.strip() + return { + 'header': doc_data[TEXT_KEYS[0]], + 'main': doc_data[TEXT_KEYS[1]], + 'footer': doc_data[TEXT_KEYS[2]], + 'images': doc_data[IMG_KEY], + PROP_KEY: doc_data[PROP_KEY], } -if __name__ == '__main__': + +class DocxFile(object): + def __init__(self, path, img_dir=None): + doc_data = parse_docx(path, img_dir) + + self.path = os.path.abspath(path) # type: str + self.img_dir = img_dir # type: str + self.header = doc_data['header'] # type: str + self.main = doc_data['main'] # type: str + self.footer = doc_data['footer'] # type: str + self.images = doc_data['images'] # type: list + self.properties = doc_data['properties'] # type: dict + + def __str__(self): + str_val = ''.join([self.header, self.main, self.footer]) + + if sys.version_info[0] < 3: + return str_val.encode('utf-8') + + return str_val + + def __repr__(self): + return 'DocxFile({!r}, {!r})'.format(self.path, self.img_dir) + + def __getattr__(self, attr_name): + if attr_name == 'text': + return str(self).strip() + + +def process(docx, img_dir=None): + document = DocxFile(docx, img_dir) + return document + + +def detail_text(prop_name, prop_val): + return '{:10s}: {!r}\n'.format(prop_name, prop_val) + + +def get_output(): args = process_args() - text = process(args.docx, args.img_dir) - sys.stdout.write(text.encode('utf-8')) + document = process(args.docx, args.img_dir) + + if args.details: + yield detail_text('path', document.path) + yield detail_text('header', document.header) + yield detail_text('main', document.main) + yield detail_text('footer', document.footer) + yield detail_text('images', document.images) + yield detail_text('properties', document.properties) + else: + yield document.text + + +if __name__ == '__main__': + for line in get_output(): + sys.stdout.write(line) From ecf5d245df85e8a708980a3f1663351cd96e35bb Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 13:25:00 -0700 Subject: [PATCH 03/22] use get_output for invocation --- bin/docx2txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/docx2txt b/bin/docx2txt index 62157c2..992dffc 100755 --- a/bin/docx2txt +++ b/bin/docx2txt @@ -4,6 +4,5 @@ import docx2txt if __name__ == '__main__': import sys - args = docx2txt.process_args() - text = docx2txt.process(args.docx, args.img_dir) - sys.stdout.write(text.encode('utf-8')) + for line in docx2txt.get_output(): + sys.stdout.write(line) From bbc35a6cc682bdccd5267af44f42bcb37aa779e6 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 13:25:31 -0700 Subject: [PATCH 04/22] export get_output() for script invocation increment version no. --- docx2txt/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py index 9f51a73..b5cbd4f 100644 --- a/docx2txt/__init__.py +++ b/docx2txt/__init__.py @@ -1,4 +1,3 @@ -from .docx2txt import process -from .docx2txt import process_args +from .docx2txt import get_output, process # noqa -VERSION = '0.7' +VERSION = '0.8' From 2cb5a30dee871a116957d6df354a4f7e2381ea2f Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 13:35:17 -0700 Subject: [PATCH 05/22] refactor DocxFile --- docx2txt/docx2txt.py | 225 +----------------------------------------- docx2txt/docx_file.py | 223 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 222 deletions(-) create mode 100644 docx2txt/docx_file.py diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py index 04b496c..596f2df 100755 --- a/docx2txt/docx2txt.py +++ b/docx2txt/docx2txt.py @@ -2,10 +2,9 @@ import argparse import os -import re import sys -import xml.etree.ElementTree as ET -import zipfile + +from . import docx_file def process_args(): @@ -40,226 +39,8 @@ def process_args(): return args -def get_rel_key(attrib): - # type: (dict) -> str - """Get dictionary key for XML node - - Arguments: - attrib {dict} -- relationship node attributes - - Returns: - str -- simplified key name - """ - node_type = attrib.get('Type', '') - key = str(re.sub(r'.+[\/\-]+', '', node_type)) - return key - - -def get_rel_path(parent, attrib): - # type: (str, dict) -> str - """Get path to relationship in REL file - - Arguments: - parent {str} -- parent directory of relationship - attrib {dict} -- relationship node attributes - - Returns: - str -- full path to relationship - """ - target = attrib.get('Target', '') - path = (parent + target).lstrip('/') - - return path - - -def load_rels(xml, fname): - # type: (bytes, str) -> dict - """Parse document REL file - - Arguments: - xml {bytes} -- contents of XML file - fname {str} -- path to XML file - - Returns: - dict -- dictionary of XML data - """ - root = ET.fromstring(xml) - base_path = str(re.sub(r'_rels/.+', '', fname)) - data = {} # type: dict - - for node in root.iter(): - key = get_rel_key(node.attrib) - path = get_rel_path(base_path, node.attrib) - data[key] = data.get(key, []) + [path] - - return data - - -def extract_image(img_bytes, img_dir, fname): - # type: (bytes, str, str) -> str - """Write image data to disk - - Arguments: - img_bytes {bytes} -- image data - img_dir {str} -- output directory - fname {str} -- name of source file - - Returns: - str -- absolute path to extracted image - """ - dst_fname = os.path.join(img_dir, os.path.basename(fname)) - - with open(dst_fname, 'wb') as dst_f: - dst_f.write(img_bytes) - - return os.path.abspath(dst_fname) - - -def un(tag): - # type: (str) -> str - """Stands for 'unqualified name'. Removes namespace from prefixed tag. - - See: [Python issue 18304](https://bugs.python.org/issue18304) - - Arguments: - tag {str} -- (possibly-)namespaced tag - - Returns: - str -- tag name without namespace - """ - return tag.split('}').pop() - - -def qn(tag): - """ - Stands for 'qualified name', a utility function to turn a namespace - prefixed tag name into a Clark-notation qualified tag name for lxml. For - example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. - Source: https://github.com/python-openxml/python-docx/ - """ - nsmap = { - 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} - prefix, tagroot = tag.split(':') - uri = nsmap[prefix] - return '{{{}}}{}'.format(uri, tagroot) - - -def xml2text(xml): - """ - A string representing the textual content of this run, with content - child elements like ```` translated to their Python - equivalent. - Adapted from: https://github.com/python-openxml/python-docx/ - """ - text = u'' - root = ET.fromstring(xml) - whitespace_tags = { - qn('w:tab'): '\t', - qn('w:br'): '\n', - qn('w:cr'): '\n', - qn('w:p'): '\n\n', } - text_tag = qn('w:t') - for child in root.iter(): - text += whitespace_tags.get(child.tag, '') - if child.tag == text_tag and child.text is not None: - text += child.text - return text - - -def xml2dict(xml): - # type: (bytes) -> dict - """Get dictionary of values from ``xml`` - - Arguments: - xml {bytes} -- contents of XML file - - Returns: - dict -- dictionary of {node.tagName: node.text} - """ - root = ET.fromstring(xml) - data = { - un(child.tag): child.text - for child in root.iter()} - return data - - -def parse_docx(path, img_dir): - # type: (str, str) -> dict - """Load and parse contents of file at ``path`` - - Arguments: - path {str} -- path to DOCX file - - Keyword Arguments: - img_dir {str} -- save images in specififed directory (default: {None}) - - Returns: - dict -- header, main, footer, images, and properties of DOCX file - """ - TEXT_KEYS = ['header', 'officeDocument', 'footer'] - PROP_KEY = 'properties' - IMG_KEY = 'image' - - zipf = zipfile.ZipFile(path) - paths = {} - for fname in ['_rels/.rels', 'word/_rels/document.xml.rels']: - paths.update(load_rels(zipf.read(fname), fname)) - - doc_data = {IMG_KEY: paths[IMG_KEY], PROP_KEY: {}} - doc_data.update({ - key: ''.join([ - xml2text(zipf.read(fname)) - for fname in paths.get(key, [])]) - for key in TEXT_KEYS}) - - if img_dir is not None: - doc_data[IMG_KEY] = [ - extract_image(zipf.read(fname), img_dir, fname) - for fname in paths[IMG_KEY]] - - for fname in paths[PROP_KEY]: - doc_data[PROP_KEY].update(xml2dict(zipf.read(fname))) - - zipf.close() - - return { - 'header': doc_data[TEXT_KEYS[0]], - 'main': doc_data[TEXT_KEYS[1]], - 'footer': doc_data[TEXT_KEYS[2]], - 'images': doc_data[IMG_KEY], - PROP_KEY: doc_data[PROP_KEY], } - - -class DocxFile(object): - def __init__(self, path, img_dir=None): - doc_data = parse_docx(path, img_dir) - - self.path = os.path.abspath(path) # type: str - self.img_dir = img_dir # type: str - self.header = doc_data['header'] # type: str - self.main = doc_data['main'] # type: str - self.footer = doc_data['footer'] # type: str - self.images = doc_data['images'] # type: list - self.properties = doc_data['properties'] # type: dict - - def __str__(self): - str_val = ''.join([self.header, self.main, self.footer]) - - if sys.version_info[0] < 3: - return str_val.encode('utf-8') - - return str_val - - def __repr__(self): - return 'DocxFile({!r}, {!r})'.format(self.path, self.img_dir) - - def __getattr__(self, attr_name): - if attr_name == 'text': - return str(self).strip() - - def process(docx, img_dir=None): - document = DocxFile(docx, img_dir) + document = docx_file.DocxFile(docx, img_dir) return document diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py new file mode 100644 index 0000000..e7df6e0 --- /dev/null +++ b/docx2txt/docx_file.py @@ -0,0 +1,223 @@ +import os +import re +import sys +import xml.etree.ElementTree as ET +import zipfile + + +def get_rel_key(attrib): + # type: (dict) -> str + """Get dictionary key for XML node + + Arguments: + attrib {dict} -- relationship node attributes + + Returns: + str -- simplified key name + """ + node_type = attrib.get('Type', '') + key = str(re.sub(r'.+[\/\-]+', '', node_type)) + return key + + +def get_rel_path(parent, attrib): + # type: (str, dict) -> str + """Get path to relationship in REL file + + Arguments: + parent {str} -- parent directory of relationship + attrib {dict} -- relationship node attributes + + Returns: + str -- full path to relationship + """ + target = attrib.get('Target', '') + path = (parent + target).lstrip('/') + + return path + + +def load_rels(xml, fname): + # type: (bytes, str) -> dict + """Parse document REL file + + Arguments: + xml {bytes} -- contents of XML file + fname {str} -- path to XML file + + Returns: + dict -- dictionary of XML data + """ + root = ET.fromstring(xml) + base_path = str(re.sub(r'_rels/.+', '', fname)) + data = {} # type: dict + + for node in root.iter(): + key = get_rel_key(node.attrib) + path = get_rel_path(base_path, node.attrib) + data[key] = data.get(key, []) + [path] + + return data + + +def extract_image(img_bytes, img_dir, fname): + # type: (bytes, str, str) -> str + """Write image data to disk + + Arguments: + img_bytes {bytes} -- image data + img_dir {str} -- output directory + fname {str} -- name of source file + + Returns: + str -- absolute path to extracted image + """ + dst_fname = os.path.join(img_dir, os.path.basename(fname)) + + with open(dst_fname, 'wb') as dst_f: + dst_f.write(img_bytes) + + return os.path.abspath(dst_fname) + + +def un(tag): + # type: (str) -> str + """Stands for 'unqualified name'. Removes namespace from prefixed tag. + + See: [Python issue 18304](https://bugs.python.org/issue18304) + + Arguments: + tag {str} -- (possibly-)namespaced tag + + Returns: + str -- tag name without namespace + """ + return tag.split('}').pop() + + +def qn(tag): + """ + Stands for 'qualified name', a utility function to turn a namespace + prefixed tag name into a Clark-notation qualified tag name for lxml. For + example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. + Source: https://github.com/python-openxml/python-docx/ + """ + nsmap = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} + prefix, tagroot = tag.split(':') + uri = nsmap[prefix] + return '{{{}}}{}'.format(uri, tagroot) + + +def xml2text(xml): + """ + A string representing the textual content of this run, with content + child elements like ```` translated to their Python + equivalent. + Adapted from: https://github.com/python-openxml/python-docx/ + """ + text = u'' + root = ET.fromstring(xml) + whitespace_tags = { + qn('w:tab'): '\t', + qn('w:br'): '\n', + qn('w:cr'): '\n', + qn('w:p'): '\n\n', } + text_tag = qn('w:t') + for child in root.iter(): + text += whitespace_tags.get(child.tag, '') + if child.tag == text_tag and child.text is not None: + text += child.text + return text + + +def xml2dict(xml): + # type: (bytes) -> dict + """Get dictionary of values from ``xml`` + + Arguments: + xml {bytes} -- contents of XML file + + Returns: + dict -- dictionary of {node.tagName: node.text} + """ + root = ET.fromstring(xml) + data = { + un(child.tag): child.text + for child in root.iter()} + return data + + +def parse_docx(path, img_dir): + # type: (str, str) -> dict + """Load and parse contents of file at ``path`` + + Arguments: + path {str} -- path to DOCX file + + Keyword Arguments: + img_dir {str} -- save images in specififed directory (default: {None}) + + Returns: + dict -- header, main, footer, images, and properties of DOCX file + """ + TEXT_KEYS = ['header', 'officeDocument', 'footer'] + PROP_KEY = 'properties' + IMG_KEY = 'image' + + zipf = zipfile.ZipFile(path) + paths = {} + for fname in ['_rels/.rels', 'word/_rels/document.xml.rels']: + paths.update(load_rels(zipf.read(fname), fname)) + + doc_data = {IMG_KEY: paths[IMG_KEY], PROP_KEY: {}} + doc_data.update({ + key: ''.join([ + xml2text(zipf.read(fname)) + for fname in paths.get(key, [])]) + for key in TEXT_KEYS}) + + if img_dir is not None: + doc_data[IMG_KEY] = [ + extract_image(zipf.read(fname), img_dir, fname) + for fname in paths[IMG_KEY]] + + for fname in paths[PROP_KEY]: + doc_data[PROP_KEY].update(xml2dict(zipf.read(fname))) + + zipf.close() + + return { + 'header': doc_data[TEXT_KEYS[0]], + 'main': doc_data[TEXT_KEYS[1]], + 'footer': doc_data[TEXT_KEYS[2]], + 'images': doc_data[IMG_KEY], + PROP_KEY: doc_data[PROP_KEY], } + + +class DocxFile(object): + def __init__(self, path, img_dir=None): + doc_data = parse_docx(path, img_dir) + + self.path = os.path.abspath(path) # type: str + self.img_dir = img_dir # type: str + self.header = doc_data['header'] # type: str + self.main = doc_data['main'] # type: str + self.footer = doc_data['footer'] # type: str + self.images = doc_data['images'] # type: list + self.properties = doc_data['properties'] # type: dict + + def __str__(self): + str_val = ''.join([self.header, self.main, self.footer]) + + if sys.version_info[0] < 3: + return str_val.encode('utf-8') + + return str_val + + def __repr__(self): + return 'DocxFile({!r}, {!r})'.format(self.path, self.img_dir) + + def __getattr__(self, attr_name): + if attr_name == 'text': + return str(self).strip() From ba3fd0bb197998510ccf6e566c2e6e268c4683d0 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 14:27:46 -0700 Subject: [PATCH 06/22] export DocxFile --- docx2txt/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py index b5cbd4f..bc2e9aa 100644 --- a/docx2txt/__init__.py +++ b/docx2txt/__init__.py @@ -1,3 +1,4 @@ from .docx2txt import get_output, process # noqa +from .docx_file import DocxFile # noqa VERSION = '0.8' From e140f481532959f472ef20b45f80f774f7eb0df7 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 14:28:28 -0700 Subject: [PATCH 07/22] get image filenames if not extracting --- docx2txt/docx_file.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index e7df6e0..39df158 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -170,18 +170,22 @@ def parse_docx(path, img_dir): for fname in ['_rels/.rels', 'word/_rels/document.xml.rels']: paths.update(load_rels(zipf.read(fname), fname)) - doc_data = {IMG_KEY: paths[IMG_KEY], PROP_KEY: {}} - doc_data.update({ + doc_data = { key: ''.join([ xml2text(zipf.read(fname)) for fname in paths.get(key, [])]) - for key in TEXT_KEYS}) + for key in TEXT_KEYS} # type: dict - if img_dir is not None: + if img_dir is None: + doc_data[IMG_KEY] = [ + os.path.basename(fname) + for fname in paths[IMG_KEY]] + else: doc_data[IMG_KEY] = [ extract_image(zipf.read(fname), img_dir, fname) for fname in paths[IMG_KEY]] + doc_data[PROP_KEY] = {} for fname in paths[PROP_KEY]: doc_data[PROP_KEY].update(xml2dict(zipf.read(fname))) From a221843b844caaf22fadb2697198679bf3eb938c Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 14:28:49 -0700 Subject: [PATCH 08/22] full documentation --- README.md | 101 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d1b360b..91446e5 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,93 @@ -# python-docx2txt # +# python-docx2txt -A pure python-based utility to extract text from docx files. +## Introduction -The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). It can however also extract text from header, footer and hyperlinks. __It can now also extract images.__ +A pure Python-based utility to extract text from docx files. -## How to install? ## -```bash +The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). +It can however also extract text from header, footer and hyperlinks. +__It can now also extract images and properties.__ + +It can be used as a [Python library](#Python%20Library) +or from the [command line](#Command%20Line%20Utility). + +## Python Library + +### Library Installation + +```sh pip install docx2txt ``` -## How to run? ## +### Library Usage -a. From command line: -```bash -# extract text -docx2txt file.docx -# extract text and images -docx2txt -i /tmp/img_dir file.docx +#### Procedural + +The library is easy to use procedurally. + +```py +>>> import docx2txt +>>> # get document text +>>> docx2txt.process('file.docx') +'header_textmain_textfooter_text' +>>> # or +>>> # get document text, extract images to /tmp/img_dir +>>> process('file.docx', img_dir='/tmp/img_dir/') +'header_textmain_textfooter_text' +``` + +#### Object Oriented + +The DocxFile class provides more granularity. +Its argument list and accompanying behaviors are identical to `process()`. +Document properties are stored as a dictionary. +No keys are guaranteed, so the get() method is recommended. + +```py +>>> import docx2txt +>>> # parse Word doc +>>> document = docx2txt.DocxFile('file.docx', img_dir='/tmp/img_dir/') +>>> # path to file +>>> document.path +'/absolute/path/to/file.docx' +>>> # all document text +>>> document.text +'header_textmain_textfooter_text' +>>> # image directory +>>> document.img_dir +>>> '/tmp/img_dir' +>>> # text components +>>> '||'.join([document.header, document.main, document.footer]) +'header_text||main_text||footer_text' +>>> # images (filename only if not extracted) +>>> document.images +['/tmp/img_dir/image1.jpg', '/tmp/img_dir/image2.jpg'] +>>> # document properties +>>> document.properties +{'property_name': 'property value', ...} ``` -b. From python: -```python -import docx2txt -# extract text -text = docx2txt.process("file.docx") +## Command Line Utility + +### Utility Installation + +With this README file as the working directory: -# extract text and write images in /tmp/img_dir -text = docx2txt.process("file.docx", "/tmp/img_dir") +```sh +python setup.py install ``` + +### Utility Usage + +```sh +# simple text extraction +docx2txt file.docx +# get text, extract images to /tmp/img_dir +docx2txt -i /tmp/img_dir file.docx +# get all document data +docx2txt -d file.docx +# get all data, extract images to /tmp/img_dir +docx2txt -d -i /tmp/img_dir file.docx +# same as previous, more simply: +docx2txt -di /tmp/img_dir file.docx +``` \ No newline at end of file From 4250f173ae60e8f5e86b08da011038aa9606cb94 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 14:38:43 -0700 Subject: [PATCH 09/22] clarify unqualify namespace --- docx2txt/docx_file.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 39df158..715b4d5 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -80,21 +80,6 @@ def extract_image(img_bytes, img_dir, fname): return os.path.abspath(dst_fname) -def un(tag): - # type: (str) -> str - """Stands for 'unqualified name'. Removes namespace from prefixed tag. - - See: [Python issue 18304](https://bugs.python.org/issue18304) - - Arguments: - tag {str} -- (possibly-)namespaced tag - - Returns: - str -- tag name without namespace - """ - return tag.split('}').pop() - - def qn(tag): """ Stands for 'qualified name', a utility function to turn a namespace @@ -109,6 +94,21 @@ def qn(tag): return '{{{}}}{}'.format(uri, tagroot) +def un_qn(tag): + # type: (str) -> str + """Stands for 'unqualified name'. Removes namespace from prefixed tag. + + See: [Python issue 18304](https://bugs.python.org/issue18304) + + Arguments: + tag {str} -- (possibly-)namespaced tag + + Returns: + str -- tag name without namespace + """ + return tag.split('}').pop() + + def xml2text(xml): """ A string representing the textual content of this run, with content @@ -143,7 +143,7 @@ def xml2dict(xml): """ root = ET.fromstring(xml) data = { - un(child.tag): child.text + un_qn(child.tag): child.text for child in root.iter()} return data From 8272a6511d86fb3ae454520b177f84e5df6d66b2 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 14:40:30 -0700 Subject: [PATCH 10/22] formatting --- docx2txt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py index bc2e9aa..031c20a 100644 --- a/docx2txt/__init__.py +++ b/docx2txt/__init__.py @@ -1,4 +1,4 @@ from .docx2txt import get_output, process # noqa -from .docx_file import DocxFile # noqa +from .docx_file import DocxFile # noqa VERSION = '0.8' From 8593a383930fe57752a382eb0ae0e07f13a2aa80 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 14:44:07 -0700 Subject: [PATCH 11/22] clarify dict.get() mention --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 91446e5..c83f420 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,12 @@ No keys are guaranteed, so the get() method is recommended. >>> # document properties >>> document.properties {'property_name': 'property value', ...} +>>> document.properties['title'] +'title_text' +>>> document.properties['nonexistent'] +KeyError +>>> document.properties.get('nonexistent') +None ``` ## Command Line Utility From 4fa534b425439ffca304bb4f349b423af6ca9391 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 15:36:48 -0700 Subject: [PATCH 12/22] update for GitHub anchors --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c83f420..1977d04 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ The code is taken and adapted from [python-docx](https://github.com/python-openx It can however also extract text from header, footer and hyperlinks. __It can now also extract images and properties.__ -It can be used as a [Python library](#Python%20Library) -or from the [command line](#Command%20Line%20Utility). +It can be used as a [Python library](#python-library) +or from the [command line](#command-line-utility). ## Python Library From 334b3f89e7cbd23d152a1f655c28bab106bc2c3e Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 16:35:05 -0700 Subject: [PATCH 13/22] update installed version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4929689..212c16a 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='docx2txt', packages=['docx2txt'], - version='0.7', + version='0.8', description='A pure python-based utility to extract text and images ' 'from docx files.', author='Ankush Shah', From f3078ea8f4a66aacf65bc916f1ece59ff4c52108 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 17 Aug 2018 18:18:34 -0700 Subject: [PATCH 14/22] follow my own advice about dict.get() --- docx2txt/docx_file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 715b4d5..8f7b169 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -179,11 +179,11 @@ def parse_docx(path, img_dir): if img_dir is None: doc_data[IMG_KEY] = [ os.path.basename(fname) - for fname in paths[IMG_KEY]] + for fname in paths.get(IMG_KEY, [])] else: doc_data[IMG_KEY] = [ extract_image(zipf.read(fname), img_dir, fname) - for fname in paths[IMG_KEY]] + for fname in paths.get(IMG_KEY, [])] doc_data[PROP_KEY] = {} for fname in paths[PROP_KEY]: From 3da2e17e1a93198913cc87b3d2f7263fa7cefc3b Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Sun, 26 Aug 2018 13:39:37 -0700 Subject: [PATCH 15/22] simplify .text property --- docx2txt/docx_file.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 8f7b169..95dec1d 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -222,6 +222,6 @@ def __str__(self): def __repr__(self): return 'DocxFile({!r}, {!r})'.format(self.path, self.img_dir) - def __getattr__(self, attr_name): - if attr_name == 'text': - return str(self).strip() + @property + def text(self): + return str(self).strip() From bea08baf00f3280104bedeb7e6454712024b78dd Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Tue, 4 Sep 2018 18:42:39 -0700 Subject: [PATCH 16/22] add get_path() to support addinfourl, TextIOWrapper classes --- docx2txt/docx_file.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 95dec1d..ae0eec6 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -199,11 +199,29 @@ def parse_docx(path, img_dir): PROP_KEY: doc_data[PROP_KEY], } +def get_path(path): + # type: (object) -> str + """Get absolute path to document + + Returns: + str -- path to document + """ + try: + return os.path.abspath(str(path)) + except TypeError: + pass + + try: + return os.path.abspath(path.name) # type: ignore + except (AttributeError, TypeError): + return '' + + class DocxFile(object): def __init__(self, path, img_dir=None): doc_data = parse_docx(path, img_dir) - self.path = os.path.abspath(path) # type: str + self.path = get_path(path) # type: str self.img_dir = img_dir # type: str self.header = doc_data['header'] # type: str self.main = doc_data['main'] # type: str From 62c97c13f6e2d4f63a118f16a0a3af8f4bbd0afd Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Tue, 4 Sep 2018 18:58:45 -0700 Subject: [PATCH 17/22] add HTTPResponse support --- docx2txt/docx_file.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index ae0eec6..5c7840c 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -203,18 +203,23 @@ def get_path(path): # type: (object) -> str """Get absolute path to document + Arguments: + path {str} -- path to DOCX file (nominal) + Returns: - str -- path to document + str -- path to document (absolute) """ + # simple filesystem path string try: return os.path.abspath(str(path)) except TypeError: pass - try: - return os.path.abspath(path.name) # type: ignore - except (AttributeError, TypeError): - return '' + # addinfourl, TextIOWrapper, HTTPResponse, ? + for attr in [key for key in ['name', 'url'] if hasattr(path, key)]: + return getattr(path, attr) + + return '' class DocxFile(object): From 986c3223526b8160d86d18ba70f2c90ed3c69e0b Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Tue, 4 Sep 2018 18:58:45 -0700 Subject: [PATCH 18/22] add HTTPResponse support --- docx2txt/docx_file.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index ae0eec6..ffad759 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -203,25 +203,30 @@ def get_path(path): # type: (object) -> str """Get absolute path to document + Arguments: + path {str} -- path to DOCX file (nominal) + Returns: - str -- path to document + str -- path to document (absolute) """ try: return os.path.abspath(str(path)) except TypeError: pass - try: - return os.path.abspath(path.name) # type: ignore - except (AttributeError, TypeError): - return '' + # TextIOWrapper, addinfourl, HTTPResponse... and more? + for attr in (getattr(path, key) for key in ('name', 'url')): + if attr is not None: + return str(attr) + + return '' class DocxFile(object): - def __init__(self, path, img_dir=None): - doc_data = parse_docx(path, img_dir) + def __init__(self, file, img_dir=None): + doc_data = parse_docx(file, img_dir) - self.path = get_path(path) # type: str + self.path = get_path(file) # type: str self.img_dir = img_dir # type: str self.header = doc_data['header'] # type: str self.main = doc_data['main'] # type: str From e66b0894bb63d2610d3c5845b1a62b69f06da540 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 8 Feb 2019 13:07:35 -0800 Subject: [PATCH 19/22] Office 365 support --- docx2txt/docx_file.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 21e2d9c..83843d1 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -37,6 +37,19 @@ def get_rel_path(parent, attrib): return path +def find_rels(name_list): + # type: (list) -> list + """Filter rels with list of paths in ``name_list`` + + Returns: + list[str] -- existing paths + """ + rel_base = 'word/rels/document{}.xml.rels' + candidates = [rel_base.format(''), rel_base.format('2')] + + return ['_rels/.rels'] + [rel for rel in candidates if rel in name_list] + + def load_rels(xml, fname): # type: (bytes, str) -> dict """Parse document REL file @@ -165,9 +178,9 @@ def parse_docx(path, img_dir): PROP_KEY = 'properties' IMG_KEY = 'image' - zipf = zipfile.ZipFile(path) paths = {} - for fname in ['_rels/.rels', 'word/_rels/document.xml.rels']: + zipf = zipfile.ZipFile(path) + for fname in find_rels(zipf.namelist()): paths.update(load_rels(zipf.read(fname), fname)) doc_data = { From 12e71e43fc5602cdca58e05a539d984fc65d0f0b Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Thu, 14 Feb 2019 13:39:08 -0800 Subject: [PATCH 20/22] limit os import --- docx2txt/docx_file.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 83843d1..99143b9 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -1,4 +1,4 @@ -import os +import os.path as os_path import re import sys import xml.etree.ElementTree as ET @@ -85,12 +85,12 @@ def extract_image(img_bytes, img_dir, fname): Returns: str -- absolute path to extracted image """ - dst_fname = os.path.join(img_dir, os.path.basename(fname)) + dst_fname = os_path.join(img_dir, os_path.basename(fname)) with open(dst_fname, 'wb') as dst_f: dst_f.write(img_bytes) - return os.path.abspath(dst_fname) + return os_path.abspath(dst_fname) def qn(tag): @@ -191,7 +191,7 @@ def parse_docx(path, img_dir): if img_dir is None: doc_data[IMG_KEY] = [ - os.path.basename(fname) + os_path.basename(fname) for fname in paths.get(IMG_KEY, [])] else: doc_data[IMG_KEY] = [ @@ -224,7 +224,7 @@ def get_path(path): """ # simple filesystem path string try: - return os.path.abspath(str(path)) + return os_path.abspath(str(path)) except TypeError: pass From 31eec562415d207d09b119d3bdc22f50ebfc5883 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 15 Feb 2019 12:05:10 -0800 Subject: [PATCH 21/22] add context management to zipfile correctly locate document relationships nomenclature fixes refactor utility functions limit imports protect document properties remove header and footer from document text (it's different from page to page) --- docx2txt/dict_util.py | 18 +++ docx2txt/docx_file.py | 282 +++++++++++++++++++++++------------------- docx2txt/xml_util.py | 37 ++++++ 3 files changed, 213 insertions(+), 124 deletions(-) create mode 100644 docx2txt/dict_util.py create mode 100644 docx2txt/xml_util.py diff --git a/docx2txt/dict_util.py b/docx2txt/dict_util.py new file mode 100644 index 0000000..4616fc1 --- /dev/null +++ b/docx2txt/dict_util.py @@ -0,0 +1,18 @@ +"""Dictionary Utilities""" + + +def merge(dicts): + # type: (list) -> dict + merged = {} # type: dict + for d in dicts: + merged.update(d) + + return merged + + +def filter_key(src_dict, key_test, getter=dict.values): + return getter({ + key: val + for key, val + in src_dict.items() + if key_test(key)}) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 99143b9..2a3bcc2 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -1,76 +1,114 @@ +import errno import os.path as os_path -import re import sys -import xml.etree.ElementTree as ET -import zipfile +from os import makedirs +from zipfile import ZipFile +from . import dict_util, xml_util -def get_rel_key(attrib): + +def get_rel_key(attribs): # type: (dict) -> str - """Get dictionary key for XML node + attr = attribs.get('Type', '') - Arguments: - attrib {dict} -- relationship node attributes + return os_path.basename(attr) - Returns: - str -- simplified key name - """ - node_type = attrib.get('Type', '') - key = str(re.sub(r'.+[\/\-]+', '', node_type)) - return key +def get_rel_path(attribs): + # type: (dict) -> str + attr = attribs.get('Target', '') -def get_rel_path(parent, attrib): - # type: (str, dict) -> str - """Get path to relationship in REL file + return attr.lstrip('/') - Arguments: - parent {str} -- parent directory of relationship - attrib {dict} -- relationship node attributes - Returns: - str -- full path to relationship - """ - target = attrib.get('Target', '') - path = (parent + target).lstrip('/') +def get_package_rels(pkg_xml): + # type: (bytes) -> dict + rels = xml_util.parse(pkg_xml) - return path + return { + get_rel_key(rel.attrib): get_rel_path(rel.attrib) + for rel + in rels.iter()} -def find_rels(name_list): - # type: (list) -> list - """Filter rels with list of paths in ``name_list`` +def parse_properties(prop_xml): + # type: (bytes) -> dict + props = xml_util.parse(prop_xml) - Returns: - list[str] -- existing paths - """ - rel_base = 'word/rels/document{}.xml.rels' - candidates = [rel_base.format(''), rel_base.format('2')] + return {xml_util.unquote(prop.tag): prop.text for prop in props.iter()} + + +def is_property_rel(kind): + # type: (str) -> bool + return kind.endswith('-properties') - return ['_rels/.rels'] + [rel for rel in candidates if rel in name_list] +def get_package_properties(pkg, pkg_rels): + # type: (ZipFile, dict) -> dict + prop_dicts = [ + parse_properties(pkg.read(path)) + for path + in dict_util.filter_key(pkg_rels, is_property_rel)] -def load_rels(xml, fname): - # type: (bytes, str) -> dict - """Parse document REL file + return dict_util.merge(prop_dicts) + + +def get_document_rels_path(doc_path): + # type: (str) -> str + path_comps = [ + os_path.dirname(doc_path).lstrip('/'), + '_rels', + os_path.basename(doc_path) + '.rels'] + + return '/'.join(path_comps) + + +def get_document_rels(pkg, doc_key, doc_path): + # type: (ZipFile, str, str) -> dict + """Parse document relationships Arguments: - xml {bytes} -- contents of XML file - fname {str} -- path to XML file + pkg {zipfile.ZipFile} -- package ZipFile + doc_key {str} -- key to store path of officeDocument part + doc_path {str} -- path to officeDocument part in package Returns: dict -- dictionary of XML data """ - root = ET.fromstring(xml) - base_path = str(re.sub(r'_rels/.+', '', fname)) - data = {} # type: dict + base_path = os_path.dirname(doc_path).lstrip('/') + rels_path = get_document_rels_path(doc_path) + rel_nodes = xml_util.parse(pkg.read(rels_path)) - for node in root.iter(): - key = get_rel_key(node.attrib) - path = get_rel_path(base_path, node.attrib) - data[key] = data.get(key, []) + [path] + rels = {} # type: dict + for rel_node in rel_nodes.iter(): + key = get_rel_key(rel_node.attrib) + path = '/'.join([base_path, rel_node.attrib.get('Target', '')]) - return data + rels[key] = rels.get(key, []) + [path] + + rels.update({doc_key: [doc_path]}) + + return rels + + +def get_package_info(pkg, doc_type): + # type: (ZipFile, str) -> tuple + pkg_rels = get_package_rels(pkg.read('_rels/.rels')) + doc_path = pkg_rels.get(doc_type, 'word/document.xml') + doc_rels = get_document_rels(pkg, doc_type, doc_path) + + return pkg_rels, doc_rels + + +def mkdir_p(path): + # type: (str) -> None + try: + makedirs(path) + except OSError as err: + if err.errno == errno.EEXIST and os_path.isdir(path): + pass + else: + raise def extract_image(img_bytes, img_dir, fname): @@ -93,35 +131,6 @@ def extract_image(img_bytes, img_dir, fname): return os_path.abspath(dst_fname) -def qn(tag): - """ - Stands for 'qualified name', a utility function to turn a namespace - prefixed tag name into a Clark-notation qualified tag name for lxml. For - example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. - Source: https://github.com/python-openxml/python-docx/ - """ - nsmap = { - 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} - prefix, tagroot = tag.split(':') - uri = nsmap[prefix] - return '{{{}}}{}'.format(uri, tagroot) - - -def un_qn(tag): - # type: (str) -> str - """Stands for 'unqualified name'. Removes namespace from prefixed tag. - - See: [Python issue 18304](https://bugs.python.org/issue18304) - - Arguments: - tag {str} -- (possibly-)namespaced tag - - Returns: - str -- tag name without namespace - """ - return tag.split('}').pop() - - def xml2text(xml): """ A string representing the textual content of this run, with content @@ -130,13 +139,13 @@ def xml2text(xml): Adapted from: https://github.com/python-openxml/python-docx/ """ text = u'' - root = ET.fromstring(xml) + root = xml_util.parse(xml) whitespace_tags = { - qn('w:tab'): '\t', - qn('w:br'): '\n', - qn('w:cr'): '\n', - qn('w:p'): '\n\n', } - text_tag = qn('w:t') + xml_util.quote('w:tab'): '\t', + xml_util.quote('w:br'): '\n', + xml_util.quote('w:cr'): '\n', + xml_util.quote('w:p'): '\n\n', } + text_tag = xml_util.quote('w:t') for child in root.iter(): text += whitespace_tags.get(child.tag, '') if child.tag == text_tag and child.text is not None: @@ -154,14 +163,14 @@ def xml2dict(xml): Returns: dict -- dictionary of {node.tagName: node.text} """ - root = ET.fromstring(xml) + root = xml_util.parse(xml) data = { - un_qn(child.tag): child.text + xml_util.unquote(child.tag): child.text for child in root.iter()} return data -def parse_docx(path, img_dir): +def read_docx(path, img_dir): # type: (str, str) -> dict """Load and parse contents of file at ``path`` @@ -174,42 +183,39 @@ def parse_docx(path, img_dir): Returns: dict -- header, main, footer, images, and properties of DOCX file """ - TEXT_KEYS = ['header', 'officeDocument', 'footer'] - PROP_KEY = 'properties' + HEAD_KEY = 'header' + MAIN_KEY = 'officeDocument' + FOOT_KEY = 'footer' IMG_KEY = 'image' - paths = {} - zipf = zipfile.ZipFile(path) - for fname in find_rels(zipf.namelist()): - paths.update(load_rels(zipf.read(fname), fname)) - - doc_data = { - key: ''.join([ - xml2text(zipf.read(fname)) - for fname in paths.get(key, [])]) - for key in TEXT_KEYS} # type: dict + with ZipFile(path) as pkg: + pkg_rels, doc_rels = get_package_info(pkg, MAIN_KEY) - if img_dir is None: - doc_data[IMG_KEY] = [ - os_path.basename(fname) - for fname in paths.get(IMG_KEY, [])] - else: - doc_data[IMG_KEY] = [ - extract_image(zipf.read(fname), img_dir, fname) - for fname in paths.get(IMG_KEY, [])] + text = { + key: ''.join([ + xml2text(pkg.read(fname)) + for fname in doc_rels.get(key, [])]) + for key in [HEAD_KEY, MAIN_KEY, FOOT_KEY]} # type: dict - doc_data[PROP_KEY] = {} - for fname in paths[PROP_KEY]: - doc_data[PROP_KEY].update(xml2dict(zipf.read(fname))) + images = [] # type: list + if img_dir is None: + images += [ + os_path.basename(fname) + for fname in doc_rels.get(IMG_KEY, [])] + else: + mkdir_p(img_dir) + images += [ + extract_image(pkg.read(fname), img_dir, fname) + for fname in doc_rels.get(IMG_KEY, [])] - zipf.close() + props = get_package_properties(pkg, pkg_rels) return { - 'header': doc_data[TEXT_KEYS[0]], - 'main': doc_data[TEXT_KEYS[1]], - 'footer': doc_data[TEXT_KEYS[2]], - 'images': doc_data[IMG_KEY], - PROP_KEY: doc_data[PROP_KEY], } + 'header': text.get(HEAD_KEY), + 'main': text.get(MAIN_KEY), + 'footer': text.get(FOOT_KEY), + 'images': images, + 'properties': props, } def get_path(path): @@ -238,18 +244,18 @@ def get_path(path): class DocxFile(object): def __init__(self, file, img_dir=None): - doc_data = parse_docx(file, img_dir) + doc_data = read_docx(file, img_dir) - self.path = get_path(file) # type: str - self.img_dir = img_dir # type: str - self.header = doc_data['header'] # type: str - self.main = doc_data['main'] # type: str - self.footer = doc_data['footer'] # type: str - self.images = doc_data['images'] # type: list - self.properties = doc_data['properties'] # type: dict + self._path = get_path(file) # type: str + self._img_dir = img_dir # type: str + self._header = str(doc_data['header']).strip() # type: str + self._main = str(doc_data['main']).strip() # type: str + self._footer = str(doc_data['footer']).strip() # type: str + self._images = doc_data['images'] # type: list + self._properties = doc_data['properties'] # type: dict def __str__(self): - str_val = ''.join([self.header, self.main, self.footer]) + str_val = ''.join(self._main) if sys.version_info[0] < 3: return str_val.encode('utf-8') @@ -257,7 +263,35 @@ def __str__(self): return str_val def __repr__(self): - return 'DocxFile({!r}, {!r})'.format(self.path, self.img_dir) + return 'DocxFile({!r}, {!r})'.format(self._path, self._img_dir) + + @property + def path(self): + return self._path + + @property + def img_dir(self): + return self._img_dir + + @property + def header(self): + return self._header + + @property + def main(self): + return self._main + + @property + def footer(self): + return self._footer + + @property + def images(self): + return self._images + + @property + def properties(self): + return self._properties @property def text(self): diff --git a/docx2txt/xml_util.py b/docx2txt/xml_util.py new file mode 100644 index 0000000..7b4d061 --- /dev/null +++ b/docx2txt/xml_util.py @@ -0,0 +1,37 @@ +"""XML Utilities""" + +import xml.etree.ElementTree as ET + + +def quote(tag): + """ + Turn a namespace + prefixed tag name into a Clark-notation qualified tag name for lxml. For + example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. + Source: https://github.com/python-openxml/python-docx/ + """ + nsmap = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} + prefix, tagroot = tag.split(':') + uri = nsmap[prefix] + return '{{{}}}{}'.format(uri, tagroot) + + +def unquote(tag): + # type: (str) -> str + """Remove namespace from prefixed tag. + + See: [Python issue 18304](https://bugs.python.org/issue18304) + + Arguments: + tag {str} -- (possibly-)namespaced tag + + Returns: + str -- tag name without namespace + """ + return tag.split('}').pop() + + +def parse(xml_bytes): + # type: (bytes) -> ET.Element + return ET.fromstring(xml_bytes) From 045571ae62223b3c8996011d42c8e18918a4f242 Mon Sep 17 00:00:00 2001 From: Andrew Champion Date: Fri, 15 Feb 2019 12:37:41 -0800 Subject: [PATCH 22/22] documentation nomenclature --- docx2txt/docx_file.py | 143 +++++++++++++++++++++++++++++++----------- 1 file changed, 105 insertions(+), 38 deletions(-) diff --git a/docx2txt/docx_file.py b/docx2txt/docx_file.py index 2a3bcc2..0664a1e 100644 --- a/docx2txt/docx_file.py +++ b/docx2txt/docx_file.py @@ -7,15 +7,31 @@ from . import dict_util, xml_util -def get_rel_key(attribs): +def simplify_rel(attribs): # type: (dict) -> str + """Simplify Type of a Relationship node + + Arguments: + attribs {dict} -- attributes of Relationship node + + Returns: + str - salient portion of rel type (officeDocument, image, etc.) + """ attr = attribs.get('Type', '') return os_path.basename(attr) -def get_rel_path(attribs): +def locate_rel(attribs): # type: (dict) -> str + """Get path to file in Relationship node + + Arguments: + attribs {dict} -- attributes of Relationship node + + Returns: + str -- path of Target within package + """ attr = attribs.get('Target', '') return attr.lstrip('/') @@ -23,28 +39,61 @@ def get_rel_path(attribs): def get_package_rels(pkg_xml): # type: (bytes) -> dict + """Get package relationships + + Arguments: + pkg_xml {bytes} -- top level relationships XML (_rels/.rels) + + Returns: + dict -- property and document paths + """ rels = xml_util.parse(pkg_xml) return { - get_rel_key(rel.attrib): get_rel_path(rel.attrib) + simplify_rel(rel.attrib): locate_rel(rel.attrib) for rel in rels.iter()} def parse_properties(prop_xml): # type: (bytes) -> dict + """Parse XML for document metadata + + Arguments: + prop_xml {bytes} -- property XML file (docProps XML) + + Returns: + dict -- document metadata + """ props = xml_util.parse(prop_xml) return {xml_util.unquote(prop.tag): prop.text for prop in props.iter()} -def is_property_rel(kind): +def is_property_rel(rel_type): # type: (str) -> bool - return kind.endswith('-properties') + """Test for string indicating a property relationship + + Arguments: + rel_type {str} -- relationship type + + Returns: + bool -- relationship is a property + """ + return rel_type.endswith('-properties') -def get_package_properties(pkg, pkg_rels): +def get_package_props(pkg, pkg_rels): # type: (ZipFile, dict) -> dict + """Get all properties of package + + Arguments: + pkg {ZipFile} -- package as ZipFile + pkg_rels {dict} -- all package relationships + + Returns: + dict -- properties of package + """ prop_dicts = [ parse_properties(pkg.read(path)) for path @@ -53,61 +102,81 @@ def get_package_properties(pkg, pkg_rels): return dict_util.merge(prop_dicts) -def get_document_rels_path(doc_path): +def get_part_rels_path(part_path): # type: (str) -> str + """Get path to document relationships + + Arguments: + part_path {str} -- path to officeDocument relationship + + Returns: + str -- path to relationships for ``part_path`` + """ path_comps = [ - os_path.dirname(doc_path).lstrip('/'), + os_path.dirname(part_path).lstrip('/'), '_rels', - os_path.basename(doc_path) + '.rels'] + os_path.basename(part_path) + '.rels'] return '/'.join(path_comps) -def get_document_rels(pkg, doc_key, doc_path): +def get_part_rels(pkg, part_key, part_path): # type: (ZipFile, str, str) -> dict - """Parse document relationships + """Parse relationships of part (document) Arguments: pkg {zipfile.ZipFile} -- package ZipFile - doc_key {str} -- key to store path of officeDocument part - doc_path {str} -- path to officeDocument part in package + part_key {str} -- key to store path of officeDocument part + part_path {str} -- path to officeDocument part in package Returns: dict -- dictionary of XML data """ - base_path = os_path.dirname(doc_path).lstrip('/') - rels_path = get_document_rels_path(doc_path) + base_path = os_path.dirname(part_path).lstrip('/') + rels_path = get_part_rels_path(part_path) rel_nodes = xml_util.parse(pkg.read(rels_path)) rels = {} # type: dict for rel_node in rel_nodes.iter(): - key = get_rel_key(rel_node.attrib) + key = simplify_rel(rel_node.attrib) path = '/'.join([base_path, rel_node.attrib.get('Target', '')]) rels[key] = rels.get(key, []) + [path] - rels.update({doc_key: [doc_path]}) + rels.update({part_key: [part_path]}) return rels -def get_package_info(pkg, doc_type): +def get_all_rels(pkg, part_type): # type: (ZipFile, str) -> tuple + """Get relationships for package and part of ``part_type`` + + Arguments: + pkg {ZipFile} -- package as ZipFile + part_type {str} -- type of 'part' to locate (officeDocument) + + Returns: + tuple -- package relationships, part relationships + """ pkg_rels = get_package_rels(pkg.read('_rels/.rels')) - doc_path = pkg_rels.get(doc_type, 'word/document.xml') - doc_rels = get_document_rels(pkg, doc_type, doc_path) + part_path = pkg_rels.get(part_type, 'word/document.xml') + part_rels = get_part_rels(pkg, part_type, part_path) - return pkg_rels, doc_rels + return pkg_rels, part_rels def mkdir_p(path): # type: (str) -> None + """Recursively create directory at ``path`` + + Arguments: + path {str} -- directory to create + """ try: makedirs(path) except OSError as err: - if err.errno == errno.EEXIST and os_path.isdir(path): - pass - else: + if err.errno != errno.EEXIST or not os_path.isdir(path): raise @@ -178,7 +247,7 @@ def read_docx(path, img_dir): path {str} -- path to DOCX file Keyword Arguments: - img_dir {str} -- save images in specififed directory (default: {None}) + img_dir {str} -- save images in specififed directory Returns: dict -- header, main, footer, images, and properties of DOCX file @@ -189,7 +258,7 @@ def read_docx(path, img_dir): IMG_KEY = 'image' with ZipFile(path) as pkg: - pkg_rels, doc_rels = get_package_info(pkg, MAIN_KEY) + pkg_rels, doc_rels = get_all_rels(pkg, MAIN_KEY) text = { key: ''.join([ @@ -208,7 +277,7 @@ def read_docx(path, img_dir): extract_image(pkg.read(fname), img_dir, fname) for fname in doc_rels.get(IMG_KEY, [])] - props = get_package_properties(pkg, pkg_rels) + props = get_package_props(pkg, pkg_rels) return { 'header': text.get(HEAD_KEY), @@ -246,21 +315,19 @@ class DocxFile(object): def __init__(self, file, img_dir=None): doc_data = read_docx(file, img_dir) - self._path = get_path(file) # type: str - self._img_dir = img_dir # type: str - self._header = str(doc_data['header']).strip() # type: str - self._main = str(doc_data['main']).strip() # type: str - self._footer = str(doc_data['footer']).strip() # type: str - self._images = doc_data['images'] # type: list - self._properties = doc_data['properties'] # type: dict + self._path = get_path(file) # type: str + self._img_dir = img_dir # type: str + self._header = doc_data['header'] # type: str + self._main = doc_data['main'] # type: str + self._footer = doc_data['footer'] # type: str + self._images = doc_data['images'] # type: list + self._properties = doc_data['properties'] # type: dict def __str__(self): - str_val = ''.join(self._main) - if sys.version_info[0] < 3: - return str_val.encode('utf-8') + return self._main.encode('utf-8') - return str_val + return self._main def __repr__(self): return 'DocxFile({!r}, {!r})'.format(self._path, self._img_dir)