Skip to content

Commit 5fb204f

Browse files
Allow docx files to include document files with document[nn].xml form
I don't know if this is deviating from the standard but I have seen one example of this in the real world
1 parent 4a11b49 commit 5fb204f

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -555,10 +555,10 @@ def availability_pdf() -> bool:
555555
# -----------------------------------------------------------------------------
556556
# In a D.I.Y. fashion
557557
# -----------------------------------------------------------------------------
558-
# DOCX specification: http://www.ecma-international.org/news/TC45_current_work/TC45_available_docs.htm # noqa: E501
558+
# DOCX specification: https://ecma-international.org/publications-and-standards/standards/ecma-376/ # noqa: E501
559559

560560
DOCX_HEADER_FILE_REGEX = re.compile("word/header[0-9]*.xml")
561-
DOCX_DOC_FILE = "word/document.xml"
561+
DOCX_DOCUMENT_FILE_REGEX = re.compile("word/document[0-9]*.xml")
562562
DOCX_FOOTER_FILE_REGEX = re.compile("word/footer[0-9]*.xml")
563563
DOCX_SCHEMA_URL = (
564564
"http://schemas.openxmlformats.org/wordprocessingml/2006/main"
@@ -601,7 +601,9 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]:
601601
for filename in filelist:
602602
if DOCX_HEADER_FILE_REGEX.match(filename):
603603
yield z.read(filename).decode("utf8")
604-
yield z.read(DOCX_DOC_FILE)
604+
for filename in filelist:
605+
if DOCX_DOCUMENT_FILE_REGEX.match(filename):
606+
yield z.read(filename)
605607
for filename in filelist:
606608
if DOCX_FOOTER_FILE_REGEX.match(filename):
607609
yield z.read(filename).decode("utf8")

0 commit comments

Comments
 (0)