diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 8c71ba9232..3bd9950d94 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -54,6 +54,39 @@ from unstructured.partition.utils.constants import PartitionStrategy from unstructured.utils import is_temp_file_path, lazyproperty +STYLE_TO_ELEMENT_MAPPING = { + "Caption": Text, # TODO(robinson) - add caption element type + "Heading 1": Title, + "Heading 2": Title, + "Heading 3": Title, + "Heading 4": Title, + "Heading 5": Title, + "Heading 6": Title, + "Heading 7": Title, + "Heading 8": Title, + "Heading 9": Title, + "Intense Quote": Text, # TODO(robinson) - add quote element type + "List": ListItem, + "List 2": ListItem, + "List 3": ListItem, + "List Bullet": ListItem, + "List Bullet 2": ListItem, + "List Bullet 3": ListItem, + "List Continue": ListItem, + "List Continue 2": ListItem, + "List Continue 3": ListItem, + "List Number": ListItem, + "List Number 2": ListItem, + "List Number 3": ListItem, + "List Paragraph": ListItem, + "Macro Text": Text, + "No Spacing": Text, + "Quote": Text, # TODO(robinson) - add quote element type + "Subtitle": Title, + "TOCHeading": Title, + "Title": Title, +} + DETECTION_ORIGIN: str = "docx" # -- CT_* stands for "complex-type", an XML element type in docx parlance -- BlockElement: TypeAlias = "CT_P | CT_Tbl" @@ -925,38 +958,6 @@ def _style_based_element_type(self, paragraph: Paragraph) -> Type[Text] | None: # NOTE(robinson) - documentation on built-in styles at the link below: # https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html \ # #paragraph-styles-in-default-template - STYLE_TO_ELEMENT_MAPPING = { - "Caption": Text, # TODO(robinson) - add caption element type - "Heading 1": Title, - "Heading 2": Title, - "Heading 3": Title, - "Heading 4": Title, - "Heading 5": Title, - "Heading 6": Title, - "Heading 7": Title, - "Heading 8": Title, - "Heading 9": Title, - "Intense Quote": Text, # TODO(robinson) - add quote element type - "List": ListItem, - "List 2": ListItem, - "List 3": ListItem, - "List Bullet": ListItem, - "List Bullet 2": ListItem, - "List Bullet 3": ListItem, - "List Continue": ListItem, - "List Continue 2": ListItem, - "List Continue 3": ListItem, - "List Number": ListItem, - "List Number 2": ListItem, - "List Number 3": ListItem, - "List Paragraph": ListItem, - "Macro Text": Text, - "No Spacing": Text, - "Quote": Text, # TODO(robinson) - add quote element type - "Subtitle": Title, - "TOCHeading": Title, - "Title": Title, - } # -- paragraph.style can be None in rare cases, so can style.name. That's going # -- to mean default style which is equivalent to "Normal" for our purposes.