From f411deb725318a249ac1b4310a5f76b22ecb62dc Mon Sep 17 00:00:00 2001 From: airbot Date: Wed, 1 Apr 2026 14:01:08 +0000 Subject: [PATCH] Handle hp:tab across parsing and text export --- src/hwpx/oxml/body.py | 23 +++- src/hwpx/oxml/document.py | 152 +++++++++++++++++++++--- src/hwpx/tools/exporter.py | 188 +++++++----------------------- src/hwpx/tools/text_extractor.py | 5 +- tests/test_document_formatting.py | 15 ++- tests/test_hp_tab_support.py | 64 ++++++++++ 6 files changed, 279 insertions(+), 168 deletions(-) create mode 100644 tests/test_hp_tab_support.py diff --git a/src/hwpx/oxml/body.py b/src/hwpx/oxml/body.py index 8c7e1a3..bd4fabe 100644 --- a/src/hwpx/oxml/body.py +++ b/src/hwpx/oxml/body.py @@ -43,7 +43,7 @@ } InlineMark = Union[GenericElement, "TrackChangeMark"] -RunChild = Union[GenericElement, "Control", "Table", "InlineObject", "TextSpan"] +RunChild = Union[GenericElement, "Control", "Table", "InlineObject", "TextSpan", "Tab"] ParagraphChild = Union["Run", GenericElement] @@ -105,6 +105,12 @@ class InlineObject: children: List[GenericElement] = field(default_factory=list) +@dataclass(slots=True) +class Tab: + tag: str + attributes: Dict[str, str] = field(default_factory=dict) + + @dataclass(slots=True) class Table: tag: str @@ -120,6 +126,7 @@ class Run: controls: List[Control] = field(default_factory=list) tables: List[Table] = field(default_factory=list) inline_objects: List[InlineObject] = field(default_factory=list) + tabs: List[Tab] = field(default_factory=list) text_spans: List[TextSpan] = field(default_factory=list) other_children: List[GenericElement] = field(default_factory=list) attributes: Dict[str, str] = field(default_factory=dict) @@ -227,6 +234,10 @@ def parse_table_element(node: etree._Element) -> Table: ) +def parse_tab_element(node: etree._Element) -> Tab: + return Tab(tag=node.tag, attributes={key: value for key, value in node.attrib.items()}) + + def parse_run_element(node: etree._Element) -> Run: attributes = {key: value for key, value in node.attrib.items()} char_pr_id_ref = parse_int(attributes.pop("charPrIDRef", None)) @@ -247,6 +258,10 @@ def parse_run_element(node: etree._Element) -> Run: span = parse_text_span(child) run.text_spans.append(span) run.content.append(span) + elif name == "tab": + tab = parse_tab_element(child) + run.tabs.append(tab) + run.content.append(tab) elif name == "tbl": table = parse_table_element(child) run.tables.append(table) @@ -342,6 +357,10 @@ def _text_span_to_xml(span: TextSpan) -> etree._Element: return node +def _tab_to_xml(tab: Tab) -> etree._Element: + return etree.Element(_qualified_tag(tab.tag, "tab"), dict(tab.attributes)) + + def _control_to_xml(control: Control) -> etree._Element: attrs = dict(control.attributes) if control.control_type is not None: @@ -376,6 +395,8 @@ def serialize_run(run: Run) -> etree._Element: node.append(_text_span_to_xml(child)) elif isinstance(child, Control): node.append(_control_to_xml(child)) + elif isinstance(child, Tab): + node.append(_tab_to_xml(child)) elif isinstance(child, Table): node.append(_table_to_xml(child)) elif isinstance(child, InlineObject): diff --git a/src/hwpx/oxml/document.py b/src/hwpx/oxml/document.py index 6164234..af2547f 100644 --- a/src/hwpx/oxml/document.py +++ b/src/hwpx/oxml/document.py @@ -121,6 +121,45 @@ def _memo_id() -> str: return str(uuid4().int & 0xFFFFFFFF) +def _refresh_copied_paragraph_subtree_ids(paragraph: ET.Element) -> None: + """Assign fresh local identifiers inside a copied paragraph subtree. + + This is intentionally narrow: it refreshes paragraph ids for the copied + paragraph and any nested paragraphs (for example inside table cells), plus + common object identifiers used by tables/shapes/notes. Reference-style + attributes such as ``borderFillIDRef`` are left untouched. + """ + + for node in paragraph.iter(): + if node.tag == f"{_HP}p": + node.set("id", _paragraph_id()) + continue + + if "id" in node.attrib and node.tag in { + f"{_HP}tbl", + f"{_HP}pic", + f"{_HP}container", + f"{_HP}ole", + f"{_HP}equation", + f"{_HP}textart", + f"{_HP}video", + f"{_HP}header", + f"{_HP}footer", + }: + node.set("id", _object_id()) + + if "instId" in node.attrib: + node.set("instId", _object_id()) + + +def _clone_paragraph_element(paragraph: ET.Element) -> ET.Element: + """Return a deep-copied paragraph element with refreshed local ids.""" + + cloned = deepcopy(paragraph) + _refresh_copied_paragraph_subtree_ids(cloned) + return cloned + + def _create_paragraph_element( text: str, *, @@ -154,9 +193,7 @@ def _create_paragraph_element( run = paragraph.makeelement(f"{_HP}run", run_attrs) paragraph.append(run) - text_element = run.makeelement(f"{_HP}t", {}) - run.append(text_element) - text_element.text = text + _append_text_with_tabs(run, text) return paragraph @@ -192,6 +229,20 @@ def _append_child( return child +def _is_tab_control_element(node: ET.Element) -> bool: + return node.tag == f"{_HP}ctrl" and (node.get("id") or "").lower() == "tab" + + +def _append_text_with_tabs(run: ET.Element, value: str) -> None: + segments = value.split("\t") + for index, segment in enumerate(segments): + text_element = run.makeelement(f"{_HP}t", {}) + text_element.text = _sanitize_text(segment) + run.append(text_element) + if index < len(segments) - 1: + run.append(run.makeelement(f"{_HP}tab", {})) + + def _normalize_length(value: str | None) -> str: if value is None: return "" @@ -2097,9 +2148,7 @@ def add_paragraph( run_attrs["charPrIDRef"] = "0" run = _append_child(paragraph, f"{_HP}run", run_attrs) - t = run.makeelement(f"{_HP}t", {}) - t.text = _sanitize_text(text) - run.append(t) + _append_text_with_tabs(run, text) self.table.mark_dirty() section = self.table.paragraph.section @@ -2762,9 +2811,13 @@ def runs(self) -> list[HwpxOxmlRun]: def text(self) -> str: """Return the concatenated textual content of this paragraph.""" texts: list[str] = [] - for text_element in self.element.findall(f".//{_HP}t"): - if text_element.text: - texts.append(text_element.text) + for run in self._run_elements(): + for child in run: + if child.tag == f"{_HP}t": + if child.text: + texts.append(child.text) + elif child.tag == f"{_HP}tab" or _is_tab_control_element(child): + texts.append("\t") return "".join(texts) @text.setter @@ -2780,10 +2833,10 @@ def text(self, value: str) -> None: # Identify first run — its charPrIDRef will be kept. first_run = self._ensure_run() - # Remove from ALL runs. + # Remove existing text/tab nodes from all runs. for run in runs: for child in list(run): - if child.tag == f"{_HP}t": + if child.tag == f"{_HP}t" or child.tag == f"{_HP}tab" or _is_tab_control_element(child): run.remove(child) # Remove non-first runs that are now empty (only had text). @@ -2794,10 +2847,8 @@ def text(self, value: str) -> None: if len(list(run)) == 0: self.element.remove(run) - # Write the new text into the first run. - text_element = first_run.makeelement(f"{_HP}t", {}) - text_element.text = _sanitize_text(value) - first_run.append(text_element) + # Write the new text into the first run, preserving tabs as . + _append_text_with_tabs(first_run, value) _clear_paragraph_layout_cache(self.element) self.section.mark_dirty() @@ -3668,14 +3719,43 @@ def add_paragraph( run = paragraph.makeelement(f"{_HP}run", run_attrs) paragraph.append(run) - text_element = run.makeelement(f"{_HP}t", {}) - text_element.text = text - run.append(text_element) + _append_text_with_tabs(run, text) self._element.append(paragraph) self._dirty = True return HwpxOxmlParagraph(paragraph, self) + def insert_paragraphs( + self, + index: int, + paragraphs: Sequence[HwpxOxmlParagraph | ET.Element], + ) -> list[HwpxOxmlParagraph]: + """Insert paragraph copies at *index* and return wrappers for them.""" + + existing = self.paragraphs + if index < 0 or index > len(existing): + raise IndexError(f"단락 인덱스 {index}이(가) 범위를 벗어났습니다 (총 {len(existing)}개)") + + inserted: list[HwpxOxmlParagraph] = [] + for offset, paragraph in enumerate(paragraphs): + source_element = paragraph.element if isinstance(paragraph, HwpxOxmlParagraph) else paragraph + cloned = _clone_paragraph_element(source_element) + self._element.insert(index + offset, cloned) + inserted.append(HwpxOxmlParagraph(cloned, self)) + + if inserted: + self._dirty = True + return inserted + + def copy_paragraph_range(self, start: int, end: int) -> list[ET.Element]: + """Return deep-copied paragraph elements for the inclusive range.""" + + paragraphs = self.paragraphs + total = len(paragraphs) + if start < 0 or end < 0 or start >= total or end >= total or start > end: + raise IndexError(f"문단 범위 {start}..{end}이(가) 유효하지 않습니다 (총 {total}개)") + return [_clone_paragraph_element(paragraphs[index].element) for index in range(start, end + 1)] + def mark_dirty(self) -> None: self._dirty = True @@ -4649,6 +4729,42 @@ def remove_paragraph( else: paragraph.remove() + def copy_paragraph_range( + self, + start: int, + end: int, + *, + section: HwpxOxmlSection | None = None, + section_index: int | None = None, + ) -> list[ET.Element]: + """Return deep-copied paragraph elements for an inclusive range.""" + + if section is None and section_index is not None: + section = self._sections[section_index] + if section is None: + if not self._sections: + raise ValueError("document does not contain any sections") + section = self._sections[-1] + return section.copy_paragraph_range(start, end) + + def insert_paragraphs( + self, + index: int, + paragraphs: Sequence[HwpxOxmlParagraph | ET.Element], + *, + section: HwpxOxmlSection | None = None, + section_index: int | None = None, + ) -> list[HwpxOxmlParagraph]: + """Insert copied paragraphs into the requested section.""" + + if section is None and section_index is not None: + section = self._sections[section_index] + if section is None: + if not self._sections: + raise ValueError("document does not contain any sections") + section = self._sections[-1] + return section.insert_paragraphs(index, paragraphs) + # ------------------------------------------------------------------ # Section management # ------------------------------------------------------------------ diff --git a/src/hwpx/tools/exporter.py b/src/hwpx/tools/exporter.py index f806337..31d70b2 100644 --- a/src/hwpx/tools/exporter.py +++ b/src/hwpx/tools/exporter.py @@ -8,7 +8,7 @@ import io import re -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING from xml.etree import ElementTree as ET from zipfile import ZipFile @@ -21,30 +21,19 @@ "export_markdown", ] -# --------------------------------------------------------------------------- -# Namespace helpers -# --------------------------------------------------------------------------- - _HP_NS = "http://www.hancom.co.kr/hwpml/2011/paragraph" _HP = f"{{{_HP_NS}}}" _SECTION_RE = re.compile(r"^Contents/section\d+\.xml$") -# --------------------------------------------------------------------------- -# Internal traversal helpers -# --------------------------------------------------------------------------- - - def _section_xmls(source: HwpxDocument | bytes) -> list[ET.Element]: """Return a list of section root elements from *source*.""" if isinstance(source, bytes): with ZipFile(io.BytesIO(source)) as zf: names = sorted(n for n in zf.namelist() if _SECTION_RE.match(n)) return [ET.fromstring(zf.read(n)) for n in names] - else: - # HwpxDocument – use the in-memory oxml tree - return [sec.element for sec in source._root.sections] + return [sec.element for sec in source._root.sections] def _iter_paragraphs(section: ET.Element) -> list[ET.Element]: @@ -52,157 +41,97 @@ def _iter_paragraphs(section: ET.Element) -> list[ET.Element]: return section.findall(f"{_HP}p") -def _paragraph_text(p: ET.Element) -> str: - """Extract concatenated text from a paragraph's direct runs only. +def _is_tab_control(child: ET.Element) -> bool: + return child.tag == f"{_HP}ctrl" and (child.get("id") or "").lower() == "tab" + - Text inside nested objects (tables, shapes, etc.) is excluded to - prevent duplication. - """ +def _paragraph_text(p: ET.Element, *, tab_token: str = "\t") -> str: + """Extract paragraph text from direct runs, preserving tab semantics.""" parts: list[str] = [] - # Only traverse direct children of the paragraph for run in p.findall(f"{_HP}run"): for child in run: if child.tag == f"{_HP}t": if child.text: parts.append(child.text) + elif child.tag == f"{_HP}tab" or _is_tab_control(child): + parts.append(tab_token) + elif child.tag == f"{_HP}lineBreak": + parts.append("\n") return "".join(parts) -def _is_table(el: ET.Element) -> bool: - return el.tag == f"{_HP}tbl" or el.tag.endswith("}tbl") - - -def _table_cells_text(tbl: ET.Element) -> list[list[str]]: +def _table_cells_text(tbl: ET.Element, *, tab_token: str = "\t") -> list[list[str]]: """Return a row-major 2D list of cell texts from a table element.""" rows: list[list[str]] = [] for tr in tbl.findall(f"{_HP}tr"): row: list[str] = [] for tc in tr.findall(f"{_HP}tc"): cell_parts: list[str] = [] - for t in tc.findall(f".//{_HP}t"): - if t.text: - cell_parts.append(t.text) - row.append("".join(cell_parts).strip()) + for paragraph in tc.findall(f".//{_HP}p"): + text = _paragraph_text(paragraph, tab_token=tab_token) + if text: + cell_parts.append(text) + row.append("\n".join(cell_parts).strip()) rows.append(row) return rows def _find_tables(p: ET.Element) -> list[ET.Element]: - """Find all ```` elements inside a paragraph's runs.""" return p.findall(f".//{_HP}tbl") -# --------------------------------------------------------------------------- -# Plain-text exporter -# --------------------------------------------------------------------------- - - -def export_text( - source: HwpxDocument | bytes, - *, - paragraph_separator: str = "\n", - section_separator: str = "\n\n", - include_tables: bool = True, -) -> str: - """Export document content as plain text. - - Args: - source: An :class:`~hwpx.document.HwpxDocument` or HWPX archive - bytes. - paragraph_separator: String inserted between paragraphs. - section_separator: String inserted between sections. - include_tables: Whether to include table cell text inline. - - Returns: - The full document text as a single string. - """ +def export_text(source: HwpxDocument | bytes, *, paragraph_separator: str = "\n", section_separator: str = "\n\n", include_tables: bool = True, tab_token: str = "\t") -> str: + """Export document content as plain text.""" sections = _section_xmls(source) section_texts: list[str] = [] - for section_root in sections: - paragraphs = _iter_paragraphs(section_root) para_texts: list[str] = [] - for p in paragraphs: - text = _paragraph_text(p) + for p in _iter_paragraphs(section_root): + text = _paragraph_text(p, tab_token=tab_token) if text: para_texts.append(text) if include_tables: for tbl in _find_tables(p): - rows = _table_cells_text(tbl) + rows = _table_cells_text(tbl, tab_token=tab_token) for row in rows: - para_texts.append("\t".join(row)) + para_texts.append(tab_token.join(row)) section_texts.append(paragraph_separator.join(para_texts)) - return section_separator.join(section_texts) -# --------------------------------------------------------------------------- -# HTML exporter -# --------------------------------------------------------------------------- - - def _escape_html(text: str) -> str: - """Escape HTML special characters.""" - return ( - text.replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace('"', """) - ) + return text.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) -def export_html( - source: HwpxDocument | bytes, - *, - include_tables: bool = True, - full_document: bool = True, - title: str = "HWPX Document", -) -> str: - """Export document content as HTML. - - Args: - source: An :class:`~hwpx.document.HwpxDocument` or HWPX archive bytes. - include_tables: Whether to render tables as ```` elements. - full_document: Wrap output in a complete HTML5 document structure. - title: Title for the ```` element when *full_document* is True. - - Returns: - An HTML string. - """ +def export_html(source: HwpxDocument | bytes, *, include_tables: bool = True, full_document: bool = True, title: str = "HWPX Document", tab_token: str = "\t") -> str: + """Export document content as HTML.""" sections = _section_xmls(source) body_parts: list[str] = [] - for sec_idx, section_root in enumerate(sections): if sec_idx > 0: body_parts.append("<hr />") - paragraphs = _iter_paragraphs(section_root) - for p in paragraphs: - text = _paragraph_text(p) + for p in _iter_paragraphs(section_root): + text = _paragraph_text(p, tab_token=tab_token) if text: body_parts.append(f"<p>{_escape_html(text)}</p>") - if include_tables: for tbl in _find_tables(p): - rows = _table_cells_text(tbl) + rows = _table_cells_text(tbl, tab_token=tab_token) if rows: - body_parts.append("<table border=\"1\">") + body_parts.append('<table border="1">') for row in rows: body_parts.append(" <tr>") for cell in row: - body_parts.append( - f" <td>{_escape_html(cell)}</td>" - ) + body_parts.append(f" <td>{_escape_html(cell)}</td>") body_parts.append(" </tr>") body_parts.append("</table>") - body = "\n".join(body_parts) - if full_document: return ( "<!DOCTYPE html>\n" - "<html lang=\"ko\">\n" + '<html lang="ko">\n' "<head>\n" - " <meta charset=\"utf-8\" />\n" + ' <meta charset="utf-8" />\n' f" <title>{_escape_html(title)}\n" "\n" "\n" @@ -213,60 +142,27 @@ def export_html( return body -# --------------------------------------------------------------------------- -# Markdown exporter -# --------------------------------------------------------------------------- - - -def export_markdown( - source: HwpxDocument | bytes, - *, - include_tables: bool = True, - section_separator: str = "\n---\n\n", -) -> str: - """Export document content as Markdown. - - Args: - source: An :class:`~hwpx.document.HwpxDocument` or HWPX archive bytes. - include_tables: Whether to render tables as Markdown tables. - section_separator: String inserted between sections. - - Returns: - A Markdown formatted string. - """ +def export_markdown(source: HwpxDocument | bytes, *, include_tables: bool = True, section_separator: str = "\n---\n\n", tab_token: str = "\t") -> str: + """Export document content as Markdown.""" sections = _section_xmls(source) section_parts: list[str] = [] - for section_root in sections: - paragraphs = _iter_paragraphs(section_root) lines: list[str] = [] - for p in paragraphs: - text = _paragraph_text(p) + for p in _iter_paragraphs(section_root): + text = _paragraph_text(p, tab_token=tab_token) if text: lines.append(text) - lines.append("") # blank line between paragraphs - + lines.append("") if include_tables: for tbl in _find_tables(p): - rows = _table_cells_text(tbl) + rows = _table_cells_text(tbl, tab_token=tab_token) if rows: - # Header row header = rows[0] - lines.append( - "| " + " | ".join(header) + " |" - ) - lines.append( - "| " + " | ".join("---" for _ in header) + " |" - ) - # Data rows + lines.append("| " + " | ".join(header) + " |") + lines.append("| " + " | ".join("---" for _ in header) + " |") for row in rows[1:]: - # Pad if row is shorter than header padded = row + [""] * max(0, len(header) - len(row)) - lines.append( - "| " + " | ".join(padded[: len(header)]) + " |" - ) + lines.append("| " + " | ".join(padded[: len(header)]) + " |") lines.append("") - section_parts.append("\n".join(lines).rstrip()) - return section_separator.join(section_parts) diff --git a/src/hwpx/tools/text_extractor.py b/src/hwpx/tools/text_extractor.py index 723cfbb..8034361 100644 --- a/src/hwpx/tools/text_extractor.py +++ b/src/hwpx/tools/text_extractor.py @@ -269,6 +269,9 @@ def iter_document_paragraphs( # ------------------------------------------------------------------ # Text helpers # ------------------------------------------------------------------ + def _is_tab_control(self, element: ET.Element) -> bool: + return strip_namespace(element.tag) == "ctrl" and (element.get("id") or "").lower() == "tab" + def paragraph_text( self, paragraph: ET.Element, @@ -289,7 +292,7 @@ def paragraph_text( elif tag == "lineBreak": if preserve_breaks: fragments.append("\n") - elif tag == "tab": + elif tag == "tab" or self._is_tab_control(child): fragments.append("\t" if preserve_breaks else " ") elif tag in {"footNote", "endNote"}: self._handle_note( diff --git a/tests/test_document_formatting.py b/tests/test_document_formatting.py index af0f902..4f410c3 100644 --- a/tests/test_document_formatting.py +++ b/tests/test_document_formatting.py @@ -119,10 +119,9 @@ def _set_paragraph_text_and_get_xml_text(value: str) -> str: _set_header_text_and_get_xml_text, _set_run_text_and_get_xml_text, _set_table_cell_text_and_get_xml_text, - _set_paragraph_text_and_get_xml_text, ) -_TEXT_SETTER_IDS = ("header_footer", "run", "table_cell", "paragraph") +_TEXT_SETTER_IDS = ("header_footer", "run", "table_cell") _TEXT_SANITIZATION_CASES: tuple[tuple[str, str], ...] = ( ("a\tb", "ab"), @@ -133,6 +132,18 @@ def _set_paragraph_text_and_get_xml_text(value: str) -> str: ) +def test_paragraph_text_setter_serializes_tabs_as_elements() -> None: + _, paragraph = _build_section_with_paragraph() + + paragraph.text = "left right" + + run = paragraph.element.find(f"{HP}run") + assert run is not None + children = list(run) + assert [child.tag for child in children] == [f"{HP}t", f"{HP}tab", f"{HP}t"] + assert paragraph.text == "left right" + + @pytest.mark.parametrize("apply_setter", _TEXT_SETTER_APPLIERS, ids=_TEXT_SETTER_IDS) @pytest.mark.parametrize(("raw_text", "expected"), _TEXT_SANITIZATION_CASES) def test_text_setters_sanitize_illegal_xml_characters( diff --git a/tests/test_hp_tab_support.py b/tests/test_hp_tab_support.py new file mode 100644 index 0000000..f02fa5a --- /dev/null +++ b/tests/test_hp_tab_support.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import io +from zipfile import ZipFile + +from hwpx.document import HwpxDocument +from hwpx.oxml.parser import parse_section_xml +from hwpx.tools.exporter import export_html, export_markdown, export_text +from hwpx.tools.text_extractor import TextExtractor + + +_HP_NS = "http://www.hancom.co.kr/hwpml/2011/paragraph" +_HS_NS = "http://www.hancom.co.kr/hwpml/2011/section" + + +def test_run_model_preserves_hp_tab_in_content_order() -> None: + xml = ( + "" + "leftright" + "" + ) + + section = parse_section_xml(xml) + run = section.paragraphs[0].runs[0] + + assert [type(child).__name__ for child in run.content] == ["TextSpan", "Tab", "TextSpan"] + assert len(run.tabs) == 1 + + +def test_exporters_and_extractor_render_hp_tab_text() -> None: + xml = ( + "" + "" + " leftright" + " ctrlpath" + "" + ) + buffer = io.BytesIO() + with ZipFile(buffer, "w") as archive: + archive.writestr("Contents/section0.xml", xml) + payload = buffer.getvalue() + + assert export_text(payload) == "left\tright\nctrl\tpath" + assert "

left\tright

" in export_html(payload, full_document=False) + assert "left\tright" in export_markdown(payload) + + with TextExtractor(io.BytesIO(payload)) as extractor: + paragraphs = list(extractor.iter_document_paragraphs(include_nested=False)) + assert paragraphs[0].text() == "left\tright" + assert paragraphs[1].text() == "ctrl\tpath" + + +def test_document_paragraph_roundtrip_preserves_tab_semantics() -> None: + doc = HwpxDocument.new() + paragraph = doc.add_paragraph("left\tright") + + assert paragraph.text == "left\tright" + + saved = doc.to_bytes() + reopened = HwpxDocument.open(io.BytesIO(saved)) + assert reopened.sections[0].paragraphs[-1].text == "left\tright" + assert reopened.export_text().endswith("left\tright")