From 5862f0b0769181d52e902744eae8e6bfc4f4fa33 Mon Sep 17 00:00:00 2001 From: airmang Date: Tue, 10 Mar 2026 10:06:52 +0900 Subject: [PATCH 1/2] chore: align license metadata --- CHANGELOG.md | 6 +++ DevDoc/license-alignment-audit.md | 41 +++++++++++++++ DevDoc/license-metadata-policy.md | 28 +++++++++++ README.md | 10 ++-- pyproject.toml | 9 ++-- tests/test_packaging_license_metadata.py | 63 ++++++++++++++++++++++++ 6 files changed, 149 insertions(+), 8 deletions(-) create mode 100644 DevDoc/license-alignment-audit.md create mode 100644 DevDoc/license-metadata-policy.md create mode 100644 tests/test_packaging_license_metadata.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e86e568..a4d169c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ 모든 중요한 변경 사항은 이 문서에 기록됩니다. 형식은 [Keep a Changelog](https://keepachangelog.com/ko/1.1.0/)과 [Semantic Versioning](https://semver.org/lang/ko/)을 따릅니다. +## [2.8.3] - 2026-03-10 +### 변경 +- 저장소와 배포 메타데이터의 라이선스 표기를 실제 `LICENSE` 파일과 일치하도록 정렬했습니다. +- `pyproject.toml`을 PEP 639 방식의 `LicenseRef-python-hwpx-NonCommercial` + `license-files` 구성으로 갱신하고, 잘못된 MIT 분류자를 제거했습니다. +- README 라이선스 배지/섹션을 커스텀 비상업적 라이선스 기준으로 수정하고, wheel/sdist 산출물의 라이선스 메타데이터를 검증하는 회귀 테스트를 추가했습니다. + ## [2.8.2] - 2026-03-08 ### 변경 - README를 현재 공개 API와 CLI 범위에 맞춰 정리했습니다. Quick start, 텍스트 추출, 객체 검색 예시를 실제 호출 방식 기준으로 수정했습니다. diff --git a/DevDoc/license-alignment-audit.md b/DevDoc/license-alignment-audit.md new file mode 100644 index 0000000..bded63a --- /dev/null +++ b/DevDoc/license-alignment-audit.md @@ -0,0 +1,41 @@ +# License Alignment Audit + +Date: 2026-03-10 + +## Files inspected + +- `LICENSE` +- `README.md` +- `pyproject.toml` +- `docs/conf.py` +- `CONTRIBUTING.md` +- `.github/workflows/release.yml` +- `.github/workflows/tests.yml` +- `scripts/build-and-publish.sh` +- `tests/test_packaging_py_typed.py` +- Repo-wide searches across `docs/`, `.github/`, `DevDoc/`, `CHANGELOG.md`, and the repository root for license-related metadata and MIT references + +## Contradictions found before this change + +- `LICENSE` defined a custom non-commercial license and named `python-hwpx Maintainers` as the copyright holder. +- `README.md` showed an MIT badge and an MIT license section, which contradicted the actual license text. +- `README.md` attributed the license line to `고규현 (Kyuhyun Koh)`, while the `LICENSE` file and package metadata used `python-hwpx Maintainers`. +- `pyproject.toml` used the legacy `license = { file = "LICENSE" }` form and also published the classifier `License :: OSI Approved :: MIT License`, which falsely represented the distribution as MIT-licensed. + +## Source of truth + +- The repository root `LICENSE` file is the source of truth for license terms. +- This audit treats the project as remaining under its existing custom non-commercial license. No evidence of an intentional relicensing to MIT was found elsewhere in the repository. + +## Decision summary + +- Preserve the current non-commercial custom license. +- Align public-facing metadata and README wording to that license. +- Use modern packaging metadata that points built distributions back to the root `LICENSE` file without inventing an OSI identifier. +- Remove conflicting MIT wording and the MIT trove classifier rather than replacing it with another potentially ambiguous license classifier. + +## Notes on surfaces inspected + +- `docs/conf.py` already used `python-hwpx Maintainers` and did not restate MIT licensing. +- No GitHub Pages or docs markdown pages were found to restate the project license. +- The release workflow already builds distributions and runs `twine check`, so it was left in place and used for verification after the metadata update. diff --git a/DevDoc/license-metadata-policy.md b/DevDoc/license-metadata-policy.md new file mode 100644 index 0000000..0b46573 --- /dev/null +++ b/DevDoc/license-metadata-policy.md @@ -0,0 +1,28 @@ +# License Metadata Policy + +## Source of truth + +- The root `LICENSE` file defines the project's license terms. +- Metadata changes must reflect the current `LICENSE` text. Do not treat README text, badges, or historical PyPI metadata as authoritative. + +## Packaging rule + +- `pyproject.toml` must represent the current custom license with `project.license = "LicenseRef-python-hwpx-NonCommercial"`. +- `pyproject.toml` must list `project.license-files = ["LICENSE"]` so both `sdist` and `wheel` carry the license file. +- Keep the build backend compatible with that metadata format by requiring `setuptools>=77.0.0`. + +## Classifier rule + +- Do not add `License ::` trove classifiers for this project unless the `LICENSE` file changes to a classifier-backed license and the classifier is verified to be accurate. +- For the current custom non-commercial license, leaving license classifiers unset is less ambiguous than picking an approximate classifier. + +## README rule + +- The README badge and license section must describe the project as using a custom non-commercial license and link to `LICENSE`. +- If contact information is updated, keep it distinct from the copyright/licensing line unless the `LICENSE` file is updated too. + +## Verification rule + +- Before release or after touching license metadata, run `python -m build` and `twine check dist/*`. +- Inspect built `PKG-INFO` and wheel `METADATA` for `License-Expression: LicenseRef-python-hwpx-NonCommercial` and `License-File: LICENSE`. +- Confirm the wheel contains `.dist-info/licenses/LICENSE` and the sdist contains the root `LICENSE` file. diff --git a/README.md b/README.md index 9153e84..b90bd97 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

PyPI Python - License + License: Custom Non-Commercial Docs

@@ -286,13 +286,15 @@ pytest ## License -[MIT](LICENSE) © 고규현 (Kyuhyun Koh) +[Custom Non-Commercial License](LICENSE) © python-hwpx Maintainers + +Commercial use requires separate permission from the copyright holders.
-## Author +## Maintainer -**고규현** — 광교고등학교 정보·컴퓨터 교사 +Primary maintainer/contact: **고규현** — 광교고등학교 정보·컴퓨터 교사 - ✉️ [kokyuhyun@hotmail.com](mailto:kokyuhyun@hotmail.com) - 🐙 [@airmang](https://github.com/airmang) diff --git a/pyproject.toml b/pyproject.toml index 844c0b6..85ed3d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,14 @@ [build-system] -requires = ["setuptools", "wheel"] +requires = ["setuptools>=77.0.0", "wheel"] build-backend = "setuptools.build_meta" [project] name = "python-hwpx" -version = "2.8.2" +version = "2.8.3" description = "Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음" readme = { file = "README.md", content-type = "text/markdown" } -license = { file = "LICENSE" } +license = "LicenseRef-python-hwpx-NonCommercial" +license-files = ["LICENSE"] requires-python = ">=3.10" authors = [ { name = "python-hwpx Maintainers" }, @@ -16,7 +17,6 @@ keywords = ["hwp", "hwpx", "hancom", "opc", "xml"] classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -35,6 +35,7 @@ dev = [ "pytest>=7.4", ] test = [ + "build>=1.0", "pytest>=7.4", "pytest-cov>=5.0", ] diff --git a/tests/test_packaging_license_metadata.py b/tests/test_packaging_license_metadata.py new file mode 100644 index 0000000..9f7ab19 --- /dev/null +++ b/tests/test_packaging_license_metadata.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import subprocess +import sys +import tarfile +from pathlib import Path +from zipfile import ZipFile + +import pytest + + +LICENSE_EXPRESSION = "LicenseRef-python-hwpx-NonCommercial" + + +def _build_distribution(tmp_path: Path, distribution: str) -> Path: + pytest.importorskip("build") + + project_root = Path(__file__).resolve().parents[1] + build_args = [ + sys.executable, + "-m", + "build", + f"--{distribution}", + "--outdir", + str(tmp_path), + ] + subprocess.run(build_args, cwd=project_root, check=True) + + pattern = "*.whl" if distribution == "wheel" else "*.tar.gz" + return next(tmp_path.glob(pattern)) + + +@pytest.mark.parametrize("distribution", ["wheel", "sdist"]) +def test_built_distributions_expose_custom_license_metadata( + tmp_path: Path, distribution: str +) -> None: + artifact = _build_distribution(tmp_path, distribution) + + if distribution == "wheel": + with ZipFile(artifact) as wheel_archive: + members = set(wheel_archive.namelist()) + metadata_name = next( + name for name in members if name.endswith(".dist-info/METADATA") + ) + metadata = wheel_archive.read(metadata_name).decode("utf-8") + + assert f"License-Expression: {LICENSE_EXPRESSION}" in metadata + assert "License-File: LICENSE" in metadata + assert "Classifier: License ::" not in metadata + assert any(name.endswith(".dist-info/licenses/LICENSE") for name in members) + return + + with tarfile.open(artifact, "r:gz") as sdist_archive: + members = sdist_archive.getnames() + pkg_info_name = next(name for name in members if name.endswith("/PKG-INFO")) + pkg_info_member = sdist_archive.extractfile(pkg_info_name) + assert pkg_info_member is not None + metadata = pkg_info_member.read().decode("utf-8") + + assert f"License-Expression: {LICENSE_EXPRESSION}" in metadata + assert "License-File: LICENSE" in metadata + assert "Classifier: License ::" not in metadata + assert any(name.endswith("/LICENSE") for name in members) From 3e20156059a3562eb54001649162ae7c2eac4255 Mon Sep 17 00:00:00 2001 From: airmang Date: Thu, 2 Apr 2026 12:32:43 +0900 Subject: [PATCH 2/2] feat: add table navigation helpers --- CHANGELOG.md | 9 + README.md | 12 + docs/api_reference.md | 6 + pyproject.toml | 2 +- src/hwpx/__init__.py | 6 +- src/hwpx/document.py | 33 ++- src/hwpx/tools/__init__.py | 24 ++ src/hwpx/tools/table_navigation.py | 457 +++++++++++++++++++++++++++++ tests/test_table_navigation.py | 183 ++++++++++++ 9 files changed, 729 insertions(+), 3 deletions(-) create mode 100644 src/hwpx/tools/table_navigation.py create mode 100644 tests/test_table_navigation.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a4d169c..24e35fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ 모든 중요한 변경 사항은 이 문서에 기록됩니다. 형식은 [Keep a Changelog](https://keepachangelog.com/ko/1.1.0/)과 [Semantic Versioning](https://semver.org/lang/ko/)을 따릅니다. +## [2.9.0] - 2026-04-02 +### 추가 +- `HwpxDocument.get_table_map()`, `find_cell_by_label()`, `fill_by_path()`를 추가해 HWPX 양식/템플릿 표를 문서 순서 기반으로 탐색하고 채울 수 있게 했습니다. +- `hwpx.tools.table_navigation` 모듈을 추가해 엔진 레벨에서 재사용 가능한 표 탐색, 라벨 정규화, 방향 이동, 배치 채우기 helper를 공개했습니다. + +### 변경 +- 라벨 매칭이 공백 축약, 대소문자 무시, 후행 콜론 허용 규칙을 따르도록 정규화 로직을 추가했습니다. +- 표 자동화 API에 대한 회귀 테스트와 README/API 레퍼런스 문서를 추가했습니다. + ## [2.8.3] - 2026-03-10 ### 변경 - 저장소와 배포 메타데이터의 라이선스 표기를 실제 `LICENSE` 파일과 일치하도록 정렬했습니다. diff --git a/README.md b/README.md index b90bd97..d110c41 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ doc.save_to_path("결과물.hwpx") | 📝 **단락** | 추가/삭제/편집/서식 | 텍스트 설정, 단락 삭제(`remove_paragraph`), 스타일 참조 | | ✏️ **Run** | 텍스트 조각 | 추가, 교체, 볼드/이탤릭/밑줄/색상 서식 | | 📊 **표(Table)** | 생성/편집/병합 | N×M 표 생성, 셀 텍스트, 셀 병합/분할, 중첩 테이블 | +| 🧭 **표 자동화** | 탐색/채우기 | 테이블 맵, 라벨 기반 셀 탐색, 경로 기반 배치 채우기 | | 📑 **섹션** | 추가/삭제 | `add_section(after=)`, `remove_section()`, manifest 자동 관리 | | 🖼️ **이미지** | 임베드/삭제 | 바이너리 데이터 관리, manifest 자동 등록 | | ✏️ **도형** | 선/사각형/타원 | OWPML 명세 준수 도형 삽입 | @@ -126,6 +127,17 @@ doc.set_footer_text("1 / 10", page_type="BOTH") # 표 셀 병합·분할 table.merge_cells(0, 0, 1, 1) # (0,0)~(1,1) 병합 table.set_cell_text(0, 0, "병합된 셀", logical=True, split_merged=True) + +# 양식형 표 자동 채우기 +form = doc.add_table(2, 2) +form.cell(0, 0).text = "성명:" +form.cell(1, 0).text = "소속" + +doc.find_cell_by_label("성명") # {"matches": [...], "count": 1} +doc.fill_by_path({ + "성명 > right": "홍길동", + "소속 > right": "플랫폼팀", +}) ``` ### 🔍 텍스트 추출 & 검색 diff --git a/docs/api_reference.md b/docs/api_reference.md index 6bd6d31..0924f2a 100644 --- a/docs/api_reference.md +++ b/docs/api_reference.md @@ -128,6 +128,12 @@ - 섹션을 삭제합니다. 인스턴스 또는 인덱스를 받습니다. 마지막 섹션 삭제 시 `ValueError`가 발생합니다. - `add_table(rows, cols, ...) -> HwpxOxmlTable` - 단락을 삽입하고 그 안에 표 인라인 객체를 생성한 후, 표 래퍼를 반환합니다. `border_fill_id_ref`를 생략하면 헤더 참조 목록에 기본 실선 `borderFill`을 생성하고 표와 셀에 자동으로 연결합니다. +- `get_table_map() -> dict` + - 문서 순서대로 표를 스캔하고 `table_index`, `paragraph_index`, 행·열 수, 추정 헤더 텍스트, 첫 행 미리보기, 빈 표 여부를 반환합니다. +- `find_cell_by_label(label_text, direction="right") -> dict` + - 모든 표를 순회하며 라벨 셀을 찾고, `right`/`down` 방향으로 인접한 타깃 셀 정보를 모두 반환합니다. 라벨 매칭은 공백·대소문자·후행 콜론을 정규화합니다. +- `fill_by_path(mappings) -> dict` + - `"라벨 > 방향 > 방향"` 형식의 경로를 해석해 셀 값을 일괄 기록합니다. 라벨 미발견, 다중 후보, 범위 초과는 개별 실패 항목으로 보고하고 나머지 매핑은 계속 처리합니다. - `add_shape(shape_type, ...) -> HwpxOxmlInlineObject` - 새 단락에 태그 이름을 사용하여 인라인 그리기 요소를 삽입합니다. - `add_control(...) -> HwpxOxmlInlineObject` diff --git a/pyproject.toml b/pyproject.toml index 85ed3d5..f3bb140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "python-hwpx" -version = "2.8.3" +version = "2.9.0" description = "Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음" readme = { file = "README.md", content-type = "text/markdown" } license = "LicenseRef-python-hwpx-NonCommercial" diff --git a/src/hwpx/__init__.py b/src/hwpx/__init__.py index 0a12491..593d43f 100644 --- a/src/hwpx/__init__.py +++ b/src/hwpx/__init__.py @@ -10,8 +10,12 @@ def _resolve_version() -> str: except PackageNotFoundError: return "0+unknown" +def __getattr__(name: str) -> object: + """Resolve dynamic module attributes.""" -__version__ = _resolve_version() + if name == "__version__": + return _resolve_version() + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") from .tools.text_extractor import ( DEFAULT_NAMESPACES, diff --git a/src/hwpx/document.py b/src/hwpx/document.py index 99138f8..38701af 100644 --- a/src/hwpx/document.py +++ b/src/hwpx/document.py @@ -10,7 +10,7 @@ import uuid from os import PathLike -from typing import Any, BinaryIO, Iterator, Sequence, overload +from typing import TYPE_CHECKING, Any, BinaryIO, Iterator, Mapping, Sequence, overload from lxml import etree @@ -53,6 +53,9 @@ logger = logging.getLogger(__name__) +if TYPE_CHECKING: + from .tools.table_navigation import TableFillResult, TableLabelSearchResult, TableMapResult + def _append_element( parent: Any, @@ -741,6 +744,34 @@ def add_table( char_pr_id_ref=char_pr_id_ref, ) + def get_table_map(self) -> TableMapResult: + """Return compact metadata for every table in document order.""" + + from .tools.table_navigation import get_table_map + + return get_table_map(self) + + def find_cell_by_label( + self, + label_text: str, + direction: str = "right", + ) -> TableLabelSearchResult: + """Return every label/target cell pair that matches *label_text*.""" + + from .tools.table_navigation import find_cell_by_label + + return find_cell_by_label(self, label_text, direction=direction) + + def fill_by_path( + self, + mappings: Mapping[str, str], + ) -> TableFillResult: + """Fill table cells using ``label > direction > ...`` navigation paths.""" + + from .tools.table_navigation import fill_by_path + + return fill_by_path(self, mappings) + def add_shape( self, shape_type: str, diff --git a/src/hwpx/tools/__init__.py b/src/hwpx/tools/__init__.py index 40880db..80c329c 100644 --- a/src/hwpx/tools/__init__.py +++ b/src/hwpx/tools/__init__.py @@ -25,6 +25,19 @@ describe_element_path, strip_namespace, ) +from .table_navigation import ( + TableCellReference, + TableFillApplied, + TableFillFailed, + TableFillResult, + TableLabelMatch, + TableLabelSearchResult, + TableMapEntry, + TableMapResult, + fill_by_path, + find_cell_by_label, + get_table_map, +) from .validator import ( DocumentSchemas, ValidationIssue, @@ -41,6 +54,17 @@ "build_parent_map", "describe_element_path", "strip_namespace", + "TableCellReference", + "TableFillApplied", + "TableFillFailed", + "TableFillResult", + "TableLabelMatch", + "TableLabelSearchResult", + "TableMapEntry", + "TableMapResult", + "fill_by_path", + "find_cell_by_label", + "get_table_map", "FoundElement", "ObjectFinder", "PackageValidationIssue", diff --git a/src/hwpx/tools/table_navigation.py b/src/hwpx/tools/table_navigation.py new file mode 100644 index 0000000..681d4d9 --- /dev/null +++ b/src/hwpx/tools/table_navigation.py @@ -0,0 +1,457 @@ +"""Reusable helpers for HWPX table discovery and form-like navigation.""" + +from __future__ import annotations + +from dataclasses import dataclass +import re +from typing import TYPE_CHECKING, Literal, Mapping, TypedDict + +from ..oxml import HwpxOxmlParagraph, HwpxOxmlTable + +if TYPE_CHECKING: + from ..document import HwpxDocument + +__all__ = [ + "SearchDirection", + "PathDirection", + "TableCellReference", + "TableFillApplied", + "TableFillFailed", + "TableFillResult", + "TableLabelMatch", + "TableLabelSearchResult", + "TableMapEntry", + "TableMapResult", + "fill_by_path", + "find_cell_by_label", + "get_table_map", +] + +_HP_NS = "http://www.hancom.co.kr/hwpml/2011/paragraph" +_HP = f"{{{_HP_NS}}}" +_WHITESPACE_RE = re.compile(r"\s+") + +SearchDirection = Literal["right", "down"] +PathDirection = Literal["left", "right", "up", "down"] + + +class TableMapEntry(TypedDict): + """Compact metadata describing a table in document order.""" + + table_index: int + paragraph_index: int + rows: int + cols: int + header_text: str + first_row_preview: list[str] + is_empty: bool + + +class TableMapResult(TypedDict): + """Collection of table metadata entries.""" + + tables: list[TableMapEntry] + + +class TableCellReference(TypedDict): + """A logical table cell position and its current text.""" + + row: int + col: int + text: str + + +class TableLabelMatch(TypedDict): + """A label cell and the cell reached from it.""" + + table_index: int + label_cell: TableCellReference + target_cell: TableCellReference + + +class TableLabelSearchResult(TypedDict): + """Result payload returned by :func:`find_cell_by_label`.""" + + matches: list[TableLabelMatch] + count: int + + +class TableFillApplied(TypedDict): + """A successfully applied path-based fill operation.""" + + path: str + table_index: int + row: int + col: int + value: str + + +class TableFillFailed(TypedDict): + """A failed path-based fill operation and its reason.""" + + path: str + reason: str + + +class TableFillResult(TypedDict): + """Batch fill summary for :func:`fill_by_path`.""" + + applied: list[TableFillApplied] + failed: list[TableFillFailed] + applied_count: int + failed_count: int + + +@dataclass(frozen=True, slots=True) +class _AnchoredTable: + table: HwpxOxmlTable + paragraph_index: int + header_text: str + + +@dataclass(frozen=True, slots=True) +class _IndexedTable: + table_index: int + table: HwpxOxmlTable + paragraph_index: int + header_text: str + + +@dataclass(frozen=True, slots=True) +class _LabelCandidate: + table_index: int + table: HwpxOxmlTable + row: int + col: int + text: str + + +def _collapse_whitespace(value: str) -> str: + return _WHITESPACE_RE.sub(" ", value).strip() + + +def _normalize_label_text(value: str) -> str: + normalized = _collapse_whitespace(value).casefold() + while normalized.endswith((":", ":")): + normalized = normalized[:-1].rstrip() + return normalized + + +def _direct_paragraph_text(paragraph: HwpxOxmlParagraph) -> str: + parts: list[str] = [] + for run in paragraph.element.findall(f"{_HP}run"): + for child in run: + if child.tag == f"{_HP}t" and child.text: + parts.append(child.text) + return _collapse_whitespace("".join(parts)) + + +def _collect_tables_from_table( + table: HwpxOxmlTable, + *, + anchor_paragraph_index: int, + inherited_header_text: str, + sink: list[_AnchoredTable], +) -> str: + last_header_text = inherited_header_text + for row in table.rows: + for cell in row.cells: + for paragraph in cell.paragraphs: + last_header_text = _collect_tables_from_paragraph( + paragraph, + anchor_paragraph_index=anchor_paragraph_index, + inherited_header_text=last_header_text, + sink=sink, + ) + return last_header_text + + +def _collect_tables_from_paragraph( + paragraph: HwpxOxmlParagraph, + *, + anchor_paragraph_index: int, + inherited_header_text: str, + sink: list[_AnchoredTable], +) -> str: + paragraph_text_parts: list[str] = [] + last_header_text = inherited_header_text + + for run in paragraph.element.findall(f"{_HP}run"): + for child in run: + if child.tag == f"{_HP}t": + if child.text: + paragraph_text_parts.append(child.text) + continue + if child.tag != f"{_HP}tbl": + continue + + paragraph_prefix_text = _collapse_whitespace("".join(paragraph_text_parts)) + header_text = paragraph_prefix_text or last_header_text + table = HwpxOxmlTable(child, paragraph) + sink.append( + _AnchoredTable( + table=table, + paragraph_index=anchor_paragraph_index, + header_text=header_text, + ) + ) + last_header_text = _collect_tables_from_table( + table, + anchor_paragraph_index=anchor_paragraph_index, + inherited_header_text=header_text, + sink=sink, + ) + + paragraph_text = _collapse_whitespace("".join(paragraph_text_parts)) + return paragraph_text or last_header_text + + +def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]: + anchored_tables: list[_AnchoredTable] = [] + last_top_level_text = "" + + for paragraph_index, paragraph in enumerate(document.paragraphs): + _collect_tables_from_paragraph( + paragraph, + anchor_paragraph_index=paragraph_index, + inherited_header_text=last_top_level_text, + sink=anchored_tables, + ) + paragraph_text = _direct_paragraph_text(paragraph) + if paragraph_text: + last_top_level_text = paragraph_text + + return [ + _IndexedTable( + table_index=table_index, + table=item.table, + paragraph_index=item.paragraph_index, + header_text=item.header_text, + ) + for table_index, item in enumerate(anchored_tables) + ] + + +def _cell_text(table: HwpxOxmlTable, row_index: int, col_index: int) -> str: + return table.cell(row_index, col_index).text + + +def _table_is_empty(table: HwpxOxmlTable) -> bool: + for row_index in range(table.row_count): + for col_index in range(table.column_count): + if _cell_text(table, row_index, col_index).strip(): + return False + return True + + +def _first_row_preview(table: HwpxOxmlTable) -> list[str]: + if table.row_count == 0: + return [] + return [_cell_text(table, 0, col_index) for col_index in range(table.column_count)] + + +def _direction_delta(direction: PathDirection) -> tuple[int, int]: + if direction == "right": + return (0, 1) + if direction == "left": + return (0, -1) + if direction == "down": + return (1, 0) + return (-1, 0) + + +def _move( + table: HwpxOxmlTable, + row_index: int, + col_index: int, + direction: PathDirection, +) -> tuple[int, int] | None: + row_delta, col_delta = _direction_delta(direction) + target_row = row_index + row_delta + target_col = col_index + col_delta + if target_row < 0 or target_col < 0: + return None + if target_row >= table.row_count or target_col >= table.column_count: + return None + return (target_row, target_col) + + +def _find_label_candidates( + tables: list[_IndexedTable], + label_text: str, +) -> list[_LabelCandidate]: + normalized_label = _normalize_label_text(label_text) + if not normalized_label: + raise ValueError("label_text must contain at least one non-whitespace character") + + candidates: list[_LabelCandidate] = [] + for table_ref in tables: + for row_index in range(table_ref.table.row_count): + for col_index in range(table_ref.table.column_count): + cell_text = _cell_text(table_ref.table, row_index, col_index) + if _normalize_label_text(cell_text) != normalized_label: + continue + candidates.append( + _LabelCandidate( + table_index=table_ref.table_index, + table=table_ref.table, + row=row_index, + col=col_index, + text=cell_text, + ) + ) + return candidates + + +def _cell_reference( + table: HwpxOxmlTable, + row_index: int, + col_index: int, +) -> TableCellReference: + return { + "row": row_index, + "col": col_index, + "text": _cell_text(table, row_index, col_index), + } + + +def _parse_path(path: str) -> tuple[str | None, list[str], str | None]: + tokens = [token.strip() for token in path.split(">")] + if not tokens or not tokens[0]: + return (None, [], "path must start with a label") + + label_text = tokens[0] + raw_directions = [token for token in tokens[1:] if token] + if not raw_directions: + return (label_text, [], "path must include at least one direction") + return (label_text, raw_directions, None) + + +def get_table_map(document: HwpxDocument) -> TableMapResult: + """Return compact metadata for every table in document order.""" + + tables: list[TableMapEntry] = [] + for table_ref in _collect_document_tables(document): + tables.append( + { + "table_index": table_ref.table_index, + "paragraph_index": table_ref.paragraph_index, + "rows": table_ref.table.row_count, + "cols": table_ref.table.column_count, + "header_text": table_ref.header_text, + "first_row_preview": _first_row_preview(table_ref.table), + "is_empty": _table_is_empty(table_ref.table), + } + ) + return {"tables": tables} + + +def find_cell_by_label( + document: HwpxDocument, + label_text: str, + direction: SearchDirection = "right", +) -> TableLabelSearchResult: + """Find label cells and return the adjacent target cells that remain in bounds.""" + + if direction not in {"right", "down"}: + raise ValueError("direction must be one of: right, down") + + matches: list[TableLabelMatch] = [] + for candidate in _find_label_candidates(_collect_document_tables(document), label_text): + target = _move(candidate.table, candidate.row, candidate.col, direction) + if target is None: + continue + target_row, target_col = target + matches.append( + { + "table_index": candidate.table_index, + "label_cell": { + "row": candidate.row, + "col": candidate.col, + "text": candidate.text, + }, + "target_cell": _cell_reference(candidate.table, target_row, target_col), + } + ) + + return { + "matches": matches, + "count": len(matches), + } + + +def fill_by_path( + document: HwpxDocument, + mappings: Mapping[str, str], +) -> TableFillResult: + """Fill multiple table cells using label-based navigation paths.""" + + indexed_tables = _collect_document_tables(document) + applied: list[TableFillApplied] = [] + failed: list[TableFillFailed] = [] + + for path, value in mappings.items(): + label_text, raw_directions, path_error = _parse_path(path) + if path_error is not None or label_text is None: + failed.append({"path": path, "reason": path_error or "invalid path"}) + continue + + try: + candidates = _find_label_candidates(indexed_tables, label_text) + except ValueError as exc: + failed.append({"path": path, "reason": str(exc)}) + continue + + if not candidates: + failed.append({"path": path, "reason": "label not found"}) + continue + if len(candidates) > 1: + failed.append({"path": path, "reason": "ambiguous label"}) + continue + + candidate = candidates[0] + current_row = candidate.row + current_col = candidate.col + navigation_failed = False + + for raw_direction in raw_directions: + direction = raw_direction.casefold() + if direction not in {"left", "right", "up", "down"}: + failed.append( + { + "path": path, + "reason": f"unsupported direction: {raw_direction}", + } + ) + navigation_failed = True + break + + next_position = _move(candidate.table, current_row, current_col, direction) + if next_position is None: + failed.append({"path": path, "reason": "navigation out of bounds"}) + navigation_failed = True + break + + current_row, current_col = next_position + + if navigation_failed: + continue + + text_value = str(value) + candidate.table.set_cell_text(current_row, current_col, text_value, logical=True) + applied.append( + { + "path": path, + "table_index": candidate.table_index, + "row": current_row, + "col": current_col, + "value": text_value, + } + ) + + return { + "applied": applied, + "failed": failed, + "applied_count": len(applied), + "failed_count": len(failed), + } diff --git a/tests/test_table_navigation.py b/tests/test_table_navigation.py new file mode 100644 index 0000000..aed26d7 --- /dev/null +++ b/tests/test_table_navigation.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +from hwpx import HwpxDocument + + +def _paragraph_index(document: HwpxDocument, target) -> int: + for index, paragraph in enumerate(document.paragraphs): + if paragraph.element is target.element: + return index + raise AssertionError("target paragraph was not found in document order") + + +def test_fill_by_path_handles_unique_labels_in_a_single_table() -> None: + document = HwpxDocument.new() + document.add_paragraph("1. 기본 현황") + table = document.add_table(2, 2) + table.cell(0, 0).text = "성명:" + table.cell(1, 0).text = "소속" + + result = document.fill_by_path( + { + "성명 > right": "홍길동", + "소속 > right": "플랫폼팀", + } + ) + + assert result["applied_count"] == 2 + assert result["failed_count"] == 0 + assert table.cell(0, 1).text == "홍길동" + assert table.cell(1, 1).text == "플랫폼팀" + + +def test_find_cell_by_label_normalizes_trailing_colons() -> None: + document = HwpxDocument.new() + document.add_paragraph("기본 정보") + table = document.add_table(1, 2) + table.cell(0, 0).text = "성명:" + + result = document.find_cell_by_label("성명") + + assert result["count"] == 1 + assert result["matches"][0]["table_index"] == 0 + assert result["matches"][0]["label_cell"] == { + "row": 0, + "col": 0, + "text": "성명:", + } + assert result["matches"][0]["target_cell"] == { + "row": 0, + "col": 1, + "text": "", + } + + +def test_multiple_tables_with_the_same_label_return_all_matches_and_make_fill_ambiguous() -> None: + document = HwpxDocument.new() + document.add_paragraph("1. 신청인") + first = document.add_table(1, 2) + first.cell(0, 0).text = "성명" + + document.add_paragraph("2. 보호자") + second = document.add_table(1, 2) + second.cell(0, 0).text = "성명" + + matches = document.find_cell_by_label("성명") + fill_result = document.fill_by_path({"성명 > right": "홍길동"}) + + assert matches["count"] == 2 + assert [match["table_index"] for match in matches["matches"]] == [0, 1] + assert fill_result["applied_count"] == 0 + assert fill_result["failed_count"] == 1 + assert fill_result["failed"][0] == { + "path": "성명 > right", + "reason": "ambiguous label", + } + assert first.cell(0, 1).text == "" + assert second.cell(0, 1).text == "" + + +def test_out_of_bounds_candidates_are_skipped_and_reported_for_batch_fill() -> None: + document = HwpxDocument.new() + document.add_paragraph("경계 값") + table = document.add_table(2, 2) + table.cell(0, 1).text = "마지막열" + table.cell(1, 0).text = "마지막행" + + right_matches = document.find_cell_by_label("마지막열", direction="right") + down_matches = document.find_cell_by_label("마지막행", direction="down") + fill_result = document.fill_by_path( + { + "마지막열 > right": "실패", + "마지막행 > down": "실패", + } + ) + + assert right_matches["count"] == 0 + assert down_matches["count"] == 0 + assert fill_result["applied_count"] == 0 + assert fill_result["failed_count"] == 2 + assert fill_result["failed"] == [ + {"path": "마지막열 > right", "reason": "navigation out of bounds"}, + {"path": "마지막행 > down", "reason": "navigation out of bounds"}, + ] + + +def test_fill_by_path_supports_multi_step_navigation() -> None: + document = HwpxDocument.new() + document.add_paragraph("정산표") + table = document.add_table(3, 2) + table.cell(0, 0).text = "합계" + + result = document.fill_by_path({"합계 > down > right": "100"}) + + assert result["applied"] == [ + { + "path": "합계 > down > right", + "table_index": 0, + "row": 1, + "col": 1, + "value": "100", + } + ] + assert result["failed"] == [] + assert table.cell(1, 1).text == "100" + + +def test_get_table_map_reports_stable_order_shape_and_header_text() -> None: + document = HwpxDocument.new() + document.add_paragraph("1. 기본 현황") + first = document.add_table(2, 4) + first.cell(0, 0).text = "성명" + first.cell(0, 1).text = "소속" + first.cell(0, 2).text = "직위" + first.cell(0, 3).text = "연락처" + first.cell(1, 0).text = "홍길동" + + document.add_paragraph("2. 비고") + second = document.add_table(1, 2) + second.cell(0, 0).text = "항목" + second.cell(0, 1).text = "값" + + result = document.get_table_map() + + assert result["tables"] == [ + { + "table_index": 0, + "paragraph_index": _paragraph_index(document, first.paragraph), + "rows": 2, + "cols": 4, + "header_text": "1. 기본 현황", + "first_row_preview": ["성명", "소속", "직위", "연락처"], + "is_empty": False, + }, + { + "table_index": 1, + "paragraph_index": _paragraph_index(document, second.paragraph), + "rows": 1, + "cols": 2, + "header_text": "2. 비고", + "first_row_preview": ["항목", "값"], + "is_empty": False, + }, + ] + + +def test_get_table_map_marks_tables_with_only_empty_strings_as_empty() -> None: + document = HwpxDocument.new() + document.add_paragraph("빈 표") + table = document.add_table(2, 2) + + result = document.get_table_map() + + assert result["tables"] == [ + { + "table_index": 0, + "paragraph_index": _paragraph_index(document, table.paragraph), + "rows": 2, + "cols": 2, + "header_text": "빈 표", + "first_row_preview": ["", ""], + "is_empty": True, + } + ]