Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docling_core/cli/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ def view(
image_mode=ImageRefMode.EMBEDDED,
split_page_view=split_view,
)
with open(target_path, "w", encoding="utf-8") as f:
f.write(html_output)
target_path.write_text(html_output, encoding="utf-8")
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")


Expand Down
108 changes: 58 additions & 50 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
import json
import logging
import mimetypes
import os
import re
import sys
import typing
import warnings
from enum import Enum
from io import BytesIO
from io import BytesIO, StringIO
from pathlib import Path
from typing import (
Any,
Expand Down Expand Up @@ -65,7 +64,11 @@
PictureClassificationLabel,
)
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
from docling_core.types.doc.utils import (
is_remote_path,
parse_otsl_table_content,
relative_path,
)

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -4762,38 +4765,48 @@ def _with_pictures_refs(
img_count = 0
image_dir.mkdir(parents=True, exist_ok=True)

if image_dir.is_dir():
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
if isinstance(item, PictureItem):
img = item.get_image(doc=self)
if img is not None:

hexhash = PictureItem._image_to_hexhash(img)

# loc_path = image_dir / f"image_{img_count:06}.png"
if hexhash is not None:
loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"

img.save(loc_path)
if reference_path is not None:
obj_path = relative_path(
reference_path.resolve(),
loc_path.resolve(),
)
else:
obj_path = loc_path
# Note: Skip is_dir() check for remote paths since S3/cloud storage
# doesn't have real directories - mkdir() is a no-op for remote paths
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
if isinstance(item, PictureItem):
img = item.get_image(doc=self)
if img is not None:

hexhash = PictureItem._image_to_hexhash(img)

# loc_path = image_dir / f"image_{img_count:06}.png"
if hexhash is not None:
loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"

# Use BytesIO + write_bytes for UPath compatibility
buf = BytesIO()
img.save(buf, format="PNG")
loc_path.write_bytes(buf.getvalue())

# For remote paths, use absolute URI string; for local, compute relative
if is_remote_path(loc_path) or is_remote_path(reference_path):
# Convert to string URI for remote paths (Pydantic can't serialize UPath)
obj_path = str(loc_path)
elif reference_path is not None:
obj_path = relative_path(
reference_path.resolve(),
loc_path.resolve(),
)
else:
obj_path = loc_path

if item.image is None:
scale = img.size[0] / item.prov[0].bbox.width
item.image = ImageRef.from_pil(
image=img, dpi=round(72 * scale)
)
item.image.uri = Path(obj_path)
if item.image is None:
scale = img.size[0] / item.prov[0].bbox.width
item.image = ImageRef.from_pil(
image=img, dpi=round(72 * scale)
)
# For remote paths, store as string URI; for local, store as Path
item.image.uri = obj_path

# if item.image._pil is not None:
# item.image._pil.close()
# if item.image._pil is not None:
# item.image._pil.close()

img_count += 1
img_count += 1

return result

Expand Down Expand Up @@ -4859,7 +4872,7 @@ def save_as_json(
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)

if image_mode == ImageRefMode.REFERENCED:
os.makedirs(artifacts_dir, exist_ok=True)
artifacts_dir.mkdir(parents=True, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
Expand All @@ -4868,8 +4881,7 @@ def save_as_json(
out = new_doc.export_to_dict(
coord_precision=coord_precision, confid_precision=confid_precision
)
with open(filename, "w", encoding="utf-8") as fw:
json.dump(out, fw, indent=indent)
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")

@classmethod
def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
Expand All @@ -4884,8 +4896,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
"""
if isinstance(filename, str):
filename = Path(filename)
with open(filename, "r", encoding="utf-8") as f:
return cls.model_validate_json(f.read())
return cls.model_validate_json(filename.read_text(encoding="utf-8"))

def save_as_yaml(
self,
Expand All @@ -4902,7 +4913,7 @@ def save_as_yaml(
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)

if image_mode == ImageRefMode.REFERENCED:
os.makedirs(artifacts_dir, exist_ok=True)
artifacts_dir.mkdir(parents=True, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
Expand All @@ -4911,8 +4922,9 @@ def save_as_yaml(
out = new_doc.export_to_dict(
coord_precision=coord_precision, confid_precision=confid_precision
)
with open(filename, "w", encoding="utf-8") as fw:
yaml.dump(out, fw, default_flow_style=default_flow_style)
stream = StringIO()
yaml.dump(out, stream, default_flow_style=default_flow_style)
filename.write_text(stream.getvalue(), encoding="utf-8")

@classmethod
def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
Expand All @@ -4926,8 +4938,7 @@ def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
"""
if isinstance(filename, str):
filename = Path(filename)
with open(filename, encoding="utf-8") as f:
data = yaml.load(f, Loader=yaml.SafeLoader)
data = yaml.load(filename.read_text(encoding="utf-8"), Loader=yaml.SafeLoader)
return DoclingDocument.model_validate(data)

def export_to_dict(
Expand Down Expand Up @@ -4979,7 +4990,7 @@ def save_as_markdown(
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)

if image_mode == ImageRefMode.REFERENCED:
os.makedirs(artifacts_dir, exist_ok=True)
artifacts_dir.mkdir(parents=True, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, page_no, reference_path=reference_path
Expand All @@ -5005,8 +5016,7 @@ def save_as_markdown(
mark_meta=mark_meta,
)

with open(filename, "w", encoding="utf-8") as fw:
fw.write(md_out)
filename.write_text(md_out, encoding="utf-8")

def export_to_markdown( # noqa: C901
self,
Expand Down Expand Up @@ -5185,7 +5195,7 @@ def save_as_html(
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)

if image_mode == ImageRefMode.REFERENCED:
os.makedirs(artifacts_dir, exist_ok=True)
artifacts_dir.mkdir(parents=True, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, page_no, reference_path=reference_path
Expand All @@ -5205,8 +5215,7 @@ def save_as_html(
include_annotations=include_annotations,
)

with open(filename, "w", encoding="utf-8") as fw:
fw.write(html_out)
filename.write_text(html_out, encoding="utf-8")

def _get_output_paths(
self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
Expand Down Expand Up @@ -5850,8 +5859,7 @@ def save_as_doctags(
minified=minified,
)

with open(filename, "w", encoding="utf-8") as fw:
fw.write(out)
filename.write_text(out, encoding="utf-8")

@deprecated("Use export_to_doctags() instead.")
def export_to_document_tokens(self, *args, **kwargs):
Expand Down
18 changes: 6 additions & 12 deletions docling_core/types/doc/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,8 +601,7 @@ def save_as_json(
if isinstance(filename, str):
filename = Path(filename)
out = self.export_to_dict()
with open(filename, "w", encoding="utf-8") as fw:
json.dump(out, fw, indent=indent)
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")

@classmethod
def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
Expand All @@ -616,8 +615,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
"""
if isinstance(filename, str):
filename = Path(filename)
with open(filename, "r", encoding="utf-8") as f:
return cls.model_validate_json(f.read())
return cls.model_validate_json(filename.read_text(encoding="utf-8"))

def crop_text(
self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
Expand Down Expand Up @@ -1218,8 +1216,7 @@ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
if isinstance(filename, str):
filename = Path(filename)
out = self.export_to_dict()
with open(filename, "w", encoding="utf-8") as fw:
json.dump(out, fw, indent=indent)
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")

@classmethod
def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
Expand All @@ -1233,8 +1230,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
"""
if isinstance(filename, str):
filename = Path(filename)
with open(filename, "r", encoding="utf-8") as f:
return cls.model_validate_json(f.read())
return cls.model_validate_json(filename.read_text(encoding="utf-8"))


class ParsedPdfDocument(BaseModel):
Expand Down Expand Up @@ -1280,8 +1276,7 @@ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
if isinstance(filename, str):
filename = Path(filename)
out = self.export_to_dict()
with open(filename, "w", encoding="utf-8") as fw:
json.dump(out, fw, indent=indent)
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")

@classmethod
def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
Expand All @@ -1295,5 +1290,4 @@ def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
"""
if isinstance(filename, str):
filename = Path(filename)
with open(filename, "r", encoding="utf-8") as f:
return cls.model_validate_json(f.read())
return cls.model_validate_json(filename.read_text(encoding="utf-8"))
36 changes: 32 additions & 4 deletions docling_core/types/doc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,33 @@
import re
import unicodedata
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Tuple
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union

from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken

if TYPE_CHECKING:
from docling_core.types.doc.document import TableCell, TableData


def relative_path(src: Path, target: Path) -> Path:
def is_remote_path(p: Any) -> bool:
"""Check if a path is a remote/cloud path (e.g., S3, GCS, Azure).

Works with UPath objects from universal-pathlib. Local paths return False.

Args:
p: A path object (Path, UPath, or similar)

Returns:
bool: True if the path is a remote/cloud path, False otherwise.
"""
# UPath objects have a 'protocol' attribute
protocol = getattr(p, "protocol", None)
if protocol is not None and protocol not in ("file", ""):
return True
return False


def relative_path(src: Union[str, Path], target: Union[str, Path]) -> Path:
"""Compute the relative path from `src` to `target`.

Args:
Expand All @@ -25,9 +43,19 @@ def relative_path(src: Path, target: Path) -> Path:

Raises:
ValueError: If either `src` or `target` is not an absolute path.

Note:
For remote paths (UPath with non-file protocols), this function cannot
compute relative paths. Use is_remote_path() to check before calling.
"""
src = Path(src).resolve()
target = Path(target).resolve()
# Convert to Path only if string, otherwise keep original type
if isinstance(src, str):
src = Path(src)
if isinstance(target, str):
target = Path(target)

src = src.resolve()
target = target.resolve()

# Ensure both paths are absolute
if not src.is_absolute():
Expand Down
17 changes: 9 additions & 8 deletions docling_core/utils/generate_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import argparse
import json
import os
from argparse import BooleanOptionalAction
from pathlib import Path
from shutil import rmtree
Expand All @@ -24,15 +23,16 @@ def _prepare_directory(folder: str, clean: bool = False) -> None:
folder: The name of the directory.
clean: Whether any existing content in the directory should be removed.
"""
if os.path.isdir(folder):
folder_path = Path(folder)
if folder_path.is_dir():
if clean:
for path in Path(folder).glob("**/*"):
for path in folder_path.glob("**/*"):
if path.is_file():
path.unlink()
elif path.is_dir():
rmtree(path)
else:
os.makedirs(folder, exist_ok=True)
folder_path.mkdir(parents=True, exist_ok=True)


def generate_collection_jsonschema(folder: str):
Expand All @@ -41,12 +41,13 @@ def generate_collection_jsonschema(folder: str):
Args:
folder: The name of the directory.
"""
folder_path = Path(folder)
for item in MODELS:
json_schema = generate_json_schema(item)
with open(
os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
) as json_file:
json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
output_file = folder_path / f"{item}.json"
output_file.write_text(
json.dumps(json_schema, ensure_ascii=False, indent=2), encoding="utf-8"
)


def main() -> None:
Expand Down
5 changes: 3 additions & 2 deletions docling_core/utils/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ def parse_arguments():

def run():
"""Run the validation of a file containing a Document."""
from pathlib import Path

file_format, input_file = parse_arguments()

with open(input_file, "r", encoding="utf-8") as fd:
file_ = json.load(fd)
file_ = json.loads(Path(input_file).read_text(encoding="utf-8"))

result = (False, "Empty result")

Expand Down
Loading
Loading