docling-project · Nintorac · Nov 28, 2025
diff --git a/docling_core/cli/view.py b/docling_core/cli/view.py
@@ -69,8 +69,7 @@ def view(
         image_mode=ImageRefMode.EMBEDDED,
         split_page_view=split_view,
     )
-    with open(target_path, "w", encoding="utf-8") as f:
-        f.write(html_output)
+    target_path.write_text(html_output, encoding="utf-8")
     webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
 
 

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -6,13 +6,12 @@
 import json
 import logging
 import mimetypes
-import os
 import re
 import sys
 import typing
 import warnings
 from enum import Enum
-from io import BytesIO
+from io import BytesIO, StringIO
 from pathlib import Path
 from typing import (
     Any,
@@ -65,7 +64,11 @@
     PictureClassificationLabel,
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
-from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
+from docling_core.types.doc.utils import (
+    is_remote_path,
+    parse_otsl_table_content,
+    relative_path,
+)
 
 _logger = logging.getLogger(__name__)
 
@@ -4762,38 +4765,48 @@ def _with_pictures_refs(
         img_count = 0
         image_dir.mkdir(parents=True, exist_ok=True)
 
-        if image_dir.is_dir():
-            for item, level in result.iterate_items(page_no=page_no, with_groups=False):
-                if isinstance(item, PictureItem):
-                    img = item.get_image(doc=self)
-                    if img is not None:
-
-                        hexhash = PictureItem._image_to_hexhash(img)
-
-                        # loc_path = image_dir / f"image_{img_count:06}.png"
-                        if hexhash is not None:
-                            loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
-
-                            img.save(loc_path)
-                            if reference_path is not None:
-                                obj_path = relative_path(
-                                    reference_path.resolve(),
-                                    loc_path.resolve(),
-                                )
-                            else:
-                                obj_path = loc_path
+        # Note: Skip is_dir() check for remote paths since S3/cloud storage
+        # doesn't have real directories - mkdir() is a no-op for remote paths
+        for item, level in result.iterate_items(page_no=page_no, with_groups=False):
+            if isinstance(item, PictureItem):
+                img = item.get_image(doc=self)
+                if img is not None:
+
+                    hexhash = PictureItem._image_to_hexhash(img)
+
+                    # loc_path = image_dir / f"image_{img_count:06}.png"
+                    if hexhash is not None:
+                        loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
+
+                        # Use BytesIO + write_bytes for UPath compatibility
+                        buf = BytesIO()
+                        img.save(buf, format="PNG")
+                        loc_path.write_bytes(buf.getvalue())
+
+                        # For remote paths, use absolute URI string; for local, compute relative
+                        if is_remote_path(loc_path) or is_remote_path(reference_path):
+                            # Convert to string URI for remote paths (Pydantic can't serialize UPath)
+                            obj_path = str(loc_path)
+                        elif reference_path is not None:
+                            obj_path = relative_path(
+                                reference_path.resolve(),
+                                loc_path.resolve(),
+                            )
+                        else:
+                            obj_path = loc_path
 
-                            if item.image is None:
-                                scale = img.size[0] / item.prov[0].bbox.width
-                                item.image = ImageRef.from_pil(
-                                    image=img, dpi=round(72 * scale)
-                                )
-                            item.image.uri = Path(obj_path)
+                        if item.image is None:
+                            scale = img.size[0] / item.prov[0].bbox.width
+                            item.image = ImageRef.from_pil(
+                                image=img, dpi=round(72 * scale)
+                            )
+                        # For remote paths, store as string URI; for local, store as Path
+                        item.image.uri = obj_path
 
-                        # if item.image._pil is not None:
-                        #    item.image._pil.close()
+                    # if item.image._pil is not None:
+                    #    item.image._pil.close()
 
-                    img_count += 1
+                img_count += 1
 
         return result
 
@@ -4859,7 +4872,7 @@ def save_as_json(
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
 
         if image_mode == ImageRefMode.REFERENCED:
-            os.makedirs(artifacts_dir, exist_ok=True)
+            artifacts_dir.mkdir(parents=True, exist_ok=True)
 
         new_doc = self._make_copy_with_refmode(
             artifacts_dir, image_mode, page_no=None, reference_path=reference_path
@@ -4868,8 +4881,7 @@ def save_as_json(
         out = new_doc.export_to_dict(
             coord_precision=coord_precision, confid_precision=confid_precision
         )
-        with open(filename, "w", encoding="utf-8") as fw:
-            json.dump(out, fw, indent=indent)
+        filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
 
     @classmethod
     def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
@@ -4884,8 +4896,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
         """
         if isinstance(filename, str):
             filename = Path(filename)
-        with open(filename, "r", encoding="utf-8") as f:
-            return cls.model_validate_json(f.read())
+        return cls.model_validate_json(filename.read_text(encoding="utf-8"))
 
     def save_as_yaml(
         self,
@@ -4902,7 +4913,7 @@ def save_as_yaml(
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
 
         if image_mode == ImageRefMode.REFERENCED:
-            os.makedirs(artifacts_dir, exist_ok=True)
+            artifacts_dir.mkdir(parents=True, exist_ok=True)
 
         new_doc = self._make_copy_with_refmode(
             artifacts_dir, image_mode, page_no=None, reference_path=reference_path
@@ -4911,8 +4922,9 @@ def save_as_yaml(
         out = new_doc.export_to_dict(
             coord_precision=coord_precision, confid_precision=confid_precision
         )
-        with open(filename, "w", encoding="utf-8") as fw:
-            yaml.dump(out, fw, default_flow_style=default_flow_style)
+        stream = StringIO()
+        yaml.dump(out, stream, default_flow_style=default_flow_style)
+        filename.write_text(stream.getvalue(), encoding="utf-8")
 
     @classmethod
     def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
@@ -4926,8 +4938,7 @@ def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
         """
         if isinstance(filename, str):
             filename = Path(filename)
-        with open(filename, encoding="utf-8") as f:
-            data = yaml.load(f, Loader=yaml.SafeLoader)
+        data = yaml.load(filename.read_text(encoding="utf-8"), Loader=yaml.SafeLoader)
         return DoclingDocument.model_validate(data)
 
     def export_to_dict(
@@ -4979,7 +4990,7 @@ def save_as_markdown(
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
 
         if image_mode == ImageRefMode.REFERENCED:
-            os.makedirs(artifacts_dir, exist_ok=True)
+            artifacts_dir.mkdir(parents=True, exist_ok=True)
 
         new_doc = self._make_copy_with_refmode(
             artifacts_dir, image_mode, page_no, reference_path=reference_path
@@ -5005,8 +5016,7 @@ def save_as_markdown(
             mark_meta=mark_meta,
         )
 
-        with open(filename, "w", encoding="utf-8") as fw:
-            fw.write(md_out)
+        filename.write_text(md_out, encoding="utf-8")
 
     def export_to_markdown(  # noqa: C901
         self,
@@ -5185,7 +5195,7 @@ def save_as_html(
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
 
         if image_mode == ImageRefMode.REFERENCED:
-            os.makedirs(artifacts_dir, exist_ok=True)
+            artifacts_dir.mkdir(parents=True, exist_ok=True)
 
         new_doc = self._make_copy_with_refmode(
             artifacts_dir, image_mode, page_no, reference_path=reference_path
@@ -5205,8 +5215,7 @@ def save_as_html(
             include_annotations=include_annotations,
         )
 
-        with open(filename, "w", encoding="utf-8") as fw:
-            fw.write(html_out)
+        filename.write_text(html_out, encoding="utf-8")
 
     def _get_output_paths(
         self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
@@ -5850,8 +5859,7 @@ def save_as_doctags(
             minified=minified,
         )
 
-        with open(filename, "w", encoding="utf-8") as fw:
-            fw.write(out)
+        filename.write_text(out, encoding="utf-8")
 
     @deprecated("Use export_to_doctags() instead.")
     def export_to_document_tokens(self, *args, **kwargs):

diff --git a/docling_core/types/doc/page.py b/docling_core/types/doc/page.py
@@ -601,8 +601,7 @@ def save_as_json(
         if isinstance(filename, str):
             filename = Path(filename)
         out = self.export_to_dict()
-        with open(filename, "w", encoding="utf-8") as fw:
-            json.dump(out, fw, indent=indent)
+        filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
 
     @classmethod
     def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
@@ -616,8 +615,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
         """
         if isinstance(filename, str):
             filename = Path(filename)
-        with open(filename, "r", encoding="utf-8") as f:
-            return cls.model_validate_json(f.read())
+        return cls.model_validate_json(filename.read_text(encoding="utf-8"))
 
     def crop_text(
         self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
@@ -1218,8 +1216,7 @@ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
         if isinstance(filename, str):
             filename = Path(filename)
         out = self.export_to_dict()
-        with open(filename, "w", encoding="utf-8") as fw:
-            json.dump(out, fw, indent=indent)
+        filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
 
     @classmethod
     def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
@@ -1233,8 +1230,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
         """
         if isinstance(filename, str):
             filename = Path(filename)
-        with open(filename, "r", encoding="utf-8") as f:
-            return cls.model_validate_json(f.read())
+        return cls.model_validate_json(filename.read_text(encoding="utf-8"))
 
 
 class ParsedPdfDocument(BaseModel):
@@ -1280,8 +1276,7 @@ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
         if isinstance(filename, str):
             filename = Path(filename)
         out = self.export_to_dict()
-        with open(filename, "w", encoding="utf-8") as fw:
-            json.dump(out, fw, indent=indent)
+        filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
 
     @classmethod
     def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
@@ -1295,5 +1290,4 @@ def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
         """
         if isinstance(filename, str):
             filename = Path(filename)
-        with open(filename, "r", encoding="utf-8") as f:
-            return cls.model_validate_json(f.read())
+        return cls.model_validate_json(filename.read_text(encoding="utf-8"))
diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py
@@ -5,15 +5,33 @@
 import re
 import unicodedata
 from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
 
 from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
 
 if TYPE_CHECKING:
     from docling_core.types.doc.document import TableCell, TableData
 
 
-def relative_path(src: Path, target: Path) -> Path:
+def is_remote_path(p: Any) -> bool:
+    """Check if a path is a remote/cloud path (e.g., S3, GCS, Azure).
+
+    Works with UPath objects from universal-pathlib. Local paths return False.
+
+    Args:
+        p: A path object (Path, UPath, or similar)
+
+    Returns:
+        bool: True if the path is a remote/cloud path, False otherwise.
+    """
+    # UPath objects have a 'protocol' attribute
+    protocol = getattr(p, "protocol", None)
+    if protocol is not None and protocol not in ("file", ""):
+        return True
+    return False
+
+
+def relative_path(src: Union[str, Path], target: Union[str, Path]) -> Path:
     """Compute the relative path from `src` to `target`.
 
     Args:
@@ -25,9 +43,19 @@ def relative_path(src: Path, target: Path) -> Path:
 
     Raises:
         ValueError: If either `src` or `target` is not an absolute path.
+
+    Note:
+        For remote paths (UPath with non-file protocols), this function cannot
+        compute relative paths. Use is_remote_path() to check before calling.
     """
-    src = Path(src).resolve()
-    target = Path(target).resolve()
+    # Convert to Path only if string, otherwise keep original type
+    if isinstance(src, str):
+        src = Path(src)
+    if isinstance(target, str):
+        target = Path(target)
+
+    src = src.resolve()
+    target = target.resolve()
 
     # Ensure both paths are absolute
     if not src.is_absolute():

diff --git a/docling_core/utils/generate_docs.py b/docling_core/utils/generate_docs.py
@@ -6,7 +6,6 @@
 
 import argparse
 import json
-import os
 from argparse import BooleanOptionalAction
 from pathlib import Path
 from shutil import rmtree
@@ -24,15 +23,16 @@ def _prepare_directory(folder: str, clean: bool = False) -> None:
         folder: The name of the directory.
         clean: Whether any existing content in the directory should be removed.
     """
-    if os.path.isdir(folder):
+    folder_path = Path(folder)
+    if folder_path.is_dir():
         if clean:
-            for path in Path(folder).glob("**/*"):
+            for path in folder_path.glob("**/*"):
                 if path.is_file():
                     path.unlink()
                 elif path.is_dir():
                     rmtree(path)
     else:
-        os.makedirs(folder, exist_ok=True)
+        folder_path.mkdir(parents=True, exist_ok=True)
 
 
 def generate_collection_jsonschema(folder: str):
@@ -41,12 +41,13 @@ def generate_collection_jsonschema(folder: str):
     Args:
         folder: The name of the directory.
     """
+    folder_path = Path(folder)
     for item in MODELS:
         json_schema = generate_json_schema(item)
-        with open(
-            os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
-        ) as json_file:
-            json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
+        output_file = folder_path / f"{item}.json"
+        output_file.write_text(
+            json.dumps(json_schema, ensure_ascii=False, indent=2), encoding="utf-8"
+        )
 
 
 def main() -> None:

diff --git a/docling_core/utils/validate.py b/docling_core/utils/validate.py
@@ -32,10 +32,11 @@ def parse_arguments():
 
 def run():
     """Run the validation of a file containing a Document."""
+    from pathlib import Path
+
     file_format, input_file = parse_arguments()
 
-    with open(input_file, "r", encoding="utf-8") as fd:
-        file_ = json.load(fd)
+    file_ = json.loads(Path(input_file).read_text(encoding="utf-8"))
 
     result = (False, "Empty result")