shruthis4
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎kubeflow-pipelines/README.md‎
Lines changed: 44 additions & 8 deletions b/‎kubeflow-pipelines/README.md‎
Lines changed: 44 additions & 8 deletions
diff --git a/‎kubeflow-pipelines/common/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎kubeflow-pipelines/common/__init__.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎kubeflow-pipelines/common/components.py‎
Lines changed: 169 additions & 0 deletions b/‎kubeflow-pipelines/common/components.py‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎kubeflow-pipelines/docling-standard/README.md‎
Lines changed: 34 additions & 2 deletions b/‎kubeflow-pipelines/docling-standard/README.md‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎kubeflow-pipelines/docling-standard/local_run.py‎
Lines changed: 16 additions & 3 deletions b/‎kubeflow-pipelines/docling-standard/local_run.py‎
Lines changed: 16 additions & 3 deletions
@@ -4,3 +4,4 @@
 .venv
 .env
 venv
+diagrams/
@@ -17,28 +17,47 @@ Two KFP pipelines are included:
 - Two customizable pipelines to suit different needs:
   - Standard PDF pipeline (backends, OCR engines, table structure, image export)
   - VLM pipeline (Docling VLM or Granite-Vision pipeline options; remote VLM service supported)
+- **Optional document chunking** using Docling's HybridChunker 
 - Multiple input sources: HTTP/S URLs or S3/S3-compatible APIs like MinIO
 - Secret-based configuration:
   - Remote VLM API configuration via a single mounted Kubernetes Secret
   - S3 endpoint and credentials via a single mounted Kubernetes Secret
 - Tunable performance and quality: threads, timeouts, OCR forcing, table mode, PDF backends, enrichments
 - Works on OpenShift AI/Kubeflow Pipelines
 
+### Pipeline Architecture
+
+The following diagram shows the overall pipeline flow with optional chunking:
+
+![Pipeline Architecture](/assets/pipeline_architecture.png)
+
+**Input path:**
+- **PDF Files** → `import_pdfs` → `docling_convert_standard/vlm` → Markdown + Docling JSON
+
+When `chunk_enabled=True`, the conversion output flows through `docling_chunk` to produce chunked JSON files for RAG workflows.
+
 ## 📦 File Structure
 
 ```bash
 kubeflow-pipelines
 |
-|- docling-standard
-|   |- docling_convert_components.py
-|   |- docling_convert_pipeline.py
-|   |- docling_convert_pipeline_compiled.yaml (generated)
+|- common/
+|   |- __init__.py
+|   |- components.py          # Shared components (import_pdfs, docling_chunk, etc.)
+|   |- constants.py           # Shared constants (base images)
+|
+|- docling-standard/
+|   |- standard_components.py
+|   |- standard_convert_pipeline.py
+|   |- standard_convert_pipeline_compiled.yaml (generated)
+|   |- local_run.py           # Local testing script
 |   |- requirements.txt
 |
-|- docling-vlm
-    |- docling_convert_components.py
-    |- docling_convert_pipeline.py
-    |- docling_convert_pipeline_compiled.yaml (generated)
+|- docling-vlm/
+    |- vlm_components.py
+    |- vlm_convert_pipeline.py
+    |- vlm_convert_pipeline_compiled.yaml (generated)
+    |- local_run.py           # Local testing script
     |- requirements.txt
 ```
 
@@ -158,6 +177,23 @@ If you'd like to consume documents stored in an S3-compatible object storage rat
 Toggle enrichments via boolean parameters:
 - `docling_enrich_code`, `docling_enrich_formula`, `docling_enrich_picture_classes`, `docling_enrich_picture_description`.
 
+#### 7) Chunking converted documents
+
+Both pipelines support optional document chunking using Docling's [HybridChunker](https://docling-project.github.io/docling/examples/hybrid_chunking/). This splits converted documents into smaller, semantically meaningful chunks ideal for RAG (Retrieval-Augmented Generation) workflows.
+
+**Chunking parameters:**
+- `docling_chunk_enabled`: Set to `True` to enable chunking after conversion (default: `False`).
+- `docling_chunk_max_tokens`: Maximum tokens per chunk (default: `512`). Adjust based on your embedding model's context limit.
+- `docling_chunk_merge_peers`: If `True`, merge adjacent small chunks for better context (default: `True`).
+
+**Tokenizer:** Chunking uses the `sentence-transformers/all-MiniLM-L6-v2` tokenizer for accurate token counting, ensuring chunks are sized appropriately for common embedding models.
+
+**Chunked output location:**
+When chunking is enabled, an additional output file is created for each converted document:
+- **Filename format**: `{original_name}_chunks.jsonl`
+- **Location**: Same output directory as the converted `.json` and `.md` files
+- To find the output location, check the Graph of your pipeline Run, click the _docling-chunk_ box, and look in the _Output artifacts_ section.
+
 ## 🔧 Advanced customizations
 
 - Increase `num_splits` to **parallelize** across more workers (uses KFP `ParallelFor`).
 
@@ -6,13 +6,19 @@
 """
 
 # Import all common components to make them easily accessible
-from .components import create_pdf_splits, download_docling_models, import_pdfs
+from .components import (
+    create_pdf_splits,
+    docling_chunk,
+    download_docling_models,
+    import_pdfs,
+)
 from .constants import DOCLING_BASE_IMAGE, PYTHON_BASE_IMAGE
 
 __all__ = [
     "import_pdfs",
     "create_pdf_splits",
     "download_docling_models",
+    "docling_chunk",
     "PYTHON_BASE_IMAGE",
     "DOCLING_BASE_IMAGE",
 ]
@@ -241,3 +241,172 @@ def download_docling_models(
         raise ValueError(
             f"Invalid pipeline_type: {pipeline_type}. Must be 'standard' or 'vlm'"
         )
+
+
+@dsl.component(
+    base_image=DOCLING_BASE_IMAGE,
+)
+def docling_chunk(
+    input_path: dsl.Input[dsl.Artifact],
+    output_path: dsl.Output[dsl.Artifact],
+    max_tokens: int = 512,
+    merge_peers: bool = True,
+):
+    """
+    Chunk Docling documents using HybridChunker. Takes converted docling JSON files as input
+    and produces chunked JSONL files with semantic chunks suitable for RAG.
+
+    Output format is JSONL (one JSON object per line) for easy inspection and streaming.
+
+    Args:
+        input_path: Path to the input directory containing Docling JSON files
+        output_path: Path to the output directory for the chunked JSONL files
+        max_tokens: Maximum number of tokens per chunk
+        merge_peers: Whether to merge smaller chunks at the same level
+    """
+    import json  # pylint: disable=import-outside-toplevel
+    from datetime import datetime, timezone  # pylint: disable=import-outside-toplevel
+    from pathlib import Path  # pylint: disable=import-outside-toplevel
+
+    # HybridChunker = Docling's smart chunking class that combines:
+    # 1. Document structure awareness
+    # 2. Token-based splitting
+    from docling.chunking import HybridChunker  # pylint: disable=import-outside-toplevel
+    from docling_core.transforms.chunker.tokenizer.huggingface import (
+        HuggingFaceTokenizer,
+    )  # pylint: disable=import-outside-toplevel
+    from docling_core.types import DoclingDocument  # pylint: disable=import-outside-toplevel
+    from transformers import AutoTokenizer  # pylint: disable=import-outside-toplevel
+
+    # Convert KFP artifact paths to Path objects
+    input_path_p = Path(input_path.path)
+    output_path_p = Path(output_path.path)
+    output_path_p.mkdir(parents=True, exist_ok=True)
+
+    # Initialize tokenizer for HybridChunker (new API)
+    # Using a lightweight sentence-transformer model for tokenization
+    EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
+    try:
+        hf_tokenizer = AutoTokenizer.from_pretrained(
+            EMBED_MODEL_ID,
+            resume_download=True,
+            timeout=60,
+        )
+        print(f"docling-chunk: loaded tokenizer from {EMBED_MODEL_ID}", flush=True)
+    except Exception as e:
+        print(f"docling-chunk: ERROR loading tokenizer: {e}", flush=True)
+        raise RuntimeError(
+            f"Failed to load tokenizer model {EMBED_MODEL_ID}. "
+            "Ensure network access to HuggingFace Hub or pre-download the model."
+        ) from e
+
+    tokenizer = HuggingFaceTokenizer(
+        tokenizer=hf_tokenizer,
+        max_tokens=max_tokens,
+    )
+
+    # Initialize Hybrid chunker with user-specified parameters
+    # tokenizer: The tokenizer wrapper to use for counting tokens (includes max_tokens)
+    # merge_peers: if true, smaller adjacent chunks will be merged together
+    chunker = HybridChunker(
+        tokenizer=tokenizer,
+        merge_peers=merge_peers,
+    )
+
+    # Find all JSON files in the input directory
+    json_files = list(input_path_p.glob("*.json"))
+    if not json_files:
+        print(f"docling-chunk: No JSON files found in {input_path_p}", flush=True)
+        return
+
+    print(
+        f"docling-chunk: processing {len(json_files)} files with max_tokens={max_tokens} and merge_peers={merge_peers}",
+        flush=True,
+    )
+
+    # Track processing results
+    processed_count = 0
+    skipped_files = []
+
+    # Process each file
+    for json_file in json_files:
+        print(f"docling-chunk: processing {json_file}", flush=True)
+
+        # Load and validate the JSON file
+        try:
+            with open(json_file, "r", encoding="utf-8") as f:
+                doc_data = json.load(f)
+        except json.JSONDecodeError as e:
+            print(
+                f"docling-chunk: skipping {json_file.name} - invalid JSON: {e}",
+                flush=True,
+            )
+            skipped_files.append((json_file.name, f"invalid JSON: {e}"))
+            continue
+
+        # Parse the JSON data into a DoclingDocument object
+        # This validates that the JSON conforms to the DoclingDocument schema
+        try:
+            doc = DoclingDocument.model_validate(doc_data)
+        except Exception as e:
+            # Catches pydantic.ValidationError and any other validation issues
+            print(
+                f"docling-chunk: skipping {json_file.name} - not a valid DoclingDocument: {e}",
+                flush=True,
+            )
+            skipped_files.append((json_file.name, f"validation failed: {e}"))
+            continue
+
+        # Chunk the document using HybridChunker
+        chunks = list(chunker.chunk(dl_doc=doc))
+
+        # Generate output filename: original_name_chunks.jsonl
+        output_filename = f"{json_file.stem}_chunks.jsonl"
+        output_file = output_path_p / output_filename
+
+        # Get current timestamp in ISO format
+        timestamp = datetime.now(timezone.utc).isoformat()
+
+        # Chunking config (for reproducibility)
+        chunking_config = {
+            "max_tokens": max_tokens,
+            "merge_peers": merge_peers,
+            "tokenizer_model": EMBED_MODEL_ID,
+        }
+
+        # Write chunks as JSONL (one JSON object per line)
+        with open(output_file, "w", encoding="utf-8") as f:
+            for idx, chunk in enumerate(chunks):
+                # Get contextualized text for this chunk
+                chunk_text = chunker.contextualize(chunk=chunk)
+
+                # Build the chunk object
+                chunk_obj = {
+                    "timestamp": timestamp,
+                    "source_document": json_file.name,
+                    "chunk_index": idx,
+                    "chunking_config": chunking_config,
+                    "text": chunk_text,
+                }
+
+                # Write as a single line of JSON
+                f.write(json.dumps(chunk_obj, ensure_ascii=False) + "\n")
+
+        print(
+            f"docling-chunk: saved {len(chunks)} chunks to {output_filename}",
+            flush=True,
+        )
+        processed_count += 1
+
+    # Report summary
+    print(
+        f"docling-chunk: done - processed {processed_count}/{len(json_files)} files",
+        flush=True,
+    )
+    if skipped_files:
+        print(
+            f"docling-chunk: skipped {len(skipped_files)} invalid files:",
+            flush=True,
+        )
+        for filename, reason in skipped_files:
+            print(f"  - {filename}: {reason}", flush=True)
@@ -27,6 +27,39 @@ The following configuration options are available as KFP parameters when you _Cr
 - `pdf_filenames`: List of PDF file names to process, separated by commas.
 - `pdf_from_s3`: If `True`, PDF files will be fetched from an S3-compatible object storage rather than `pdf_base_url`. A secret must be configured as described in [docs](../README.md).
 
+### Chunking options
+
+Optional document chunking using Docling's [HybridChunker](https://docling-project.github.io/docling/examples/hybrid_chunking/):
+
+- `docling_chunk_enabled`: If `True`, chunk converted documents into smaller pieces (default: `False`).
+- `docling_chunk_max_tokens`: Maximum tokens per chunk (default: `512`).
+- `docling_chunk_merge_peers`: If `True`, merge adjacent small chunks for better context (default: `True`).
+
+Chunking uses the `sentence-transformers/all-MiniLM-L6-v2` tokenizer for accurate token counting.
+
+**Chunked output**: When enabled, creates `{filename}_chunks.jsonl` files (one JSON object per line) in the same output directory as the converted documents. See [main docs](../README.md) for output format details.
+
+## Local testing
+
+You can test the pipeline locally using Docker before deploying to KFP.
+
+### Prerequisites
+
+```bash
+pip install docker kfp
+```
+
+Requires a Docker-compatible daemon (Docker or Podman socket).
+
+### Run locally
+
+```bash
+cd data-processing/kubeflow-pipelines/docling-standard
+python local_run.py
+```
+
+This runs `convert_pipeline_local()` which converts PDFs and chunks the output.
+
 ## Compiling from source
 
 ### Clone repository, create venv, install dependencies
@@ -45,5 +78,4 @@ This generates `standard_convert_pipeline_compiled.yaml`:
 
 ```bash
 python standard_convert_pipeline.py
-```
-
+```
@@ -3,7 +3,12 @@
 from typing import List
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
-from common import create_pdf_splits, download_docling_models, import_pdfs
+from common import (
+    create_pdf_splits,
+    docling_chunk,
+    download_docling_models,
+    import_pdfs,
+)
 from kfp import dsl, local
 from standard_components import docling_convert_standard
 
@@ -15,6 +20,9 @@ def take_first_split(splits: List[List[str]]) -> List[str]:
 
 @dsl.pipeline()
 def convert_pipeline_local():
+    """
+    Local pipeline for testing standard conversion with chunking.
+    """
     importer = import_pdfs(
         filenames="2203.01017v2.pdf,2206.01062.pdf",
         base_url="https://github.com/docling-project/docling/raw/v2.43.0/tests/data/pdf",
@@ -32,15 +40,20 @@ def convert_pipeline_local():
 
     first_split = take_first_split(splits=pdf_splits.output)
 
-    docling_convert_standard(
+    converter = docling_convert_standard(
         input_path=importer.outputs["output_path"],
         artifacts_path=artifacts.outputs["output_path"],
         pdf_filenames=first_split.output,
     )
 
+    docling_chunk(
+        input_path=converter.outputs["output_path"],
+        max_tokens=512,
+        merge_peers=True,
+    )
+
 
 def main() -> None:
-    # Requires: pip install docker; and a Docker-compatible daemon (Docker or Podman socket)
     local.init(runner=local.DockerRunner())
     convert_pipeline_local()
-Original file line number
+Diff line change
 .venv
 .env
 venv
 +diagrams/