From 6c0010bb3363efc31202b94d91ddf4927c6f4eb0 Mon Sep 17 00:00:00 2001 From: Iliescu Constantin Date: Wed, 16 Jul 2025 10:52:12 +0200 Subject: [PATCH 1/2] Add example script for ImageToDescriptionProcessor --- image_to_description.py | 51 ++++++++++++++++++++++++++++++++ image_to_description_script.py | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 image_to_description.py create mode 100755 image_to_description_script.py diff --git a/image_to_description.py b/image_to_description.py new file mode 100644 index 000000000..f51a154bc --- /dev/null +++ b/image_to_description.py @@ -0,0 +1,51 @@ +from typing import Annotated, List + +from pydantic import BaseModel + +from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData +from marker.schema import BlockTypes +from marker.schema.document import Document + + +class ImageToDescriptionProcessor(BaseLLMSimpleBlockProcessor): + """Simple processor that replaces images with generated text.""" + + block_types = ( + BlockTypes.Picture, + BlockTypes.Figure, + ) + + extract_images: Annotated[bool, "Extract images from the document."] = True + + def inference_blocks(self, document: Document) -> List[BlockData]: + blocks = super().inference_blocks(document) + if self.extract_images: + return [] + return blocks + + def block_prompts(self, document: Document) -> List[PromptData]: + prompts = [] + for block_data in self.inference_blocks(document): + block = block_data["block"] + prompts.append( + { + "prompt": "", + "image": self.extract_image(document, block), + "block": block, + "schema": ImageDescriptionSchema, + "page": block_data["page"], + } + ) + return prompts + + def rewrite_block( + self, response: dict, prompt_data: PromptData, document: Document + ): + block = prompt_data["block"] + image_name = block.id.to_path() + description = f"This is the image {image_name}" + block.html = f"

{description}

" + + +class ImageDescriptionSchema(BaseModel): + description: str diff --git a/image_to_description_script.py b/image_to_description_script.py new file mode 100755 index 000000000..1c682422d --- /dev/null +++ b/image_to_description_script.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +""" +Minimal script: run Marker with ImageToDescriptionProcessor via LLMSimpleBlockMetaProcessor. +""" + +from pathlib import Path + +from image_to_description import ImageToDescriptionProcessor +from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.config.parser import ConfigParser + +PDF_PATH = Path( + "Design Patterns in C# A Hands-on Guide with Real-World Examples - Vaskaran Sarcar.pdf" +) +PAGE_RANGE = [31] +OUTPUT_DIR = Path("test_output") + +# Prepare config +config = { + "extract_images": False, # Must be False for processor to run! + "output_dir": str(OUTPUT_DIR), + "page_range": ",".join(str(p) for p in PAGE_RANGE), + "output_format": "markdown", + "use_llm": True, + # Add any other LLM config keys needed (e.g., API key) + # "gemini_api_key": "your_key_here", +} + +# Instantiate your processor +image_processor = ImageToDescriptionProcessor(config=config) + +# Instantiate your LLM service (replace with your real service) +# from marker.services.ollama import OllamaService +# llm_service = OllamaService(config) +llm_service = "gemma3:4b" # <-- Replace with your actual LLM service + +# Wrap with LLMSimpleBlockMetaProcessor +meta_processor = LLMSimpleBlockMetaProcessor([image_processor], llm_service, config) + +# Create converter with the meta processor +converter = PdfConverter( + artifact_dict=create_model_dict(), + config=ConfigParser(config).generate_config_dict(), + processor_list=[meta_processor], +) + +# Run conversion and save markdown +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +rendered = converter(str(PDF_PATH)) +final_md_path = OUTPUT_DIR / f"{PDF_PATH.stem}_processed.md" +final_md_path.write_text(rendered.markdown, encoding="utf-8") +print(f"✅ Processed markdown saved to: {final_md_path}") From 41481da225e86d64fe89a996be4da7d043d95cf1 Mon Sep 17 00:00:00 2001 From: Iliescu Constantin Date: Wed, 16 Jul 2025 11:01:51 +0200 Subject: [PATCH 2/2] Update ImageToDescription usage --- image_to_description_script.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/image_to_description_script.py b/image_to_description_script.py index 1c682422d..9429babfd 100755 --- a/image_to_description_script.py +++ b/image_to_description_script.py @@ -1,12 +1,11 @@ #!/usr/bin/env python """ -Minimal script: run Marker with ImageToDescriptionProcessor via LLMSimpleBlockMetaProcessor. +Minimal script: run Marker with ``ImageToDescriptionProcessor``. """ from pathlib import Path from image_to_description import ImageToDescriptionProcessor -from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser @@ -28,22 +27,13 @@ # "gemini_api_key": "your_key_here", } -# Instantiate your processor -image_processor = ImageToDescriptionProcessor(config=config) - -# Instantiate your LLM service (replace with your real service) -# from marker.services.ollama import OllamaService -# llm_service = OllamaService(config) -llm_service = "gemma3:4b" # <-- Replace with your actual LLM service - -# Wrap with LLMSimpleBlockMetaProcessor -meta_processor = LLMSimpleBlockMetaProcessor([image_processor], llm_service, config) - -# Create converter with the meta processor +# Create converter with the processor class converter = PdfConverter( artifact_dict=create_model_dict(), config=ConfigParser(config).generate_config_dict(), - processor_list=[meta_processor], + processor_list=[ImageToDescriptionProcessor], + # Replace with your actual LLM service if needed + llm_service="marker.services.ollama.OllamaService", ) # Run conversion and save markdown