From 7eb9c2866d17c78ad3fa9922d6e2c521dcf56bc3 Mon Sep 17 00:00:00 2001 From: Iliescu Constantin Date: Tue, 15 Jul 2025 22:25:21 +0200 Subject: [PATCH 1/2] Fix import for get_block_class --- marker/processors/image_to_text.py | 118 +++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 marker/processors/image_to_text.py diff --git a/marker/processors/image_to_text.py b/marker/processors/image_to_text.py new file mode 100644 index 000000000..8a6f91cc4 --- /dev/null +++ b/marker/processors/image_to_text.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import base64 +import io +from copy import deepcopy +from typing import Optional + +import markdown2 +import requests +from PIL import Image + +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document +from marker.schema.registry import get_block_class + + +class ImageToTextProcessor(BaseProcessor): + """Replace image blocks with markdown descriptions.""" + + block_types = (BlockTypes.Picture, BlockTypes.Figure) + + def __init__( + self, + config: Optional[dict] = None, + ollama_url: str = "http://localhost:11434", + ollama_model: str = "llama3.2-vision:11b", + vision_prompt: str = ( + "Describe this image precisely in technical terms. Focus on diagrams, code, text, and technical content." + ), + ) -> None: + super().__init__(config) + self.ollama_url = ollama_url.rstrip("/") + self.ollama_model = ollama_model + self.vision_prompt = vision_prompt + + def process_image_to_text(self, image: Image.Image) -> str: + """Convert an image to a markdown description using the Ollama service.""" + try: + buffer = io.BytesIO() + image.save(buffer, format="PNG") + image_b64 = base64.b64encode(buffer.getvalue()).decode() + payload = { + "model": self.ollama_model, + "prompt": self.vision_prompt, + "images": [image_b64], + "stream": False, + } + resp = requests.post(f"{self.ollama_url}/api/generate", json=payload, timeout=120) + resp.raise_for_status() + result = resp.json() + description = result.get("response", "").strip() + return f"[Image Description: {description}]" + except Exception as e: # pragma: no cover - external service + return f"[Image processing failed: {str(e)}]" + + def __call__(self, document: Document) -> None: + TextClass = get_block_class(BlockTypes.Text) + for page in document.pages: + for block in list(page.contained_blocks(document, self.block_types)): + try: + image = block.get_image(document, highres=True) + if image is None: + continue + markdown_text = self.process_image_to_text(image) + html = markdown2.markdown(markdown_text, extras=["tables"]) + new_block = TextClass( + polygon=deepcopy(block.polygon), + page_id=block.page_id, + structure=deepcopy(block.structure), + text_extraction_method=block.text_extraction_method, + source="processor", + top_k=block.top_k, + metadata=block.metadata, + html=html, + ) + page.replace_block(block, new_block) + except Exception: + continue + + + +def create_converter_with_image_to_text( + config: Optional[dict] = None, + ollama_url: str = "http://localhost:11434", + ollama_model: str = "llama3.2-vision:11b", + vision_prompt: str = ( + "Describe this image precisely in technical terms. Focus on diagrams, code, text, and technical content." + ), +): + """Return a PdfConverter that applies ImageToTextProcessor.""" + from marker.converters.pdf import PdfConverter + from marker.models import create_model_dict + from marker.config.parser import ConfigParser + + if config is None: + config = {"extract_images": True, "output_dir": "output"} + + if "page_range" in config and isinstance(config["page_range"], list): + config["page_range"] = ",".join(str(p) for p in config["page_range"]) + + config_parser = ConfigParser(config) + processors = config_parser.get_processors() + processors.append( + ImageToTextProcessor( + config=config, + ollama_url=ollama_url, + ollama_model=ollama_model, + vision_prompt=vision_prompt, + ) + ) + converter = PdfConverter( + artifact_dict=create_model_dict(), + config=config_parser.generate_config_dict(), + processor_list=processors, + renderer=config_parser.get_renderer(), + ) + return converter From 41429e41dc073d09cf36a09e548315d67ea7145a Mon Sep 17 00:00:00 2001 From: Iliescu Constantin Date: Tue, 15 Jul 2025 23:16:23 +0200 Subject: [PATCH 2/2] Fix ImageToTextProcessor and converter helper --- marker/processors/image_to_text.py | 46 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/marker/processors/image_to_text.py b/marker/processors/image_to_text.py index 8a6f91cc4..3bc7d9a6f 100644 --- a/marker/processors/image_to_text.py +++ b/marker/processors/image_to_text.py @@ -3,7 +3,7 @@ import base64 import io from copy import deepcopy -from typing import Optional +from typing import Optional, List import markdown2 import requests @@ -20,19 +20,16 @@ class ImageToTextProcessor(BaseProcessor): block_types = (BlockTypes.Picture, BlockTypes.Figure) - def __init__( - self, - config: Optional[dict] = None, - ollama_url: str = "http://localhost:11434", - ollama_model: str = "llama3.2-vision:11b", - vision_prompt: str = ( - "Describe this image precisely in technical terms. Focus on diagrams, code, text, and technical content." - ), - ) -> None: + def __init__(self, config: Optional[dict] = None) -> None: super().__init__(config) - self.ollama_url = ollama_url.rstrip("/") - self.ollama_model = ollama_model - self.vision_prompt = vision_prompt + if config is None: + config = {} + self.ollama_url = config.get("ollama_url", "http://localhost:11434").rstrip("/") + self.ollama_model = config.get("ollama_model", "llama3.2-vision:11b") + self.vision_prompt = config.get( + "vision_prompt", + "Describe this image precisely in technical terms. Focus on diagrams, code, text, and technical content.", + ) def process_image_to_text(self, image: Image.Image) -> str: """Convert an image to a markdown description using the Ollama service.""" @@ -87,11 +84,12 @@ def create_converter_with_image_to_text( vision_prompt: str = ( "Describe this image precisely in technical terms. Focus on diagrams, code, text, and technical content." ), -): +) -> "PdfConverter": """Return a PdfConverter that applies ImageToTextProcessor.""" from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser + from marker.util import classes_to_strings if config is None: config = {"extract_images": True, "output_dir": "output"} @@ -99,16 +97,20 @@ def create_converter_with_image_to_text( if "page_range" in config and isinstance(config["page_range"], list): config["page_range"] = ",".join(str(p) for p in config["page_range"]) + # Store ollama params so the processor can access them + config.update({ + "ollama_url": ollama_url, + "ollama_model": ollama_model, + "vision_prompt": vision_prompt, + }) + config_parser = ConfigParser(config) processors = config_parser.get_processors() - processors.append( - ImageToTextProcessor( - config=config, - ollama_url=ollama_url, - ollama_model=ollama_model, - vision_prompt=vision_prompt, - ) - ) + if processors is None: + processors = classes_to_strings(list(PdfConverter.default_processors) + [ImageToTextProcessor]) + else: + processors.append("marker.processors.image_to_text.ImageToTextProcessor") + converter = PdfConverter( artifact_dict=create_model_dict(), config=config_parser.generate_config_dict(),