diff --git a/image_to_description.py b/image_to_description.py new file mode 100644 index 000000000..f51a154bc --- /dev/null +++ b/image_to_description.py @@ -0,0 +1,51 @@ +from typing import Annotated, List + +from pydantic import BaseModel + +from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData +from marker.schema import BlockTypes +from marker.schema.document import Document + + +class ImageToDescriptionProcessor(BaseLLMSimpleBlockProcessor): + """Simple processor that replaces images with generated text.""" + + block_types = ( + BlockTypes.Picture, + BlockTypes.Figure, + ) + + extract_images: Annotated[bool, "Extract images from the document."] = True + + def inference_blocks(self, document: Document) -> List[BlockData]: + blocks = super().inference_blocks(document) + if self.extract_images: + return [] + return blocks + + def block_prompts(self, document: Document) -> List[PromptData]: + prompts = [] + for block_data in self.inference_blocks(document): + block = block_data["block"] + prompts.append( + { + "prompt": "", + "image": self.extract_image(document, block), + "block": block, + "schema": ImageDescriptionSchema, + "page": block_data["page"], + } + ) + return prompts + + def rewrite_block( + self, response: dict, prompt_data: PromptData, document: Document + ): + block = prompt_data["block"] + image_name = block.id.to_path() + description = f"This is the image {image_name}" + block.html = f"
{description}
" + + +class ImageDescriptionSchema(BaseModel): + description: str diff --git a/image_to_description_script.py b/image_to_description_script.py new file mode 100755 index 000000000..9429babfd --- /dev/null +++ b/image_to_description_script.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Minimal script: run Marker with ``ImageToDescriptionProcessor``. +""" + +from pathlib import Path + +from image_to_description import ImageToDescriptionProcessor +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.config.parser import ConfigParser + +PDF_PATH = Path( + "Design Patterns in C# A Hands-on Guide with Real-World Examples - Vaskaran Sarcar.pdf" +) +PAGE_RANGE = [31] +OUTPUT_DIR = Path("test_output") + +# Prepare config +config = { + "extract_images": False, # Must be False for processor to run! + "output_dir": str(OUTPUT_DIR), + "page_range": ",".join(str(p) for p in PAGE_RANGE), + "output_format": "markdown", + "use_llm": True, + # Add any other LLM config keys needed (e.g., API key) + # "gemini_api_key": "your_key_here", +} + +# Create converter with the processor class +converter = PdfConverter( + artifact_dict=create_model_dict(), + config=ConfigParser(config).generate_config_dict(), + processor_list=[ImageToDescriptionProcessor], + # Replace with your actual LLM service if needed + llm_service="marker.services.ollama.OllamaService", +) + +# Run conversion and save markdown +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +rendered = converter(str(PDF_PATH)) +final_md_path = OUTPUT_DIR / f"{PDF_PATH.stem}_processed.md" +final_md_path.write_text(rendered.markdown, encoding="utf-8") +print(f"✅ Processed markdown saved to: {final_md_path}")