Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions image_to_description.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import Annotated, List

from pydantic import BaseModel

from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData
from marker.schema import BlockTypes
from marker.schema.document import Document


class ImageToDescriptionProcessor(BaseLLMSimpleBlockProcessor):
"""Simple processor that replaces images with generated text."""

block_types = (
BlockTypes.Picture,
BlockTypes.Figure,
)

extract_images: Annotated[bool, "Extract images from the document."] = True

def inference_blocks(self, document: Document) -> List[BlockData]:
blocks = super().inference_blocks(document)
if self.extract_images:
return []
return blocks

def block_prompts(self, document: Document) -> List[PromptData]:
prompts = []
for block_data in self.inference_blocks(document):
block = block_data["block"]
prompts.append(
{
"prompt": "",
"image": self.extract_image(document, block),
"block": block,
"schema": ImageDescriptionSchema,
"page": block_data["page"],
}
)
return prompts

def rewrite_block(
self, response: dict, prompt_data: PromptData, document: Document
):
block = prompt_data["block"]
image_name = block.id.to_path()
description = f"This is the image {image_name}"
block.html = f"<p>{description}</p>"


class ImageDescriptionSchema(BaseModel):
description: str
44 changes: 44 additions & 0 deletions image_to_description_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python
"""
Minimal script: run Marker with ``ImageToDescriptionProcessor``.
"""

from pathlib import Path

from image_to_description import ImageToDescriptionProcessor
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

PDF_PATH = Path(
"Design Patterns in C# A Hands-on Guide with Real-World Examples - Vaskaran Sarcar.pdf"
)
PAGE_RANGE = [31]
OUTPUT_DIR = Path("test_output")

# Prepare config
config = {
"extract_images": False, # Must be False for processor to run!
"output_dir": str(OUTPUT_DIR),
"page_range": ",".join(str(p) for p in PAGE_RANGE),
"output_format": "markdown",
"use_llm": True,
# Add any other LLM config keys needed (e.g., API key)
# "gemini_api_key": "your_key_here",
}

# Create converter with the processor class
converter = PdfConverter(
artifact_dict=create_model_dict(),
config=ConfigParser(config).generate_config_dict(),
processor_list=[ImageToDescriptionProcessor],
# Replace with your actual LLM service if needed
llm_service="marker.services.ollama.OllamaService",
)

# Run conversion and save markdown
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
rendered = converter(str(PDF_PATH))
final_md_path = OUTPUT_DIR / f"{PDF_PATH.stem}_processed.md"
final_md_path.write_text(rendered.markdown, encoding="utf-8")
print(f"✅ Processed markdown saved to: {final_md_path}")