From 6572f2981a81f6f77105a9953d0cf1868a133d52 Mon Sep 17 00:00:00 2001
From: Travor <3488616445@qq.com>
Date: Fri, 22 May 2026 00:37:57 +0800
Subject: [PATCH] Add MiniCPM-V 4.6 examples gallery
---
README.md | 2 +
examples/gallery/README.md | 111 +++++++++++++++
examples/gallery/build_gallery.py | 166 +++++++++++++++++++++++
examples/gallery/gradio_demo.py | 75 ++++++++++
examples/gallery/minicpmv46_inference.py | 132 ++++++++++++++++++
5 files changed, 486 insertions(+)
create mode 100644 examples/gallery/README.md
create mode 100644 examples/gallery/build_gallery.py
create mode 100644 examples/gallery/gradio_demo.py
create mode 100644 examples/gallery/minicpmv46_inference.py
diff --git a/README.md b/README.md
index df3ce199..7ba6a7f5 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,8 @@ MiniCPM-V 4.6 can be deployed across three mainstream end-side platforms — **i
pip install "transformers[torch]>=5.7.0" torchvision torchcodec
```
+Runnable minimal scripts are also available in [examples/gallery](./examples/gallery/).
+
> **Note on CUDA compatibility:** `torchcodec` (used for video decoding) may have compatibility issues with certain CUDA versions. For example, `torch>=2.11` bundles CUDA 13.1 by default, while environments with CUDA 12.x may encounter errors such as `RuntimeError: Could not load libtorchcodec`. Two workarounds:
>
> 1. **Replace `torchcodec` with `PyAV`** — supports both image and video inference without CUDA version constraints:
diff --git a/examples/gallery/README.md b/examples/gallery/README.md
new file mode 100644
index 00000000..413abdaf
--- /dev/null
+++ b/examples/gallery/README.md
@@ -0,0 +1,111 @@
+# MiniCPM-V 4.6 Examples Gallery
+
+This gallery contains small, runnable MiniCPM-V 4.6 examples for image, OCR, multi-image, video, low-memory, and local Gradio smoke tests. Each command can optionally write a JSON result that can be assembled into a static HTML gallery.
+
+## Setup
+
+Install the current Transformers path for MiniCPM-V 4.6:
+
+```bash
+pip install "transformers[torch]>=5.7.0" torchvision av pillow
+```
+
+`av` is a lightweight media decoding option for environments where `torchcodec` has CUDA compatibility issues. If your CUDA and PyTorch versions support `torchcodec`, you can use the installation path from the main README instead.
+
+For the optional local web UI:
+
+```bash
+pip install gradio
+```
+
+## Image Captioning
+
+Run the default refraction image example:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+ --prompt "Describe this image in one sentence." \
+ --output-json outputs/gallery/refraction.json
+```
+
+Expected output is a short description of the image, for example:
+
+```text
+A glass of water with a red pencil stuck inside it.
+```
+
+## OCR
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+ --image-url assets/hk_OCR.jpg \
+ --prompt "Read the visible text and summarize the scene in two sentences." \
+ --output-json outputs/gallery/ocr.json
+```
+
+For lower-memory GPUs, keep `--downsample-mode 16x` and reduce `--max-slice-nums` if needed. Use `--downsample-mode 4x` when you need finer visual detail and have enough memory.
+
+## Multi-Image Comparison
+
+Pass `--image-url` multiple times:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+ --image-url assets/airplane.jpeg \
+ --image-url assets/worldmap_ck.jpg \
+ --prompt "Compare these two images in two concise sentences." \
+ --output-json outputs/gallery/multi_image.json
+```
+
+## Video Question Answering
+
+Run a short video smoke test with a small frame budget:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+ --video-url https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4 \
+ --prompt "Describe the main action in this video in one sentence." \
+ --max-num-frames 8 \
+ --max-slice-nums 1 \
+ --output-json outputs/gallery/video.json
+```
+
+## Low-Memory Settings
+
+The default examples use `--downsample-mode 16x`, which is the faster and lighter setting. If memory is tight, also reduce image slices and video frames:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+ --image-url assets/hk_OCR.jpg \
+ --prompt "Read the visible text and summarize the scene." \
+ --max-slice-nums 1 \
+ --max-new-tokens 64 \
+ --output-json outputs/gallery/ocr_low_memory.json
+```
+
+For more visual detail, switch to `--downsample-mode 4x` when your hardware has enough memory.
+
+## Static Gallery
+
+After writing one or more JSON outputs, build a local HTML gallery:
+
+```bash
+python examples/gallery/build_gallery.py \
+ --result-json outputs/gallery/refraction.json \
+ --result-json outputs/gallery/ocr.json \
+ --result-json outputs/gallery/multi_image.json \
+ --result-json outputs/gallery/video.json \
+ --output outputs/gallery/index.html
+```
+
+Open `outputs/gallery/index.html` to switch between cases and inspect the input media, prompt, and model output.
+
+## Local Gradio Demo
+
+Launch a minimal local UI:
+
+```bash
+python examples/gallery/gradio_demo.py
+```
+
+The Gradio demo loads the model once, then accepts either an image or a video plus a prompt. Video inputs use the same frame-budget controls as the command-line script.
diff --git a/examples/gallery/build_gallery.py b/examples/gallery/build_gallery.py
new file mode 100644
index 00000000..c5d31b2e
--- /dev/null
+++ b/examples/gallery/build_gallery.py
@@ -0,0 +1,166 @@
+import argparse
+import html
+import json
+import os
+from pathlib import Path
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Build a static MiniCPM-V examples gallery from JSON outputs.")
+ parser.add_argument(
+ "--result-json",
+ action="append",
+ required=True,
+ type=Path,
+ help="Path to a JSON file written by minicpmv46_inference.py. Pass multiple times.",
+ )
+ parser.add_argument("--output", type=Path, default=Path("outputs/minicpmv46_gallery/index.html"))
+ parser.add_argument("--title", default="MiniCPM-V 4.6 Gallery")
+ return parser.parse_args()
+
+
+def to_display_src(value, output_path):
+ if not value:
+ return ""
+ if value.startswith(("http://", "https://", "data:")):
+ return value
+
+ path = Path(value)
+ if not path.exists():
+ return value
+ try:
+ return Path(os.path.relpath(path.resolve(), output_path.resolve().parent)).as_posix()
+ except ValueError:
+ return path.resolve().as_uri()
+
+
+def load_case(path, output_path):
+ data = json.loads(path.read_text(encoding="utf-8"))
+ media = data.get("media")
+ if not media:
+ media = []
+ for image_url in data.get("image_url") or []:
+ media.append({"type": "image", "url": image_url})
+ if data.get("video_url"):
+ media.append({"type": "video", "url": data["video_url"]})
+
+ title = path.stem.replace("_", " ").title()
+ normalized_media = []
+ for item in media:
+ normalized_media.append(
+ {
+ "type": item.get("type", "image"),
+ "src": to_display_src(item.get("url", ""), output_path),
+ }
+ )
+
+ return {
+ "title": title,
+ "prompt": data.get("prompt", ""),
+ "answer": data.get("answer", ""),
+ "media": normalized_media,
+ "source": path.as_posix(),
+ }
+
+
+def write_gallery(cases, output_path, title):
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ cases_json = json.dumps(cases, ensure_ascii=False)
+ page = f"""
+
+
+
+
+{html.escape(title)}
+
+
+
+
+{html.escape(title)}
+Local smoke-test inputs and model outputs generated with examples/gallery/minicpmv46_inference.py.
+
+
+
+ Prompt
+
+ Model Output
+
+
+
+
+
+
+"""
+ output_path.write_text(page, encoding="utf-8")
+
+
+def main():
+ args = parse_args()
+ cases = [load_case(path, args.output) for path in args.result_json]
+ write_gallery(cases, args.output, args.title)
+ print(f"Wrote gallery: {args.output}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/gallery/gradio_demo.py b/examples/gallery/gradio_demo.py
new file mode 100644
index 00000000..e6e30061
--- /dev/null
+++ b/examples/gallery/gradio_demo.py
@@ -0,0 +1,75 @@
+import argparse
+from pathlib import Path
+from types import SimpleNamespace
+
+import gradio as gr
+
+from minicpmv46_inference import DEFAULT_PROMPT, load_model_and_processor, run_inference
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Launch a minimal local Gradio demo for MiniCPM-V 4.6.")
+ parser.add_argument("--model-id", default="openbmb/MiniCPM-V-4.6")
+ parser.add_argument("--device-map", default="auto")
+ parser.add_argument("--attn-implementation", default=None)
+ parser.add_argument("--local-files-only", action="store_true")
+ parser.add_argument("--server-name", default="127.0.0.1")
+ parser.add_argument("--server-port", type=int, default=7860)
+ return parser.parse_args()
+
+
+def make_infer_fn(model, processor, base_args):
+ def infer(image, video, prompt, max_new_tokens, downsample_mode, max_slice_nums, max_num_frames):
+ image_url = None
+ if image is not None:
+ image_url = str(Path(image).resolve())
+ video_url = None
+ if video is not None:
+ video_url = str(Path(video).resolve())
+
+ args = SimpleNamespace(
+ model_id=base_args.model_id,
+ image_url=[image_url] if image_url and not video_url else None,
+ video_url=video_url,
+ prompt=prompt or DEFAULT_PROMPT,
+ max_new_tokens=int(max_new_tokens),
+ downsample_mode=downsample_mode,
+ max_slice_nums=int(max_slice_nums),
+ max_num_frames=int(max_num_frames),
+ stack_frames=1,
+ )
+ answer, _ = run_inference(model, processor, args)
+ return answer
+
+ return infer
+
+
+def main():
+ args = parse_args()
+ model, processor = load_model_and_processor(args)
+ infer = make_infer_fn(model, processor, args)
+
+ with gr.Blocks(title="MiniCPM-V 4.6 Local Demo") as demo:
+ gr.Markdown("# MiniCPM-V 4.6 Local Demo")
+ with gr.Row():
+ image = gr.Image(type="filepath", label="Image")
+ video = gr.Video(label="Video")
+ prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt")
+ with gr.Row():
+ max_new_tokens = gr.Slider(16, 512, value=128, step=16, label="Max new tokens")
+ downsample_mode = gr.Radio(["16x", "4x"], value="16x", label="Downsample mode")
+ max_slice_nums = gr.Slider(1, 8, value=4, step=1, label="Max slice nums")
+ max_num_frames = gr.Slider(1, 32, value=8, step=1, label="Max video frames")
+ run = gr.Button("Run")
+ output = gr.Textbox(label="Model output", lines=6)
+ run.click(
+ infer,
+ inputs=[image, video, prompt, max_new_tokens, downsample_mode, max_slice_nums, max_num_frames],
+ outputs=output,
+ )
+
+ demo.launch(server_name=args.server_name, server_port=args.server_port)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/gallery/minicpmv46_inference.py b/examples/gallery/minicpmv46_inference.py
new file mode 100644
index 00000000..93c8c955
--- /dev/null
+++ b/examples/gallery/minicpmv46_inference.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+
+DEFAULT_IMAGE_URL = "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/refract.png"
+DEFAULT_VIDEO_URL = "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4"
+DEFAULT_PROMPT = "Answer in one short sentence: what is shown in this input?"
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Run a minimal MiniCPM-V 4.6 inference example.")
+ parser.add_argument("--model-id", default="openbmb/MiniCPM-V-4.6", help="Hugging Face model id or local path.")
+ parser.add_argument(
+ "--image-url",
+ action="append",
+ help="Remote image URL or local image path. Pass multiple times for multi-image input.",
+ )
+ parser.add_argument("--video-url", help="Remote video URL or local video path. When set, image inputs are ignored.")
+ parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Text prompt for the image or video.")
+ parser.add_argument("--max-new-tokens", type=int, default=128)
+ parser.add_argument("--downsample-mode", default="16x", choices=["4x", "16x"])
+ parser.add_argument("--max-slice-nums", type=int, default=4)
+ parser.add_argument("--max-num-frames", type=int, default=16)
+ parser.add_argument("--stack-frames", type=int, default=1)
+ parser.add_argument("--device-map", default="auto")
+ parser.add_argument("--attn-implementation", default=None, help="Optional attention backend, e.g. flash_attention_2.")
+ parser.add_argument("--local-files-only", action="store_true", help="Load model files from the local cache only.")
+ parser.add_argument("--output-json", type=Path, help="Optional path to save the prompt and answer as JSON.")
+ return parser.parse_args()
+
+
+def load_model_and_processor(args):
+ processor = AutoProcessor.from_pretrained(args.model_id, local_files_only=args.local_files_only)
+ model_kwargs = {
+ "torch_dtype": "auto",
+ "device_map": args.device_map,
+ "local_files_only": args.local_files_only,
+ }
+ if args.attn_implementation:
+ model_kwargs["attn_implementation"] = args.attn_implementation
+
+ model = AutoModelForImageTextToText.from_pretrained(args.model_id, **model_kwargs)
+ return model, processor
+
+
+def build_media_content(args):
+ media_content = []
+ if args.video_url:
+ media_content.append({"type": "video", "url": args.video_url})
+ else:
+ for image_url in args.image_url or [DEFAULT_IMAGE_URL]:
+ media_content.append({"type": "image", "url": image_url})
+ return media_content
+
+
+def run_inference(model, processor, args):
+ media_content = build_media_content(args)
+ messages = [{"role": "user", "content": media_content + [{"type": "text", "text": args.prompt}]}]
+
+ processor_kwargs = {
+ "downsample_mode": args.downsample_mode,
+ "max_slice_nums": args.max_slice_nums,
+ }
+ if args.video_url:
+ processor_kwargs.update(
+ {
+ "max_num_frames": args.max_num_frames,
+ "stack_frames": args.stack_frames,
+ "use_image_id": False,
+ }
+ )
+ inputs = processor.apply_chat_template(
+ messages,
+ tokenize=True,
+ add_generation_prompt=True,
+ return_dict=True,
+ return_tensors="pt",
+ processor_kwargs=processor_kwargs,
+ ).to(model.device)
+
+ with torch.inference_mode():
+ generated_ids = model.generate(
+ **inputs,
+ downsample_mode=args.downsample_mode,
+ max_new_tokens=args.max_new_tokens,
+ )
+
+ generated_ids_trimmed = [
+ output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
+ ]
+ answer = processor.batch_decode(
+ generated_ids_trimmed,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=False,
+ )[0]
+ return answer, media_content
+
+
+def write_output_json(args, answer, media_content):
+ if args.output_json:
+ args.output_json.parent.mkdir(parents=True, exist_ok=True)
+ args.output_json.write_text(
+ json.dumps(
+ {
+ "model_id": args.model_id,
+ "media": media_content,
+ "image_url": args.image_url or ([] if args.video_url else [DEFAULT_IMAGE_URL]),
+ "video_url": args.video_url,
+ "prompt": args.prompt,
+ "answer": answer,
+ },
+ ensure_ascii=False,
+ indent=2,
+ ),
+ encoding="utf-8",
+ )
+
+
+def main():
+ args = parse_args()
+ model, processor = load_model_and_processor(args)
+ answer, media_content = run_inference(model, processor, args)
+ print(answer)
+ write_output_json(args, answer, media_content)
+
+
+if __name__ == "__main__":
+ main()