From 6572f2981a81f6f77105a9953d0cf1868a133d52 Mon Sep 17 00:00:00 2001
From: Travor <3488616445@qq.com>
Date: Fri, 22 May 2026 00:37:57 +0800
Subject: [PATCH] Add MiniCPM-V 4.6 examples gallery

---
 README.md                                |   2 +
 examples/gallery/README.md               | 111 +++++++++++++++
 examples/gallery/build_gallery.py        | 166 +++++++++++++++++++++++
 examples/gallery/gradio_demo.py          |  75 ++++++++++
 examples/gallery/minicpmv46_inference.py | 132 ++++++++++++++++++
 5 files changed, 486 insertions(+)
 create mode 100644 examples/gallery/README.md
 create mode 100644 examples/gallery/build_gallery.py
 create mode 100644 examples/gallery/gradio_demo.py
 create mode 100644 examples/gallery/minicpmv46_inference.py

diff --git a/README.md b/README.md
index df3ce199..7ba6a7f5 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,8 @@ MiniCPM-V 4.6 can be deployed across three mainstream end-side platforms — **i
 pip install "transformers[torch]>=5.7.0" torchvision torchcodec
 ```
 
+Runnable minimal scripts are also available in [examples/gallery](./examples/gallery/).
+
 > **Note on CUDA compatibility:** `torchcodec` (used for video decoding) may have compatibility issues with certain CUDA versions. For example, `torch>=2.11` bundles CUDA 13.1 by default, while environments with CUDA 12.x may encounter errors such as `RuntimeError: Could not load libtorchcodec`. Two workarounds:
 >
 > 1. **Replace `torchcodec` with `PyAV`** — supports both image and video inference without CUDA version constraints:
diff --git a/examples/gallery/README.md b/examples/gallery/README.md
new file mode 100644
index 00000000..413abdaf
--- /dev/null
+++ b/examples/gallery/README.md
@@ -0,0 +1,111 @@
+# MiniCPM-V 4.6 Examples Gallery
+
+This gallery contains small, runnable MiniCPM-V 4.6 examples for image, OCR, multi-image, video, low-memory, and local Gradio smoke tests. Each command can optionally write a JSON result that can be assembled into a static HTML gallery.
+
+## Setup
+
+Install the current Transformers path for MiniCPM-V 4.6:
+
+```bash
+pip install "transformers[torch]>=5.7.0" torchvision av pillow
+```
+
+`av` is a lightweight media decoding option for environments where `torchcodec` has CUDA compatibility issues. If your CUDA and PyTorch versions support `torchcodec`, you can use the installation path from the main README instead.
+
+For the optional local web UI:
+
+```bash
+pip install gradio
+```
+
+## Image Captioning
+
+Run the default refraction image example:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+  --prompt "Describe this image in one sentence." \
+  --output-json outputs/gallery/refraction.json
+```
+
+Expected output is a short description of the image, for example:
+
+```text
+A glass of water with a red pencil stuck inside it.
+```
+
+## OCR
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+  --image-url assets/hk_OCR.jpg \
+  --prompt "Read the visible text and summarize the scene in two sentences." \
+  --output-json outputs/gallery/ocr.json
+```
+
+For lower-memory GPUs, keep `--downsample-mode 16x` and reduce `--max-slice-nums` if needed. Use `--downsample-mode 4x` when you need finer visual detail and have enough memory.
+
+## Multi-Image Comparison
+
+Pass `--image-url` multiple times:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+  --image-url assets/airplane.jpeg \
+  --image-url assets/worldmap_ck.jpg \
+  --prompt "Compare these two images in two concise sentences." \
+  --output-json outputs/gallery/multi_image.json
+```
+
+## Video Question Answering
+
+Run a short video smoke test with a small frame budget:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+  --video-url https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4 \
+  --prompt "Describe the main action in this video in one sentence." \
+  --max-num-frames 8 \
+  --max-slice-nums 1 \
+  --output-json outputs/gallery/video.json
+```
+
+## Low-Memory Settings
+
+The default examples use `--downsample-mode 16x`, which is the faster and lighter setting. If memory is tight, also reduce image slices and video frames:
+
+```bash
+python examples/gallery/minicpmv46_inference.py \
+  --image-url assets/hk_OCR.jpg \
+  --prompt "Read the visible text and summarize the scene." \
+  --max-slice-nums 1 \
+  --max-new-tokens 64 \
+  --output-json outputs/gallery/ocr_low_memory.json
+```
+
+For more visual detail, switch to `--downsample-mode 4x` when your hardware has enough memory.
+
+## Static Gallery
+
+After writing one or more JSON outputs, build a local HTML gallery:
+
+```bash
+python examples/gallery/build_gallery.py \
+  --result-json outputs/gallery/refraction.json \
+  --result-json outputs/gallery/ocr.json \
+  --result-json outputs/gallery/multi_image.json \
+  --result-json outputs/gallery/video.json \
+  --output outputs/gallery/index.html
+```
+
+Open `outputs/gallery/index.html` to switch between cases and inspect the input media, prompt, and model output.
+
+## Local Gradio Demo
+
+Launch a minimal local UI:
+
+```bash
+python examples/gallery/gradio_demo.py
+```
+
+The Gradio demo loads the model once, then accepts either an image or a video plus a prompt. Video inputs use the same frame-budget controls as the command-line script.
diff --git a/examples/gallery/build_gallery.py b/examples/gallery/build_gallery.py
new file mode 100644
index 00000000..c5d31b2e
--- /dev/null
+++ b/examples/gallery/build_gallery.py
@@ -0,0 +1,166 @@
+import argparse
+import html
+import json
+import os
+from pathlib import Path
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Build a static MiniCPM-V examples gallery from JSON outputs.")
+    parser.add_argument(
+        "--result-json",
+        action="append",
+        required=True,
+        type=Path,
+        help="Path to a JSON file written by minicpmv46_inference.py. Pass multiple times.",
+    )
+    parser.add_argument("--output", type=Path, default=Path("outputs/minicpmv46_gallery/index.html"))
+    parser.add_argument("--title", default="MiniCPM-V 4.6 Gallery")
+    return parser.parse_args()
+
+
+def to_display_src(value, output_path):
+    if not value:
+        return ""
+    if value.startswith(("http://", "https://", "data:")):
+        return value
+
+    path = Path(value)
+    if not path.exists():
+        return value
+    try:
+        return Path(os.path.relpath(path.resolve(), output_path.resolve().parent)).as_posix()
+    except ValueError:
+        return path.resolve().as_uri()
+
+
+def load_case(path, output_path):
+    data = json.loads(path.read_text(encoding="utf-8"))
+    media = data.get("media")
+    if not media:
+        media = []
+        for image_url in data.get("image_url") or []:
+            media.append({"type": "image", "url": image_url})
+        if data.get("video_url"):
+            media.append({"type": "video", "url": data["video_url"]})
+
+    title = path.stem.replace("_", " ").title()
+    normalized_media = []
+    for item in media:
+        normalized_media.append(
+            {
+                "type": item.get("type", "image"),
+                "src": to_display_src(item.get("url", ""), output_path),
+            }
+        )
+
+    return {
+        "title": title,
+        "prompt": data.get("prompt", ""),
+        "answer": data.get("answer", ""),
+        "media": normalized_media,
+        "source": path.as_posix(),
+    }
+
+
+def write_gallery(cases, output_path, title):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    cases_json = json.dumps(cases, ensure_ascii=False)
+    page = f"""<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>{html.escape(title)}</title>
+<style>
+body {{ margin: 0; background: #f6f7fb; color: #172033; font-family: Arial, sans-serif; }}
+main {{ max-width: 1120px; margin: 28px auto; padding: 0 18px; }}
+h1 {{ margin: 0 0 8px; font-size: 30px; }}
+.subtitle {{ margin: 0 0 22px; color: #475569; }}
+.card {{ background: white; border: 1px solid #d8dee9; border-radius: 8px; padding: 18px; box-shadow: 0 12px 30px rgba(15, 23, 42, 0.06); }}
+.top {{ display: flex; justify-content: space-between; gap: 16px; align-items: flex-start; margin-bottom: 16px; }}
+.case-title {{ margin: 0; font-size: 22px; }}
+.buttons {{ display: flex; flex-wrap: wrap; justify-content: flex-end; gap: 8px; max-width: 680px; }}
+button {{ border: 1px solid #cbd5e1; border-radius: 6px; background: white; padding: 8px 12px; cursor: pointer; font-size: 14px; }}
+button.active {{ background: #172033; color: white; border-color: #172033; }}
+.media {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); gap: 14px; }}
+.frame {{ background: #eef2f7; border: 1px solid #d8dee9; border-radius: 6px; padding: 10px; }}
+.frame img, .frame video {{ width: 100%; max-height: 560px; object-fit: contain; background: white; border-radius: 5px; }}
+.label {{ margin: 0 0 8px; color: #475569; font-weight: 700; }}
+.section-title {{ margin: 18px 0 8px; color: #475569; font-size: 15px; text-transform: uppercase; letter-spacing: 0.04em; }}
+pre {{ white-space: pre-wrap; background: #f1f5f9; border-radius: 6px; padding: 12px; margin: 0; overflow: auto; }}
+@media (max-width: 760px) {{ .top {{ display: block; }} .buttons {{ justify-content: flex-start; margin-top: 12px; }} }}
+</style>
+</head>
+<body>
+<main>
+<h1>{html.escape(title)}</h1>
+<p class="subtitle">Local smoke-test inputs and model outputs generated with <code>examples/gallery/minicpmv46_inference.py</code>.</p>
+<section class="card">
+  <div class="top">
+    <h2 class="case-title" id="caseTitle"></h2>
+    <div class="buttons" id="caseButtons"></div>
+  </div>
+  <div class="media" id="media"></div>
+  <h3 class="section-title">Prompt</h3>
+  <pre id="prompt"></pre>
+  <h3 class="section-title">Model Output</h3>
+  <pre id="answer"></pre>
+</section>
+</main>
+<script>
+const cases = {cases_json};
+const buttons = document.getElementById('caseButtons');
+const media = document.getElementById('media');
+function renderCase(index) {{
+  const item = cases[index];
+  document.getElementById('caseTitle').textContent = item.title;
+  document.getElementById('prompt').textContent = item.prompt;
+  document.getElementById('answer').textContent = item.answer;
+  media.innerHTML = '';
+  item.media.forEach((entry, mediaIndex) => {{
+    const frame = document.createElement('section');
+    frame.className = 'frame';
+    const label = document.createElement('p');
+    label.className = 'label';
+    label.textContent = entry.type === 'video' ? 'Video' : `Image ${{mediaIndex + 1}}`;
+    frame.appendChild(label);
+    const element = document.createElement(entry.type === 'video' ? 'video' : 'img');
+    element.src = entry.src;
+    if (entry.type === 'video') {{
+      element.controls = true;
+      element.muted = true;
+    }} else {{
+      element.alt = item.title;
+    }}
+    frame.appendChild(element);
+    media.appendChild(frame);
+  }});
+  [...buttons.children].forEach((button, buttonIndex) => button.classList.toggle('active', buttonIndex === index));
+}}
+cases.forEach((item, index) => {{
+  const button = document.createElement('button');
+  button.type = 'button';
+  button.textContent = item.title;
+  button.addEventListener('click', () => renderCase(index));
+  buttons.appendChild(button);
+}});
+if (cases.length) {{
+  renderCase(0);
+}}
+</script>
+</body>
+</html>
+"""
+    output_path.write_text(page, encoding="utf-8")
+
+
+def main():
+    args = parse_args()
+    cases = [load_case(path, args.output) for path in args.result_json]
+    write_gallery(cases, args.output, args.title)
+    print(f"Wrote gallery: {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gallery/gradio_demo.py b/examples/gallery/gradio_demo.py
new file mode 100644
index 00000000..e6e30061
--- /dev/null
+++ b/examples/gallery/gradio_demo.py
@@ -0,0 +1,75 @@
+import argparse
+from pathlib import Path
+from types import SimpleNamespace
+
+import gradio as gr
+
+from minicpmv46_inference import DEFAULT_PROMPT, load_model_and_processor, run_inference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Launch a minimal local Gradio demo for MiniCPM-V 4.6.")
+    parser.add_argument("--model-id", default="openbmb/MiniCPM-V-4.6")
+    parser.add_argument("--device-map", default="auto")
+    parser.add_argument("--attn-implementation", default=None)
+    parser.add_argument("--local-files-only", action="store_true")
+    parser.add_argument("--server-name", default="127.0.0.1")
+    parser.add_argument("--server-port", type=int, default=7860)
+    return parser.parse_args()
+
+
+def make_infer_fn(model, processor, base_args):
+    def infer(image, video, prompt, max_new_tokens, downsample_mode, max_slice_nums, max_num_frames):
+        image_url = None
+        if image is not None:
+            image_url = str(Path(image).resolve())
+        video_url = None
+        if video is not None:
+            video_url = str(Path(video).resolve())
+
+        args = SimpleNamespace(
+            model_id=base_args.model_id,
+            image_url=[image_url] if image_url and not video_url else None,
+            video_url=video_url,
+            prompt=prompt or DEFAULT_PROMPT,
+            max_new_tokens=int(max_new_tokens),
+            downsample_mode=downsample_mode,
+            max_slice_nums=int(max_slice_nums),
+            max_num_frames=int(max_num_frames),
+            stack_frames=1,
+        )
+        answer, _ = run_inference(model, processor, args)
+        return answer
+
+    return infer
+
+
+def main():
+    args = parse_args()
+    model, processor = load_model_and_processor(args)
+    infer = make_infer_fn(model, processor, args)
+
+    with gr.Blocks(title="MiniCPM-V 4.6 Local Demo") as demo:
+        gr.Markdown("# MiniCPM-V 4.6 Local Demo")
+        with gr.Row():
+            image = gr.Image(type="filepath", label="Image")
+            video = gr.Video(label="Video")
+        prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt")
+        with gr.Row():
+            max_new_tokens = gr.Slider(16, 512, value=128, step=16, label="Max new tokens")
+            downsample_mode = gr.Radio(["16x", "4x"], value="16x", label="Downsample mode")
+            max_slice_nums = gr.Slider(1, 8, value=4, step=1, label="Max slice nums")
+            max_num_frames = gr.Slider(1, 32, value=8, step=1, label="Max video frames")
+        run = gr.Button("Run")
+        output = gr.Textbox(label="Model output", lines=6)
+        run.click(
+            infer,
+            inputs=[image, video, prompt, max_new_tokens, downsample_mode, max_slice_nums, max_num_frames],
+            outputs=output,
+        )
+
+    demo.launch(server_name=args.server_name, server_port=args.server_port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gallery/minicpmv46_inference.py b/examples/gallery/minicpmv46_inference.py
new file mode 100644
index 00000000..93c8c955
--- /dev/null
+++ b/examples/gallery/minicpmv46_inference.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+
+DEFAULT_IMAGE_URL = "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/refract.png"
+DEFAULT_VIDEO_URL = "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4"
+DEFAULT_PROMPT = "Answer in one short sentence: what is shown in this input?"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run a minimal MiniCPM-V 4.6 inference example.")
+    parser.add_argument("--model-id", default="openbmb/MiniCPM-V-4.6", help="Hugging Face model id or local path.")
+    parser.add_argument(
+        "--image-url",
+        action="append",
+        help="Remote image URL or local image path. Pass multiple times for multi-image input.",
+    )
+    parser.add_argument("--video-url", help="Remote video URL or local video path. When set, image inputs are ignored.")
+    parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Text prompt for the image or video.")
+    parser.add_argument("--max-new-tokens", type=int, default=128)
+    parser.add_argument("--downsample-mode", default="16x", choices=["4x", "16x"])
+    parser.add_argument("--max-slice-nums", type=int, default=4)
+    parser.add_argument("--max-num-frames", type=int, default=16)
+    parser.add_argument("--stack-frames", type=int, default=1)
+    parser.add_argument("--device-map", default="auto")
+    parser.add_argument("--attn-implementation", default=None, help="Optional attention backend, e.g. flash_attention_2.")
+    parser.add_argument("--local-files-only", action="store_true", help="Load model files from the local cache only.")
+    parser.add_argument("--output-json", type=Path, help="Optional path to save the prompt and answer as JSON.")
+    return parser.parse_args()
+
+
+def load_model_and_processor(args):
+    processor = AutoProcessor.from_pretrained(args.model_id, local_files_only=args.local_files_only)
+    model_kwargs = {
+        "torch_dtype": "auto",
+        "device_map": args.device_map,
+        "local_files_only": args.local_files_only,
+    }
+    if args.attn_implementation:
+        model_kwargs["attn_implementation"] = args.attn_implementation
+
+    model = AutoModelForImageTextToText.from_pretrained(args.model_id, **model_kwargs)
+    return model, processor
+
+
+def build_media_content(args):
+    media_content = []
+    if args.video_url:
+        media_content.append({"type": "video", "url": args.video_url})
+    else:
+        for image_url in args.image_url or [DEFAULT_IMAGE_URL]:
+            media_content.append({"type": "image", "url": image_url})
+    return media_content
+
+
+def run_inference(model, processor, args):
+    media_content = build_media_content(args)
+    messages = [{"role": "user", "content": media_content + [{"type": "text", "text": args.prompt}]}]
+
+    processor_kwargs = {
+        "downsample_mode": args.downsample_mode,
+        "max_slice_nums": args.max_slice_nums,
+    }
+    if args.video_url:
+        processor_kwargs.update(
+            {
+                "max_num_frames": args.max_num_frames,
+                "stack_frames": args.stack_frames,
+                "use_image_id": False,
+            }
+        )
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+        processor_kwargs=processor_kwargs,
+    ).to(model.device)
+
+    with torch.inference_mode():
+        generated_ids = model.generate(
+            **inputs,
+            downsample_mode=args.downsample_mode,
+            max_new_tokens=args.max_new_tokens,
+        )
+
+    generated_ids_trimmed = [
+        output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    answer = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )[0]
+    return answer, media_content
+
+
+def write_output_json(args, answer, media_content):
+    if args.output_json:
+        args.output_json.parent.mkdir(parents=True, exist_ok=True)
+        args.output_json.write_text(
+            json.dumps(
+                {
+                    "model_id": args.model_id,
+                    "media": media_content,
+                    "image_url": args.image_url or ([] if args.video_url else [DEFAULT_IMAGE_URL]),
+                    "video_url": args.video_url,
+                    "prompt": args.prompt,
+                    "answer": answer,
+                },
+                ensure_ascii=False,
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+
+
+def main():
+    args = parse_args()
+    model, processor = load_model_and_processor(args)
+    answer, media_content = run_inference(model, processor, args)
+    print(answer)
+    write_output_json(args, answer, media_content)
+
+
+if __name__ == "__main__":
+    main()