From 6572f2981a81f6f77105a9953d0cf1868a133d52 Mon Sep 17 00:00:00 2001 From: Travor <3488616445@qq.com> Date: Fri, 22 May 2026 00:37:57 +0800 Subject: [PATCH] Add MiniCPM-V 4.6 examples gallery --- README.md | 2 + examples/gallery/README.md | 111 +++++++++++++++ examples/gallery/build_gallery.py | 166 +++++++++++++++++++++++ examples/gallery/gradio_demo.py | 75 ++++++++++ examples/gallery/minicpmv46_inference.py | 132 ++++++++++++++++++ 5 files changed, 486 insertions(+) create mode 100644 examples/gallery/README.md create mode 100644 examples/gallery/build_gallery.py create mode 100644 examples/gallery/gradio_demo.py create mode 100644 examples/gallery/minicpmv46_inference.py diff --git a/README.md b/README.md index df3ce199..7ba6a7f5 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,8 @@ MiniCPM-V 4.6 can be deployed across three mainstream end-side platforms — **i pip install "transformers[torch]>=5.7.0" torchvision torchcodec ``` +Runnable minimal scripts are also available in [examples/gallery](./examples/gallery/). + > **Note on CUDA compatibility:** `torchcodec` (used for video decoding) may have compatibility issues with certain CUDA versions. For example, `torch>=2.11` bundles CUDA 13.1 by default, while environments with CUDA 12.x may encounter errors such as `RuntimeError: Could not load libtorchcodec`. Two workarounds: > > 1. **Replace `torchcodec` with `PyAV`** — supports both image and video inference without CUDA version constraints: diff --git a/examples/gallery/README.md b/examples/gallery/README.md new file mode 100644 index 00000000..413abdaf --- /dev/null +++ b/examples/gallery/README.md @@ -0,0 +1,111 @@ +# MiniCPM-V 4.6 Examples Gallery + +This gallery contains small, runnable MiniCPM-V 4.6 examples for image, OCR, multi-image, video, low-memory, and local Gradio smoke tests. Each command can optionally write a JSON result that can be assembled into a static HTML gallery. + +## Setup + +Install the current Transformers path for MiniCPM-V 4.6: + +```bash +pip install "transformers[torch]>=5.7.0" torchvision av pillow +``` + +`av` is a lightweight media decoding option for environments where `torchcodec` has CUDA compatibility issues. If your CUDA and PyTorch versions support `torchcodec`, you can use the installation path from the main README instead. + +For the optional local web UI: + +```bash +pip install gradio +``` + +## Image Captioning + +Run the default refraction image example: + +```bash +python examples/gallery/minicpmv46_inference.py \ + --prompt "Describe this image in one sentence." \ + --output-json outputs/gallery/refraction.json +``` + +Expected output is a short description of the image, for example: + +```text +A glass of water with a red pencil stuck inside it. +``` + +## OCR + +```bash +python examples/gallery/minicpmv46_inference.py \ + --image-url assets/hk_OCR.jpg \ + --prompt "Read the visible text and summarize the scene in two sentences." \ + --output-json outputs/gallery/ocr.json +``` + +For lower-memory GPUs, keep `--downsample-mode 16x` and reduce `--max-slice-nums` if needed. Use `--downsample-mode 4x` when you need finer visual detail and have enough memory. + +## Multi-Image Comparison + +Pass `--image-url` multiple times: + +```bash +python examples/gallery/minicpmv46_inference.py \ + --image-url assets/airplane.jpeg \ + --image-url assets/worldmap_ck.jpg \ + --prompt "Compare these two images in two concise sentences." \ + --output-json outputs/gallery/multi_image.json +``` + +## Video Question Answering + +Run a short video smoke test with a small frame budget: + +```bash +python examples/gallery/minicpmv46_inference.py \ + --video-url https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4 \ + --prompt "Describe the main action in this video in one sentence." \ + --max-num-frames 8 \ + --max-slice-nums 1 \ + --output-json outputs/gallery/video.json +``` + +## Low-Memory Settings + +The default examples use `--downsample-mode 16x`, which is the faster and lighter setting. If memory is tight, also reduce image slices and video frames: + +```bash +python examples/gallery/minicpmv46_inference.py \ + --image-url assets/hk_OCR.jpg \ + --prompt "Read the visible text and summarize the scene." \ + --max-slice-nums 1 \ + --max-new-tokens 64 \ + --output-json outputs/gallery/ocr_low_memory.json +``` + +For more visual detail, switch to `--downsample-mode 4x` when your hardware has enough memory. + +## Static Gallery + +After writing one or more JSON outputs, build a local HTML gallery: + +```bash +python examples/gallery/build_gallery.py \ + --result-json outputs/gallery/refraction.json \ + --result-json outputs/gallery/ocr.json \ + --result-json outputs/gallery/multi_image.json \ + --result-json outputs/gallery/video.json \ + --output outputs/gallery/index.html +``` + +Open `outputs/gallery/index.html` to switch between cases and inspect the input media, prompt, and model output. + +## Local Gradio Demo + +Launch a minimal local UI: + +```bash +python examples/gallery/gradio_demo.py +``` + +The Gradio demo loads the model once, then accepts either an image or a video plus a prompt. Video inputs use the same frame-budget controls as the command-line script. diff --git a/examples/gallery/build_gallery.py b/examples/gallery/build_gallery.py new file mode 100644 index 00000000..c5d31b2e --- /dev/null +++ b/examples/gallery/build_gallery.py @@ -0,0 +1,166 @@ +import argparse +import html +import json +import os +from pathlib import Path + + +def parse_args(): + parser = argparse.ArgumentParser(description="Build a static MiniCPM-V examples gallery from JSON outputs.") + parser.add_argument( + "--result-json", + action="append", + required=True, + type=Path, + help="Path to a JSON file written by minicpmv46_inference.py. Pass multiple times.", + ) + parser.add_argument("--output", type=Path, default=Path("outputs/minicpmv46_gallery/index.html")) + parser.add_argument("--title", default="MiniCPM-V 4.6 Gallery") + return parser.parse_args() + + +def to_display_src(value, output_path): + if not value: + return "" + if value.startswith(("http://", "https://", "data:")): + return value + + path = Path(value) + if not path.exists(): + return value + try: + return Path(os.path.relpath(path.resolve(), output_path.resolve().parent)).as_posix() + except ValueError: + return path.resolve().as_uri() + + +def load_case(path, output_path): + data = json.loads(path.read_text(encoding="utf-8")) + media = data.get("media") + if not media: + media = [] + for image_url in data.get("image_url") or []: + media.append({"type": "image", "url": image_url}) + if data.get("video_url"): + media.append({"type": "video", "url": data["video_url"]}) + + title = path.stem.replace("_", " ").title() + normalized_media = [] + for item in media: + normalized_media.append( + { + "type": item.get("type", "image"), + "src": to_display_src(item.get("url", ""), output_path), + } + ) + + return { + "title": title, + "prompt": data.get("prompt", ""), + "answer": data.get("answer", ""), + "media": normalized_media, + "source": path.as_posix(), + } + + +def write_gallery(cases, output_path, title): + output_path.parent.mkdir(parents=True, exist_ok=True) + cases_json = json.dumps(cases, ensure_ascii=False) + page = f""" + + + + +{html.escape(title)} + + + +
+

{html.escape(title)}

+

Local smoke-test inputs and model outputs generated with examples/gallery/minicpmv46_inference.py.

+
+
+

+
+
+
+

Prompt

+

+  

Model Output

+

+
+
+ + + +""" + output_path.write_text(page, encoding="utf-8") + + +def main(): + args = parse_args() + cases = [load_case(path, args.output) for path in args.result_json] + write_gallery(cases, args.output, args.title) + print(f"Wrote gallery: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/examples/gallery/gradio_demo.py b/examples/gallery/gradio_demo.py new file mode 100644 index 00000000..e6e30061 --- /dev/null +++ b/examples/gallery/gradio_demo.py @@ -0,0 +1,75 @@ +import argparse +from pathlib import Path +from types import SimpleNamespace + +import gradio as gr + +from minicpmv46_inference import DEFAULT_PROMPT, load_model_and_processor, run_inference + + +def parse_args(): + parser = argparse.ArgumentParser(description="Launch a minimal local Gradio demo for MiniCPM-V 4.6.") + parser.add_argument("--model-id", default="openbmb/MiniCPM-V-4.6") + parser.add_argument("--device-map", default="auto") + parser.add_argument("--attn-implementation", default=None) + parser.add_argument("--local-files-only", action="store_true") + parser.add_argument("--server-name", default="127.0.0.1") + parser.add_argument("--server-port", type=int, default=7860) + return parser.parse_args() + + +def make_infer_fn(model, processor, base_args): + def infer(image, video, prompt, max_new_tokens, downsample_mode, max_slice_nums, max_num_frames): + image_url = None + if image is not None: + image_url = str(Path(image).resolve()) + video_url = None + if video is not None: + video_url = str(Path(video).resolve()) + + args = SimpleNamespace( + model_id=base_args.model_id, + image_url=[image_url] if image_url and not video_url else None, + video_url=video_url, + prompt=prompt or DEFAULT_PROMPT, + max_new_tokens=int(max_new_tokens), + downsample_mode=downsample_mode, + max_slice_nums=int(max_slice_nums), + max_num_frames=int(max_num_frames), + stack_frames=1, + ) + answer, _ = run_inference(model, processor, args) + return answer + + return infer + + +def main(): + args = parse_args() + model, processor = load_model_and_processor(args) + infer = make_infer_fn(model, processor, args) + + with gr.Blocks(title="MiniCPM-V 4.6 Local Demo") as demo: + gr.Markdown("# MiniCPM-V 4.6 Local Demo") + with gr.Row(): + image = gr.Image(type="filepath", label="Image") + video = gr.Video(label="Video") + prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt") + with gr.Row(): + max_new_tokens = gr.Slider(16, 512, value=128, step=16, label="Max new tokens") + downsample_mode = gr.Radio(["16x", "4x"], value="16x", label="Downsample mode") + max_slice_nums = gr.Slider(1, 8, value=4, step=1, label="Max slice nums") + max_num_frames = gr.Slider(1, 32, value=8, step=1, label="Max video frames") + run = gr.Button("Run") + output = gr.Textbox(label="Model output", lines=6) + run.click( + infer, + inputs=[image, video, prompt, max_new_tokens, downsample_mode, max_slice_nums, max_num_frames], + outputs=output, + ) + + demo.launch(server_name=args.server_name, server_port=args.server_port) + + +if __name__ == "__main__": + main() diff --git a/examples/gallery/minicpmv46_inference.py b/examples/gallery/minicpmv46_inference.py new file mode 100644 index 00000000..93c8c955 --- /dev/null +++ b/examples/gallery/minicpmv46_inference.py @@ -0,0 +1,132 @@ +import argparse +import json +from pathlib import Path + +import torch +from transformers import AutoModelForImageTextToText, AutoProcessor + + +DEFAULT_IMAGE_URL = "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/refract.png" +DEFAULT_VIDEO_URL = "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4" +DEFAULT_PROMPT = "Answer in one short sentence: what is shown in this input?" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Run a minimal MiniCPM-V 4.6 inference example.") + parser.add_argument("--model-id", default="openbmb/MiniCPM-V-4.6", help="Hugging Face model id or local path.") + parser.add_argument( + "--image-url", + action="append", + help="Remote image URL or local image path. Pass multiple times for multi-image input.", + ) + parser.add_argument("--video-url", help="Remote video URL or local video path. When set, image inputs are ignored.") + parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Text prompt for the image or video.") + parser.add_argument("--max-new-tokens", type=int, default=128) + parser.add_argument("--downsample-mode", default="16x", choices=["4x", "16x"]) + parser.add_argument("--max-slice-nums", type=int, default=4) + parser.add_argument("--max-num-frames", type=int, default=16) + parser.add_argument("--stack-frames", type=int, default=1) + parser.add_argument("--device-map", default="auto") + parser.add_argument("--attn-implementation", default=None, help="Optional attention backend, e.g. flash_attention_2.") + parser.add_argument("--local-files-only", action="store_true", help="Load model files from the local cache only.") + parser.add_argument("--output-json", type=Path, help="Optional path to save the prompt and answer as JSON.") + return parser.parse_args() + + +def load_model_and_processor(args): + processor = AutoProcessor.from_pretrained(args.model_id, local_files_only=args.local_files_only) + model_kwargs = { + "torch_dtype": "auto", + "device_map": args.device_map, + "local_files_only": args.local_files_only, + } + if args.attn_implementation: + model_kwargs["attn_implementation"] = args.attn_implementation + + model = AutoModelForImageTextToText.from_pretrained(args.model_id, **model_kwargs) + return model, processor + + +def build_media_content(args): + media_content = [] + if args.video_url: + media_content.append({"type": "video", "url": args.video_url}) + else: + for image_url in args.image_url or [DEFAULT_IMAGE_URL]: + media_content.append({"type": "image", "url": image_url}) + return media_content + + +def run_inference(model, processor, args): + media_content = build_media_content(args) + messages = [{"role": "user", "content": media_content + [{"type": "text", "text": args.prompt}]}] + + processor_kwargs = { + "downsample_mode": args.downsample_mode, + "max_slice_nums": args.max_slice_nums, + } + if args.video_url: + processor_kwargs.update( + { + "max_num_frames": args.max_num_frames, + "stack_frames": args.stack_frames, + "use_image_id": False, + } + ) + inputs = processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt", + processor_kwargs=processor_kwargs, + ).to(model.device) + + with torch.inference_mode(): + generated_ids = model.generate( + **inputs, + downsample_mode=args.downsample_mode, + max_new_tokens=args.max_new_tokens, + ) + + generated_ids_trimmed = [ + output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, generated_ids) + ] + answer = processor.batch_decode( + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + )[0] + return answer, media_content + + +def write_output_json(args, answer, media_content): + if args.output_json: + args.output_json.parent.mkdir(parents=True, exist_ok=True) + args.output_json.write_text( + json.dumps( + { + "model_id": args.model_id, + "media": media_content, + "image_url": args.image_url or ([] if args.video_url else [DEFAULT_IMAGE_URL]), + "video_url": args.video_url, + "prompt": args.prompt, + "answer": answer, + }, + ensure_ascii=False, + indent=2, + ), + encoding="utf-8", + ) + + +def main(): + args = parse_args() + model, processor = load_model_and_processor(args) + answer, media_content = run_inference(model, processor, args) + print(answer) + write_output_json(args, answer, media_content) + + +if __name__ == "__main__": + main()