From 99c401bf85bb88e2303ba6d50525cea8ee959f12 Mon Sep 17 00:00:00 2001 From: Jinghan Li Date: Fri, 5 Dec 2025 06:58:06 +0000 Subject: [PATCH] Introduce AutoPipelineForText2Video (simple) --- auto_pipeline_test.py | 16 +++++++++ src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 1 + src/diffusers/pipelines/auto_pipeline.py | 42 +++++++++++++++++++++++- 4 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 auto_pipeline_test.py diff --git a/auto_pipeline_test.py b/auto_pipeline_test.py new file mode 100644 index 000000000000..74a65a9df13e --- /dev/null +++ b/auto_pipeline_test.py @@ -0,0 +1,16 @@ +import torch +from diffusers import AutoPipelineForText2Video +from diffusers.utils import export_to_video + +wan_list = [ + "Wan-AI/Wan2.1-T2V-14B-Diffusers", + "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers", + "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers", + "Wan-AI/Wan2.1-VACE-1.3B-diffusers", + "Wan-AI/Wan2.2-I2V-A14B-Diffusers", +] + +pipe = AutoPipelineForText2Video.from_pretrained( + wan_list[2], + torch_dtype=torch.bfloat16, +) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index eb8e86c4c89d..fb5c5605bed6 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -303,6 +303,7 @@ "AutoPipelineForImage2Image", "AutoPipelineForInpainting", "AutoPipelineForText2Image", + "AutoPipelineForText2Video", "ConsistencyModelPipeline", "DanceDiffusionPipeline", "DDIMPipeline", diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 3d669aecf556..5187d79c7a44 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -46,6 +46,7 @@ "AutoPipelineForImage2Image", "AutoPipelineForInpainting", "AutoPipelineForText2Image", + "AutoPipelineForText2Video", ] _import_structure["consistency_models"] = ["ConsistencyModelPipeline"] _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"] diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index 044d854390e4..509cb55b6be6 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -117,7 +117,7 @@ StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) -from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline +from .wan import WanAnimatePipeline, WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline @@ -218,6 +218,10 @@ AUTO_TEXT2VIDEO_PIPELINES_MAPPING = OrderedDict( [ ("wan", WanPipeline), + ("wan-animate", WanAnimatePipeline), + ("wan-image-to-video", WanImageToVideoPipeline), + ("wan-vace", WanVACEPipeline), + ("wan-video-to-video", WanVideoToVideoPipeline), ] ) @@ -1203,3 +1207,39 @@ def from_pipe(cls, pipeline, **kwargs): model.register_to_config(**unused_original_config) return model + + +class AutoPipelineForText2Video(ConfigMixin): + config_name = "model_index.json" + + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_pipe(pipeline)` methods." + ) + + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + token = kwargs.pop("token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + + load_config_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "token": token, + "local_files_only": local_files_only, + "revision": revision, + } + + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + orig_class_name = config["_class_name"] + text_to_video_cls = _get_task_class(AUTO_TEXT2VIDEO_PIPELINES_MAPPING, orig_class_name) + kwargs = {**load_config_kwargs, **kwargs} + return text_to_video_cls.from_pretrained(pretrained_model_or_path, **kwargs)