diff --git a/.gitignore b/.gitignore index 96eaa4a..03044ca 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ u2net_segm.pth data outputs +veo_test.py \ No newline at end of file diff --git a/README.md b/README.md index dc92873..93dc20f 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ OpenTryOn is an open-source AI toolkit designed for fashion technology and virtu - GPT-Image-1 & GPT-Image-1.5 (OpenAI): High-quality image generation with strong prompt understanding, consistent composition, and reliable visual accuracy. GPT-Image-1.5 offers enhanced quality and better consistency - **Video Generation**: - Luma AI Video Generation Model (Dream Machine): High-quality video generation with text-to-image and image-to-video modes. + - Google Veo 3 Video Generation Model: Generate high-quality, cinematic videos from text or images with realistic motion, temporal consistency, and fine-grained control over style and camera dynamics. - **Datasets Module**: - Fashion-MNIST dataset loader with automatic download - VITON-HD dataset loader with lazy loading via PyTorch DataLoader @@ -61,6 +62,7 @@ OpenTryOn is an open-source AI toolkit designed for fashion technology and virtu - [Image Generation with Luma AI](#luma-ai-image-generation) - [Image Generation with OpenAI](#image-generation-with-gpt-image-1) - [Video Generation with Luma AI](#video-generation-with-luma-ai) + - [Video Generation with Google Veo 3](#video-generation-with-google-veo-3) - [Preprocessing Functions](#preprocessing-functions) - [Demos](#demos) - [Project Structure](#project-structure) @@ -138,13 +140,13 @@ KLING_AI_BASE_URL=https://api-singapore.klingai.com # Optional, defaults to Sin # Segmind Credentials (required for Segmind virtual try-on) SEGMIND_API_KEY=your_segmind_api_key -# Google Gemini Credentials (required for Nano Banana image generation) +# Google Gemini Credentials (required for Nano Banana image generation and Google Veo 3 Video generation) GEMINI_API_KEY=your_gemini_api_key # BFL API Credentials (required for FLUX.2 image generation) BFL_API_KEY=your_bfl_api_key -# Luma AI Credentials (required for Luma AI image generation) +# Luma AI Credentials (required for Luma AI image generation and Luma AI Video generation) LUMA_AI_API_KEY=your_luma_ai_api_key # OpenAI Credentials (required for OpenAI GPT-Image-1 image generation) @@ -164,7 +166,7 @@ GOOGLE_API_KEY=your_google_api_key # For Google Gemini - For Kling AI, obtain your API key and secret key from the [Kling AI Developer Portal](https://app.klingai.com/global/dev/document-api/apiReference/model/functionalityTry) - For Segmind, obtain your API key from the [Segmind API Portal](https://www.segmind.com/models/try-on-diffusion/api) -- For Nano Banana, obtain your API key from the [Google AI Studio](https://aistudio.google.com/app/apikey) +- For Nano Banana and Google Veo 3, obtain your API key from the [Google AI Studio](https://aistudio.google.com/app/apikey) - For FLUX.2 models, obtain your API key from [BFL AI](https://docs.bfl.ai/) - For FLUX.2 models, obtain your API key from [BFL AI](https://docs.bfl.ai/) @@ -1621,7 +1623,7 @@ for idx, vid_bytes in enumerate(video_list): #### Supported Features -- **Text to Video**: Generate videos using test descriptions. +- **Text to Video**: Generate videos using text descriptions. - **Image to Video**: Generate videos using keyframes. - **Keyframe Generation**: Generate videos using a start keyframe or an end keyframe or both. - **Duration**: Durations in seconds (5s, 9s, 10s) @@ -1639,6 +1641,138 @@ for idx, vid_bytes in enumerate(video_list): **Reference**: [Luma AI Video Generation Documentation](https://docs.lumalabs.ai/docs/video-generation) +### Video Generation with Google Veo 3 + +Generate high-quality, cinematic videos using Google’s Veo 3 models (Veo 3.0 and Veo 3.1), including (veo-3.1-generate-preview, veo-3.1-fast-generate-preview, veo-3.0-generate-001, and veo-3.0-fast-generate-001). These models support text-to-video, image-to-video, reference-images-to-video, and frames-to-video generation for controlled motion, realistic dynamics, and consistent visual quality. + +#### Prerequisites + +1. **Google Gemini Account Setup**: + - Sign up for a Google AI Studio account at [Google AI Studio](https://aistudio.google.com/) + - Obtain your API key from the [API Keys page](https://aistudio.google.com/app/apikey) + - Configure credentials in your `.env` file (see Environment Variables section) + +2. **Model Selection**: + - **veo-3.1-generate-preview**: Generate high-quality cinematic videos with enhanced motion realism and temporal consistency using the latest Veo 3.1 model. + - **veo-3.1-fast-generate-preview**: Create videos quickly with optimized inference speed while retaining strong visual quality and motion coherence. + - **veo-3.0-generate-001**: Produce stable, high-fidelity videos using the proven Veo 3.0 generation model with reliable motion and style control. + - **veo-3.0-fast-generate-001**: Generate videos faster with the Veo 3.0 fast variant, balancing speed and visual quality for rapid iteration. + +#### Command Line Usage + +```bash +# Text to Video with Google Veo 3 +python veo_video.py --provider veo-3.1-generate-preview --mode text --prompt "model at a fashion show" --aspect 16:9 --duration 8 --resolution 1080p --output_dir outputs + +# Video generation with negative prompt +python veo_video.py --provider veo-3.1-generate-preview --mode text --prompt "person with a hat" --resolution 1080p --negative_prompt "cartoon, anime, kids" + +# Image to Video +python veo_video.py --provider veo-3.1-generate-preview --mode image --prompt "model at a fashion show" --images person.jpg --aspect 16:9 --duration 8 --resolution 1080p + +# Video generation with reference images (up to 3) +python veo_video.py --provider veo-3.1-generate-preview --mode reference --prompt "create a fashion week video" --images person1.jpg person2.jpg person3.jpg --resolution 1080p + +# Video generation with frames +python veo_video.py --provider veo-3.1-generate-preview --mode frames --prompt "create a cinematic video" --start_image person1.jpg --end_image person2.jpg --aspect 16:9 --resolution 720p +``` + +#### Python API Usage + +**Google Veo 3** + +```python + +from dotenv import load_dotenv +load_dotenv() + +from pathlib import Path +from tryon.api.veo import VeoAdapter + +adapter = VeoAdapter() +video_list = [] + + +def save_video(video_bytes: bytes, idx: int): + Path("outputs").mkdir(exist_ok=True) + out_path = Path("outputs") / f"generated_{idx}.mp4" + with open(out_path, "wb") as f: + f.write(video_bytes) + print(f"[SAVED] {out_path}") + + +# TEXT → VIDEO +video = adapter.generate_text_to_video( + prompt="A cinematic neon city with cars moving at night", + duration_seconds="4", + aspect_ratio="16:9", + resolution="720p", + model="veo-3.1-generate-preview", +) +video_list.append(video) + + +# IMAGE → VIDEO +video = adapter.generate_image_to_video( + image="model.jpg", + prompt="Two monsters fighting with each other", + duration_seconds="4", + aspect_ratio="16:9", + resolution="720p", + model="veo-3.1-generate-preview", + negative_prompt="cartoon, anime, for kids", +) +video_list.append(video) + + +# REFERENCE IMAGES → VIDEO +video = adapter.generate_video_with_references( + prompt="A fashion model walking on a runway", + reference_images=[ + "test_assets/ref1.jpg", + "test_assets/ref2.jpg", + ], + duration_seconds="8", + aspect_ratio="16:9", + resolution="720p", + model="veo-3.1-generate-preview", +) +video_list.append(video) + + +# FIRST + LAST FRAME → VIDEO +video = adapter.generate_video_with_frames( + prompt="Smooth cinematic transition from grizzly bear to polar bear", + first_image="person1.jpg", + last_image="person2.jpg", + duration_seconds="8", + aspect_ratio="16:9", + resolution="720p", + model="veo-3.1-generate-preview", + negative_prompt="cartoon, anime, kids", +) +video_list.append(video) + + +# SAVE ALL RESULTS +for idx, vid_bytes in enumerate(video_list): + save_video(vid_bytes, idx) + +``` + +#### Supported Features + +- **Text to Video**: Generate Video using text descriptions. +- **Image to Video**: Generate Video using a single image. +- **Video Generation with Reference Images**: Generate Video using reference Images (up to 3). +- **Video Generation with Frames**: Video Generation with first frame and last frame. +- **Duration**: Durations in seconds (4s, 6s, 8s) +- **Resolution**: Quality of the video (720p, 1080p) +- **Aspect Ratio**: Aspect Ratio of videos (16:9, 9:16) +- **Negative Prompt**: Negative Prompt tells the Veo model what to avoid generating in the video. + +**Reference**: [Google Veo 3 Video Generation Documentation](https://ai.google.dev/gemini-api/docs/video) + ### Preprocessing Functions #### Segment Garment diff --git a/tryon/api/__init__.py b/tryon/api/__init__.py index c3eeed2..f731845 100644 --- a/tryon/api/__init__.py +++ b/tryon/api/__init__.py @@ -5,6 +5,7 @@ from .lumaAI import LumaAIAdapter from .flux2 import Flux2ProAdapter, Flux2FlexAdapter from .lumaAI.luma_video_adapter import LumaAIVideoAdapter +from .openAI.image_adapter import GPTImageAdapter __all__ = [ "AmazonNovaCanvasVTONAdapter", @@ -16,4 +17,5 @@ "Flux2ProAdapter", "Flux2FlexAdapter", "LumaAIVideoAdapter", + "GPTImageAdapter", ] \ No newline at end of file diff --git a/tryon/api/veo.py b/tryon/api/veo.py new file mode 100644 index 0000000..43b0c2c --- /dev/null +++ b/tryon/api/veo.py @@ -0,0 +1,873 @@ +""" +Google Veo Video Generation API Adapter + +Adapter for Google's Veo video generation models, providing structured +interfaces for multiple video creation workflows including text-driven +generation, image-conditioned generation, and frame-controlled synthesis. + +Supported Models: +- veo-3.1-generate-preview +- veo-3.1-fast-generate-preview +- veo-3.0-generate-001 +- veo-3.0-fast-generate-001 + +Capabilities: +1) Text-to-Video + Generate high-quality cinematic videos purely from a natural language prompt. + +2) Image-to-Video + Use a single reference image to establish style, composition, and scene + context while the model animates motion forward in time. + +3) Video Generation with Reference Images (up to 3) + Provide one or more guiding images (maximum three). These images help + influence scene structure, visual continuity, and thematic consistency. + +4) First Frame + Last Frame Controlled Generation + Supply both an initial frame and a final target frame. Veo interpolates + motion and visual development between the two frames to create a smooth + evolving sequence. + (Full frame-to-frame control is primarily supported in Veo 3.1 models.) + +Behavior Notes: +- Not all control modes are equally supported across Veo 3.0 vs 3.1. + Veo 3.1 provides better guided control and stability. +- Generation is asynchronous; polling is required until completion. +- Output is returned as raw video bytes for downstream streaming/storage. +- Duration, resolution, and aspect-ratio constraints depend on model config. + +Typical Workflow: +1) Submit generation request with prompt (+ optional frames / references) +2) Poll until operation finishes +3) Download or extract MP4 video bytes + +Reference: +https://ai.google.dev/gemini-api/docs/video + +Usage Examples: + + Text-to-Video: + >>> adapter.generate_text_video( + ... prompt="A cinematic aerial shot of a futuristic neon city at night", + ... model="veo-3.1-generate-preview" + ... ) + + Image-to-Video: + >>> adapter.generate_image_to_video( + ... prompt="Continue motion through a windy canyon", + ... image="start.png", + ... model="veo-3.1-generate-preview" + ... ) + + Reference Images (up to 3): + >>> adapter.generate_video_with_references( + ... prompt="Epic fantasy battlefield reveal", + ... reference_images=["a.png", "b.png", "c.png"], + ... model="veo-3.1-fast-generate-preview" + ... ) + + First + Last Frame Controlled Generation: + >>> adapter.generate_video_with_frames( + ... prompt="Smooth cinematic transition through cyberpunk streets", + ... first_image="start.png", + ... last_image="end.png", + ... model="veo-3.1-generate-preview" + ... ) +""" + +import time +import os +import io +import base64 +from PIL import Image +from typing import Optional, Union + +try: + from google import genai + from google.genai import types + GEMINI_API_KEY = True +except ImportError: + GEMINI_API_KEY = False + genai = None + +DURATION = {"4", "6", "8"} +ASPECT_RATIO = {"16:9", "9:16"} +RESOLUTION = {"720p", "1080p"} +MODELS = {"veo-3.1-generate-preview", "veo-3.1-fast-generate-preview", "veo-3.0-generate-001", "veo-3.0-fast-generate-001"} + +class VeoAdapter: + + def __init__(self, api_key: Optional[str] = None): + + if not GEMINI_API_KEY: + raise ImportError( + "google-genai library is not available. " \ + "Please install it with 'pip install google-genai'." + ) + + self.api_key = api_key or os.getenv("GEMINI_API_KEY") + if not self.api_key: + raise ValueError("GEMINI API key must be provided either as a parameter or through the GEMINI_API_KEY environment variable.") + + self.client = genai.Client(api_key=self.api_key) + + + def _prepare_image_input(self, image_input: Union[str, io.BytesIO, Image.Image]) -> types.Image: + + """ + Converts various image inputs into a Veo-compatible `types.Image`. + + Supports: + - PIL Image + - file-like objects (BytesIO / file handles) + - strings (URL, local path, or Base64) + + Ensures: + - image is valid + - converted to RGB + - encoded as PNG + - returned as `types.Image(image_bytes, mime_type="image/png")` + + Raises: + ValueError: If the input type is unsupported or the image cannot be decoded. + """ + + # A PIL Image + if isinstance(image_input, Image.Image): + pil = image_input.convert("RGB") + + # File-like object (BytesIO, file handle) + elif hasattr(image_input, "read"): + image_input.seek(0) + image_bytes = image_input.read() + try: + pil = Image.open(io.BytesIO(image_bytes)).convert("RGB") + except Exception as e: + raise ValueError(f"Invalid image data in file-like object: {e}") + + # String (URL, file path, or base64) + elif isinstance(image_input, str): + # URL + if image_input.startswith(("http://", "https://")): + try: + import requests + r = requests.get(image_input, timeout=10) + r.raise_for_status() + image_bytes = r.content + except Exception as e: + raise ValueError(f"Failed to download image from URL: {e}") + + # File path + elif os.path.exists(image_input): + try: + with open(image_input, "rb") as f: + image_bytes = f.read() + except Exception as e: + raise ValueError(f"Failed to read image file: {e}") + + # Base64 string + else: + try: + image_bytes = base64.b64decode(image_input, validate=True) + except Exception as e: + raise ValueError(f"Invalid base64 image string: {e}") + + # Convert bytes to PIL Image + try: + pil = Image.open(io.BytesIO(image_bytes)).convert("RGB") + except Exception as e: + raise ValueError(f"Invalid image data: {e}") + + else: + raise ValueError(f"Unsupported image input type: {type(image_input)}") + + # Convert to PNG bytes + buffer = io.BytesIO() + pil.save(buffer, format="PNG") + png_bytes = buffer.getvalue() + + # Return raw bytes + return types.Image( + image_bytes=png_bytes, + mime_type="image/png", + ) + + + def generate_text_to_video( + self, + prompt: str, + duration_seconds: str = "4", + aspect_ratio: str = "16:9", + resolution: str = "720p", + negative_prompt: Optional[str] = None, + model: str = "veo-3.1-generate-preview", + ) -> bytes: + + """ + Generate a video from a text prompt using Google Veo. + + This function sends a text prompt to a Veo video generation model, polls the + operation until the video is ready, then downloads and returns the raw MP4 bytes. + Supports duration, aspect-ratio, resolution constraints and optional + negative prompts. Automatically validates parameters and ensures only valid + Veo model + configuration combinations are used. + + Args: + prompt: + The primary text prompt describing what the video should depict. + duration_seconds: + Length of the generated clip in seconds. + Supported values depend on the model (commonly: "4", "6", "8"). + aspect_ratio: + Output aspect ratio (e.g., "16:9" or "9:16"). + resolution: + Output resolution preset ("720p" or "1080p", depending on model limits). + negative_prompt: + Optional text describing content to avoid in the generation. + model: + Veo model identifier. + Examples: + - "veo-3.1-generate-preview" + - "veo-3.1-fast-generate-preview" + - "veo-3.0-generate-001" + - "veo-3.0-fast-generate-001" + + Returns: + bytes: + Raw MP4 video bytes + + Raises: + ValueError: + If prompt is missing, configuration is invalid, + or no video was generated. + RuntimeError: + If Veo returns an unexpected structure and video bytes + cannot be extracted. + + Example: + >>> video_bytes = adapter.generate_text_to_video( + ... prompt="A cinematic shot of a dragon flying over a medieval city", + ... duration_seconds="8", + ... aspect_ratio="16:9", + ... resolution="1080p", + ... model="veo-3.1-generate-preview" + ... ) + >>> with open("dragon_city.mp4", "wb") as f: + ... f.write(video_bytes) + """ + + # Validation Check + if not prompt: + raise ValueError("prompt is required") + + if model not in MODELS: + raise ValueError(f"{model} is not a recognized model. Available models are {MODELS}") + + if duration_seconds not in DURATION: + raise ValueError("duration_seconds must be one of: 4, 6, 8") + + if aspect_ratio not in ASPECT_RATIO: + raise ValueError("aspect_ratio must be '16:9' or '9:16'") + + if resolution not in RESOLUTION: + raise ValueError("resolution must be '720p' or '1080p'") + + if model in {"veo-3.1-generate-preview", "veo-3.1-fast-generate-preview"}: + if resolution == "1080p" and duration_seconds != "8": + raise ValueError("1080p resolution only supports 8s duration for veo 3.1 models.") + + if model in {"veo-3.0-generate-001", "veo-3.0-fast-generate-001"}: + if resolution == "1080p" and aspect_ratio != "16:9": + raise ValueError("1080p resolution only supportes 16:9 aspect ratio for veo 3 models.") + + # Create Configurations + kwargs = { + "duration_seconds": duration_seconds, + "aspect_ratio": aspect_ratio, + "resolution": resolution, + } + + # Negative Prompt + if negative_prompt: + kwargs["negative_prompt"] = negative_prompt + + # Create a generation object + operation = self.client.models.generate_videos( + model=model, + prompt=prompt, + config=types.GenerateVideosConfig(**kwargs), + ) + + # Polling + while not operation.done: + operation = self.client.operations.get(operation) + time.sleep(1) + + # Check for video error after polling completes + if getattr(operation, "error", None): + raise RuntimeError(f"Video generation failed: {operation.error}") + + if operation.response is None: + raise RuntimeError("Video generation completed but no response was returned") + + generated = getattr(operation.response, "generated_videos", None) + if not generated: + raise RuntimeError( + f"No videos were generated. operation.error={getattr(operation, 'error', None)}; " + f"operation_name={getattr(operation, 'name', None)}; response={operation.response}" + ) + + # Extract the generated video + video = operation.response.generated_videos[0].video + if not video: + raise ValueError("No video was generated.") + + + # If Video bytes are returned + if hasattr(video, "video_bytes") and video.video_bytes: + return video.video_bytes + + # Else download the file via the files API + downloaded = self.client.files.download(file=video) + + if isinstance(downloaded, (bytes, bytearray)): + return bytes(downloaded) + + if hasattr(downloaded, "read"): + try: + downloaded.seek(0) + except Exception: + pass + return downloaded.read() + + if hasattr(downloaded, "bytes"): + return downloaded.bytes + + if hasattr(downloaded, "data"): + return downloaded.data + + if hasattr(downloaded, "content"): + return downloaded.content + + raise RuntimeError(f"Error occured. Video cannot be generated...") + + + def generate_image_to_video( + self, + image: Union[str, io.BytesIO, Image.Image], + prompt: str, + duration_seconds: str = "4", + aspect_ratio: str = "16:9", + resolution: str = "720p", + negative_prompt: Optional[str] = None, + model: str = "veo-3.1-generate-preview", + ) -> bytes: + + """ + Animate a still image into a video using Google Veo. + + This function takes a single reference image and a guiding text prompt, sends + them to a Veo image-to-video model, polls until generation completes, and + returns the resulting MP4 video as raw bytes. The animation respects the + specified duration, aspect ratio, and resolution constraints. Supports optional + negative prompts and enforces model capability rules to prevent invalid requests. + + Args: + image: + The input image to animate. Can be one of: + - str: Path to an image file. + - io.BytesIO: In-memory binary image stream. + - PIL.Image.Image: Loaded PIL image object. + prompt: + Text description guiding how the image should animate. + duration_seconds: + Length of the generated clip in seconds. + Supported values depend on the model (commonly: "4", "6", "8"). + aspect_ratio: + Output aspect ratio (e.g., "16:9" or "9:16"). + resolution: + Output video resolution preset ("720p" or "1080p", depending on model limits). + negative_prompt: + Optional text describing what should be avoided in the generated video. + model: + Veo model identifier to use for generation, such as: + - "veo-3.1-generate-preview" + - "veo-3.1-fast-generate-preview" + - "veo-3.0-generate-001" + - "veo-3.0-fast-generate-001" + + Returns: + bytes: + Raw MP4 video bytes + + Raises: + ValueError: + If required parameters are missing, invalid combinations are used, + or no video is returned from Veo. + RuntimeError: + If Veo produces an unexpected response structure and + video bytes cannot be extracted. + + Example: + >>> with open("person.png", "rb") as img: + ... video = adapter.generate_image_to_video( + ... image=img, + ... prompt="The person begins walking through a snowy forest", + ... duration_seconds="8", + ... aspect_ratio="16:9", + ... resolution="1080p", + ... model="veo-3.1-generate-preview", + ... ) + >>> with open("animated_person.mp4", "wb") as f: + ... f.write(video) + """ + + # Validation Check + if not image: + raise ValueError("image is required") + + if not prompt: + raise ValueError("prompt is required") + + if model not in MODELS: + raise ValueError(f"{model} is not a recognized model. Available models are {MODELS}") + + if duration_seconds not in DURATION: + raise ValueError("duration_seconds must be one of: 4, 6, 8") + + if aspect_ratio not in ASPECT_RATIO: + raise ValueError("aspect_ratio must be '16:9' or '9:16'") + + if resolution not in RESOLUTION: + raise ValueError("resolution must be '720p' or '1080p'") + + if model in {"veo-3.1-generate-preview", "veo-3.1-fast-generate-preview"}: + if resolution == "1080p" and duration_seconds != "8": + raise ValueError("1080p resolution only supports 8s duration for veo 3.1 models.") + + if model in {"veo-3.0-generate-001", "veo-3.0-fast-generate-001"}: + if resolution == "1080p" and aspect_ratio != "16:9": + raise ValueError("1080p resolution only supportes 16:9 aspect ratio for veo 3 models.") + + + # Create Configurations + kwargs = { + "duration_seconds": duration_seconds, + "aspect_ratio": aspect_ratio, + "resolution": resolution, + } + + # Negative Prompt + if negative_prompt: + kwargs["negative_prompt"] = negative_prompt + + # Create a generation object + operation = self.client.models.generate_videos( + model=model, + prompt=prompt, + image=self._prepare_image_input(image), + config=types.GenerateVideosConfig(**kwargs), + ) + + # Polling + while not operation.done: + operation = self.client.operations.get(operation) + time.sleep(1) + + # Check for video error after polling completes + if getattr(operation, "error", None): + raise RuntimeError(f"Video generation failed: {operation.error}") + + if operation.response is None: + raise RuntimeError("Video generation completed but no response was returned") + + generated = getattr(operation.response, "generated_videos", None) + if not generated: + raise RuntimeError( + f"No videos were generated. operation.error={getattr(operation, 'error', None)}; " + f"operation_name={getattr(operation, 'name', None)}; response={operation.response}" + ) + + # Extract the generated video + video = operation.response.generated_videos[0].video + if not video: + raise ValueError("Video was not generated.") + + # If Video bytes are returned + if hasattr(video, 'video_bytes') and video.video_bytes: + return video.video_bytes + + # Else download the file + downloaded = self.client.files.download(file=video) + + if isinstance(downloaded, (bytes, bytearray)): + return bytes(downloaded) + + if hasattr(downloaded, "read"): + try: + downloaded.seek(0) + except Exception: + pass + return downloaded.read() + + if hasattr(downloaded, "bytes"): + return downloaded.bytes + + if hasattr(downloaded, "data"): + return downloaded.data + + if hasattr(downloaded, "content"): + return downloaded.content + + raise ValueError("Error occured. Video cannot be generated...") + + + def generate_video_with_references( + self, + prompt: str, + reference_images: list[Union[str, io.BytesIO, Image.Image]], + duration_seconds: str = "8", + aspect_ratio: str = "16:9", + resolution: str = "720p", + model: str = "veo-3.1-generate-preview", + negative_prompt: Optional[str] = None + ) -> bytes: + + """ + Generate a video using reference images as visual/style guidance with Google Veo. + + This function generates a video guided by one or more reference images. The + images help Veo preserve identity, visual style, composition, or scene + continuity depending on how they are interpreted by the model. Supports up + to three reference images and automatically enforces Veo 3.1 model constraints + (8 seconds, 16:9, supported resolutions). The function polls the generation + operation until completion and returns the final MP4 video bytes. + + Args: + prompt: + Text description guiding how the scene should animate and what should happen. + reference_images: + List of up to three images used as visual guidance. + Each element may be: + - str: Path to an image file + - io.BytesIO: In-memory binary stream + - PIL.Image.Image: Loaded PIL image object + duration_seconds: + Length of the generated clip in seconds. + For reference-guided generation, Veo requires `"8"`. + aspect_ratio: + Output aspect ratio. Reference-guided video currently supports `"16:9"` only. + resolution: + Output resolution preset, typically `"720p"` or `"1080p"` depending on model limits. + model: + Veo model identifier. Reference-guided generation is only supported + on Veo 3.1 preview / fast-preview models, e.g.: + - "veo-3.1-generate-preview" + - "veo-3.1-fast-generate-preview" + negative_prompt: + Optional text describing what should be avoided in the generated video. + + Returns: + bytes: + Raw MP4 video bytes + + Raises: + ValueError: + If required parameters are missing, invalid configurations are used, + unsupported models are selected, or no video is returned. + RuntimeError: + If Veo returns an unexpected response structure and the video + file bytes cannot be extracted. + + Example: + >>> video_bytes = adapter.generate_video_with_references( + ... prompt="A heroic knight walking through a ruined castle courtyard", + ... reference_images=[ + ... "face_ref.png", + ... "armor_style.jpg" + ... ], + ... resolution="1080p", + ... model="veo-3.1-generate-preview" + ... ) + >>> with open("knight_scene.mp4", "wb") as f: + ... f.write(video_bytes) + """ + + # Valdiation Check + if not prompt: + raise ValueError("prompt is required") + + if model not in MODELS: + raise ValueError(f"{model} is not a recognized model. Available models are {MODELS}") + + if model in {"veo-3.0-generate-001", "veo-3.0-fast-generate-001"}: + raise ValueError("Video generation using reference images is only supported for veo 3.1 models.") + + if not reference_images: + raise ValueError("At least one reference image is required") + + if len(reference_images) > 3: + raise ValueError("Veo 3.1 supports a maximum of 3 reference images") + + if resolution not in RESOLUTION: + raise ValueError("resolution must be '720p' or '1080p'") + + if duration_seconds != "8": + raise ValueError("Video generation using reference images require duration_seconds='8'") + + if aspect_ratio != "16:9": + raise ValueError("Video generation using reference images only support aspect_ratio='16:9'") + + + # Prepare reference images + refs = [] + + for img in reference_images: + refs.append( + types.VideoGenerationReferenceImage( + image=self._prepare_image_input(img), + reference_type="asset" + ) + ) + + # Create Configurations + kwargs = { + "duration_seconds": duration_seconds, + "aspect_ratio": aspect_ratio, + "resolution": resolution, + "reference_images": refs + } + + # Negative prompt + if negative_prompt: + kwargs["negative_prompt"] = negative_prompt + + # Create a generation object + operation = self.client.models.generate_videos( + model=model, + prompt=prompt, + config=types.GenerateVideosConfig(**kwargs), + ) + + # Polling + while not operation.done: + operation = self.client.operations.get(operation) + time.sleep(1) + + # Check for video error after polling completes + if getattr(operation, "error", None): + raise RuntimeError(f"Video generation failed: {operation.error}") + + if operation.response is None: + raise RuntimeError("Video generation completed but no response was returned") + + generated = getattr(operation.response, "generated_videos", None) + if not generated: + raise RuntimeError( + f"No videos were generated. operation.error={getattr(operation, 'error', None)}; " + f"operation_name={getattr(operation, 'name', None)}; response={operation.response}" + ) + + # Extract the generated video + video = operation.response.generated_videos[0].video + if not video: + raise ValueError("No video was generated") + + # If Video bytes are returned + if hasattr(video, "video_bytes") and video.video_bytes: + return video.video_bytes + + # Else download the file + downloaded = self.client.files.download(file=video) + + if isinstance(downloaded, (bytes, bytearray)): + return bytes(downloaded) + + if hasattr(downloaded, "read"): + try: + downloaded.seek(0) + except Exception: + pass + return downloaded.read() + + if hasattr(downloaded, "bytes"): + return downloaded.bytes + + if hasattr(downloaded, "data"): + return downloaded.data + + if hasattr(downloaded, "content"): + return downloaded.content + + raise RuntimeError(f"Error occured. Video cannot be generated...") + + + def generate_video_with_frames( + self, + prompt: str, + first_image: Union[str, io.BytesIO, Image.Image], + last_image: Union[str, io.BytesIO, Image.Image], + duration_seconds: str = "8", + aspect_ratio: str = "16:9", + resolution: str = "720p", + model: str = "veo-3.1-generate-preview", + negative_prompt: Optional[str] = None + ) -> bytes: + + """ + Generate a video using a starting frame and ending frame as guidance with Google Veo. + + This function generates a video using a specified first frame and last frame to + guide motion, composition, and visual consistency throughout the clip. The + first frame determines how the video begins and the last frame influences how + the animation resolves. The generation respects Veo 3.1 constraints + (8-second clips, supported aspect ratios, supported resolutions) and polls the + operation until the output is ready, finally returning raw MP4 bytes. + + Args: + prompt: + Text description guiding what happens between the first and last frames. + first_image: + The starting frame of the video. May be: + - str: Path to an image file + - io.BytesIO: Binary image stream + - PIL.Image.Image: Loaded PIL image + last_image: + The ending frame of the video. Must be provided in the same supported + formats as `first_image`. + duration_seconds: + Duration of the generated clip. Frame-guided generation currently + requires `"8"`. + aspect_ratio: + Output aspect ratio (e.g., `"16:9"` or `"9:16"`, depending on model support). + resolution: + Output video resolution preset such as `"720p"` or `"1080p"`. + model: + Veo model identifier. Frame-guided generation is only supported on + Veo 3.1 preview / fast-preview models, such as: + - "veo-3.1-generate-preview" + - "veo-3.1-fast-generate-preview" + negative_prompt: + Optional text describing what should be avoided in the generation. + + Returns: + bytes: + Raw MP4 video bytes + + Raises: + ValueError: + If required parameters are missing, unsupported models are used, + invalid configurations are passed, or the API does not return a video. + RuntimeError: + If the API response structure is unexpected and the video bytes + cannot be extracted. + + Example: + >>> video = adapter.generate_video_with_frames( + ... prompt="A dramatic camera move through a futuristic city at night", + ... first_image="start_frame.png", + ... last_image="end_frame.png", + ... resolution="1080p", + ... model="veo-3.1-generate-preview" + ... ) + >>> with open("city_transition.mp4", "wb") as f: + ... f.write(video) + """ + + # Validation Check + if not prompt: + raise ValueError("Prompt is required for video generation.") + + if model not in MODELS: + raise ValueError(f"{model} is not a recognized model. Available models are {MODELS}") + + if model in {"veo-3.0-generate-001", "veo-3.0-fast-generate-001"}: + raise ValueError("Video generation using first frame and last frame is only supported for veo 3.1 models.") + + if aspect_ratio not in ASPECT_RATIO: + raise ValueError("aspect_ratio must be '16:9' or '9:16'") + + if resolution not in RESOLUTION: + raise ValueError("resolution must be '720p' or '1080p'") + + if duration_seconds != "8": + raise ValueError("Video generation using frames require duration_seconds='8'") + + if not first_image and not last_image: + raise ValueError("Both first frame and last frame are required for video generation.") + + # Preparing images for input + first_frame = self._prepare_image_input(first_image) + last_frame = self._prepare_image_input(last_image) + + # Create Configurations + kwargs = { + "duration_seconds": duration_seconds, + "aspect_ratio": aspect_ratio, + "resolution": resolution, + "last_frame": last_frame, + } + + # Negative prompt + if negative_prompt: + kwargs["negative_prompt"] = negative_prompt + + # Create a generation object + operation = self.client.models.generate_videos( + model=model, + prompt=prompt, + image=first_frame, + config=types.GenerateVideosConfig(**kwargs), + ) + + # Polling + while not operation.done: + operation = self.client.operations.get(operation) + time.sleep(1) + + # Check for video error after polling completes + if getattr(operation, "error", None): + raise RuntimeError(f"Video generation failed: {operation.error}") + + if operation.response is None: + raise RuntimeError("Video generation completed but no response was returned") + + generated = getattr(operation.response, "generated_videos", None) + if not generated: + raise RuntimeError( + f"No videos were generated. operation.error={getattr(operation, 'error', None)}; " + f"operation_name={getattr(operation, 'name', None)}; response={operation.response}" + ) + + # Extract the generated video + video = operation.response.generated_videos[0].video + if not video: + raise ValueError("Video was not generated.") + + # If Video bytes are returned + if hasattr(video, 'video_bytes') and video.video_bytes: + return video.video_bytes + + # Else download the file + downloaded = self.client.files.download(file=video) + + if isinstance(downloaded, (bytes, bytearray)): + return bytes(downloaded) + + if hasattr(downloaded, "read"): + try: + downloaded.seek(0) + except Exception: + pass + return downloaded.read() + + if hasattr(downloaded, "bytes"): + return downloaded.bytes + + if hasattr(downloaded, "data"): + return downloaded.data + + if hasattr(downloaded, "content"): + return downloaded.content + + raise ValueError("Error occured. Video cannot be generated...") \ No newline at end of file diff --git a/veo_video.py b/veo_video.py new file mode 100644 index 0000000..9df9480 --- /dev/null +++ b/veo_video.py @@ -0,0 +1,266 @@ +from dotenv import load_dotenv +load_dotenv() + +import os +import time +import argparse +from pathlib import Path + +from tryon.api.veo import VeoAdapter + + +# ------------------------------------------------------- +# CLI ARGUMENT PARSER +# ------------------------------------------------------- +def build_parser(): + parser = argparse.ArgumentParser( + description="Generate Videos using Google Veo Video Generation API", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # Provider selection + parser.add_argument( + "--provider", + type=str, + default="veo-3.1-generate-preview", + choices=[ + "veo-3.1-generate-preview", + "veo-3.1-fast-generate-preview", + "veo-3.0-generate-001", + "veo-3.0-fast-generate-001", + ], + help="Veo model to use" + ) + + # Mode + parser.add_argument( + "--mode", + type=str, + default="text", + choices=["text", "image", "reference", "frames"], + help="Generation Mode: " + "'text' = text-to-video, " + "'image' = image-to-video, " + "'reference' = reference image guided video, " + "'frames' = first+last frame guided video" + ) + + # Prompt + parser.add_argument( + "-p", "--prompt", + type=str, + help="Prompt (required for all modes)" + ) + + # Duration + parser.add_argument( + "--duration", + type=str, + default="8", + choices=["4", "6", "8"], + help="Video duration (seconds)" + ) + + # Aspect Ratio + parser.add_argument( + "--aspect", + type=str, + default="16:9", + choices=["16:9", "9:16"], + help="Aspect Ratio" + ) + + # Resolution + parser.add_argument( + "--resolution", + type=str, + default="720p", + choices=["720p", "1080p"], + help="Resolution" + ) + + # Negative Prompt + parser.add_argument( + "--negative_prompt", + type=str, + default=None, + help="Optional negative prompt" + ) + + # Images (Single / Multiple) + parser.add_argument( + "--images", + type=str, + nargs="+", + help="Images for video generation" + ) + + # Start Image + parser.add_argument( + "--start_image", + type=str, + help="Start image for frames mode" + ) + + # End Image + parser.add_argument( + "--end_image", + type=str, + help="End image for frames mode" + ) + + # Output Directory + parser.add_argument( + "--output_dir", + type=str, + default="outputs", + help="Folder to save generated videos" + ) + + return parser + + +# ------------------------------------------------------- +# MAIN +# ------------------------------------------------------- +def main(): + parser = build_parser() + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + raise ValueError("GEMINI_API_KEY must be set in environment") + + # -------- Validation -------- + if not args.prompt: + raise ValueError("--prompt is required for all modes") + + # Validate resolution for Veo 3.1 + if args.provider in {"veo-3.1-generate-preview", "veo-3.1-fast-generate-preview"}: + if args.resolution == "1080p" and args.duration != "8": + raise ValueError("1080p supports ONLY 8s duration in Veo 3.1") + + # ---------- Mode: TEXT ---------- + if args.mode == "text": + pass # only prompt required + + + # ---------- Mode: IMAGE ---------- + if args.mode == "image": + if not args.images: + raise ValueError("image mode requires --images") + + if len(args.images) != 1: + raise ValueError("image mode requires exactly ONE image") + + if not (args.images[0].startswith("http") or os.path.exists(args.images[0])): + raise ValueError(f"Image not found: {args.images[0]}") + + + # ---------- Mode: REFERENCE ---------- + if args.mode == "reference": + if args.provider in {"veo-3.0-generate-001", "veo-3.0-fast-generate-001"}: + raise ValueError("Reference images supported ONLY in Veo 3.1 models") + + if not args.images: + raise ValueError("reference mode requires --images") + + if len(args.images) > 3: + raise ValueError("Maximum 3 reference images supported") + + if args.duration != "8": + raise ValueError("Reference video requires duration = 8s") + + if args.aspect != "16:9": + raise ValueError("Reference videos only support 16:9") + + for r in args.images: + if not (r.startswith("http") or os.path.exists(r)): + raise ValueError(f"Reference image not found: {r}") + + + # ---------- Mode: FRAMES ---------- + if args.mode == "frames": + if args.provider in {"veo-3.0-generate-001", "veo-3.0-fast-generate-001"}: + raise ValueError("Frames mode supported ONLY in Veo 3.1 models") + + if args.images: + raise ValueError("--images is not supported for frames mode. Use --start_image and --end_image") + + if args.duration != "8": + raise ValueError("Frames mode requires duration = 8s") + + if not args.start_image or not args.end_image: + raise ValueError("frames mode requires BOTH --start_image AND --end_image") + + if not (args.start_image.startswith("http") or os.path.exists(args.start_image)): + raise ValueError(f"Start image not found: {args.start_image}") + + if not (args.end_image.startswith("http") or os.path.exists(args.end_image)): + raise ValueError(f"End image not found: {args.end_image}") + + + # -------- Initialize Adapter -------- + adapter = VeoAdapter(api_key=api_key) + + # -------- Dispatch -------- + if args.mode == "text": + video_bytes = adapter.generate_text_to_video( + prompt=args.prompt, + duration_seconds=args.duration, + aspect_ratio=args.aspect, + resolution=args.resolution, + model=args.provider, + negative_prompt=args.negative_prompt + ) + + elif args.mode == "image": + video_bytes = adapter.generate_image_to_video( + image=args.images[0], + prompt=args.prompt, + duration_seconds=args.duration, + aspect_ratio=args.aspect, + resolution=args.resolution, + model=args.provider, + negative_prompt=args.negative_prompt + ) + + elif args.mode == "reference": + video_bytes = adapter.generate_video_with_references( + prompt=args.prompt, + reference_images=args.images, + duration_seconds=args.duration, + aspect_ratio=args.aspect, + resolution=args.resolution, + model=args.provider, + negative_prompt=args.negative_prompt + ) + + elif args.mode == "frames": + video_bytes = adapter.generate_video_with_frames( + prompt=args.prompt, + first_image=args.start_image, + last_image=args.end_image, + duration_seconds=args.duration, + aspect_ratio=args.aspect, + resolution=args.resolution, + model=args.provider, + negative_prompt=args.negative_prompt + ) + + else: + raise ValueError("Invalid mode") + + # -------- Save Output -------- + name = f"{args.mode}_{int(time.time())}.mp4" + path = output_dir / name + path.write_bytes(video_bytes) + + print(f"\n✓ Saved: {path}\n") + return 0 + +if __name__ == "__main__": + main()