diff --git a/README.md b/README.md index c0a0c82..d239927 100644 --- a/README.md +++ b/README.md @@ -1153,6 +1153,208 @@ with open("output.png", "wb") as f: - [GPT-Image-1.5 Model Card](https://platform.openai.com/docs/models/gpt-image-1.5) +### Video Generation with OpenAI Sora + +Generate high-quality videos using OpenAI's Sora models (Sora 2 and Sora 2 Pro). These models support text-to-video and image-to-video generation with flexible durations (4-12 seconds) and multiple resolutions. + +**Available Models:** +- **Sora 2**: Fast, high-quality video generation (recommended for most use cases) +- **Sora 2 Pro**: Enhanced quality with superior temporal consistency and prompt understanding + +#### Prerequisites + +1. **OpenAI Account Setup**: + - Sign up for an OpenAI account at [OpenAI Platform](https://platform.openai.com/) + - Obtain your API key from the [API Keys page](https://platform.openai.com/settings/organization/api-keys) + - Configure credentials in your `.env` file (see Environment Variables section) + +#### Command Line Usage + +```bash +# Basic text-to-video (uses Sora 2 by default) +python sora_video.py --prompt "A fashion model walking down a runway" --output runway.mp4 + +# High-quality with Sora 2 Pro +python sora_video.py --prompt "Cinematic fashion runway show" \ + --model sora-2-pro \ + --duration 12 \ + --resolution 1920x1080 \ + --output runway_hd.mp4 + +# Image-to-video (animate a static image) +python sora_video.py --image model_photo.jpg \ + --prompt "The model turns and smiles at the camera" \ + --duration 4 \ + --output animated.mp4 + +# Asynchronous mode (non-blocking) +python sora_video.py --prompt "Fabric flowing in slow motion" \ + --duration 8 \ + --async \ + --output fabric.mp4 + +# With verbose output +python sora_video.py --prompt "A person trying on different outfits" \ + --duration 8 \ + --resolution 1280x720 \ + --verbose \ + --output outfit_changes.mp4 +``` + +#### Python API Usage + +**Text-to-Video (Synchronous):** + +```python +from dotenv import load_dotenv +load_dotenv() + +from tryon.api.openAI.video_adapter import SoraVideoAdapter + +# Initialize adapter (uses Sora 2 by default) +adapter = SoraVideoAdapter() + +# Generate video from text prompt +video_bytes = adapter.generate_text_to_video( + prompt="A fashion model walking down a runway wearing an elegant evening gown", + duration=8, # seconds (4, 8, or 12) + resolution="1920x1080" # Full HD +) + +# Save the video +with open("runway_walk.mp4", "wb") as f: + f.write(video_bytes) + +print("Video generated successfully!") +``` + +**Using Sora 2 Pro for Higher Quality:** + +```python +# Initialize with Sora 2 Pro +adapter = SoraVideoAdapter(model_version="sora-2-pro") + +video_bytes = adapter.generate_text_to_video( + prompt="Cinematic slow-motion shot of fabric flowing in the wind", + duration=12, + resolution="1920x1080" +) + +with open("fabric_flow.mp4", "wb") as f: + f.write(video_bytes) +``` + +**Image-to-Video (Animate Static Images):** + +```python +adapter = SoraVideoAdapter() + +# Animate a static image with a text prompt +video_bytes = adapter.generate_image_to_video( + image="model_portrait.jpg", + prompt="The model turns around and smiles at the camera", + duration=4, + resolution="1280x720" +) + +with open("animated_model.mp4", "wb") as f: + f.write(video_bytes) +``` + +**Asynchronous Generation with Callbacks:** + +```python +adapter = SoraVideoAdapter() + +# Define callback functions +def on_complete(video_bytes): + with open("output.mp4", "wb") as f: + f.write(video_bytes) + print("✅ Video generation complete!") + +def on_error(error): + print(f"❌ Error: {error}") + +def on_progress(status): + print(f"Status: {status['status']}, Progress: {status.get('progress', 'N/A')}") + +# Start async generation +video_id = adapter.generate_text_to_video_async( + prompt="A person trying on different outfits in a fashion boutique", + duration=8, + resolution="1920x1080", + on_complete=on_complete, + on_error=on_error, + on_progress=on_progress +) + +print(f"Video generation started with ID: {video_id}") +# Script continues immediately, callbacks will be invoked when ready +``` + +**Manual Status Tracking:** + +```python +import time + +# Start generation without waiting +video_id = adapter.generate_text_to_video( + prompt="Fashion runway show with multiple models", + duration=12, + resolution="1920x1080", + wait=False # Return immediately +) + +# Check status manually +while True: + status = adapter.get_video_status(video_id) + print(f"Status: {status['status']}") + + if status['status'] == 'completed': + video_bytes = adapter.download_video(video_id) + with open("runway_show.mp4", "wb") as f: + f.write(video_bytes) + break + elif status['status'] == 'failed': + print(f"Failed: {status.get('error')}") + break + + time.sleep(5) +``` + +#### Supported Features + +- **Text-to-Video**: Generate videos from text descriptions +- **Image-to-Video**: Animate static images with text prompts +- **Durations**: 4, 8, or 12 seconds +- **Resolutions**: + - `720x1280` (9:16 vertical) + - `1280x720` (16:9 horizontal) + - `1080x1920` (9:16 Full HD vertical) + - `1920x1080` (16:9 Full HD horizontal) + - `1024x1792` (tall vertical) + - `1792x1024` (wide horizontal) +- **Wait Modes**: + - Synchronous (blocking, wait for completion) + - Asynchronous (callbacks, non-blocking) + - Manual tracking (custom control flow) +- **Output Format**: MP4 (H.264) + +#### Model Comparison + +| Feature | Sora 2 | Sora 2 Pro | +|---------|--------|------------| +| **Speed** | Fast ⚡ | Slower 🐢 | +| **Quality** | High | Superior | +| **Temporal Consistency** | Good | Excellent | +| **Prompt Understanding** | Good | Superior | +| **Best For** | Rapid iteration, previews | Final production, marketing | + +**References**: +- [OpenAI Video Generation Documentation](https://platform.openai.com/docs/guides/video-generation) +- [Sora Models Overview](https://platform.openai.com/docs/models/sora) + + ### Video Generation with Luma AI Generate smooth, high-fidelity videos using Luma AI’s Ray models (Ray 1.6, Ray 2, and Ray Flash 2). These models support text-to-video and image-to-video generation with optional keyframe interpolation. Image-to-video accepts either a single image or two keyframe images (frame0, frame1) for controlled motion. diff --git a/docs/docs/api-reference/sora-video.md b/docs/docs/api-reference/sora-video.md new file mode 100644 index 0000000..1935d8f --- /dev/null +++ b/docs/docs/api-reference/sora-video.md @@ -0,0 +1,593 @@ +--- +sidebar_position: 8 +title: Sora (OpenAI Video Generation) +description: Generate high-quality videos using OpenAI's Sora models (Sora 2 and Sora 2 Pro) with text-to-video and image-to-video capabilities. +keywords: + - Sora 2 + - Sora 2 Pro + - OpenAI video generation + - video generation + - text to video + - image to video + - video AI + - fashion video +--- + +# Sora (OpenAI Video Generation) + +Generate high-quality videos using OpenAI's **Sora 2** and **Sora 2 Pro** models. This adapter provides a unified interface for text-to-video and image-to-video generation with both synchronous (polling) and asynchronous (callback-based) wait mechanisms. + +## Models Available + +OpenAI provides two Sora model variants: + +### **Sora 2** (Default) +- **Model ID**: `sora-2` +- **Best for**: Fast, high-quality video generation +- **Use cases**: Standard video generation, rapid prototyping, preview generation +- **Speed**: Faster generation times +- **Quality**: High quality suitable for most applications + +### **Sora 2 Pro** +- **Model ID**: `sora-2-pro` +- **Best for**: Premium quality with enhanced temporal consistency +- **Use cases**: Professional video content, marketing materials, high-fidelity animations +- **Speed**: Slower than Sora 2 +- **Quality**: Superior quality with better prompt understanding and frame consistency + +## Features + +- ✅ **Text-to-Video**: Generate videos from text descriptions +- ✅ **Image-to-Video**: Animate static images with text prompts +- ✅ **Flexible Durations**: Support for 4, 8, and 12-second videos +- ✅ **Multiple Resolutions**: From 720p to Full HD in various aspect ratios +- ✅ **Two Wait Modes**: Synchronous (blocking) and asynchronous (callbacks) +- ✅ **Progress Tracking**: Monitor video generation status +- ✅ **Simple Python API**: Easy-to-use interface +- ✅ **CLI Support**: Command-line interface for quick generation + +## Installation + +```bash +pip install openai +``` + +## Configuration + +Set your OpenAI API key as an environment variable: + +```bash +export OPENAI_API_KEY='your-api-key-here' +``` + +Or pass it directly when initializing the adapter. + +## Basic Usage + +### Text-to-Video Generation + +```python +from tryon.api.openAI import SoraVideoAdapter + +# Initialize adapter (uses Sora 2 by default) +adapter = SoraVideoAdapter() + +# Generate a video from text prompt (synchronous) +video_bytes = adapter.generate_text_to_video( + prompt="A fashion model walking down a runway wearing an elegant evening gown", + duration=8, # seconds + resolution="1920x1080" # Full HD +) + +# Save the video +with open("runway_walk.mp4", "wb") as f: + f.write(video_bytes) +``` + +### Using Sora 2 Pro + +```python +# Initialize with Sora 2 Pro for higher quality +adapter = SoraVideoAdapter(model_version="sora-2-pro") + +video_bytes = adapter.generate_text_to_video( + prompt="Cinematic slow-motion shot of fabric flowing in the wind", + duration=12, + resolution="1920x1080" +) + +with open("fabric_flow.mp4", "wb") as f: + f.write(video_bytes) +``` + +### Image-to-Video Generation + +```python +# Animate a static image +adapter = SoraVideoAdapter() + +video_bytes = adapter.generate_image_to_video( + image="model_photo.jpg", + prompt="The model turns around and smiles at the camera", + duration=4, + resolution="1280x720" +) + +with open("animated_model.mp4", "wb") as f: + f.write(video_bytes) +``` + +## Advanced Usage + +### Asynchronous Generation with Callbacks + +```python +adapter = SoraVideoAdapter() + +# Define callback functions +def on_complete(video_bytes): + with open("output.mp4", "wb") as f: + f.write(video_bytes) + print("✅ Video generation complete!") + +def on_error(error): + print(f"❌ Error: {error}") + +def on_progress(status): + print(f"Status: {status['status']}, Progress: {status.get('progress', 'N/A')}") + +# Start async generation +video_id = adapter.generate_text_to_video_async( + prompt="A person trying on different outfits in a fashion boutique", + duration=8, + resolution="1920x1080", + on_complete=on_complete, + on_error=on_error, + on_progress=on_progress +) + +print(f"Video generation started with ID: {video_id}") +# Script continues immediately, callbacks will be invoked when ready +``` + +### Manual Status Tracking + +```python +# Start generation without waiting +video_id = adapter.generate_text_to_video( + prompt="Fashion runway show with multiple models", + duration=12, + resolution="1920x1080", + wait=False # Return immediately +) + +# Check status manually +import time +while True: + status = adapter.get_video_status(video_id) + print(f"Status: {status['status']}") + + if status['status'] == 'completed': + video_bytes = adapter.download_video(video_id) + with open("runway_show.mp4", "wb") as f: + f.write(video_bytes) + break + elif status['status'] == 'failed': + print(f"Failed: {status.get('error')}") + break + + time.sleep(5) +``` + +## CLI Usage + +The package includes a command-line interface for easy video generation: + +### Basic Text-to-Video + +```bash +python sora_video.py --prompt "A fashion model walking in the rain" \ + --output walk.mp4 +``` + +### High-Quality with Sora 2 Pro + +```bash +python sora_video.py --prompt "Cinematic fashion runway show" \ + --model sora-2-pro \ + --duration 12 \ + --resolution 1920x1080 \ + --output runway.mp4 +``` + +### Image-to-Video + +```bash +python sora_video.py --image model_photo.jpg \ + --prompt "The model waves and smiles" \ + --duration 4 \ + --output animated.mp4 +``` + +### Asynchronous Mode + +```bash +python sora_video.py --prompt "Fabric flowing in slow motion" \ + --duration 8 \ + --async \ + --output fabric.mp4 +``` + +## API Reference + +### SoraVideoAdapter + +#### Constructor + +```python +SoraVideoAdapter( + api_key: Optional[str] = None, + model_version: str = "sora-2", + polling_interval: int = 5, + max_polling_time: int = 300 +) +``` + +**Parameters:** +- `api_key` (str, optional): OpenAI API key. Defaults to `OPENAI_API_KEY` environment variable. +- `model_version` (str, optional): Model to use. Options: `"sora-2"`, `"sora-2-pro"`. Default: `"sora-2"`. +- `polling_interval` (int, optional): Seconds between status checks. Default: 5. +- `max_polling_time` (int, optional): Maximum wait time in seconds. Default: 300 (5 minutes). + +#### generate_text_to_video() + +```python +generate_text_to_video( + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + wait: bool = True +) -> Union[bytes, str] +``` + +Generate a video from a text prompt. + +**Parameters:** +- `prompt` (str): Text description of the video content. **Required**. +- `duration` (int, optional): Video length in seconds. Options: `4`, `8`, `12`. Default: `4`. +- `resolution` (str, optional): Output resolution. See [Supported Resolutions](#supported-resolutions). Default: `"1280x720"`. +- `wait` (bool, optional): If `True`, waits for completion and returns video bytes. If `False`, returns video ID immediately. Default: `True`. + +**Returns:** +- `bytes`: Video data (if `wait=True`) +- `str`: Video generation ID (if `wait=False`) + +**Raises:** +- `ValueError`: If parameters are invalid +- `TimeoutError`: If generation exceeds `max_polling_time` +- `RuntimeError`: If generation fails + +#### generate_image_to_video() + +```python +generate_image_to_video( + image: Union[str, io.BytesIO, Image.Image], + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + wait: bool = True +) -> Union[bytes, str] +``` + +Generate a video from an image and text prompt. + +**Parameters:** +- `image` (Union[str, io.BytesIO, Image.Image]): Input image. Can be file path, BytesIO, or PIL Image. **Required**. +- `prompt` (str): Animation instructions. **Required**. +- `duration` (int, optional): Video length in seconds. Options: `4`, `8`, `12`. Default: `4`. +- `resolution` (str, optional): Output resolution. Default: `"1280x720"`. +- `wait` (bool, optional): Whether to wait for completion. Default: `True`. + +**Returns:** +- `bytes`: Video data (if `wait=True`) +- `str`: Video generation ID (if `wait=False`) + +#### generate_text_to_video_async() + +```python +generate_text_to_video_async( + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + on_complete: Optional[Callable[[bytes], None]] = None, + on_error: Optional[Callable[[str], None]] = None, + on_progress: Optional[Callable[[Dict[str, Any]], None]] = None +) -> str +``` + +Generate video asynchronously with callback functions. + +**Parameters:** +- `prompt` (str): Text description. **Required**. +- `duration` (int, optional): Video length in seconds. Default: `4`. +- `resolution` (str, optional): Output resolution. Default: `"1280x720"`. +- `on_complete` (Callable, optional): Called with video bytes when complete. +- `on_error` (Callable, optional): Called with error message if generation fails. +- `on_progress` (Callable, optional): Called with status dict during generation. + +**Returns:** +- `str`: Video generation ID + +#### generate_image_to_video_async() + +```python +generate_image_to_video_async( + image: Union[str, io.BytesIO, Image.Image], + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + on_complete: Optional[Callable[[bytes], None]] = None, + on_error: Optional[Callable[[str], None]] = None, + on_progress: Optional[Callable[[Dict[str, Any]], None]] = None +) -> str +``` + +Generate video from image asynchronously with callbacks. + +#### get_video_status() + +```python +get_video_status(video_id: str) -> Dict[str, Any] +``` + +Check the status of a video generation request. + +**Returns:** +```python +{ + "status": "queued" | "in_progress" | "completed" | "failed", + "id": "video_id", + "progress": 0-100, # Optional + "url": "...", # Only when completed + "file_id": "...", # Only when completed + "error": "..." # Only when failed +} +``` + +#### download_video() + +```python +download_video(video_id: str) -> bytes +``` + +Download a completed video by its ID. + +**Raises:** +- `RuntimeError`: If video is not yet completed + +## Configuration Options + +### Supported Resolutions + +| Resolution | Aspect Ratio | Orientation | Use Case | +|-----------|--------------|-------------|----------| +| `720x1280` | 9:16 | Vertical | Mobile, Stories | +| `1280x720` | 16:9 | Horizontal | Standard HD | +| `1080x1920` | 9:16 | Vertical | Full HD Mobile | +| `1920x1080` | 16:9 | Horizontal | Full HD Desktop | +| `1024x1792` | ~9:16 | Tall Vertical | Special formats | +| `1792x1024` | ~16:9 | Wide Horizontal | Cinematic | + +### Supported Durations + +- **4 seconds**: Quick clips, previews, social media snippets +- **8 seconds**: Standard content, demonstrations, short narratives +- **12 seconds**: Extended content, detailed animations, storytelling + +## Model Comparison + +| Feature | Sora 2 | Sora 2 Pro | +|---------|--------|------------| +| **Speed** | Fast ⚡ | Slower 🐢 | +| **Quality** | High | Superior | +| **Temporal Consistency** | Good | Excellent | +| **Prompt Understanding** | Good | Superior | +| **Frame Coherence** | Good | Excellent | +| **Best For** | Rapid iteration, previews | Final production, marketing | +| **Cost** | Lower | Higher | + +## Wait Mechanisms + +### 1. Synchronous (Polling) - Default + +**When to use:** +- Simple scripts +- One-off generations +- When you can block and wait + +**How it works:** +```python +video_bytes = adapter.generate_text_to_video( + prompt="...", + wait=True # Blocks until complete +) +``` + +### 2. Asynchronous (Callbacks) + +**When to use:** +- Multiple concurrent generations +- Long-running applications +- When you need progress updates +- Non-blocking workflows + +**How it works:** +```python +video_id = adapter.generate_text_to_video_async( + prompt="...", + on_complete=lambda bytes: save_video(bytes), + on_progress=lambda status: print(status) +) +# Continues immediately +``` + +### 3. Manual Tracking + +**When to use:** +- Custom control flow +- Integration with existing systems +- When you need fine-grained control + +**How it works:** +```python +# Start without waiting +video_id = adapter.generate_text_to_video(prompt="...", wait=False) + +# Check status anytime +status = adapter.get_video_status(video_id) + +# Download when ready +if status['status'] == 'completed': + video_bytes = adapter.download_video(video_id) +``` + +## Common Use Cases + +### Fashion Runway Videos + +```python +adapter = SoraVideoAdapter(model_version="sora-2-pro") + +video_bytes = adapter.generate_text_to_video( + prompt="Professional fashion runway show with model wearing elegant evening gown, " + "cinematic lighting, slow motion walk, professional photography", + duration=12, + resolution="1920x1080" +) +``` + +### Product Animation + +```python +adapter = SoraVideoAdapter() + +video_bytes = adapter.generate_image_to_video( + image="product_photo.jpg", + prompt="360-degree rotation of the product, professional studio lighting", + duration=4, + resolution="1280x720" +) +``` + +### Fabric Visualization + +```python +video_bytes = adapter.generate_text_to_video( + prompt="Luxurious silk fabric flowing gracefully in slow motion, " + "golden hour lighting, close-up macro shot", + duration=8, + resolution="1920x1080" +) +``` + +### Model Portfolio Animation + +```python +adapter = SoraVideoAdapter() + +video_bytes = adapter.generate_image_to_video( + image="model_portrait.jpg", + prompt="Model turns head towards camera with confident expression, " + "professional studio lighting, fashion photography", + duration=4, + resolution="1080x1920" # Vertical for mobile +) +``` + +## Error Handling + +```python +from tryon.api.openAI import SoraVideoAdapter + +adapter = SoraVideoAdapter() + +try: + video_bytes = adapter.generate_text_to_video( + prompt="A fashion model walking", + duration=8, + resolution="1920x1080" + ) + + with open("output.mp4", "wb") as f: + f.write(video_bytes) + +except ValueError as e: + print(f"Invalid parameters: {e}") +except TimeoutError as e: + print(f"Generation timeout: {e}") +except RuntimeError as e: + print(f"Generation failed: {e}") +except Exception as e: + print(f"Unexpected error: {e}") +``` + +## Performance Tips + +1. **Use Sora 2 for iteration**: Start with Sora 2 for rapid prototyping, then switch to Sora 2 Pro for final output. + +2. **Optimize duration**: Longer videos take significantly more time. Use 4-second clips for testing. + +3. **Async for batch processing**: When generating multiple videos, use async mode to parallelize: + ```python + for prompt in prompts: + adapter.generate_text_to_video_async( + prompt=prompt, + on_complete=lambda b: save_video(b) + ) + ``` + +4. **Monitor progress**: Use `on_progress` callbacks to track generation status. + +5. **Handle timeouts gracefully**: Set appropriate `max_polling_time` based on your duration and model. + +## Limitations + +- Maximum video duration: 12 seconds +- Generation time varies by duration and model (typically 1-5 minutes) +- API rate limits apply (check OpenAI's documentation) +- Video format: MP4 (H.264) +- Frame rate: 24 FPS (default, may vary) + +## Related Documentation + +- [OpenAI Video Generation Guide](https://platform.openai.com/docs/guides/video-generation) +- [GPT-Image (OpenAI)](./gpt-image) +- [FLUX.2 Image Generation](./flux2) +- [API Reference Overview](./overview) + +## Troubleshooting + +### "Video generation timeout" +- Increase `max_polling_time` parameter +- Try a shorter duration (4s instead of 12s) +- Check OpenAI API status + +### "Invalid resolution" +- Use one of the supported resolutions listed above +- Common safe choice: `"1280x720"` + +### "API key not found" +- Set `OPENAI_API_KEY` environment variable +- Or pass `api_key` parameter explicitly + +### Poor video quality +- Try using Sora 2 Pro instead of Sora 2 +- Refine your prompt with more details +- Check resolution settings + +## Support + +For issues and questions: +- OpenAI API Support: [OpenAI Help Center](https://help.openai.com/) +- OpenTryOn Issues: [GitHub Issues](https://github.com/tryonlabs/opentryon/issues) + diff --git a/docs/docs/getting-started/quickstart.md b/docs/docs/getting-started/quickstart.md index b1f272a..191fb3c 100644 --- a/docs/docs/getting-started/quickstart.md +++ b/docs/docs/getting-started/quickstart.md @@ -136,6 +136,47 @@ edited_images = adapter.generate_image_edit( ) ``` +**Sora (OpenAI Video Generation):** + +```python +from tryon.api.openAI.video_adapter import SoraVideoAdapter + +# Text-to-video generation (uses Sora 2 by default) +adapter = SoraVideoAdapter() # Latest model (Sora 2) +video_bytes = adapter.generate_text_to_video( + prompt="A fashion model walking down a runway wearing an elegant evening gown", + duration=8, # seconds (4, 8, or 12) + resolution="1920x1080" # Full HD +) + +# Save video +with open("runway_walk.mp4", "wb") as f: + f.write(video_bytes) + +# Use Sora 2 Pro for higher quality +adapter_pro = SoraVideoAdapter(model_version="sora-2-pro") + +# Image-to-video (animate a static image) +video_bytes = adapter.generate_image_to_video( + image="model_photo.jpg", + prompt="The model turns and smiles at the camera", + duration=4, + resolution="1280x720" +) + +# Asynchronous generation with callbacks +def on_complete(video_bytes): + with open("result.mp4", "wb") as f: + f.write(video_bytes) + print("Video ready!") + +video_id = adapter.generate_text_to_video_async( + prompt="Fabric flowing in slow motion", + duration=8, + on_complete=on_complete +) +``` + ### 3. Datasets Load and work with fashion datasets: @@ -385,6 +426,7 @@ python run_demo.py --name outfit_generator - [FLUX.2 Image Generation](../api-reference/flux2) - [Nano Banana Image Generation](../api-reference/nano-banana) - [GPT-Image (OpenAI)](../api-reference/gpt-image) + - [Sora Video (OpenAI)](../api-reference/sora-video) - **[Datasets Module](../datasets/overview)**: Load and work with fashion datasets - [Fashion-MNIST](../datasets/fashion-mnist) diff --git a/docs/docs/intro.md b/docs/docs/intro.md index ee266e8..b67e7ca 100644 --- a/docs/docs/intro.md +++ b/docs/docs/intro.md @@ -1,7 +1,7 @@ --- slug: / title: OpenTryOn -description: OpenTryOn is an open-source AI toolkit for fashion technology and virtual try-on applications. Features virtual try-on APIs (Amazon Nova Canvas, Kling AI, Segmind), image generation APIs (Nano Banana, Nano Banana Pro, FLUX.2, GPT-Image-1), datasets (Fashion-MNIST, VITON-HD, Subjects200K), garment segmentation, pose estimation, and TryOnDiffusion implementation. +description: OpenTryOn is an open-source AI toolkit for fashion technology and virtual try-on applications. Features virtual try-on APIs (Amazon Nova Canvas, Kling AI, Segmind), image generation APIs (Nano Banana, Nano Banana Pro, FLUX.2, GPT-Image-1), video generation APIs (Sora 2), datasets (Fashion-MNIST, VITON-HD, Subjects200K), garment segmentation, pose estimation, and TryOnDiffusion implementation. keywords: - virtual try-on - fashion AI @@ -28,6 +28,9 @@ keywords: - FLUX.2 - GPT-Image-1 - OpenAI + - Sora 2 + - video generation + - AI video image: /img/opentryon-social-card.jpg --- @@ -59,7 +62,7 @@ Easy-to-use interfaces for fashion and virtual try-on datasets: - **Automatic Download**: Built-in download functionality and HuggingFace integration ### API Adapters -Cloud-based virtual try-on and image generation APIs: +Cloud-based virtual try-on, image generation, and video generation APIs: - **Segmind**: Try-On Diffusion API for realistic virtual try-on generation - **Kling AI**: Virtual try-on with asynchronous processing - **Amazon Nova Canvas**: AWS-based virtual try-on service @@ -68,6 +71,7 @@ Cloud-based virtual try-on and image generation APIs: - **FLUX.2 [PRO]**: High-quality image generation with text-to-image, image editing, and multi-image composition - **FLUX.2 [FLEX]**: Flexible image generation with advanced controls (guidance scale, steps, prompt upsampling) - **GPT-Image-1 & GPT-Image-1.5**: OpenAI image generation with strong prompt understanding, transparent backgrounds, and mask-based editing. GPT-Image-1.5 offers enhanced quality and better consistency +- **Sora 2 & Sora 2 Pro**: OpenAI video generation with text-to-video and image-to-video capabilities. Sora 2 Pro offers superior quality with enhanced temporal consistency (4-12 second videos, multiple resolutions) ### Garment Preprocessing - **Garment Segmentation**: U2Net-based segmentation for upper, lower, and dress categories diff --git a/docs/sidebars.ts b/docs/sidebars.ts index ca7907a..ec7ab6e 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -58,6 +58,7 @@ const sidebars: SidebarsConfig = { 'api-reference/flux2', 'api-reference/nano-banana', 'api-reference/gpt-image', + 'api-reference/sora-video', 'api-reference/utils', ], }, diff --git a/sora_video.py b/sora_video.py new file mode 100644 index 0000000..1a2cc74 --- /dev/null +++ b/sora_video.py @@ -0,0 +1,338 @@ +""" +Sora Video Generation CLI + +Command-line interface for generating videos using OpenAI's Sora models. + +This script provides easy access to Sora 2 and Sora 2 Pro for: +- Text-to-video generation +- Image-to-video generation (animate static images) +- Both synchronous (blocking) and asynchronous (non-blocking) modes + +Examples: + Text-to-video (synchronous): + python sora_video.py --prompt "A fashion model walking down a runway" \\ + --duration 8 --resolution 1920x1080 --output runway.mp4 + + Text-to-video (async): + python sora_video.py --prompt "Fabric flowing in slow motion" \\ + --duration 4 --async --output fabric.mp4 + + Image-to-video: + python sora_video.py --image model.jpg \\ + --prompt "The model turns and smiles" \\ + --duration 4 --output animated.mp4 + + Using Sora 2 Pro: + python sora_video.py --prompt "Cinematic fashion shoot" \\ + --model sora-2-pro --duration 12 --output cinematic.mp4 + +Requirements: + - OpenAI API key (set OPENAI_API_KEY environment variable or use --api-key) + - openai Python package (pip install openai) +""" + +from dotenv import load_dotenv +load_dotenv() + +import argparse +import os +import sys +from pathlib import Path + + +def build_parser(): + """Build the argument parser for the CLI.""" + parser = argparse.ArgumentParser( + description="Generate videos using OpenAI's Sora models (Sora 2 and Sora 2 Pro)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic text-to-video + python sora_video.py --prompt "A person walking in the rain" --output walk.mp4 + + # High-quality with Sora 2 Pro + python sora_video.py --prompt "Fashion runway show" --model sora-2-pro \\ + --duration 12 --resolution 1920x1080 --output runway.mp4 + + # Animate a static image + python sora_video.py --image photo.jpg --prompt "The person waves" \\ + --duration 4 --output animated.mp4 + + # Asynchronous generation (non-blocking) + python sora_video.py --prompt "Fabric in slow motion" --async --output fabric.mp4 + +For more information, visit: https://platform.openai.com/docs/guides/video-generation + """ + ) + + # Required arguments + parser.add_argument( + '--prompt', + type=str, + required=True, + help="Text description of the video to generate or animation instructions" + ) + + parser.add_argument( + '--output', + '-o', + type=str, + required=True, + help="Output video file path (e.g., output.mp4)" + ) + + # Optional: Image input for image-to-video + parser.add_argument( + '--image', + '-i', + type=str, + default=None, + help="Input image path for image-to-video generation (optional)" + ) + + # Model selection + parser.add_argument( + '--model', + type=str, + default="sora-2", + choices=['sora-2', 'sora-2-pro'], + help="Sora model version. Default: sora-2 (fast). Use sora-2-pro for higher quality." + ) + + # Video parameters + parser.add_argument( + '--duration', + '-d', + type=int, + default=4, + choices=[4, 8, 12], + help="Video duration in seconds. Options: 4, 8, 12. Default: 4" + ) + + parser.add_argument( + '--resolution', + '-r', + type=str, + default="1280x720", + choices=['720x1280', '1280x720', '1024x1792', '1792x1024', '1920x1080', '1080x1920'], + help="Video resolution. Default: 1280x720 (16:9 horizontal)" + ) + + # Wait mode + parser.add_argument( + '--async', + dest='async_mode', + action='store_true', + help="Use asynchronous mode (non-blocking, returns immediately)" + ) + + # API configuration + parser.add_argument( + '--api-key', + type=str, + default=None, + help="OpenAI API key (optional, reads from OPENAI_API_KEY env var if not provided)" + ) + + parser.add_argument( + '--polling-interval', + type=int, + default=5, + help="Seconds between status checks when polling. Default: 5" + ) + + parser.add_argument( + '--max-wait-time', + type=int, + default=300, + help="Maximum wait time in seconds. Default: 300 (5 minutes)" + ) + + # Verbosity + parser.add_argument( + '--verbose', + '-v', + action='store_true', + help="Enable verbose output" + ) + + return parser + + +def main(): + """Main CLI entry point.""" + parser = build_parser() + args = parser.parse_args() + + # Import here to avoid loading heavy dependencies if just showing help + try: + from tryon.api.openAI.video_adapter import SoraVideoAdapter + except ImportError as e: + print(f"Error: Failed to import SoraVideoAdapter: {e}", file=sys.stderr) + print("Please ensure OpenAI SDK is installed: pip install openai", file=sys.stderr) + sys.exit(1) + + # Get API key + api_key = args.api_key or os.getenv("OPENAI_API_KEY") + if not api_key: + print("Error: OpenAI API key not found.", file=sys.stderr) + print("Please provide --api-key or set OPENAI_API_KEY environment variable.", file=sys.stderr) + sys.exit(1) + + # Initialize adapter + if args.verbose: + print(f"Initializing Sora adapter with model: {args.model}") + + try: + adapter = SoraVideoAdapter( + api_key=api_key, + model_version=args.model, + polling_interval=args.polling_interval, + max_polling_time=args.max_wait_time + ) + except Exception as e: + print(f"Error: Failed to initialize adapter: {e}", file=sys.stderr) + sys.exit(1) + + # Validate output path + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Generate video + try: + if args.image: + # Image-to-video + if args.verbose: + print(f"Generating video from image: {args.image}") + print(f"Prompt: {args.prompt}") + print(f"Duration: {args.duration}s, Resolution: {args.resolution}") + + if args.async_mode: + # Asynchronous mode + def on_complete(video_bytes): + with open(output_path, "wb") as f: + f.write(video_bytes) + print(f"✅ Video saved to: {output_path}") + + def on_error(error): + print(f"❌ Video generation failed: {error}", file=sys.stderr) + sys.exit(1) + + def on_progress(status): + if args.verbose: + progress = status.get('progress', 'processing') + print(f"Status: {status['status']} - {progress}") + + video_id = adapter.generate_image_to_video_async( + image=args.image, + prompt=args.prompt, + duration=args.duration, + resolution=args.resolution, + on_complete=on_complete, + on_error=on_error, + on_progress=on_progress if args.verbose else None + ) + + print(f"🎬 Video generation started (ID: {video_id})") + print("Running in async mode. The script will continue monitoring in the background...") + print("Press Ctrl+C to exit (video will continue generating)") + + # Keep script alive + import time + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + print("\nExiting... (video generation continues on server)") + sys.exit(0) + + else: + # Synchronous mode + print("🎬 Generating video... (this may take a few minutes)") + video_bytes = adapter.generate_image_to_video( + image=args.image, + prompt=args.prompt, + duration=args.duration, + resolution=args.resolution, + wait=True + ) + + with open(output_path, "wb") as f: + f.write(video_bytes) + + print(f"✅ Video saved to: {output_path}") + + else: + # Text-to-video + if args.verbose: + print(f"Generating video from text prompt") + print(f"Prompt: {args.prompt}") + print(f"Duration: {args.duration}s, Resolution: {args.resolution}") + + if args.async_mode: + # Asynchronous mode + def on_complete(video_bytes): + with open(output_path, "wb") as f: + f.write(video_bytes) + print(f"✅ Video saved to: {output_path}") + + def on_error(error): + print(f"❌ Video generation failed: {error}", file=sys.stderr) + sys.exit(1) + + def on_progress(status): + if args.verbose: + progress = status.get('progress', 'processing') + print(f"Status: {status['status']} - {progress}") + + video_id = adapter.generate_text_to_video_async( + prompt=args.prompt, + duration=args.duration, + resolution=args.resolution, + on_complete=on_complete, + on_error=on_error, + on_progress=on_progress if args.verbose else None + ) + + print(f"🎬 Video generation started (ID: {video_id})") + print("Running in async mode. The script will continue monitoring in the background...") + print("Press Ctrl+C to exit (video will continue generating)") + + # Keep script alive + import time + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + print("\nExiting... (video generation continues on server)") + sys.exit(0) + + else: + # Synchronous mode + print("🎬 Generating video... (this may take a few minutes)") + video_bytes = adapter.generate_text_to_video( + prompt=args.prompt, + duration=args.duration, + resolution=args.resolution, + wait=True + ) + + with open(output_path, "wb") as f: + f.write(video_bytes) + + print(f"✅ Video saved to: {output_path}") + + except KeyboardInterrupt: + print("\n⚠️ Interrupted by user", file=sys.stderr) + sys.exit(130) + except Exception as e: + print(f"❌ Error: {e}", file=sys.stderr) + if args.verbose: + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/tryon/api/openAI/__init__.py b/tryon/api/openAI/__init__.py index 3cb55d2..4b87f4b 100644 --- a/tryon/api/openAI/__init__.py +++ b/tryon/api/openAI/__init__.py @@ -1,20 +1,31 @@ """ -GPT Image 1 API Adapter +OpenAI API Adapters -This module provides an adapter for OpenAI's image generation model: -- GPT Image 1 (gpt-image-1): High-quality image generation and image editing +This module provides adapters for OpenAI's generative models: + +Image Generation: +- GPT Image 1 (gpt-image-1): High-quality image generation and editing +- GPT Image 1.5 (gpt-image-1.5): Enhanced quality and prompt understanding with support for text-to-image, image-to-image edits, masks, background control, quality settings, and multiple output images. -The adapter normalizes OpenAI SDK responses and returns decoded image bytes +Video Generation: +- Sora 2 (sora-2): Fast, high-quality video generation +- Sora 2 Pro (sora-2-pro): Enhanced quality with superior temporal consistency + with support for text-to-video, image-to-video, and multiple wait strategies. + +The adapters normalize OpenAI SDK responses and return decoded bytes suitable for saving, post-processing, or further transformation. -Reference: -https://platform.openai.com/docs/guides/image-generation +References: +- Image: https://platform.openai.com/docs/guides/image-generation +- Video: https://platform.openai.com/docs/guides/video-generation """ from .image_adapter import GPTImageAdapter +from .video_adapter import SoraVideoAdapter __all__ = [ 'GPTImageAdapter', + 'SoraVideoAdapter', ] \ No newline at end of file diff --git a/tryon/api/openAI/video_adapter.py b/tryon/api/openAI/video_adapter.py new file mode 100644 index 0000000..bab143b --- /dev/null +++ b/tryon/api/openAI/video_adapter.py @@ -0,0 +1,948 @@ +""" +Sora (OpenAI Video Generation) API Adapter + +Adapter for OpenAI's Sora video generation models (Sora 2 and Sora 2 Pro). + +These models support high-quality video generation from text prompts and optional +reference images. This adapter provides a clean, unified interface for the following workflows: + +- Text-to-Video: Generate videos from text descriptions +- Text+Image-to-Video: Generate videos from text and a reference image +- Multiple wait strategies: Polling (sync) and callback-based (async) +- Flexible output control: Duration, resolution, quality, and frame rate + +The adapter can return video bytes directly or provide status tracking for +long-running video generation tasks. + +Reference: +https://platform.openai.com/docs/guides/video-generation + +Models: +- Sora 2 (sora-2): Fast, high-quality video generation (recommended for most use cases) +- Sora 2 Pro (sora-2-pro): Enhanced quality, better temporal consistency, superior prompt understanding + +Examples: + Text-to-video with latest model: + >>> from tryon.api.openAI.video_adapter import SoraVideoAdapter + >>> adapter = SoraVideoAdapter() # Uses sora-2 by default + >>> video_bytes = adapter.generate_text_to_video( + ... prompt="A fashion model walking down a runway wearing an elegant evening gown", + ... duration=4, + ... resolution="1280x720" + ... ) + >>> with open("result.mp4", "wb") as f: + ... f.write(video_bytes) + + Using Sora 2 Pro for higher quality: + >>> adapter = SoraVideoAdapter(model_version="sora-2-pro") + >>> video_bytes = adapter.generate_text_to_video( + ... prompt="A cinematic shot of fabric flowing in slow motion", + ... duration=8, + ... resolution="1920x1080" + ... ) + + Text+Image-to-video: + >>> adapter = SoraVideoAdapter() + >>> video_bytes = adapter.generate_image_to_video( + ... image="reference.jpg", + ... prompt="Animate this image with the model turning and smiling", + ... duration=4 + ... ) + >>> with open("animated.mp4", "wb") as f: + ... f.write(video_bytes) + + Async generation with callback: + >>> def on_complete(video_bytes): + ... print("Video ready!") + ... with open("result.mp4", "wb") as f: + ... f.write(video_bytes) + >>> + >>> def on_error(error): + ... print(f"Generation failed: {error}") + >>> + >>> adapter.generate_text_to_video_async( + ... prompt="A person trying on different outfits", + ... duration=8, + ... on_complete=on_complete, + ... on_error=on_error + ... ) +""" + +import io +import os +import time +from typing import Optional, Union, Callable, Dict, Any +from PIL import Image + +try: + from openai import OpenAI + OPENAI_AVAILABLE = True +except ImportError: + OPENAI_AVAILABLE = False + OpenAI = None + +# Valid configuration options +VALID_MODELS = {"sora-2", "sora-2-pro"} + +# Model-specific supported resolutions +MODEL_RESOLUTIONS = { + "sora-2": { + "720x1280", # Vertical (9:16) + "1280x720", # Horizontal (16:9) + }, + "sora-2-pro": { + "720x1280", # Vertical (9:16) + "1280x720", # Horizontal (16:9) + "1024x1792", # Tall vertical + "1792x1024", # Wide horizontal + } +} + +# All supported resolutions (union of all models) +VALID_RESOLUTIONS = set().union(*MODEL_RESOLUTIONS.values()) + +VALID_DURATIONS = {4, 8, 12} # seconds + + +class SoraVideoAdapter: + """ + Adapter for OpenAI Sora Video Generation API (supports Sora 2 and Sora 2 Pro). + + Args: + api_key (str, optional): OpenAI API key. If not provided, reads from OPENAI_API_KEY environment variable. + model_version (str, optional): Model version to use. Options: "sora-2", "sora-2-pro". + Defaults to "sora-2" (fast and high-quality). + polling_interval (int, optional): Seconds to wait between status checks when polling. Defaults to 5. + max_polling_time (int, optional): Maximum time (seconds) to wait for video generation. Defaults to 300 (5 minutes). + + Examples: + >>> # Use default model (Sora 2) + >>> adapter = SoraVideoAdapter() + + >>> # Use Sora 2 Pro for higher quality + >>> adapter = SoraVideoAdapter(model_version="sora-2-pro") + + >>> # With explicit API key + >>> adapter = SoraVideoAdapter(api_key="sk-...", model_version="sora-2") + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_version: str = "sora-2", + polling_interval: int = 5, + max_polling_time: int = 300 + ): + + if not OPENAI_AVAILABLE: + raise ImportError( + "OpenAI SDK is not available. " + "Please install it with 'pip install openai'." + ) + + if model_version not in VALID_MODELS: + raise ValueError( + f"Invalid model_version: {model_version}. " + f"Supported models: {VALID_MODELS}" + ) + + self.api_key = api_key or os.getenv("OPENAI_API_KEY") + if not self.api_key: + raise ValueError( + "OpenAI API key must be provided either as a parameter " + "or through the OPENAI_API_KEY environment variable." + ) + + self.model_version = model_version + self.polling_interval = polling_interval + self.max_polling_time = max_polling_time + self.client = OpenAI(api_key=self.api_key) + + + def get_image_dimensions(self, image: Union[str, io.BytesIO, Image.Image]) -> tuple: + """ + Get the dimensions (width, height) of an image. + + Args: + image: Image as file path, BytesIO buffer, or PIL Image + + Returns: + Tuple of (width, height) + """ + if isinstance(image, str): + with Image.open(image) as img: + return img.size + + if isinstance(image, Image.Image): + return image.size + + if isinstance(image, io.BytesIO): + image.seek(0) + with Image.open(image) as img: + size = img.size + image.seek(0) # Reset position after reading + return size + + raise TypeError(f"Unsupported image type: {type(image)}") + + + def get_matching_resolution(self, width: int, height: int) -> str: + """ + Find the closest valid video resolution that matches the image dimensions. + + Args: + width: Image width in pixels + height: Image height in pixels + + Returns: + Valid resolution string (e.g., "1280x720") + + Raises: + ValueError: If no matching resolution is found + """ + resolution_str = f"{width}x{height}" + + # Get model-specific supported resolutions + model_resolutions = MODEL_RESOLUTIONS.get(self.model_version, VALID_RESOLUTIONS) + + # Check if exact match exists + if resolution_str in model_resolutions: + return resolution_str + + # If not exact match, raise error with suggestions + raise ValueError( + f"Image dimensions {resolution_str} do not match any supported video resolution for {self.model_version}. " + f"Supported resolutions for {self.model_version}: {sorted(model_resolutions)}. " + f"Please resize your image to one of these resolutions before generating video." + ) + + + def get_closest_resolution(self, width: int, height: int) -> str: + """ + Find the closest supported video resolution based on image aspect ratio. + + Args: + width: Image width in pixels + height: Image height in pixels + + Returns: + Closest valid resolution string (e.g., "1280x720") + """ + aspect_ratio = width / height + + # Get model-specific supported resolutions + model_resolutions = MODEL_RESOLUTIONS.get(self.model_version, VALID_RESOLUTIONS) + + # Parse all valid resolutions and find closest aspect ratio match + closest_resolution = None + min_aspect_diff = float('inf') + + for res in model_resolutions: + res_w, res_h = map(int, res.split('x')) + res_aspect = res_w / res_h + aspect_diff = abs(aspect_ratio - res_aspect) + + if aspect_diff < min_aspect_diff: + min_aspect_diff = aspect_diff + closest_resolution = res + + return closest_resolution + + + def resize_image_to_resolution( + self, + image: Union[str, io.BytesIO, Image.Image], + target_resolution: str + ) -> Image.Image: + """ + Resize an image to match a target video resolution. + + Args: + image: Image as file path, BytesIO buffer, or PIL Image + target_resolution: Target resolution string (e.g., "1280x720") + + Returns: + Resized PIL Image + """ + # Load image as PIL Image + if isinstance(image, str): + img = Image.open(image) + elif isinstance(image, io.BytesIO): + image.seek(0) + img = Image.open(image) + elif isinstance(image, Image.Image): + img = image + else: + raise TypeError(f"Unsupported image type: {type(image)}") + + # Parse target resolution + target_width, target_height = map(int, target_resolution.split('x')) + + # Resize image to exact dimensions + resized_img = img.resize((target_width, target_height), Image.Resampling.LANCZOS) + + return resized_img + + + def prepare_image(self, image: Union[str, io.BytesIO, Image.Image]) -> io.BytesIO: + """ + Prepare an image for the API request. + + Args: + image: Image as file path, BytesIO buffer, or PIL Image + + Returns: + BytesIO buffer ready for API submission (file-like object) + """ + if isinstance(image, str): + with open(image, "rb") as f: + buffer = io.BytesIO(f.read()) + buffer.name = "image.png" # Add filename for multipart upload + buffer.seek(0) + return buffer + + if isinstance(image, Image.Image): + buffer = io.BytesIO() + image.save(buffer, format="PNG") + buffer.name = "image.png" # Add filename for multipart upload + buffer.seek(0) + return buffer + + if isinstance(image, io.BytesIO): + image.seek(0) + if not hasattr(image, 'name'): + image.name = "image.png" # Add filename for multipart upload + return image + + raise TypeError(f"Unsupported image type: {type(image)}") + + + def _validate_common_params(self, duration: int, resolution: str) -> None: + """Validate common parameters across generation methods.""" + if duration not in VALID_DURATIONS: + raise ValueError( + f"Invalid duration: {duration}. " + f"Supported durations (seconds): {VALID_DURATIONS}" + ) + + # Check if resolution is supported by the current model + model_supported_resolutions = MODEL_RESOLUTIONS.get(self.model_version, VALID_RESOLUTIONS) + if resolution not in model_supported_resolutions: + raise ValueError( + f"Invalid resolution: {resolution} for model {self.model_version}. " + f"Supported resolutions for {self.model_version}: {sorted(model_supported_resolutions)}" + ) + + + def _poll_video_status(self, video_id: str) -> bytes: + """ + Poll video generation status until completion (synchronous wait). + + Args: + video_id: The ID of the video generation request + + Returns: + Video bytes when generation is complete + + Raises: + TimeoutError: If video generation exceeds max_polling_time + RuntimeError: If video generation fails + """ + start_time = time.time() + + while True: + elapsed = time.time() - start_time + if elapsed > self.max_polling_time: + raise TimeoutError( + f"Video generation exceeded maximum wait time of {self.max_polling_time} seconds" + ) + + # Check status + status_response = self.client.videos.retrieve(video_id) + status = status_response.status + + if status == "completed": + # Download the video using the correct API + content = self.client.videos.download_content(video_id, variant="video") + return content.read() + + elif status == "failed": + error_msg = getattr(status_response, 'error', 'Unknown error') + raise RuntimeError(f"Video generation failed: {error_msg}") + + elif status in ["queued", "in_progress"]: + # Still generating, wait and retry + time.sleep(self.polling_interval) + continue + + else: + raise RuntimeError(f"Unknown video status: {status}") + + + def _start_video_generation( + self, + prompt: str, + duration: int, + resolution: str, + image: Optional[io.BytesIO] = None, + ) -> str: + """ + Start a video generation request and return the video ID. + + Args: + prompt: Text prompt describing the video + duration: Video length in seconds + resolution: Video resolution (e.g., "1280x720") + image: Optional reference image file-like object for image-to-video + + Returns: + Video generation ID for status tracking + """ + kwargs = { + "model": self.model_version, + "prompt": prompt, + "seconds": str(duration), # API expects string: "4", "8", or "12" + "size": resolution, + } + + # Add image if provided (for image-to-video) + if image is not None: + # Pass the file-like object directly as input_reference + image.seek(0) # Ensure we're at the start of the buffer + kwargs["input_reference"] = image + + response = self.client.videos.create(**kwargs) + return response.id + + + def generate_text_to_video( + self, + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + wait: bool = True, + ) -> Union[bytes, str]: + """ + Generate a video from a text prompt using OpenAI's Sora model. + + This method performs text-to-video generation and can either wait for completion + (returning video bytes) or return immediately with a video ID for manual tracking. + + Args: + prompt (str): + Text description of the desired video content. + Must be a non-empty string with clear, detailed instructions. + + duration (int, optional): + Video length in seconds. + Allowed values: {4, 8, 12}. + Defaults to 4. + + resolution (str, optional): + Output video resolution. + Allowed values: See VALID_RESOLUTIONS. + Common options: "1280x720" (16:9), "720x1280" (9:16), "1920x1080" (Full HD). + Defaults to "1280x720". + + wait (bool, optional): + If True, polls the API until video generation completes and returns video bytes. + If False, returns the video ID immediately for manual status tracking. + Defaults to True. + + Returns: + Union[bytes, str]: + - If wait=True: Video data as bytes (ready to save as .mp4) + - If wait=False: Video generation ID (str) for tracking + + Raises: + ValueError: + - If prompt is empty + - If duration or resolution is invalid + TimeoutError: + - If video generation exceeds max_polling_time (only when wait=True) + RuntimeError: + - If video generation fails + + Example: + >>> adapter = SoraVideoAdapter() + >>> + >>> # Synchronous generation (wait for completion) + >>> video_bytes = adapter.generate_text_to_video( + ... prompt="A fashion model walking down a runway in an elegant dress", + ... duration=8, + ... resolution="1920x1080" + ... ) + >>> with open("runway.mp4", "wb") as f: + ... f.write(video_bytes) + >>> + >>> # Asynchronous generation (manual tracking) + >>> video_id = adapter.generate_text_to_video( + ... prompt="Fabric flowing in slow motion", + ... duration=4, + ... wait=False + ... ) + >>> print(f"Video generation started: {video_id}") + """ + + if not prompt: + raise ValueError("Prompt is required for text-to-video generation.") + + self._validate_common_params(duration, resolution) + + # Start generation + video_id = self._start_video_generation( + prompt=prompt, + duration=duration, + resolution=resolution, + image=None + ) + + # Wait for completion or return ID + if wait: + return self._poll_video_status(video_id) + else: + return video_id + + + def generate_image_to_video( + self, + image: Union[str, io.BytesIO, Image.Image], + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + wait: bool = True, + auto_resize: bool = True, + ) -> Union[bytes, str]: + """ + Generate a video from an image and text prompt using OpenAI's Sora model. + + This method performs image+text-to-video generation, animating a reference image + based on the provided text prompt. Can wait for completion or return a video ID. + + Args: + image (Union[str, io.BytesIO, Image.Image]): + Reference image to animate. + Can be a file path, BytesIO buffer, or PIL Image. + + prompt (str): + Text description of how to animate the image. + Must be a non-empty string with clear animation instructions. + + duration (int, optional): + Video length in seconds. + Allowed values: {4, 8, 12}. + Defaults to 4. + + resolution (str, optional): + Output video resolution. + Allowed values: See VALID_RESOLUTIONS. + Defaults to "1280x720". + + wait (bool, optional): + If True, polls until completion and returns video bytes. + If False, returns video ID for manual tracking. + Defaults to True. + + auto_resize (bool, optional): + If True, automatically resizes the image to the closest supported resolution + when image dimensions don't match. If False, raises an error on mismatch. + Defaults to True. + + Returns: + Union[bytes, str]: + - If wait=True: Video data as bytes + - If wait=False: Video generation ID for tracking + + Raises: + ValueError: + - If prompt is empty + - If duration or resolution is invalid + TypeError: + - If image type is not supported + TimeoutError: + - If generation exceeds max_polling_time (only when wait=True) + RuntimeError: + - If video generation fails + + Example: + >>> adapter = SoraVideoAdapter() + >>> + >>> # Animate a static image + >>> video_bytes = adapter.generate_image_to_video( + ... image="model_photo.jpg", + ... prompt="The model turns and smiles at the camera", + ... duration=4, + ... resolution="1280x720" + ... ) + >>> with open("animated_model.mp4", "wb") as f: + ... f.write(video_bytes) + """ + + if not prompt: + raise ValueError("Prompt is required for image-to-video generation.") + + # Get image dimensions to validate/auto-detect resolution + width, height = self.get_image_dimensions(image) + image_resolution = f"{width}x{height}" + + # Get model-specific supported resolutions + model_resolutions = MODEL_RESOLUTIONS.get(self.model_version, VALID_RESOLUTIONS) + + # Check if image dimensions match a supported resolution + if image_resolution in model_resolutions: + # Exact match - use image dimensions + if resolution == "1280x720": # Default resolution + resolution = image_resolution + print(f"✓ Using image resolution: {resolution}") + elif resolution != image_resolution: + # User specified different resolution than image + if auto_resize: + print(f"⚠ Image is {image_resolution}, resizing to requested {resolution}...") + image = self.resize_image_to_resolution(image, resolution) + else: + raise ValueError( + f"Image dimensions ({image_resolution}) don't match requested resolution ({resolution}). " + f"Set auto_resize=True to automatically resize, or resize your image manually." + ) + else: + # Image dimensions don't match any supported resolution + if auto_resize: + # Find closest supported resolution + target_resolution = self.get_closest_resolution(width, height) + print(f"⚠ Image is {image_resolution}, auto-resizing to closest supported resolution: {target_resolution}") + image = self.resize_image_to_resolution(image, target_resolution) + resolution = target_resolution + else: + raise ValueError( + f"Image dimensions {image_resolution} do not match any supported video resolution. " + f"Supported resolutions: {VALID_RESOLUTIONS}. " + f"Set auto_resize=True to automatically resize, or resize your image manually." + ) + + self._validate_common_params(duration, resolution) + + # Prepare image (returns BytesIO file-like object) + image_buffer = self.prepare_image(image) + + # Start generation + video_id = self._start_video_generation( + prompt=prompt, + duration=duration, + resolution=resolution, + image=image_buffer + ) + + # Wait for completion or return ID + if wait: + return self._poll_video_status(video_id) + else: + return video_id + + + def get_video_status(self, video_id: str) -> Dict[str, Any]: + """ + Check the status of a video generation request. + + Args: + video_id: The ID returned from a video generation request + + Returns: + Dict containing status information: + - status: "queued", "in_progress", "completed", or "failed" + - progress: Optional progress percentage (0-100) + - url: Video URL (only when status is "completed") + - error: Error message (only when status is "failed") + + Example: + >>> video_id = adapter.generate_text_to_video(prompt="...", wait=False) + >>> status = adapter.get_video_status(video_id) + >>> print(f"Status: {status['status']}") + """ + response = self.client.videos.retrieve(video_id) + + result = { + "status": response.status, + "id": video_id, + } + + if hasattr(response, 'progress'): + result["progress"] = response.progress + + if response.status == "completed": + result["url"] = response.url + + if response.status == "failed": + result["error"] = getattr(response, 'error', 'Unknown error') + + return result + + + def download_video(self, video_id: str) -> bytes: + """ + Download a completed video by its ID. + + Args: + video_id: The ID of a completed video generation + + Returns: + Video bytes ready to save + + Raises: + RuntimeError: If video is not yet completed or has failed + + Example: + >>> video_id = adapter.generate_text_to_video(prompt="...", wait=False) + >>> # ... wait some time or check status ... + >>> video_bytes = adapter.download_video(video_id) + >>> with open("video.mp4", "wb") as f: + ... f.write(video_bytes) + """ + status = self.get_video_status(video_id) + + if status["status"] != "completed": + raise RuntimeError( + f"Video is not ready for download. Current status: {status['status']}" + ) + + # Download using the correct API + content = self.client.videos.download_content(video_id, variant="video") + return content.read() + + + def generate_text_to_video_async( + self, + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + on_complete: Optional[Callable[[bytes], None]] = None, + on_error: Optional[Callable[[str], None]] = None, + on_progress: Optional[Callable[[Dict[str, Any]], None]] = None, + ) -> str: + """ + Generate a video asynchronously with callback functions. + + This method starts video generation and immediately returns the video ID. + It spawns a background thread to monitor progress and invoke callbacks. + + Args: + prompt: Text description of the video + duration: Video length in seconds + resolution: Output resolution + on_complete: Callback function called with video bytes when ready + on_error: Callback function called with error message if generation fails + on_progress: Callback function called with status dict during generation + + Returns: + Video generation ID for reference + + Example: + >>> def handle_complete(video_bytes): + ... with open("output.mp4", "wb") as f: + ... f.write(video_bytes) + ... print("Video saved!") + >>> + >>> def handle_error(error): + ... print(f"Error: {error}") + >>> + >>> def handle_progress(status): + ... print(f"Progress: {status.get('progress', 'processing')}") + >>> + >>> video_id = adapter.generate_text_to_video_async( + ... prompt="A cinematic fashion show", + ... duration=8, + ... on_complete=handle_complete, + ... on_error=handle_error, + ... on_progress=handle_progress + ... ) + """ + import threading + + if not prompt: + raise ValueError("Prompt is required for text-to-video generation.") + + self._validate_common_params(duration, resolution) + + # Start generation + video_id = self._start_video_generation( + prompt=prompt, + duration=duration, + resolution=resolution, + image=None + ) + + # Background monitoring thread + def monitor(): + try: + start_time = time.time() + while True: + elapsed = time.time() - start_time + if elapsed > self.max_polling_time: + if on_error: + on_error(f"Timeout after {self.max_polling_time} seconds") + return + + status = self.get_video_status(video_id) + + if on_progress: + on_progress(status) + + if status["status"] == "completed": + video_bytes = self.download_video(video_id) + if on_complete: + on_complete(video_bytes) + return + + elif status["status"] == "failed": + if on_error: + on_error(status.get("error", "Unknown error")) + return + + time.sleep(self.polling_interval) + + except Exception as e: + if on_error: + on_error(str(e)) + + thread = threading.Thread(target=monitor, daemon=True) + thread.start() + + return video_id + + + def generate_image_to_video_async( + self, + image: Union[str, io.BytesIO, Image.Image], + prompt: str, + duration: int = 4, + resolution: str = "1280x720", + on_complete: Optional[Callable[[bytes], None]] = None, + on_error: Optional[Callable[[str], None]] = None, + on_progress: Optional[Callable[[Dict[str, Any]], None]] = None, + auto_resize: bool = True, + ) -> str: + """ + Generate a video from image asynchronously with callback functions. + + This method starts image-to-video generation and immediately returns the video ID. + It spawns a background thread to monitor progress and invoke callbacks. + + Args: + image: Reference image to animate + prompt: Text description of the animation + duration: Video length in seconds + resolution: Output resolution + on_complete: Callback function called with video bytes when ready + on_error: Callback function called with error message if generation fails + on_progress: Callback function called with status dict during generation + + Returns: + Video generation ID for reference + + Example: + >>> adapter = SoraVideoAdapter() + >>> + >>> def save_video(video_bytes): + ... with open("animated.mp4", "wb") as f: + ... f.write(video_bytes) + >>> + >>> video_id = adapter.generate_image_to_video_async( + ... image="photo.jpg", + ... prompt="The person waves and smiles", + ... duration=4, + ... on_complete=save_video + ... ) + """ + import threading + + if not prompt: + raise ValueError("Prompt is required for image-to-video generation.") + + # Get image dimensions to validate/auto-detect resolution + width, height = self.get_image_dimensions(image) + image_resolution = f"{width}x{height}" + + # Get model-specific supported resolutions + model_resolutions = MODEL_RESOLUTIONS.get(self.model_version, VALID_RESOLUTIONS) + + # Check if image dimensions match a supported resolution + if image_resolution in model_resolutions: + # Exact match - use image dimensions + if resolution == "1280x720": # Default resolution + resolution = image_resolution + print(f"✓ Using image resolution: {resolution}") + elif resolution != image_resolution: + # User specified different resolution than image + if auto_resize: + print(f"⚠ Image is {image_resolution}, resizing to requested {resolution}...") + image = self.resize_image_to_resolution(image, resolution) + else: + raise ValueError( + f"Image dimensions ({image_resolution}) don't match requested resolution ({resolution}). " + f"Set auto_resize=True to automatically resize, or resize your image manually." + ) + else: + # Image dimensions don't match any supported resolution + if auto_resize: + # Find closest supported resolution + target_resolution = self.get_closest_resolution(width, height) + print(f"⚠ Image is {image_resolution}, auto-resizing to closest supported resolution: {target_resolution}") + image = self.resize_image_to_resolution(image, target_resolution) + resolution = target_resolution + else: + raise ValueError( + f"Image dimensions {image_resolution} do not match any supported video resolution. " + f"Supported resolutions: {VALID_RESOLUTIONS}. " + f"Set auto_resize=True to automatically resize, or resize your image manually." + ) + + self._validate_common_params(duration, resolution) + + # Prepare image (returns BytesIO file-like object) + image_buffer = self.prepare_image(image) + + # Start generation + video_id = self._start_video_generation( + prompt=prompt, + duration=duration, + resolution=resolution, + image=image_buffer + ) + + # Background monitoring thread (same logic as text-to-video) + def monitor(): + try: + start_time = time.time() + while True: + elapsed = time.time() - start_time + if elapsed > self.max_polling_time: + if on_error: + on_error(f"Timeout after {self.max_polling_time} seconds") + return + + status = self.get_video_status(video_id) + + if on_progress: + on_progress(status) + + if status["status"] == "completed": + video_bytes = self.download_video(video_id) + if on_complete: + on_complete(video_bytes) + return + + elif status["status"] == "failed": + if on_error: + on_error(status.get("error", "Unknown error")) + return + + time.sleep(self.polling_interval) + + except Exception as e: + if on_error: + on_error(str(e)) + + thread = threading.Thread(target=monitor, daemon=True) + thread.start() + + return video_id +