From 2304a737b3f02dfa303ed45b0a58ddaa5a637c85 Mon Sep 17 00:00:00 2001 From: Kailash Date: Wed, 24 Dec 2025 16:08:48 +0530 Subject: [PATCH] Model swap agent added --- README.md | 272 ++++++++++++- docs/docs/agents/model-swap-agent.md | 453 +++++++++++++++++++++ docs/sidebars.ts | 1 + model_swap_agent.py | 328 +++++++++++++++ tryon/agents/__init__.py | 2 + tryon/agents/model_swap/__init__.py | 15 + tryon/agents/model_swap/agent.py | 537 +++++++++++++++++++++++++ tryon/agents/model_swap/tools.py | 580 +++++++++++++++++++++++++++ 8 files changed, 2185 insertions(+), 3 deletions(-) create mode 100644 docs/docs/agents/model-swap-agent.md create mode 100644 model_swap_agent.py create mode 100644 tryon/agents/model_swap/__init__.py create mode 100644 tryon/agents/model_swap/agent.py create mode 100644 tryon/agents/model_swap/tools.py diff --git a/README.md b/README.md index dc92873..239e3c6 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,9 @@ OpenTryOn is an open-source AI toolkit designed for fashion technology and virtu - **Model Swap**: Swap garments on different models - **Interactive Demos**: Gradio-based web interfaces for all features - **Preprocessing Pipeline**: Complete preprocessing pipeline for training and inference -- **AI Agents**: LangChain-based agents for intelligent virtual try-on operations +- **AI Agents**: + - Virtual Try-On Agent: LangChain-based agent for intelligent virtual try-on operations + - Model Swap Agent: AI agent for replacing models while preserving outfits using multiple AI models (Nano Banana, Nano Banana Pro, FLUX 2 Pro, FLUX 2 Flex) ## 📋 Table of Contents @@ -56,6 +58,7 @@ OpenTryOn is an open-source AI toolkit designed for fashion technology and virtu - [Virtual Try-On with Kling AI](#virtual-try-on-with-kling-ai) - [Virtual Try-On with Segmind](#virtual-try-on-with-segmind) - [Virtual Try-On Agent](#virtual-try-on-agent) + - [Model Swap Agent](#model-swap-agent) - [Image Generation with Nano Banana](#image-generation-with-nano-banana) - [Image Generation with FLUX.2](#image-generation-with-flux2) - [Image Generation with Luma AI](#luma-ai-image-generation) @@ -795,13 +798,276 @@ else: - **Amazon Nova Canvas**: AWS Bedrock-based virtual try-on with automatic garment detection - **Segmind**: Fast and efficient virtual try-on generation +--- + +### Model Swap Agent + +A LangChain-based AI agent that intelligently replaces models/people in images while preserving outfits and styling. Perfect for e-commerce sellers and fashion brands to create professional product imagery with diverse models. + +#### Overview + +The Model Swap Agent: +- **Extracts person attributes** from natural language prompts (gender, age, ethnicity, body type, pose) +- **Generates professional model-swapped images** while preserving exact outfit details +- **Supports multiple AI models**: Nano Banana, Nano Banana Pro (default), FLUX 2 Pro, and FLUX 2 Flex +- **Maintains high-quality photography** with up to 4K resolution support + +#### Prerequisites + +1. **LangChain Installation**: + ```bash + pip install langchain langchain-openai langchain-anthropic langchain-google-genai + ``` + +2. **API Keys Required**: + ```bash + # For Nano Banana models (Gemini API key) + export GEMINI_API_KEY="your_gemini_api_key" + + # For FLUX 2 models (BFL API key) + export BFL_API_KEY="your_bfl_api_key" + + # LLM provider (choose one) + export OPENAI_API_KEY="your_openai_api_key" # Default + export ANTHROPIC_API_KEY="your_anthropic_api_key" + export GOOGLE_API_KEY="your_google_api_key" + ``` + +#### Command Line Usage + +```bash +# Basic usage - replace with professional male model (uses Nano Banana Pro by default) +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with a professional male model in his 30s, athletic build" + +# Use FLUX 2 Pro for high-quality results +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with a professional female model" \ + --model flux2_pro + +# Use FLUX 2 Flex for advanced control +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with an athletic Asian model" \ + --model flux2_flex + +# Use Nano Banana for fast generation +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with a professional model" \ + --model nano_banana + +# Specify detailed attributes with specific model +python model_swap_agent.py \ + --image outfit.jpg \ + --prompt "Asian female model, mid-20s, athletic, confident pose" \ + --model nano_banana_pro \ + --resolution 4K + +# Use Google Search grounding for style references (Nano Banana Pro only) +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Model like professional fashion runway" \ + --model nano_banana_pro \ + --search-grounding + +# Use different LLM provider +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Plus-size woman, African American, 40s, friendly" \ + --llm-provider anthropic \ + --model flux2_pro + +# Use URLs instead of file paths +python model_swap_agent.py \ + --image https://example.com/model.jpg \ + --prompt "Professional female model in her 30s" \ + --model flux2_pro + +# Verbose output to see agent reasoning +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Male model in 30s" \ + --verbose +``` + +#### Python API Usage + +```python +from tryon.agents.model_swap import ModelSwapAgent + +# Initialize the agent with default Nano Banana Pro +agent = ModelSwapAgent(llm_provider="openai") + +# Generate model swap +result = agent.generate( + image="model_wearing_outfit.jpg", + prompt="Replace with a professional Asian female model in her 30s, athletic build, confident pose", + resolution="4K", # Only for Nano Banana Pro + verbose=True +) + +# Handle results +if result["status"] == "success": + images = result['images'] # List of PIL Images + for idx, image in enumerate(images): + image.save(f"result_{idx}.png") + print(f"Generated {len(images)} images using {result['provider']}") +else: + print(f"Error: {result.get('error')}") + +# Using different models +# FLUX 2 Pro +agent = ModelSwapAgent(llm_provider="openai", model="flux2_pro") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional male model in his 30s" +) + +# FLUX 2 Flex +agent = ModelSwapAgent(llm_provider="openai", model="flux2_flex") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional female model" +) + +# Nano Banana (fast) +agent = ModelSwapAgent(llm_provider="openai", model="nano_banana") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional model" +) +``` + +#### Example Prompts + +**Basic Descriptions:** +```python +"Professional male model in his 30s" +"Female model, mid-20s, athletic build" +"Plus-size woman, friendly expression" +``` + +**Detailed Descriptions:** +```python +"Professional Asian female model in her early 30s, athletic build, +confident posture, sharp features, editorial style photography" + +"Athletic male model, African American, late 20s, muscular build, +casual confident pose, commercial photography style" + +"Plus-size woman, Caucasian, 40s, warm friendly expression, +lifestyle photography, natural lighting" +``` + +**Style References:** +```python +"Professional fashion runway model style" +"Commercial lifestyle photography model" +"Editorial high-fashion model aesthetic" +``` + +#### Model Options + +- **Nano Banana**: Fast generation at 1024px resolution, ideal for quick iterations +- **Nano Banana Pro** (default): High-quality up to 4K resolution with search grounding support +- **FLUX 2 Pro**: Professional quality with custom width/height control +- **FLUX 2 Flex**: Advanced controls (guidance scale, steps) for fine-tuned generation + +#### Resolution Options (Nano Banana Pro) + +- **1K (1024px)**: Draft quality, fast generation, testing +- **2K (2048px)**: High-quality, good for web use +- **4K (4096px)**: Professional e-commerce quality (default, recommended) + +#### Advanced Features + +**Search Grounding:** +```python +result = agent.generate( + image="model.jpg", + prompt="Professional fashion runway model", + use_search_grounding=True # Enables Google Search for style references +) +``` + +**Multi-LLM Support:** +```python +# OpenAI GPT (default) +agent = ModelSwapAgent(llm_provider="openai", llm_model="gpt-4") + +# Anthropic Claude +agent = ModelSwapAgent(llm_provider="anthropic", llm_model="claude-3-opus-20240229") + +# Google Gemini +agent = ModelSwapAgent(llm_provider="google", llm_model="gemini-2.5-pro") +``` + +#### Use Cases + +- **E-commerce Sellers**: Create professional product photos with diverse models +- **Fashion Brands**: Showcase clothing on different body types and demographics +- **Clothing Brands**: Generate consistent product imagery across model portfolios +- **Product Photography**: Maintain styling and composition while varying models + +#### How It Works + +1. **Prompt Analysis**: LLM agent extracts person attributes (gender, age, ethnicity, body type, pose, styling) +2. **Prompt Construction**: Agent builds detailed, professional prompt emphasizing outfit preservation +3. **Model Selection**: Uses the specified model (or default Nano Banana Pro) to generate images +4. **Image Generation**: Selected model generates images with perfect outfit preservation (up to 4K with Nano Banana Pro) + +#### Complete Example + +```python +from tryon.agents.model_swap import ModelSwapAgent + +# Initialize agent +agent = ModelSwapAgent( + llm_provider="openai", + llm_model="gpt-4" +) + +# Generate model swap with detailed prompt +result = agent.generate( + image="original_model.jpg", + prompt=( + "Professional Asian female model in her early 30s, " + "athletic build, confident posture, sharp features, " + "editorial style photography" + ), + resolution="4K", + use_search_grounding=False, + verbose=True +) + +# Save results +if result["status"] == "success": + for idx, image in enumerate(result['images']): + image.save(f"swapped_model_{idx}.png") + print(f"Model swap complete! Generated {len(result['images'])} images") + print(f"Model description: {result['model_description']}") +else: + print(f"Error: {result['error']}") +``` + +#### Best Practices + +1. **Be Specific**: Include age, gender, ethnicity, body type in prompts +2. **Describe Pose**: Mention confident, casual, professional, etc. +3. **Mention Style**: Editorial, commercial, lifestyle photography +4. **Use 4K Resolution**: For professional e-commerce quality +5. **Trust the Agent**: Outfit preservation is automatic + #### Documentation For complete documentation, API reference, architecture details, and advanced usage examples, see: -📚 **[Virtual Try-On Agent Documentation →](https://tryonlabs.github.io/opentryon/docs/agents/vton-agent)** +📚 **[Model Swap Agent Documentation →](https://tryonlabs.github.io/opentryon/docs/agents/model-swap-agent)** -**Reference**: [Virtual Try-On Agent Documentation](https://tryonlabs.github.io/opentryon/docs/agents/vton-agent) +**Reference**: [Model Swap Agent Documentation](https://tryonlabs.github.io/opentryon/docs/agents/model-swap-agent) ### Image Generation with Nano Banana diff --git a/docs/docs/agents/model-swap-agent.md b/docs/docs/agents/model-swap-agent.md new file mode 100644 index 0000000..fa4ff30 --- /dev/null +++ b/docs/docs/agents/model-swap-agent.md @@ -0,0 +1,453 @@ +--- +title: Model Swap Agent +description: LangChain-based agent that intelligently replaces models/people in images while preserving outfits and styling using multiple AI models. +keywords: + - model swap agent + - langchain agent + - nano banana + - flux2 + - AI agent + - model replacement + - outfit preservation +--- + +# Model Swap Agent + +A LangChain-based AI agent that intelligently replaces models/people in images while perfectly preserving outfits and styling. Perfect for e-commerce sellers and fashion brands to create professional product imagery with diverse models. + +## Overview + +The Model Swap Agent uses LangChain to analyze natural language prompts and extract detailed person attributes, then generates professional model-swapped images while maintaining the exact outfit, clothing details, patterns, and styling. + +### Supported Models + +The agent supports multiple AI models for model swapping: + +- **Nano Banana** (Gemini 2.5 Flash Image): Fast generation at 1024px resolution, ideal for quick iterations +- **Nano Banana Pro** (Gemini 3 Pro Image Preview): High-quality 4K resolution with search grounding support (default) +- **FLUX 2 Pro**: Professional quality with custom width/height control +- **FLUX 2 Flex**: Advanced controls (guidance scale, steps) for fine-tuned generation + +## Features + +- **Intelligent Attribute Extraction**: Automatically extracts gender, age, ethnicity, body type, pose, and styling from natural language prompts +- **Perfect Outfit Preservation**: Maintains exact clothing details, colors, patterns, textures, and fit +- **Multi-Model Support**: Choose from 4 different AI models based on your needs +- **High-Resolution Output**: Up to 4K resolution for professional e-commerce quality +- **Natural Language Interface**: Simple prompts like "Replace with a professional Asian female model in her 30s" +- **Multiple LLM Support**: Works with OpenAI, Anthropic Claude, and Google Gemini +- **Professional Quality**: Maintains lighting, background, composition, and photography standards + +## Installation + +```bash +pip install langchain langchain-openai langchain-anthropic langchain-google-genai +``` + +**Note**: This agent uses LangChain 1.x API (`create_agent`). See [LangChain 1.x documentation](https://docs.langchain.com/oss/python/langchain/agents) for details. + +## Quick Start + +```python +from tryon.agents.model_swap import ModelSwapAgent + +# Initialize agent with default Nano Banana Pro +agent = ModelSwapAgent(llm_provider="openai") + +# Generate model swap +result = agent.generate( + image="person_wearing_outfit.jpg", + prompt="Replace with a professional Asian female model in her 30s, athletic build" +) + +if result["status"] == "success": + for idx, image in enumerate(result['images']): + image.save(f"swapped_model_{idx}.png") +``` + +## Usage + +### Command Line Interface + +The Model Swap Agent includes a command-line interface for easy usage: + +```bash +# Basic usage with default Nano Banana Pro +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with a professional male model in his 30s, athletic build" + +# Use FLUX 2 Pro for high-quality results +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with a professional female model" \ + --model flux2_pro + +# Use FLUX 2 Flex for advanced control +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with an athletic Asian model" \ + --model flux2_flex + +# Use Nano Banana for fast generation +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Replace with a professional model" \ + --model nano_banana + +# Specify resolution for Nano Banana Pro +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Male model in his 30s" \ + --model nano_banana_pro \ + --resolution 2K + +# Use Google Search grounding (Nano Banana Pro only) +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Model like professional fashion runway" \ + --model nano_banana_pro \ + --search-grounding + +# Use different LLM provider +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Plus-size woman, African American, 40s" \ + --llm-provider anthropic \ + --model flux2_pro + +# Use URLs instead of file paths +python model_swap_agent.py \ + --image https://example.com/model.jpg \ + --prompt "Female model, 30s, professional" \ + --model flux2_pro + +# Verbose output to see agent reasoning +python model_swap_agent.py \ + --image model.jpg \ + --prompt "Male model in 30s" \ + --verbose +``` + +#### CLI Arguments + +- `--image`, `-i`: Path or URL to image of person wearing outfit (required) +- `--prompt`: Description of desired model/person (required) +- `--model`: Model to use for swapping (default: `nano_banana_pro`, options: `nano_banana`, `nano_banana_pro`, `flux2_pro`, `flux2_flex`) +- `--resolution`: Output resolution for Nano Banana Pro (default: `4K`, options: `1K`, `2K`, `4K`) +- `--search-grounding`: Use Google Search grounding for real-world references (Nano Banana Pro only) +- `--llm-provider`: LLM provider to use (default: `openai`, options: `openai`, `anthropic`, `google`) +- `--llm-model`: Specific LLM model name (optional, uses default for provider) +- `--llm-temperature`: Temperature for LLM (default: `0.0`) +- `--llm-api-key`: API key for LLM provider (optional, can use environment variables) +- `--output-dir`, `-o`: Directory to save generated images (default: `outputs/`) +- `--save-base64`: Also save Base64 encoded strings to .txt files +- `--verbose`: Print verbose output including agent reasoning steps + +### Python API Usage + +#### Basic Usage + +```python +from tryon.agents.model_swap import ModelSwapAgent + +# Initialize agent with default Nano Banana Pro +agent = ModelSwapAgent(llm_provider="openai") + +# Generate model swap +result = agent.generate( + image="person_wearing_outfit.jpg", + prompt="Replace with a professional male model in his 30s, athletic build" +) + +if result["status"] == "success": + images = result['images'] + for idx, image in enumerate(images): + image.save(f"result_{idx}.png") +``` + +#### Using Different Models + +```python +# Nano Banana (fast, 1024px) +agent = ModelSwapAgent(llm_provider="openai", model="nano_banana") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional model" +) + +# Nano Banana Pro (4K, default) +agent = ModelSwapAgent(llm_provider="openai", model="nano_banana_pro") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional model", + resolution="4K", + use_search_grounding=False +) + +# FLUX 2 Pro (high quality) +agent = ModelSwapAgent(llm_provider="openai", model="flux2_pro") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional model" +) + +# FLUX 2 Flex (advanced controls) +agent = ModelSwapAgent(llm_provider="openai", model="flux2_flex") +result = agent.generate( + image="model.jpg", + prompt="Replace with a professional model" +) +``` + +#### Using Different LLM Providers + +```python +# OpenAI (default) +agent = ModelSwapAgent(llm_provider="openai", llm_model="gpt-4") + +# Anthropic Claude +agent = ModelSwapAgent(llm_provider="anthropic", llm_model="claude-3-opus-20240229") + +# Google Gemini +agent = ModelSwapAgent(llm_provider="google", llm_model="gemini-2.5-pro") +``` + +### Attribute Extraction + +The agent automatically extracts the following attributes from your prompts: + +- **Gender**: Male, female, non-binary, or unspecified +- **Age Range**: Teens, 20s, 30s, 40s, 50s+ +- **Ethnicity/Appearance**: Asian, African, Caucasian, Hispanic, Middle Eastern, mixed, diverse, or unspecified +- **Body Type**: Slim, athletic, average, curvy, plus-size, muscular +- **Facial Features**: Sharp features, soft features, distinctive characteristics +- **Pose/Expression**: Confident, casual, professional, friendly, serious, natural +- **Styling Preferences**: Professional, casual, editorial, commercial, lifestyle + +### Example Prompts + +**Basic Descriptions:** +```python +"Professional male model in his 30s" +"Female model, mid-20s, athletic build" +"Plus-size woman, friendly expression" +``` + +**Detailed Descriptions:** +```python +"Professional Asian female model in her early 30s, athletic build, +confident posture, sharp features, editorial style photography" + +"Athletic male model, African American, late 20s, muscular build, +casual confident pose, commercial photography style" + +"Plus-size woman, Caucasian, 40s, warm friendly expression, +lifestyle photography, natural lighting" +``` + +**Style References:** +```python +"Professional fashion runway model style" +"Commercial lifestyle photography model" +"Editorial high-fashion model aesthetic" +``` + +## Model Comparison + +| Model | Resolution | Speed | Quality | Best For | +|-------|-----------|-------|---------|----------| +| **Nano Banana** | 1024px | Fast ⚡ | Good | Quick iterations, testing | +| **Nano Banana Pro** | Up to 4K | Medium | Excellent | Professional e-commerce (default) | +| **FLUX 2 Pro** | Custom | Medium | High | Professional quality with custom dimensions | +| **FLUX 2 Flex** | Custom | Slower | Highest | Fine-tuned control, advanced parameters | + +## Resolution Options (Nano Banana Pro) + +- **1K (1024px)**: Draft quality, fast generation, testing +- **2K (2048px)**: High-quality, good for web use +- **4K (4096px)**: Professional e-commerce quality (default, recommended) + +## Environment Variables + +Set the following environment variables for API keys: + +```bash +# For LLM providers (choose one) +export OPENAI_API_KEY="your-openai-api-key" +# OR +export ANTHROPIC_API_KEY="your-anthropic-api-key" +# OR +export GOOGLE_API_KEY="your-google-api-key" + +# For model swapping APIs +export GEMINI_API_KEY="your-gemini-api-key" # For Nano Banana models +export BFL_API_KEY="your-bfl-api-key" # For FLUX 2 models +``` + +## API Reference + +### ModelSwapAgent + +#### `__init__(llm_provider, llm_model=None, temperature=0.0, api_key=None, model=None, **llm_kwargs)` + +Initialize the Model Swap Agent. + +**Parameters:** +- `llm_provider` (str): LLM provider to use. Options: "openai", "anthropic", "google" +- `llm_model` (str, optional): Specific model name. If None, uses default for provider +- `temperature` (float): Temperature for LLM (default: 0.0) +- `api_key` (str, optional): API key for LLM provider +- `model` (str, optional): Model to use for swapping. Options: "nano_banana", "nano_banana_pro", "flux2_pro", "flux2_flex". Default: "nano_banana_pro" +- `**llm_kwargs`: Additional keyword arguments for LLM initialization + +#### `generate(image, prompt, resolution=None, use_search_grounding=False, verbose=False, **kwargs)` + +Generate model-swapped images using the agent. + +**Parameters:** +- `image` (str): Path or URL to the image of person wearing the outfit +- `prompt` (str): Natural language description of desired model/person +- `resolution` (str, optional): Resolution for Nano Banana Pro. Options: "1K", "2K", "4K" (default: "4K") +- `use_search_grounding` (bool): Whether to use Google Search grounding for real-world references (Nano Banana Pro only) +- `verbose` (bool): If True, print debug information about agent reasoning +- `**kwargs`: Additional parameters to pass to the agent + +**Returns:** +- Dictionary containing: + - `status`: "success" or "error" + - `provider`: Model provider used (e.g., "nano_banana_pro", "flux2_pro") + - `images`: List of generated images (PIL Images or base64 strings) + - `model_description`: The detailed prompt used for generation + - `result`: Full agent response + - `error`: Error message (if status is "error") + +## Examples + +### Example 1: Basic Model Swap + +```python +from tryon.agents.model_swap import ModelSwapAgent + +agent = ModelSwapAgent(llm_provider="openai") + +result = agent.generate( + image="model_wearing_outfit.jpg", + prompt="Replace with a professional Asian female model in her 30s, athletic build" +) + +if result["status"] == "success": + for idx, image in enumerate(result['images']): + image.save(f"swapped_model_{idx}.png") + print(f"Generated {len(result['images'])} images") +``` + +### Example 2: High-Quality with FLUX 2 Pro + +```python +agent = ModelSwapAgent( + llm_provider="openai", + model="flux2_pro" +) + +result = agent.generate( + image="model.jpg", + prompt="Professional male model in his 30s, athletic build, confident pose" +) + +if result["status"] == "success": + result['images'][0].save("high_quality_swap.png") +``` + +### Example 3: 4K Professional Quality + +```python +agent = ModelSwapAgent( + llm_provider="openai", + model="nano_banana_pro" +) + +result = agent.generate( + image="outfit.jpg", + prompt=( + "Professional Asian female model in her early 30s, " + "athletic build, confident posture, sharp features, " + "editorial style photography" + ), + resolution="4K", + use_search_grounding=False, + verbose=True +) + +if result["status"] == "success": + for idx, image in enumerate(result['images']): + image.save(f"professional_4k_{idx}.png") + print(f"Model description: {result['model_description']}") +``` + +### Example 4: Advanced Control with FLUX 2 Flex + +```python +agent = ModelSwapAgent( + llm_provider="anthropic", + model="flux2_flex" +) + +result = agent.generate( + image="model.jpg", + prompt="Plus-size woman, African American, 40s, friendly expression" +) + +if result["status"] == "success": + result['images'][0].save("flex_result.png") +``` + +## How It Works + +1. **Prompt Analysis**: LLM agent extracts person attributes (gender, age, ethnicity, body type, pose, styling) from natural language prompts +2. **Prompt Construction**: Agent builds detailed, professional prompt emphasizing outfit preservation and maintaining original lighting, background, and composition +3. **Model Selection**: Uses the specified model (or default Nano Banana Pro) to generate images +4. **Image Generation**: Selected model generates up to 4K resolution images with perfect outfit preservation + +## Best Practices + +1. **Be Specific**: Include age, gender, ethnicity, body type in prompts for better results +2. **Describe Pose**: Mention confident, casual, professional, etc. to guide the generation +3. **Mention Style**: Editorial, commercial, lifestyle photography helps set the tone +4. **Use 4K Resolution**: For professional e-commerce quality (Nano Banana Pro) +5. **Trust the Agent**: Outfit preservation is automatic - focus on describing the desired model +6. **Choose the Right Model**: + - Use Nano Banana for quick iterations + - Use Nano Banana Pro for professional 4K quality (default) + - Use FLUX 2 Pro for custom dimensions + - Use FLUX 2 Flex for advanced control + +## Use Cases + +- **E-commerce Sellers**: Create professional product photos with diverse models +- **Fashion Brands**: Showcase clothing on different body types and demographics +- **Clothing Brands**: Generate consistent product imagery across model portfolios +- **Product Photography**: Maintain styling and composition while varying models + +## Limitations + +- Resolution options (1K, 2K, 4K) are only available for Nano Banana Pro +- Search grounding is only available for Nano Banana Pro +- FLUX models don't support resolution presets (use custom width/height via API) +- Agent output parsing may need refinement for complex scenarios + +## Future Enhancements + +- Add support for more image generation models +- Improve prompt understanding for better attribute extraction +- Add support for batch processing +- Implement result caching +- Add support for video model swapping + +## Related Documentation + +- [Agent Ideas](./agent-ideas.md) - Overview of Fashion AI Agents ecosystem +- [Virtual Try-On Agent](./vton-agent.md) - Virtual try-on agent documentation +- [API Reference - Nano Banana](../api-reference/nano-banana.md) - Nano Banana adapter documentation +- [API Reference - FLUX 2](../api-reference/flux2.md) - FLUX 2 adapter documentation + diff --git a/docs/sidebars.ts b/docs/sidebars.ts index bb30f62..50115c7 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -102,6 +102,7 @@ const sidebars: SidebarsConfig = { 'agents/agent-ideas-summary', 'agents/agent-ideas', 'agents/vton-agent', + 'agents/model-swap-agent', ], }, { diff --git a/model_swap_agent.py b/model_swap_agent.py new file mode 100644 index 0000000..9afbf44 --- /dev/null +++ b/model_swap_agent.py @@ -0,0 +1,328 @@ +from dotenv import load_dotenv +load_dotenv() + +import os +import json +import argparse +import base64 +import io +import requests +from pathlib import Path +from PIL import Image +from tryon.agents.model_swap import ModelSwapAgent + +def main(): + parser = argparse.ArgumentParser( + description="Swap models in images using AI agent while preserving outfit and styling", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage - replace with professional male model (uses Nano Banana Pro by default) + python model_swap_agent.py --image model.jpg --prompt "Replace with a professional male model in his 30s, athletic build" + + # Use FLUX 2 Pro for high-quality results + python model_swap_agent.py --image model.jpg --prompt "Replace with a professional female model" --model flux2_pro + + # Use FLUX 2 Flex for advanced control + python model_swap_agent.py --image model.jpg --prompt "Replace with an athletic Asian model" --model flux2_flex + + # Use Nano Banana for fast generation + python model_swap_agent.py --image model.jpg --prompt "Replace with a professional model" --model nano_banana + + # Specify detailed attributes with specific model + python model_swap_agent.py --image outfit.jpg --prompt "Asian female model, mid-20s, athletic, confident pose" --model nano_banana_pro + + # Use different LLM provider + python model_swap_agent.py --image model.jpg --prompt "Plus-size woman, African American, 40s" --llm-provider anthropic --model flux2_pro + + # Specify resolution (for Nano Banana Pro) + python model_swap_agent.py --image model.jpg --prompt "Male model in his 30s" --resolution 2K --model nano_banana_pro + + # Use Google Search grounding for real-world references (Nano Banana Pro only) + python model_swap_agent.py --image model.jpg --prompt "Model like professional fashion runway" --search-grounding --model nano_banana_pro + + # Use URLs instead of file paths + python model_swap_agent.py --image https://example.com/model.jpg --prompt "Female model, 30s, professional" --model flux2_pro + +Use Cases: + - E-commerce sellers creating professional product imagery + - Fashion brands showcasing clothing on diverse models + - Clothing brands generating model portfolios + - Product photography with consistent styling across models + """ + ) + + # Required arguments + parser.add_argument( + '-i', '--image', + type=str, + required=True, + help='Path or URL to image of person wearing outfit (outfit will be preserved)' + ) + + parser.add_argument( + '--prompt', + type=str, + required=True, + help='Description of desired model/person. Examples: "Professional male model in 30s, athletic", "Asian female, mid-20s, confident pose", "Plus-size woman, friendly expression"' + ) + + # Model swap parameters + parser.add_argument( + '--model', + type=str, + default='nano_banana_pro', + choices=['nano_banana', 'nano_banana_pro', 'flux2_pro', 'flux2_flex'], + help='Model to use for model swapping. Options: nano_banana (fast, 1024px), nano_banana_pro (4K, default), flux2_pro (high-quality), flux2_flex (advanced controls)' + ) + + parser.add_argument( + '--resolution', + type=str, + default='4K', + choices=['1K', '2K', '4K'], + help='Output image resolution (Nano Banana Pro only). Options: 1K (draft), 2K (high-quality), 4K (professional, default)' + ) + + parser.add_argument( + '--search-grounding', + action='store_true', + help='Use Google Search grounding for real-world reference images (Nano Banana Pro only, useful for specific style references)' + ) + + # LLM configuration + parser.add_argument( + '--llm-provider', + type=str, + default='openai', + choices=['openai', 'anthropic', 'google'], + help='LLM provider to use for the agent. Options: openai (default), anthropic, google' + ) + + parser.add_argument( + '--llm-model', + type=str, + default=None, + help='Specific LLM model to use (e.g., "gpt-4-turbo-preview", "claude-3-opus-20240229", "gemini-pro"). If not specified, uses default model for the provider' + ) + + parser.add_argument( + '--llm-temperature', + type=float, + default=0.0, + help='Temperature for LLM (default: 0.0 for deterministic behavior). Range: 0.0-2.0' + ) + + parser.add_argument( + '--llm-api-key', + type=str, + default=None, + help='API key for LLM provider (if not set via environment variable). For OpenAI: OPENAI_API_KEY, Anthropic: ANTHROPIC_API_KEY, Google: GOOGLE_API_KEY' + ) + + # Output configuration + parser.add_argument( + '-o', '--output-dir', + type=str, + default='outputs', + help='Directory to save generated images. Default: outputs/' + ) + + parser.add_argument( + '--save-base64', + action='store_true', + help='Also save Base64 encoded strings to .txt files (in addition to PNG images)' + ) + + parser.add_argument( + '--verbose', + action='store_true', + help='Print verbose output including agent reasoning steps' + ) + + args = parser.parse_args() + + # Validate file path (if it's a local file, not URL) + if not args.image.startswith(('http://', 'https://')): + if not os.path.exists(args.image): + raise FileNotFoundError(f"Image not found: {args.image}") + + # Create output directory if it doesn't exist + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Initialize agent + print(f"Initializing Model Swap Agent...") + print(f" Model: {args.model}") + print(f" LLM Provider: {args.llm_provider}") + if args.llm_model: + print(f" LLM Model: {args.llm_model}") + print(f" Temperature: {args.llm_temperature}") + if args.model == 'nano_banana_pro': + print(f" Output Resolution: {args.resolution}") + if args.search_grounding: + print(f" Search Grounding: Enabled") + + try: + agent = ModelSwapAgent( + llm_provider=args.llm_provider, + llm_model=args.llm_model, + temperature=args.llm_temperature, + api_key=args.llm_api_key, + model=args.model + ) + except ValueError as e: + print(f"\nError initializing agent: {e}") + print("\nPlease ensure:") + print(" 1. Required LLM API key is set in environment variables or --llm-api-key") + print(" 2. LangChain dependencies are installed: pip install langchain langchain-openai langchain-anthropic langchain-google-genai") + if args.model in ['nano_banana', 'nano_banana_pro']: + print(f" 3. Gemini API key is set for {args.model}: GEMINI_API_KEY") + elif args.model in ['flux2_pro', 'flux2_flex']: + print(f" 3. BFL API key is set for {args.model}: BFL_API_KEY") + return 1 + except Exception as e: + print(f"\nUnexpected error initializing agent: {e}") + return 1 + + # Generate model swap + print(f"\n{'='*60}") + print(f"Starting Model Swap Generation") + print(f"{'='*60}") + print(f" Input image: {args.image}") + print(f" Desired model: {args.prompt}") + print(f" Model: {args.model}") + if args.model == 'nano_banana_pro': + print(f" Resolution: {args.resolution}") + print(f"{'='*60}\n") + + try: + # Generate with agent + # Only pass resolution and search_grounding for Nano Banana Pro + generate_kwargs = { + "image": args.image, + "prompt": args.prompt, + "verbose": True # Always show progress + } + + if args.model == 'nano_banana_pro': + generate_kwargs["resolution"] = args.resolution + generate_kwargs["use_search_grounding"] = args.search_grounding + + result = agent.generate(**generate_kwargs) + + if args.verbose: + print(f"\nFull agent result:") + print(json.dumps(result, indent=2, default=str)) + + if result["status"] == "error": + print(f"\n{'='*60}") + print(f"Error: {result.get('error', 'Unknown error')}") + print(f"{'='*60}") + return 1 + + # Extract images + images = result.get("images", []) + provider = result.get("provider", "nano_banana_pro") + model_description = result.get("model_description", "") + + if not images: + print(f"\n{'='*60}") + print(f"Error: No images generated") + print(f"{'='*60}") + return 1 + + print(f"\n{'='*60}") + print(f"Successfully generated {len(images)} image(s) using {provider}") + print(f"{'='*60}") + + if model_description and args.verbose: + print(f"\nModel Description Used:") + print(f" {model_description[:200]}...") + + # Process and save images + saved_images = [] + for idx, image_data in enumerate(images): + try: + # Handle different image formats + if isinstance(image_data, str): + # Check if it's a URL + if image_data.startswith(('http://', 'https://')): + # Download image from URL + img_response = requests.get(image_data) + img_response.raise_for_status() + image_bytes = img_response.content + image = Image.open(io.BytesIO(image_bytes)) + else: + # Assume it's base64 + image_bytes = base64.b64decode(image_data) + image = Image.open(io.BytesIO(image_bytes)) + elif isinstance(image_data, bytes): + image = Image.open(io.BytesIO(image_data)) + elif isinstance(image_data, Image.Image): + image = image_data + else: + # Try to handle image objects with save method + # (PIL Images, Google Genai Images, etc.) + if hasattr(image_data, 'save'): + # It's an image object with save capability + image = image_data + else: + print(f"Skipping image {idx + 1}: Unsupported format (type: {type(image_data).__name__})") + if args.verbose: + print(f" Debug: {type(image_data)}, has save: {hasattr(image_data, 'save')}") + continue + + # Save PNG + output_path = output_dir / f"model_swap_result_{idx}.png" + image.save(output_path) + saved_images.append(output_path) + print(f"Saved image {idx + 1}/{len(images)}: {output_path}") + + # Optionally save Base64 + if args.save_base64: + buffer = io.BytesIO() + image.save(buffer, format='PNG') + image_bytes = buffer.getvalue() + image_base64 = base64.b64encode(image_bytes).decode('utf-8') + + output_path_txt = output_dir / f"model_swap_result_{idx}.txt" + with open(output_path_txt, 'w') as f: + f.write(image_base64) + print(f"Saved Base64 string {idx + 1}: {output_path_txt}") + + except Exception as e: + print(f"Error processing image {idx + 1}: {e}") + continue + + print(f"\n{'='*60}") + print(f"Complete! Saved {len(saved_images)} image(s) to {output_dir}") + print(f"{'='*60}") + + # Print tips + print(f"\nTips:") + if args.model == 'nano_banana_pro': + print(f" • Use --resolution 4K for professional e-commerce quality") + print(f" • Try --search-grounding for style-specific references") + elif args.model in ['flux2_pro', 'flux2_flex']: + print(f" • FLUX models support custom width/height (via API)") + print(f" • FLUX 2 Flex offers advanced controls (guidance, steps)") + print(f" • Be specific in prompts: age, ethnicity, body type, pose") + print(f" • The outfit and styling are automatically preserved") + print(f" • Try different models: nano_banana (fast), nano_banana_pro (4K), flux2_pro (quality), flux2_flex (advanced)") + + return 0 + + except ValueError as e: + print(f"\nError: {e}") + return 1 + except Exception as e: + print(f"\nUnexpected error: {e}") + if args.verbose: + import traceback + traceback.print_exc() + return 1 + +if __name__ == "__main__": + exit(main()) + diff --git a/tryon/agents/__init__.py b/tryon/agents/__init__.py index 3970f7e..cba415b 100644 --- a/tryon/agents/__init__.py +++ b/tryon/agents/__init__.py @@ -5,8 +5,10 @@ """ from .vton.agent import VTOnAgent +from .model_swap.agent import ModelSwapAgent __all__ = [ "VTOnAgent", + "ModelSwapAgent", ] diff --git a/tryon/agents/model_swap/__init__.py b/tryon/agents/model_swap/__init__.py new file mode 100644 index 0000000..b9b5ec1 --- /dev/null +++ b/tryon/agents/model_swap/__init__.py @@ -0,0 +1,15 @@ +""" +Model Swap Agent Module + +This module provides an AI agent for model swapping - replacing the person/model +in an image while preserving the outfit and styling. +""" + +from .agent import ModelSwapAgent +from .tools import get_model_swap_tools + +__all__ = [ + "ModelSwapAgent", + "get_model_swap_tools", +] + diff --git a/tryon/agents/model_swap/agent.py b/tryon/agents/model_swap/agent.py new file mode 100644 index 0000000..4c3400e --- /dev/null +++ b/tryon/agents/model_swap/agent.py @@ -0,0 +1,537 @@ +""" +Model Swap Agent using LangChain + +This agent uses LangChain 1.x to intelligently extract person attributes from +user prompts and perform model swapping using Nano Banana Pro API. The agent +takes an image of a person wearing an outfit and replaces the model based on +natural language descriptions. + +Use Case: E-commerce sellers and clothing brands can create professional quality +product imagery with high-quality fashion models for their online stores. + +Example: + >>> from tryon.agents.model_swap import ModelSwapAgent + >>> + >>> # Using default Nano Banana Pro + >>> agent = ModelSwapAgent(llm_provider="openai") + >>> result = agent.generate( + ... image="person_with_outfit.jpg", + ... prompt="Replace with a professional Asian female model in her 30s, athletic build" + ... ) + >>> print(result) + + >>> # Using FLUX 2 Pro + >>> agent = ModelSwapAgent(llm_provider="openai", model="flux2_pro") + >>> result = agent.generate( + ... image="person_with_outfit.jpg", + ... prompt="Replace with a professional male model in his 30s" + ... ) +""" + +import json +import asyncio +from typing import Optional, Dict, Any, List + +# LangChain 1.x imports +from langchain.agents import create_agent +from langchain_openai import ChatOpenAI +from langchain_anthropic import ChatAnthropic +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_core.language_models import BaseChatModel + +from .tools import get_model_swap_tools, get_tool_output_from_cache + + +class ModelSwapAgent: + """ + LangChain-based Model Swap Agent. + + This agent intelligently extracts person attributes from natural language + prompts and uses Nano Banana Pro to swap the model while preserving the + outfit and styling. + + The agent analyzes prompts to extract: + - Gender (male, female, non-binary) + - Age range (20s, 30s, 40s, etc.) + - Ethnicity/appearance + - Body type (athletic, slim, average, plus-size) + - Facial features + - Pose and styling preferences + + Example: + >>> # Using default Nano Banana Pro + >>> agent = ModelSwapAgent(llm_provider="openai") + >>> result = agent.generate( + ... image="model.jpg", + ... prompt="Replace with a professional male model in his 30s, athletic build, confident pose" + ... ) + + >>> # Using FLUX 2 Flex for advanced control + >>> agent = ModelSwapAgent(llm_provider="openai", model="flux2_flex") + >>> result = agent.generate( + ... image="model.jpg", + ... prompt="Replace with a professional female model in her 20s" + ... ) + """ + + def __init__( + self, + llm_provider: str = "openai", + llm_model: Optional[str] = None, + temperature: float = 0.0, + api_key: Optional[str] = None, + model: Optional[str] = None, + **llm_kwargs + ): + """ + Initialize the Model Swap Agent. + + Args: + llm_provider: LLM provider to use. Options: "openai", "anthropic", "google" + llm_model: Specific model name (e.g., "gpt-4", "claude-3-opus-20240229") + If None, uses default model for the provider + temperature: Temperature for LLM (default: 0.0 for deterministic behavior) + api_key: API key for the LLM provider (if not set via environment variable) + model: Model to use for model swapping. Options: "nano_banana", "nano_banana_pro", + "flux2_pro", "flux2_flex". If None, defaults to "nano_banana_pro" + **llm_kwargs: Additional keyword arguments for LLM initialization + + Raises: + ValueError: If llm_provider is not supported + """ + self.llm_provider = llm_provider.lower() + # Normalize model name: handle None, convert to lowercase, replace dashes/spaces with underscores + if model: + self.model = model.lower().replace("-", "_").replace(" ", "_") + else: + self.model = "nano_banana_pro" + self.tools = get_model_swap_tools(model=self.model) + self.llm = self._initialize_llm( + llm_provider=self.llm_provider, + llm_model=llm_model, + temperature=temperature, + api_key=api_key, + **llm_kwargs + ) + self.agent = self._create_agent() + + def _initialize_llm( + self, + llm_provider: str, + llm_model: Optional[str], + temperature: float, + api_key: Optional[str], + **kwargs + ) -> BaseChatModel: + """Initialize the LLM based on provider.""" + if api_key and callable(api_key): + raise ValueError( + "API key must be a string, not a callable. " + "For async operations, use async methods instead." + ) + + if llm_provider == "openai": + model_name = llm_model or "gpt-5.1" + llm_kwargs = { + "model": model_name, + "temperature": temperature, + **kwargs + } + if api_key: + llm_kwargs["api_key"] = api_key + return ChatOpenAI(**llm_kwargs) + elif llm_provider == "anthropic": + model_name = llm_model or "claude-sonnet-4-5-20250929" + llm_kwargs = { + "model": model_name, + "temperature": temperature, + **kwargs + } + if api_key: + llm_kwargs["api_key"] = api_key + return ChatAnthropic(**llm_kwargs) + elif llm_provider == "google": + model_name = llm_model or "gemini-2.5-pro" + llm_kwargs = { + "model": model_name, + "temperature": temperature, + **kwargs + } + if api_key: + llm_kwargs["google_api_key"] = api_key + return ChatGoogleGenerativeAI(**llm_kwargs) + else: + raise ValueError( + f"Unsupported LLM provider: {llm_provider}. " + f"Supported providers: 'openai', 'anthropic', 'google'" + ) + + def _create_agent(self): + """ + Create the LangChain 1.x agent using create_agent. + + The agent is responsible for: + 1. Analyzing user prompts to extract person attributes + 2. Constructing detailed prompts for model swap + 3. Calling Nano Banana Pro API with optimized parameters + """ + # Determine which tool to use based on model + tool_mapping = { + "nano_banana": "nano_banana_model_swap", + "nano_banana_pro": "nano_banana_pro_model_swap", + "flux2_pro": "flux2_pro_model_swap", + "flux2_flex": "flux2_flex_model_swap" + } + tool_name = tool_mapping.get(self.model, "nano_banana_pro_model_swap") + + # Build tool descriptions + tool_descriptions = { + "nano_banana": "nano_banana_model_swap: Uses Google's Gemini 2.5 Flash Image (Nano Banana) for fast model swapping at 1024px resolution. Good for quick iterations.", + "nano_banana_pro": "nano_banana_pro_model_swap: Uses Google's Gemini 3 Pro Image Preview (Nano Banana Pro) for high-quality model swapping. Supports 1K, 2K, and 4K resolutions. Best for professional e-commerce quality.", + "flux2_pro": "flux2_pro_model_swap: Uses FLUX 2 Pro for high-quality model swapping with custom width/height control. Professional quality results.", + "flux2_flex": "flux2_flex_model_swap: Uses FLUX 2 Flex for advanced model swapping with guidance scale and steps control. Highest quality with fine-tuned parameters." + } + tool_description = tool_descriptions.get(self.model, tool_descriptions["nano_banana_pro"]) + + system_prompt = f"""You are an expert fashion photography and model casting assistant. Your task is to analyze +user requests for model swapping and extract detailed person attributes to generate professional product imagery. + +Available tool: +- {tool_description} + +User will provide: +1. An image of a person wearing an outfit +2. A description of the desired model/person to replace them with + +Your task: +1. Analyze the user's prompt to extract person attributes: + - Gender (male, female, non-binary, or unspecified) + - Age range (teens, 20s, 30s, 40s, 50s+) + - Ethnicity/appearance (Asian, African, Caucasian, Hispanic, Middle Eastern, mixed, diverse, or unspecified) + - Body type (slim, athletic, average, curvy, plus-size, muscular) + - Facial features (if specified: sharp features, soft features, distinctive characteristics) + - Pose/expression (confident, casual, professional, friendly, serious, natural) + - Styling preferences (professional, casual, editorial, commercial, lifestyle) + +2. Construct a detailed, professional prompt for the model swap that: + - Describes the desired model with extracted attributes + - Emphasizes preserving the exact outfit, clothing, and styling + - Maintains the original lighting, background, and composition + - Ensures high-quality, professional photography standards + +3. Select appropriate generation parameters based on the tool: + - For nano_banana_pro_model_swap: Use "4K" resolution for professional e-commerce (default), "2K" for high-quality, "1K" for draft. Use search grounding if real-world references mentioned. + - For nano_banana_model_swap: Optionally specify aspect ratio if needed. + - For flux2_pro_model_swap and flux2_flex_model_swap: Use width/height if specified, otherwise use model defaults. + +4. Call the {tool_name} tool with: + - image: The input image path/URL + - model_description: Your constructed detailed prompt + - Additional parameters based on the tool (resolution, width/height, etc.) + +5. Return the result as a JSON string with status, provider, and images fields + +IMPORTANT PROMPT CONSTRUCTION GUIDELINES: +- Start with: "Professional fashion photography showing [person description] wearing the exact same outfit..." +- Be specific about preserving: clothing items, colors, patterns, textures, fit, styling +- Maintain: lighting setup, background, composition, camera angle, photo quality +- Emphasize photorealism and professional standards +- Keep the outfit and its details completely unchanged + +Example good prompt construction: +User: "Replace with an athletic Asian woman in her 30s" +Your prompt: "Professional fashion photography showing an athletic Asian woman in her early 30s with +confident posture, wearing the exact same outfit with all clothing details, patterns, and colors preserved +perfectly. Maintain the original lighting, background, and composition. High-end e-commerce quality, +photorealistic, professional model photography." + +For nano_banana_pro, default to 4K resolution for professional quality unless the user specifies otherwise. +""" + + agent = create_agent( + model=self.llm, + tools=self.tools, + system_prompt=system_prompt + ) + + return agent + + def generate( + self, + image: str, + prompt: str, + resolution: Optional[str] = None, + use_search_grounding: bool = False, + verbose: bool = False, + **kwargs + ) -> Dict[str, Any]: + """ + Generate model-swapped images using the agent. + + The agent analyzes the prompt to extract person attributes and calls + Nano Banana Pro to swap the model while preserving the outfit. + + Args: + image: Path or URL to the image of person wearing the outfit + prompt: Natural language description of desired model/person. + Examples: + - "Replace with a professional male model in his 30s" + - "Athletic female model, Asian, mid-20s, confident pose" + - "Plus-size woman, African American, 40s, friendly expression" + resolution: Optional resolution override. Options: "1K", "2K", "4K" (default: "4K") + use_search_grounding: Whether to use Google Search grounding for real-world references + verbose: If True, print debug information about agent reasoning + **kwargs: Additional parameters to pass to the agent + + Returns: + Dictionary containing: + - 'status': 'success' or 'error' + - 'provider': Model provider used (e.g., 'nano_banana_pro', 'flux2_pro', etc.) + - 'images': List of generated images (base64 strings or PIL Images) + - 'model_description': The detailed prompt used for generation + - 'result': Full agent response + - 'error': Error message (if status is 'error') + + Example: + >>> agent = ModelSwapAgent() + >>> result = agent.generate( + ... image="model.jpg", + ... prompt="Replace with a professional female model in her 30s, athletic build" + ... ) + >>> images = result['images'] + """ + # Construct the input message for the agent + user_message = f"""Image: {image} +Model Description: {prompt} +Model to use: {self.model}""" + + if resolution: + user_message += f"\nResolution: {resolution}" + + if use_search_grounding: + user_message += "\nUse search grounding: Yes" + + user_message += f"\n\nPlease perform model swap using {self.model} based on the description provided." + + try: + # Execute the agent with streaming + if verbose: + print("\nAnalyzing prompt and extracting person attributes...") + + result = None + last_message_count = 0 + + async def stream_agent(): + nonlocal result, last_message_count + try: + async for chunk in self.agent.astream( + {"messages": [{"role": "user", "content": user_message}]}, + stream_mode="values", + **kwargs + ): + if isinstance(chunk, dict): + messages = chunk.get("messages", []) + if len(messages) > last_message_count: + for msg in messages[last_message_count:]: + msg_type = getattr(msg, 'type', None) or (msg.get("type") if isinstance(msg, dict) else None) + + if msg_type == "ai": + content = getattr(msg, 'content', None) or (msg.get("content") if isinstance(msg, dict) else "") + if content and verbose: + tool_calls = getattr(msg, 'tool_calls', None) or (msg.get("tool_calls") if isinstance(msg, dict) else []) + if tool_calls: + tool_names = [] + for tc in tool_calls: + if isinstance(tc, dict): + tool_names.append(tc.get("name", "unknown")) + else: + tool_names.append(getattr(tc, 'name', 'unknown')) + if tool_names: + print(f"Calling tool: {', '.join(tool_names)}") + elif content.strip() and len(content.strip()) > 10: + if verbose: + print(f"Agent: {content[:200]}") + + elif msg_type == "tool": + if verbose: + tool_name = getattr(msg, 'name', None) or (msg.get("name") if isinstance(msg, dict) else "unknown") + print(f"Tool '{tool_name}' executing...") + + last_message_count = len(messages) + + result = chunk.copy() if hasattr(chunk, 'copy') else chunk + except Exception as e: + if verbose: + print(f"WARNING: Streaming error: {e}, falling back to standard execution...") + result = await self.agent.ainvoke( + {"messages": [{"role": "user", "content": user_message}]}, + **kwargs + ) + + # Run the streaming agent + asyncio.run(stream_agent()) + + # Fallback if no result + if not result or not result.get("messages"): + if verbose: + print("WARNING: No result from streaming, using standard execution...") + result = asyncio.run( + self.agent.ainvoke( + {"messages": [{"role": "user", "content": user_message}]}, + **kwargs + ) + ) + + # Extract output from messages + if not result: + raise ValueError("Agent execution returned no result") + + messages = result.get("messages", []) + if not messages: + if verbose: + print(f"WARNING: No messages found in result") + if isinstance(result, dict): + for key in ["messages", "output", "state"]: + if key in result: + potential_messages = result[key] + if isinstance(potential_messages, list): + messages = potential_messages + break + + output = "" + tool_output = None + + if verbose: + print(f"\nProcessing {len(messages)} messages...") + + # Extract tool output from messages + for message in reversed(messages): + message_type = None + if hasattr(message, 'type'): + message_type = message.type + elif isinstance(message, dict): + message_type = message.get("type") or message.get("message_type") + + if message_type == "tool" or (isinstance(message, dict) and message.get("type") == "tool"): + if hasattr(message, 'content'): + tool_output = message.content + elif isinstance(message, dict): + tool_output = message.get("content", "") + + if tool_output: + if verbose: + print(f"Tool output received") + try: + tool_result = json.loads(tool_output) + provider = tool_result.get("provider", "unknown") + if provider != "unknown": + print(f"Provider: {provider}") + except: + pass + break + + if not output and not tool_output: + if hasattr(message, 'content'): + content = message.content + elif isinstance(message, dict): + content = message.get("content", "") + else: + content = str(message) + + if content and content.strip(): + output = content + + # Prefer tool output + if tool_output: + output = tool_output + + if not output and messages: + output = str(messages[-1]) + + # Parse JSON output + parsed_result = None + try: + parsed_result = json.loads(output) + except (json.JSONDecodeError, TypeError): + try: + if "{" in output and "}" in output: + json_start = output.find("{") + json_end = output.rfind("}") + 1 + json_str = output[json_start:json_end] + parsed_result = json.loads(json_str) + except json.JSONDecodeError: + pass + + # Extract results + if parsed_result: + if parsed_result.get("status") == "error": + return { + "status": "error", + "provider": parsed_result.get("provider", self.model), + "images": [], + "error": parsed_result.get("error", "Unknown error from tool"), + "result": output, + "raw_output": result + } + + # Check for cache key + cache_key = parsed_result.get("cache_key") + if cache_key: + if verbose: + print(f"Retrieving images from cache (key: {cache_key[:8]}...)") + cached_data = get_tool_output_from_cache(cache_key) + if cached_data: + images = cached_data.get("images", []) + provider = cached_data.get("provider", parsed_result.get("provider", self.model)) + model_description = cached_data.get("model_description", "") + if verbose: + print(f"Retrieved {len(images)} image(s) from cache") + else: + if verbose: + print("WARNING: Cache miss, trying alternative extraction...") + images = parsed_result.get("images", []) + provider = parsed_result.get("provider", self.model) + model_description = parsed_result.get("model_description", "") + else: + if verbose: + print("Extracting images from tool output...") + images = parsed_result.get("images", []) + provider = parsed_result.get("provider", self.model) + model_description = parsed_result.get("model_description", "") + + if verbose: + print(f"Successfully generated {len(images)} image(s)") + + return { + "status": "success", + "provider": provider, + "images": images if isinstance(images, list) else [images] if images else [], + "model_description": model_description, + "result": output, + "raw_output": result + } + + # Return error if parsing failed + debug_info = f"Could not parse JSON from output. Output type: {type(output)}, Output preview: {str(output)[:200]}" + if verbose: + print(f"[DEBUG] {debug_info}") + + return { + "status": "error", + "provider": self.model, + "images": [], + "error": "Failed to parse tool output", + "result": output, + "raw_output": result, + "debug_info": debug_info + } + + except Exception as e: + return { + "status": "error", + "provider": self.model, + "images": [], + "error": str(e), + "raw_output": None + } + diff --git a/tryon/agents/model_swap/tools.py b/tryon/agents/model_swap/tools.py new file mode 100644 index 0000000..e5b11fc --- /dev/null +++ b/tryon/agents/model_swap/tools.py @@ -0,0 +1,580 @@ +""" +Tools for Model Swap Agent + +This module provides LangChain tools for model swapping using multiple AI models: +- Nano Banana (Gemini 2.5 Flash Image) +- Nano Banana Pro (Gemini 3 Pro Image Preview) +- FLUX 2 Pro +- FLUX 2 Flex + +The tools allow intelligent extraction of person attributes from prompts and generate +professional model-swapped images while preserving outfits. + +Note: Tools store full image data in a global cache to avoid token limit issues. +""" + +import json +from typing import Optional, List +from pydantic import BaseModel, Field +from langchain.tools import tool + +from tryon.api.nano_banana import NanoBananaAdapter, NanoBananaProAdapter +from tryon.api.flux2 import Flux2ProAdapter, Flux2FlexAdapter + +# Global cache to store full tool outputs +_tool_output_cache = {} + + +class NanoBananaProModelSwapToolInput(BaseModel): + """Input schema for Nano Banana Pro model swap tool.""" + image: str = Field( + description="Path or URL to the image of person wearing the outfit to preserve" + ) + model_description: str = Field( + description=( + "Detailed description of the desired model/person to generate. " + "Should include specific attributes like: " + "gender, age range, ethnicity, body type, facial features, pose, expression, " + "and importantly must emphasize preserving the exact outfit, clothing, and styling. " + "Example: 'Professional fashion photography showing an athletic Asian woman in her 30s " + "wearing the exact same outfit with all clothing details preserved, maintaining original " + "lighting and background.'" + ) + ) + resolution: Optional[str] = Field( + default="4K", + description="Output resolution. Options: '1K', '2K', '4K'. Default: '4K' for professional quality" + ) + use_search_grounding: Optional[bool] = Field( + default=False, + description="Whether to use Google Search grounding for real-world reference images" + ) + + +@tool("nano_banana_pro_model_swap", args_schema=NanoBananaProModelSwapToolInput) +def nano_banana_pro_model_swap( + image: str, + model_description: str, + resolution: str = "4K", + use_search_grounding: bool = False +) -> str: + """ + Swap the model/person in an image while preserving the outfit using Nano Banana Pro. + + This tool uses Google's Gemini 3 Pro Image Preview (Nano Banana Pro) to: + 1. Take an image of a person wearing an outfit + 2. Generate a new image with a different model/person based on the description + 3. Preserve the exact outfit, clothing details, patterns, and styling + 4. Maintain professional photography quality and composition + + Perfect for e-commerce and fashion brands to create professional product imagery + with diverse models while keeping the clothing exactly the same. + + Nano Banana Pro Features: + - 4K resolution support for professional-quality output + - Google Search grounding for real-world reference images + - Advanced "thinking" process for refined composition + - Photorealistic results with accurate clothing preservation + + Args: + image: Path or URL to the image of person wearing the outfit + model_description: Detailed prompt describing the desired model and emphasizing + outfit preservation. Should be comprehensive and specific. + resolution: Output resolution ('1K', '2K', or '4K'). Default: '4K' + use_search_grounding: Use Google Search for real-world references. Default: False + + Returns: + JSON string containing: + - status: 'success' or 'error' + - provider: 'nano_banana_pro' + - image_count: Number of images generated + - cache_key: Key to retrieve full image data from cache + - model_description: The prompt used for generation + - message: Status message + + Example Usage by Agent: + When user says: "Replace with a professional male model in his 30s" + Agent constructs: "Professional fashion photography showing a professional male model + in his early 30s with athletic build and confident posture, wearing the exact same + outfit with all clothing details, colors, and patterns preserved perfectly. Maintain + the original lighting, background, and composition. High-end e-commerce quality, + photorealistic." + """ + try: + print("Initializing Nano Banana Pro adapter...") + adapter = NanoBananaProAdapter() + + print("Generating model swap (this may take a moment)...") + print(f"Resolution: {resolution}") + print(f"Prompt: {model_description}") + if use_search_grounding: + print("Using Google Search grounding") + + # Use generate_image_edit to modify the person while preserving outfit + images = adapter.generate_image_edit( + image=image, + prompt=model_description, + resolution=resolution + ) + + print("Nano Banana Pro generation completed") + + # Store full images in cache + import hashlib + cache_key = hashlib.md5( + f"{image}_{model_description}_{resolution}_{use_search_grounding}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "nano_banana_pro", + "images": images if isinstance(images, list) else [images], + "model_description": model_description + } + + # Return metadata to avoid token limits + result = { + "status": "success", + "provider": "nano_banana_pro", + "image_count": len(images) if isinstance(images, list) else 1, + "cache_key": cache_key, + "model_description": model_description, + "resolution": resolution, + "message": f"Successfully generated {len(images) if isinstance(images, list) else 1} image(s) at {resolution} resolution." + } + return json.dumps(result) + + except Exception as e: + error_message = str(e) + print(f"Error during model swap: {error_message}") + + result = { + "status": "error", + "provider": "nano_banana_pro", + "error": error_message, + "model_description": model_description + } + return json.dumps(result) + + +class NanoBananaModelSwapToolInput(BaseModel): + """Input schema for Nano Banana model swap tool.""" + image: str = Field( + description="Path or URL to the image of person wearing the outfit to preserve" + ) + model_description: str = Field( + description=( + "Detailed description of the desired model/person to generate. " + "Should include specific attributes like: " + "gender, age range, ethnicity, body type, facial features, pose, expression, " + "and importantly must emphasize preserving the exact outfit, clothing, and styling. " + "Example: 'Professional fashion photography showing an athletic Asian woman in her 30s " + "wearing the exact same outfit with all clothing details preserved, maintaining original " + "lighting and background.'" + ) + ) + aspect_ratio: Optional[str] = Field( + default=None, + description="Aspect ratio. Options: '1:1', '2:3', '3:2', '3:4', '4:3', '4:5', '5:4', '9:16', '16:9', '21:9'. Default: None (auto)" + ) + + +class Flux2ProModelSwapToolInput(BaseModel): + """Input schema for FLUX 2 Pro model swap tool.""" + image: str = Field( + description="Path or URL to the image of person wearing the outfit to preserve" + ) + model_description: str = Field( + description=( + "Detailed description of the desired model/person to generate. " + "Should include specific attributes like: " + "gender, age range, ethnicity, body type, facial features, pose, expression, " + "and importantly must emphasize preserving the exact outfit, clothing, and styling. " + "Example: 'Professional fashion photography showing an athletic Asian woman in her 30s " + "wearing the exact same outfit with all clothing details preserved, maintaining original " + "lighting and background.'" + ) + ) + width: Optional[int] = Field( + default=None, + description="Width of output image. Minimum: 64. Default: Model default" + ) + height: Optional[int] = Field( + default=None, + description="Height of output image. Minimum: 64. Default: Model default" + ) + seed: Optional[int] = Field( + default=None, + description="Random seed for reproducibility" + ) + + +class Flux2FlexModelSwapToolInput(BaseModel): + """Input schema for FLUX 2 Flex model swap tool.""" + image: str = Field( + description="Path or URL to the image of person wearing the outfit to preserve" + ) + model_description: str = Field( + description=( + "Detailed description of the desired model/person to generate. " + "Should include specific attributes like: " + "gender, age range, ethnicity, body type, facial features, pose, expression, " + "and importantly must emphasize preserving the exact outfit, clothing, and styling. " + "Example: 'Professional fashion photography showing an athletic Asian woman in her 30s " + "wearing the exact same outfit with all clothing details preserved, maintaining original " + "lighting and background.'" + ) + ) + width: Optional[int] = Field( + default=None, + description="Width of output image. Minimum: 64. Default: Model default" + ) + height: Optional[int] = Field( + default=None, + description="Height of output image. Minimum: 64. Default: Model default" + ) + seed: Optional[int] = Field( + default=None, + description="Random seed for reproducibility" + ) + guidance: Optional[float] = Field( + default=3.5, + description="Guidance scale (1.5-10). Higher values = more adherence to prompt. Default: 3.5" + ) + steps: Optional[int] = Field( + default=28, + description="Number of generation steps. More steps = higher quality. Default: 28" + ) + + +@tool("nano_banana_model_swap", args_schema=NanoBananaModelSwapToolInput) +def nano_banana_model_swap( + image: str, + model_description: str, + aspect_ratio: Optional[str] = None +) -> str: + """ + Swap the model/person in an image while preserving the outfit using Nano Banana (Gemini 2.5 Flash Image). + + This tool uses Google's Gemini 2.5 Flash Image (Nano Banana) to: + 1. Take an image of a person wearing an outfit + 2. Generate a new image with a different model/person based on the description + 3. Preserve the exact outfit, clothing details, patterns, and styling + 4. Maintain professional photography quality and composition + + Nano Banana Features: + - Fast generation at 1024px resolution + - Efficient for high-volume tasks + - Good quality for quick iterations + + Args: + image: Path or URL to the image of person wearing the outfit + model_description: Detailed prompt describing the desired model and emphasizing + outfit preservation. Should be comprehensive and specific. + aspect_ratio: Optional aspect ratio. Options: '1:1', '2:3', '3:2', '3:4', '4:3', + '4:5', '5:4', '9:16', '16:9', '21:9'. Default: None (auto) + + Returns: + JSON string containing: + - status: 'success' or 'error' + - provider: 'nano_banana' + - image_count: Number of images generated + - cache_key: Key to retrieve full image data from cache + - model_description: The prompt used for generation + - message: Status message + """ + try: + print("Initializing Nano Banana adapter...") + adapter = NanoBananaAdapter() + + print("Generating model swap (this may take a moment)...") + print(f"Prompt: {model_description}") + if aspect_ratio: + print(f"Aspect ratio: {aspect_ratio}") + + # Use generate_image_edit to modify the person while preserving outfit + images = adapter.generate_image_edit( + image=image, + prompt=model_description, + aspect_ratio=aspect_ratio + ) + + print("Nano Banana generation completed") + + # Store full images in cache + import hashlib + cache_key = hashlib.md5( + f"nano_banana_{image}_{model_description}_{aspect_ratio}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "nano_banana", + "images": images if isinstance(images, list) else [images], + "model_description": model_description + } + + # Return metadata to avoid token limits + result = { + "status": "success", + "provider": "nano_banana", + "image_count": len(images) if isinstance(images, list) else 1, + "cache_key": cache_key, + "model_description": model_description, + "message": f"Successfully generated {len(images) if isinstance(images, list) else 1} image(s) using Nano Banana." + } + return json.dumps(result) + + except Exception as e: + error_message = str(e) + print(f"Error during model swap: {error_message}") + + result = { + "status": "error", + "provider": "nano_banana", + "error": error_message, + "model_description": model_description + } + return json.dumps(result) + + +@tool("flux2_pro_model_swap", args_schema=Flux2ProModelSwapToolInput) +def flux2_pro_model_swap( + image: str, + model_description: str, + width: Optional[int] = None, + height: Optional[int] = None, + seed: Optional[int] = None +) -> str: + """ + Swap the model/person in an image while preserving the outfit using FLUX 2 Pro. + + This tool uses FLUX 2 Pro to: + 1. Take an image of a person wearing an outfit + 2. Generate a new image with a different model/person based on the description + 3. Preserve the exact outfit, clothing details, patterns, and styling + 4. Maintain professional photography quality and composition + + FLUX 2 Pro Features: + - High-quality image generation + - Custom width/height control + - Seed for reproducibility + - Professional quality results + + Args: + image: Path or URL to the image of person wearing the outfit + model_description: Detailed prompt describing the desired model and emphasizing + outfit preservation. Should be comprehensive and specific. + width: Width of output image. Minimum: 64. Default: Model default + height: Height of output image. Minimum: 64. Default: Model default + seed: Random seed for reproducibility + + Returns: + JSON string containing: + - status: 'success' or 'error' + - provider: 'flux2_pro' + - image_count: Number of images generated + - cache_key: Key to retrieve full image data from cache + - model_description: The prompt used for generation + - message: Status message + """ + try: + print("Initializing FLUX 2 Pro adapter...") + adapter = Flux2ProAdapter() + + print("Generating model swap (this may take a moment)...") + print(f"Prompt: {model_description}") + if width and height: + print(f"Resolution: {width}x{height}") + if seed: + print(f"Seed: {seed}") + + # Use generate_image_edit to modify the person while preserving outfit + images = adapter.generate_image_edit( + prompt=model_description, + input_image=image, + width=width, + height=height, + seed=seed + ) + + print("FLUX 2 Pro generation completed") + + # Store full images in cache + import hashlib + cache_key = hashlib.md5( + f"flux2_pro_{image}_{model_description}_{width}_{height}_{seed}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "flux2_pro", + "images": images if isinstance(images, list) else [images], + "model_description": model_description + } + + # Return metadata to avoid token limits + result = { + "status": "success", + "provider": "flux2_pro", + "image_count": len(images) if isinstance(images, list) else 1, + "cache_key": cache_key, + "model_description": model_description, + "message": f"Successfully generated {len(images) if isinstance(images, list) else 1} image(s) using FLUX 2 Pro." + } + return json.dumps(result) + + except Exception as e: + error_message = str(e) + print(f"Error during model swap: {error_message}") + + result = { + "status": "error", + "provider": "flux2_pro", + "error": error_message, + "model_description": model_description + } + return json.dumps(result) + + +@tool("flux2_flex_model_swap", args_schema=Flux2FlexModelSwapToolInput) +def flux2_flex_model_swap( + image: str, + model_description: str, + width: Optional[int] = None, + height: Optional[int] = None, + seed: Optional[int] = None, + guidance: float = 3.5, + steps: int = 28 +) -> str: + """ + Swap the model/person in an image while preserving the outfit using FLUX 2 Flex. + + This tool uses FLUX 2 Flex to: + 1. Take an image of a person wearing an outfit + 2. Generate a new image with a different model/person based on the description + 3. Preserve the exact outfit, clothing details, patterns, and styling + 4. Maintain professional photography quality and composition + + FLUX 2 Flex Features: + - Advanced controls (guidance scale, steps) + - Prompt upsampling for better quality + - Custom width/height control + - Seed for reproducibility + - Highest quality results with fine-tuned parameters + + Args: + image: Path or URL to the image of person wearing the outfit + model_description: Detailed prompt describing the desired model and emphasizing + outfit preservation. Should be comprehensive and specific. + width: Width of output image. Minimum: 64. Default: Model default + height: Height of output image. Minimum: 64. Default: Model default + seed: Random seed for reproducibility + guidance: Guidance scale (1.5-10). Higher values = more adherence to prompt. Default: 3.5 + steps: Number of generation steps. More steps = higher quality. Default: 28 + + Returns: + JSON string containing: + - status: 'success' or 'error' + - provider: 'flux2_flex' + - image_count: Number of images generated + - cache_key: Key to retrieve full image data from cache + - model_description: The prompt used for generation + - message: Status message + """ + try: + print("Initializing FLUX 2 Flex adapter...") + adapter = Flux2FlexAdapter() + + print("Generating model swap (this may take a moment)...") + print(f"Prompt: {model_description}") + if width and height: + print(f"Resolution: {width}x{height}") + if seed: + print(f"Seed: {seed}") + print(f"Guidance: {guidance}, Steps: {steps}") + + # Use generate_image_edit to modify the person while preserving outfit + images = adapter.generate_image_edit( + prompt=model_description, + input_image=image, + width=width, + height=height, + seed=seed, + guidance=guidance, + steps=steps + ) + + print("FLUX 2 Flex generation completed") + + # Store full images in cache + import hashlib + cache_key = hashlib.md5( + f"flux2_flex_{image}_{model_description}_{width}_{height}_{seed}_{guidance}_{steps}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "flux2_flex", + "images": images if isinstance(images, list) else [images], + "model_description": model_description + } + + # Return metadata to avoid token limits + result = { + "status": "success", + "provider": "flux2_flex", + "image_count": len(images) if isinstance(images, list) else 1, + "cache_key": cache_key, + "model_description": model_description, + "message": f"Successfully generated {len(images) if isinstance(images, list) else 1} image(s) using FLUX 2 Flex." + } + return json.dumps(result) + + except Exception as e: + error_message = str(e) + print(f"Error during model swap: {error_message}") + + result = { + "status": "error", + "provider": "flux2_flex", + "error": error_message, + "model_description": model_description + } + return json.dumps(result) + + +def get_model_swap_tools(model: Optional[str] = None) -> List: + """ + Get available model swap tools. + + Args: + model: Specific model to use. Options: 'nano_banana', 'nano_banana_pro', + 'flux2_pro', 'flux2_flex'. If None, returns all tools. + + Returns: + List of LangChain tools for model swapping + """ + all_tools = { + "nano_banana": nano_banana_model_swap, + "nano_banana_pro": nano_banana_pro_model_swap, + "flux2_pro": flux2_pro_model_swap, + "flux2_flex": flux2_flex_model_swap, + } + + if model: + model_lower = model.lower().replace("-", "_").replace(" ", "_") + if model_lower in all_tools: + return [all_tools[model_lower]] + else: + # Default to nano_banana_pro if invalid model specified + return [all_tools["nano_banana_pro"]] + + # Return all tools if no model specified + return list(all_tools.values()) + + +def get_tool_output_from_cache(cache_key: str) -> Optional[dict]: + """ + Retrieve full tool output from cache using cache_key. + + Args: + cache_key: Cache key returned in tool output + + Returns: + Dictionary with provider, images, and model_description, or None if not found + """ + return _tool_output_cache.get(cache_key) +