Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions configs/ltxv-13b-0.9.7-dev.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

pipeline_type: multi-scale
checkpoint_path: "ltxv-13b-0.9.7-dev.safetensors"
downscale_factor: 0.6666666
Expand All @@ -14,20 +13,22 @@ prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-P
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
stochastic_sampling: false


first_pass:
guidance_scale: [3]
stg_scale: [1]
rescaling_scale: [0.7]
guidance_timesteps: [1.0]
skip_block_list: [19] # [[1], [1,2], [1,2,3], [27], [28], [28]]
guidance_scale: [1, 1, 6, 8, 6, 1, 1]
stg_scale: [0, 0, 4, 4, 4, 2, 1]
rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
guidance_timesteps: [1.0, 0.996, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
num_inference_steps: 30
skip_final_inference_steps: 3
cfg_star_rescale: true

second_pass:
guidance_scale: [3]
guidance_scale: [1]
stg_scale: [1]
rescaling_scale: [0.7]
rescaling_scale: [1]
guidance_timesteps: [1.0]
skip_block_list: [19] # [[1], [1,2], [1,2,3], [27], [28], [28]]
num_inference_steps: 10
strength: 0.85
skip_block_list: [27]
num_inference_steps: 30
skip_initial_inference_steps: 17
cfg_star_rescale: true
17 changes: 17 additions & 0 deletions configs/ltxv-2b-0.9.1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pipeline_type: base
checkpoint_path: "ltx-video-2b-v0.9.1.safetensors"
guidance_scale: 3
stg_scale: 1
rescaling_scale: 0.7
skip_block_list: [19]
num_inference_steps: 40
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
decode_timestep: 0.05
decode_noise_scale: 0.025
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
precision: "bfloat16"
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
prompt_enhancement_words_threshold: 120
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
stochastic_sampling: false
17 changes: 17 additions & 0 deletions configs/ltxv-2b-0.9.5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pipeline_type: base
checkpoint_path: "ltx-video-2b-v0.9.5.safetensors"
guidance_scale: 3
stg_scale: 1
rescaling_scale: 0.7
skip_block_list: [19]
num_inference_steps: 40
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
decode_timestep: 0.05
decode_noise_scale: 0.025
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
precision: "bfloat16"
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
prompt_enhancement_words_threshold: 120
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
stochastic_sampling: false
7 changes: 3 additions & 4 deletions configs/ltxv-2b-0.9.6-distilled.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
pipeline_type: base
checkpoint_path: "ltxv-2b-0.9.6-distilled-04-25.safetensors"
guidance_scale: 3
stg_scale: 1
rescaling_scale: 0.7
skip_block_list: [19]
guidance_scale: 1
stg_scale: 0
rescaling_scale: 1
num_inference_steps: 8
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
decode_timestep: 0.05
Expand Down
17 changes: 17 additions & 0 deletions configs/ltxv-2b-0.9.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pipeline_type: base
checkpoint_path: "ltx-video-2b-v0.9.safetensors"
guidance_scale: 3
stg_scale: 1
rescaling_scale: 0.7
skip_block_list: [19]
num_inference_steps: 40
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
decode_timestep: 0.05
decode_noise_scale: 0.025
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
precision: "bfloat16"
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
prompt_enhancement_words_threshold: 120
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
stochastic_sampling: false
18 changes: 8 additions & 10 deletions inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import json
import numpy as np
import torch
import cv2
from safetensors import safe_open
from PIL import Image
from transformers import (
Expand All @@ -35,6 +36,7 @@
from ltx_video.schedulers.rf import RectifiedFlowScheduler
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
import ltx_video.pipelines.crf_compressor as crf_compressor

MAX_HEIGHT = 720
MAX_WIDTH = 1280
Expand Down Expand Up @@ -96,7 +98,12 @@ def load_image_to_tensor_with_resize_and_crop(
image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
if not just_crop:
image = image.resize((target_width, target_height))
frame_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float()

image = np.array(image)
image = cv2.GaussianBlur(image, (3, 3), 0)
frame_tensor = torch.from_numpy(image).float()
frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
frame_tensor = frame_tensor.permute(2, 0, 1)
frame_tensor = (frame_tensor / 127.5) - 1.0
# Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
return frame_tensor.unsqueeze(0).unsqueeze(2)
Expand Down Expand Up @@ -266,13 +273,6 @@ def main():
help="Path to the input video (or imaage) to be modified using the video-to-video pipeline",
)

parser.add_argument(
"--strength",
type=float,
default=1.0,
help="Editing strength (noising level) for video-to-video pipeline.",
)

# Conditioning arguments
parser.add_argument(
"--conditioning_media_paths",
Expand Down Expand Up @@ -407,7 +407,6 @@ def infer(
negative_prompt: str,
offload_to_cpu: bool,
input_media_path: Optional[str] = None,
strength: Optional[float] = 1.0,
conditioning_media_paths: Optional[List[str]] = None,
conditioning_strengths: Optional[List[float]] = None,
conditioning_start_frames: Optional[List[int]] = None,
Expand Down Expand Up @@ -614,7 +613,6 @@ def infer(
frame_rate=frame_rate,
**sample,
media_items=media_item,
strength=strength,
conditioning_items=conditioning_items,
is_video=True,
vae_per_channel_normalize=True,
Expand Down
50 changes: 50 additions & 0 deletions ltx_video/pipelines/crf_compressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import av
import torch
import io
import numpy as np


def _encode_single_frame(output_file, image_array: np.ndarray, crf):
container = av.open(output_file, "w", format="mp4")
try:
stream = container.add_stream(
"libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"}
)
stream.height = image_array.shape[0]
stream.width = image_array.shape[1]
av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(
format="yuv420p"
)
container.mux(stream.encode(av_frame))
container.mux(stream.encode())
finally:
container.close()


def _decode_single_frame(video_file):
container = av.open(video_file)
try:
stream = next(s for s in container.streams if s.type == "video")
frame = next(container.decode(stream))
finally:
container.close()
return frame.to_ndarray(format="rgb24")


def compress(image: torch.Tensor, crf=29):
if crf == 0:
return image

image_array = (
(image[: (image.shape[0] // 2) * 2, : (image.shape[1] // 2) * 2] * 255.0)
.byte()
.cpu()
.numpy()
)
with io.BytesIO() as output_file:
_encode_single_frame(output_file, image_array, crf)
video_bytes = output_file.getvalue()
with io.BytesIO(video_bytes) as video_file:
image_array = _decode_single_frame(video_file)
tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0
return tensor
Loading