YAYD/pyproject.toml at master · StarDuster/YAYD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
[project]
name = "yet-another-youdub-webui"
version = "0.1.0"
description = "Translate and dub YouTube videos to Chinese using AI"
readme = "README.md"
requires-python = ">=3.10,<3.12"
license = { text = "MIT" }
authors = [{ name = "StarDuster" }, { name = "liuzhao1225" }]
keywords = ["youtube", "translation", "dubbing", "ai", "tts", "whisper"]

dependencies = [
    # Web UI
    "gradio",
    "typer>=0.12,<1.0.0",

    # Logging
    "loguru>=0.7.0,<1.0.0",

    # Configuration
    "pydantic>=2.0.0,<3.0.0",
    "pydantic-settings>=2.0.0,<3.0.0",

    # Video downloading (使用 nightly 版本以获取最新的 YouTube 反爬虫修复)
    "yt-dlp[default,curl-cffi,secretstorage]>=2026.1.29",
    "curl_cffi",
    "yt-dlp-invidious",
    "bgutil-ytdlp-pot-provider",
    "google-genai>=1.50.0",

    # Audio processing
    "scipy>=1.14.0,<2.0.0",
    "librosa>=0.11.0,<0.12.0",
    "audiostretchy>=1.3.5",
    "soundfile",
    "numpy>=2.0.0,<3.0.0",
    "cmudict>=1.1.2",
    "pypinyin",

    # Vocal separation (offline / GPU-friendly)
    # Use demucs-infer (actively maintained inference-only Demucs fork).
    "demucs-infer>=4.1.2",

    # Speech recognition (offline / GPU-friendly)
    "faster-whisper>=1.2.1",
    # Use CTranslate2 >= 4.5.0 to align with cuDNN 9 (avoid cuDNN 8 split libs like libcudnn_ops_infer.so.8).
    "ctranslate2>=4.5.0,<5.0.0",
    # Qwen3-ASR (offline, transformers backend)
    "qwen-asr>=0.0.4",
    # faster-whisper VAD uses onnxruntime. Keep it in cpu/gpu extras so you can choose the build.

    # Speaker diarization / speaker embedding (used by diarization + bytedance voice matching; optional at runtime)
    "pyannote.audio>=4.0.0,<5.0.0",

    # Local TTS (Qwen3-TTS / Qwen-Audio worker)
    "qwen-tts==0.0.5",

    # Translation API
    "openai>=1.0.0,<2.0.0",
    "python-dotenv>=1.0.0,<2.0.0",
    "huggingface-hub",

    # Image processing
    "Pillow>=10.0.0,<11.0.0",

    # HTTP requests (for bytedance TTS)
    "requests>=2.31.0,<3.0.0",

    # Bilibili upload

    # PyTorch ecosystem (CUDA 12.8 on Linux via uv index)
    # Keep as a range so you can upgrade without editing this file again.
    "torch>=2.10.0,<3.0.0",
    "torchaudio>=2.10.0,<3.0.0",
    "torchcodec>=0.10.0",
]

[project.optional-dependencies]
# Development dependencies
dev = ["pytest>=7.0.0", "black>=23.0.0", "ruff>=0.1.0"]

# CPU runtime stack (default runtime dependencies but kept optional so you can swap to GPU builds cleanly)
# Use: `uv sync --extra cpu`
cpu = ["onnxruntime>=1.17.0,<1.24.0"]

# Optional GPU acceleration stack (Linux).
# Use: `uv sync --extra gpu`
gpu = [
    # Used by our cuDNN preloader (and helps onnxruntime/ctranslate2 GPU providers).
    "nvidia-cudnn-cu12; sys_platform == 'linux'",
    # GPU build of onnxruntime (avoid co-install with onnxruntime)
    "onnxruntime-gpu>=1.17.0,<1.24.0; sys_platform == 'linux'",
]

# Flash Attention (optional, significantly speeds up Transformer inference but takes a long time to compile).
# Use: `uv pip install flash-attn --no-build-isolation`
flash = ["flash-attn; sys_platform == 'linux'"]

[project.scripts]
youdub = "youdub.app:main"

[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
where = ["src"]
include = ["youdub*"]

[tool.ruff]
line-length = 120
target-version = "py310"

[tool.black]
line-length = 120
target-version = ["py310"]

# --- UV Package Manager Configuration ---

[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true

[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
explicit = true

[tool.uv]
# 允许使用预发布版本 (nightly)，以获取最新的 YouTube 反爬虫修复
prerelease = "allow"

# faster-whisper / qwen-tts depend on `onnxruntime`. We declare the runtime explicitly in
# [project.optional-dependencies] (cpu/gpu extras), and patch upstream metadata to avoid forcing a specific onnxruntime wheel.
dependency-metadata = [
    # Patch faster-whisper to avoid forcing a bundled onnxruntime dependency.
    { name = "faster-whisper", version = "1.2.1", provides-extra = ["conversion", "dev"], requires-dist = [
        "ctranslate2<5,>=4.0",
        "huggingface-hub>=0.21",
        "tokenizers<1,>=0.13",
        "av>=11",
        "tqdm",
        "transformers[torch]>=4.23; extra == \"conversion\"",
        "black==23.*; extra == \"dev\"",
        "flake8==6.*; extra == \"dev\"",
        "isort==5.*; extra == \"dev\"",
        "pytest==7.*; extra == \"dev\"",
    ] },
    # Patch qwen-tts to avoid forcing a bundled onnxruntime dependency.
    { name = "qwen-tts", version = "0.0.5", requires-dist = [
        "transformers==4.57.6",
        "accelerate==1.12.0",
        "gradio",
        "librosa",
        "torchaudio",
        "soundfile",
        "sox",
        "einops",
    ] },
]

[tool.uv.sources]
torch = [
    { index = "pytorch-cu128", marker = "sys_platform == 'linux' and extra == 'gpu'" },
    { index = "pytorch-cpu", marker = "sys_platform == 'linux' and extra != 'gpu'" },
]
torchaudio = [
    { index = "pytorch-cu128", marker = "sys_platform == 'linux' and extra == 'gpu'" },
    { index = "pytorch-cpu", marker = "sys_platform == 'linux' and extra != 'gpu'" },
]