From a4b9590b82fc538dcdcf489e52f805f656a43876 Mon Sep 17 00:00:00 2001
From: chennan <chennan@qiniu.com>
Date: Thu, 21 May 2026 09:48:23 +0800
Subject: [PATCH 1/3] tts doc & demo

---
 docs/develop/sound-generation/index.zh.md     |  22 ++
 .../sound-generation/tts-phase-1.zh.md        | 272 ++++++++++++++++++
 docs/openapi.yaml                             | 134 +++++++++
 spx-gui/src/apis/aigc.ts                      |  41 ++-
 spx-gui/src/apis/common/index.ts              |   2 +
 spx-gui/src/components/asset/gen/modal.ts     |  11 +
 .../asset/gen/sound/SoundGenModal.vue         |  87 ++++++
 .../src/components/asset/gen/sound/TTSGen.vue | 259 +++++++++++++++++
 spx-gui/src/components/asset/index.ts         |   8 +
 .../editor/stage/sound/AddSoundMenu.vue       |  25 +-
 spx-gui/src/models/spx/gen/aigc-mock.ts       |   7 +
 spx-gui/src/models/spx/gen/common.ts          |   3 +-
 spx-gui/src/models/spx/gen/sound-gen.ts       | 147 ++++++++++
 13 files changed, 1015 insertions(+), 3 deletions(-)
 create mode 100644 docs/develop/sound-generation/index.zh.md
 create mode 100644 docs/develop/sound-generation/tts-phase-1.zh.md
 create mode 100644 spx-gui/src/components/asset/gen/sound/SoundGenModal.vue
 create mode 100644 spx-gui/src/components/asset/gen/sound/TTSGen.vue
 create mode 100644 spx-gui/src/models/spx/gen/sound-gen.ts

diff --git a/docs/develop/sound-generation/index.zh.md b/docs/develop/sound-generation/index.zh.md
new file mode 100644
index 0000000000..2a3306147b
--- /dev/null
+++ b/docs/develop/sound-generation/index.zh.md
@@ -0,0 +1,22 @@
+# Sound 生成
+
+本文档用于承接 XBuilder 中 Sound 生成功能的开发设计与实现拆解。
+
+## 范围
+
+Sound 生成当前规划包含：
+
+- 语音合成（Text-to-Speech, TTS）
+- 音效（Sound Effect）
+- 背景音乐（Background Music）
+
+## 当前实现建议
+
+当前实现上建议 **TTS 优先**，并以统一的 Sound 生成任务模型向后兼容音效与背景音乐。
+
+- MVP 优先落地：阿里云 CosyVoice TTS
+- 后续扩展：同一套任务接口继续支持 Sound Effect / Background Music
+
+## 设计文档
+
+- Sound 一期 TTS 接口设计与 `builder-backend` 开发任务拆解：[`./tts-phase-1.zh.md`](tts-phase-1.zh.md)
diff --git a/docs/develop/sound-generation/tts-phase-1.zh.md b/docs/develop/sound-generation/tts-phase-1.zh.md
new file mode 100644
index 0000000000..404457e738
--- /dev/null
+++ b/docs/develop/sound-generation/tts-phase-1.zh.md
@@ -0,0 +1,272 @@
+# Sound 一期 TTS 接口设计
+
+本文档用于对齐 `builder`（前端）与 `builder-backend`（服务端）的 Sound 生成功能设计。
+
+当前方案基于阿里云 CosyVoice v3.5 的真实约束：
+
+> **前端表达“我要哪类声音、以什么语气说这段话”**  
+> **服务端负责“使用哪个 target model / voice_id / instruction 去实现”**
+
+尤其需要注意：
+
+- `cosyvoice-v3.5-flash` / `cosyvoice-v3.5-plus` **没有系统音色**
+- 必须先通过**声音设计/复刻**生成 voice_id，后续语音合成时再把该 voice_id 作为 `voice` 参数使用
+- 模型本身通常可以根据 `text` 推断基础情绪和表达方式
+- 一期前端只选择 `性别 + 年龄段`，服务端为每个桶位固定维护一个默认音色
+- 前端更适合暴露开放的 `instruction` 作为补充说明，而不是固定枚举的 `emotion` / `useCase`
+- `rate` / `pitch` 虽然底层支持较大范围调节，但前端应收敛为少量合理档位，避免生成结果失真
+
+这样可以最大程度贴合 CosyVoice v3.5 的真实能力边界，同时保持前端心智简洁。
+
+---
+
+## 1. 设计结论
+
+### 1.1 前端暴露“易理解、可复用、不过度绑定供应商”的概念
+
+Sound TTS 的前端公开协议只保留这些字段：
+
+- `name`：生成后的素材名
+- `description`：素材描述
+- `category`：MVP 版本只有 `voice`（前端不暴露 category 选择，运行时固定传 voice）
+- `speechSettings`
+  - `text`：要说的话
+  - `voiceGender`：声音性别（男 / 女）
+  - `voiceAgeGroup`：声音年龄段（儿童 / 青年 / 中年 / 老年）
+  - `instruction`：补充“希望怎么说”“更偏什么感觉”“面向谁说”等开放信息
+  - `rate`：语速档位
+  - `pitch`：音调档位
+
+### 1.2 前端不包含这些底层字段
+
+以下字段都不应该成为前端协议的一部分：
+
+- `provider`
+- `model`
+- 供应商原始 `voice_id`
+- `format`
+- `sampleRate`
+
+原因：
+
+1. 这些都是服务端接供应商时的实现细节。
+2. 它们会把前端交互变成“调底层模型参数”。
+3. 未来更换供应商或调整模型时，不应要求前端同步改协议。
+
+### 1.3 服务端负责按分桶选择固定音色与参数映射
+
+服务端根据前端提交的请求参数自行决定：
+
+- 使用哪个 CosyVoice target model 和 `voice_id`
+- 如何组织最终调用参数
+- 输出什么格式和采样率
+
+---
+
+## 2. MVP 范围
+
+### 2.1 本期必须实现
+
+- 文本转语音（TTS）
+- 手动配置基础音色维度与合成参数
+- 异步生成任务创建 / 查询 / SSE 订阅 / 取消
+- 返回音频 URL
+- 前端将结果落为 XBuilder 的 `Sound` Asset
+
+### 2.2 本期明确不做
+
+- 音效生成（Sound Effect）
+- 背景音乐生成（Background Music）
+- 公开供应商底层参数，如原始 `voice_id`、连续数值级 `rate` / `pitch`
+- 公开 provider / model / voiceId 选择
+- 将生成结果自动入库到公共素材库
+
+---
+
+## 3. 核心原则
+
+1. **前端讲用户语言，后端讲供应商语言。**
+2. **前端只选性别与年龄桶位，不直接接触 voice_id。**
+3. **运行时暴露稳定的可控项：性别、年龄段、开放 `instruction`、有限档位的 `rate / pitch`。**
+4. **生成结果仍沿用现有 `/aigc/task` 异步任务体系。**
+5. **前端只保留手动选择，不做本地或服务端推荐。**
+
+---
+
+## 4. 数据模型
+
+### 4.1 基础音色维度（前端可见）
+
+```ts
+export type SoundVoiceGender = 'male' | 'female'
+
+export type SoundVoiceAgeGroup = 'child' | 'youth' | 'middle-aged' | 'senior'
+```
+
+说明：
+
+- 服务端内部按 `男 / 女 × 儿童 / 青年 / 中年 / 老年` 维护 8 个基础桶位
+- 每个桶位在一期只对应一个固定默认音色
+- 前端暴露的是桶位维度，不暴露供应商原始 `voice_id`
+- `rate / pitch` 虽然在协议里仍然使用 number，但前端只提供固定几档可选值
+- 这些固定选项由前端直接内置，不再单独提供“获取可选项”的接口
+- 前端所有可控项都由用户手动选择，不再引入推荐值
+
+### 4.2 生成请求（前端可见）
+
+```ts
+export type SpeechSoundSettings = {
+  name: string
+  description: string
+  category: 'voice'
+  speechSettings: {
+    text: string
+    voiceGender: SoundVoiceGender
+    voiceAgeGroup: SoundVoiceAgeGroup
+    instruction?: string
+    rate?: number
+    pitch?: number
+  }
+}
+```
+
+说明：
+
+- `voiceGender / voiceAgeGroup` 用于选定基础音色桶位
+- `text` 暂定限制为最多 200 个字符，避免长文本导致生成用时过长
+- `instruction` 限制为最多 50 个字符，不超过 CosyVoice 可接受的长度范围
+- 协议字段 `rate / pitch` 使用 number；前端通过固定档位映射到这些数值
+
+前端固定档位映射如下：
+
+| UI 档位 | `rate` | `pitch` |
+| --- | --- | --- |
+| 低 / 慢 | `0.8` | `0.85` |
+| 稍低 / 稍慢 | `0.9` | `0.95` |
+| 标准 | `1.0` | `1.0` |
+| 稍高 / 稍快 | `1.1` | `1.05` |
+| 高 / 快 | `1.2` | `1.15` |
+
+---
+
+## 5. 接口设计
+
+### 5.1 创建 Sound 生成任务
+
+继续沿用统一任务接口：
+
+```http
+POST /aigc/task
+Content-Type: application/json
+```
+
+#### Request
+
+```json
+{
+  "type": "generateSound",
+  "parameters": {
+    "settings": {
+      "name": "hero-hello",
+      "description": "主角的一句开心问候",
+      "category": "voice",
+      "speechSettings": {
+        "text": "你好，我们出发吧！",
+        "voiceGender": "female",
+        "voiceAgeGroup": "youth",
+        "instruction": "像在提醒队友准备出发，语气轻快一点",
+        "rate": 1.1,
+        "pitch": 1.05
+      }
+    }
+  }
+}
+```
+
+### 5.2 TaskResult 设计
+
+```ts
+export type TaskResultGenerateSound = {
+  audioUrl: string
+}
+```
+
+---
+
+## 6. 前端交互
+
+UI 顺序：
+
+1. 输入素材名称
+2. 输入要说的话
+3. 选择声音性别 / 年龄段
+4. 选择语速 / 音调档位
+5. 按需补充 `instruction`
+6. 点击生成
+7. 试听生成结果
+8. 采用到项目
+
+---
+
+## 7. 服务端开发建议
+
+### 任务 1：维护基础音色库存
+
+服务端维护一份基础音色库存，例如：
+
+```text
+builder-backend/internal/aigc/sound/base_voice_inventory.json
+```
+
+库存项可包含：
+
+- `target_model`
+- `voice_gender`
+- `voice_age_group`
+- `voice_id`
+
+其中：
+
+- `voice_id` 来自 CosyVoice 声音设计接口
+- `voice_gender + voice_age_group` 共同决定一个基础音色桶位
+
+建议：
+
+- 一期先按 `男 / 女 × 儿童 / 青年 / 中年 / 老年` 维护 8 个基础桶位
+- 每个桶位只维护一个默认基础音色，优先保证稳定、自然、通用
+- 如果后续某个桶位的默认音色无法覆盖主要场景，再考虑扩展该桶位的候选音色数量
+
+### 任务 2：处理 Sound 生成任务
+
+服务端收到请求后完成：
+
+1. 根据 `voiceGender + voiceAgeGroup` 选定基础音色桶位对应的固定 `voice_id`
+2. 根据 `text / instruction / rate / pitch` 等信息生成最终调用参数
+3. 调用供应商生成音频
+4. 上传对象存储并写回 task result
+
+例如：
+
+- `male + middle-aged` -> `cosyvoice-v3.5-plus` 的中年男声默认 `voice_id`
+- `female + youth` -> `cosyvoice-v3.5-plus` 的青年女声默认 `voice_id`
+
+然后再根据：
+
+- `text`
+- `instruction`
+- `rate`
+- `pitch`
+
+生成最终调用参数。默认值策略：
+
+- `instruction` 为空时，仅基于 `text` 做自然表达
+- `rate / pitch` 缺省时使用 `1.0`
+- 输出格式固定用 `mp3`
+- 其它参数不指定，使用 CosyVoice 默认值
+
+---
+
+## 8. 相关文档
+
+- [阿里百炼语音合成模型用户指南](https://help.aliyun.com/zh/model-studio/tts-model)
+- [CosyVoice 语音合成 API 参考](https://help.aliyun.com/zh/model-studio/non-realtime-cosyvoice-api)
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index fe2929c17c..ab3af9e28c 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -2676,6 +2676,7 @@ paths:
         | `generateAnimationVideo` | Generate animation video |
         | `extractVideoFrames` | Extract frames from video |
         | `generateBackdrop` | Generate backdrop image |
+        | `generateSound` | Generate sound asset (TTS in current MVP) |
 
         Quota and rate limits:
 
@@ -2831,6 +2832,13 @@ paths:
                           default: 1
                           examples:
                             - 4
+                    - title: generateSound
+                      type: object
+                      required:
+                        - settings
+                      properties:
+                        settings:
+                          $ref: "#/components/schemas/AIGCSpeechSoundSettings"
       responses:
         "202":
           description: Task accepted and queued for processing.
@@ -4708,6 +4716,7 @@ components:
             - generateAnimationVideo
             - extractVideoFrames
             - generateBackdrop
+            - generateSound
           examples:
             - removeBackground
         status:
@@ -4776,6 +4785,15 @@ components:
                   items:
                     type: string
                     format: uri
+            - title: generateSound
+              type: object
+              required:
+                - audioUrl
+              properties:
+                audioUrl:
+                  description: Universal URL of the generated audio file (e.g. kodo://bucket/key).
+                  type: string
+                  format: uri
         error:
           description: Error details. Only available when status is failed.
           type: object
@@ -4943,6 +4961,122 @@ components:
                 - ui
                 - unspecified
 
+    AIGCSoundCategory:
+      description: Category of the generated sound asset.
+      type: string
+      enum:
+        - voice
+        - effect
+        - music
+        - ambience
+        - other
+      examples:
+        - voice
+
+    AIGCSoundVoiceGender:
+      description: Gender bucket of the generated voice.
+      type: string
+      enum:
+        - male
+        - female
+      examples:
+        - female
+
+    AIGCSoundVoiceAgeGroup:
+      description: Age bucket of the generated voice.
+      type: string
+      enum:
+        - child
+        - youth
+        - middle-aged
+        - senior
+      examples:
+        - middle-aged
+
+    AIGCSpeechSettings:
+      description: Speech-specific generation settings.
+      type: object
+      required:
+        - text
+        - voiceGender
+        - voiceAgeGroup
+      properties:
+        text:
+          description: Text to synthesize into speech.
+          type: string
+          minLength: 1
+          maxLength: 200
+          examples:
+            - 你好，我们出发吧！
+        voiceGender:
+          $ref: "#/components/schemas/AIGCSoundVoiceGender"
+        voiceAgeGroup:
+          $ref: "#/components/schemas/AIGCSoundVoiceAgeGroup"
+        instruction:
+          description: Optional free-form instruction describing how the speech should be delivered.
+          type: string
+          maxLength: 50
+          examples:
+            - 像在提醒队友准备出发，语气轻快一点
+        rate:
+          description: Optional speech-rate multiplier passed to synthesis. `1.0` means natural speed.
+          type: number
+          format: float
+          minimum: 0.5
+          maximum: 2.0
+          examples:
+            - 1.1
+        pitch:
+          description: Optional pitch multiplier passed to synthesis. `1.0` means natural pitch.
+          type: number
+          format: float
+          minimum: 0.5
+          maximum: 2.0
+          examples:
+            - 1.05
+
+    AIGCSoundBaseSettings:
+      description: Base settings shared by all sound-generation requests.
+      type: object
+      required:
+        - name
+        - description
+        - category
+      properties:
+        name:
+          description: Name of the resulting Sound asset.
+          type: string
+          examples:
+            - hero-hello
+        description:
+          description: Description of the resulting Sound asset.
+          type: string
+          examples:
+            - 主角的一句开心问候
+        category:
+          $ref: "#/components/schemas/AIGCSoundCategory"
+
+    AIGCSpeechSoundSettings:
+      description: |
+        Settings for generating a speech sound asset.
+
+        This is the only supported sound-generation request type in the current MVP.
+      allOf:
+        - $ref: "#/components/schemas/AIGCSoundBaseSettings"
+        - type: object
+          required:
+            - speechSettings
+          properties:
+            category:
+              description: Category of the resulting Sound asset. Fixed to `voice` for speech generation.
+              type: string
+              enum:
+                - voice
+              examples:
+                - voice
+            speechSettings:
+              $ref: "#/components/schemas/AIGCSpeechSettings"
+
     UpInfo:
       description: Upload credentials and configuration.
       type: object
diff --git a/spx-gui/src/apis/aigc.ts b/spx-gui/src/apis/aigc.ts
index aca88b1d52..631447897d 100644
--- a/spx-gui/src/apis/aigc.ts
+++ b/spx-gui/src/apis/aigc.ts
@@ -9,6 +9,7 @@ import {
   client,
   type FileCollection,
   Perspective,
+  SoundCategory,
   SpriteCategory,
   type UniversalUrl
 } from './common'
@@ -65,12 +66,40 @@ export type BackdropSettings = {
   perspective: Perspective
 }
 
+export type SoundVoiceGender = 'male' | 'female'
+
+export type SoundVoiceAgeGroup = 'child' | 'youth' | 'middle-aged' | 'senior'
+
+export type SoundSettingsBase = {
+  name: string
+  description: string
+  category: SoundCategory
+}
+
+export type SpeechSettings = {
+  text: string
+  voiceGender: SoundVoiceGender
+  voiceAgeGroup: SoundVoiceAgeGroup
+  instruction?: string
+  rate?: number
+  pitch?: number
+}
+
+export type SpeechSoundSettings = SoundSettingsBase & {
+  category: SoundCategory.Voice
+  speechSettings: SpeechSettings
+}
+
+// TODO: support more sound types in the future
+export type SoundSettings = SpeechSoundSettings
+
 export const enum TaskType {
   RemoveBackground = 'removeBackground',
   GenerateCostume = 'generateCostume',
   GenerateAnimationVideo = 'generateAnimationVideo',
   ExtractVideoFrames = 'extractVideoFrames',
-  GenerateBackdrop = 'generateBackdrop'
+  GenerateBackdrop = 'generateBackdrop',
+  GenerateSound = 'generateSound'
 }
 
 export const enum TaskStatus {
@@ -120,12 +149,17 @@ export type TaskResultGenerateBackdrop = {
   imageUrls: UniversalUrl[]
 }
 
+export type TaskResultGenerateSound = {
+  audioUrl: UniversalUrl
+}
+
 export type TaskResult<T extends TaskType = TaskType> = {
   [TaskType.RemoveBackground]: TaskResultRemoveBackground
   [TaskType.GenerateCostume]: TaskResultGenerateCostume
   [TaskType.GenerateAnimationVideo]: TaskResultGenerateAnimationVideo
   [TaskType.ExtractVideoFrames]: TaskResultExtractVideoFrames
   [TaskType.GenerateBackdrop]: TaskResultGenerateBackdrop
+  [TaskType.GenerateSound]: TaskResultGenerateSound
 }[T]
 
 export type Task<T extends TaskType = TaskType> = {
@@ -174,12 +208,17 @@ export type TaskParamsGenerateBackdrop = {
   n: number
 }
 
+export type TaskParamsGenerateSound = {
+  settings: SoundSettings
+}
+
 export type TaskParams<T extends TaskType = TaskType> = {
   [TaskType.RemoveBackground]: TaskParamsRemoveBackground
   [TaskType.GenerateCostume]: TaskParamsGenerateCostume
   [TaskType.GenerateAnimationVideo]: TaskParamsGenerateAnimationVideo
   [TaskType.ExtractVideoFrames]: TaskParamsExtractVideoFrames
   [TaskType.GenerateBackdrop]: TaskParamsGenerateBackdrop
+  [TaskType.GenerateSound]: TaskParamsGenerateSound
 }[T]
 
 export function createTask<T extends TaskType>(type: T, params: TaskParams<T>, signal?: AbortSignal): Promise<Task<T>> {
diff --git a/spx-gui/src/apis/common/index.ts b/spx-gui/src/apis/common/index.ts
index b94a728617..6add117101 100644
--- a/spx-gui/src/apis/common/index.ts
+++ b/spx-gui/src/apis/common/index.ts
@@ -88,6 +88,8 @@ export const enum BackdropCategory {
 }
 
 export const enum SoundCategory {
+  /** Voice sounds are spoken or narrated audio assets, such as TTS lines, narration, or dialogue clips. */
+  Voice = 'voice',
   /** Sound effects are audio elements that enhance the gaming experience by providing auditory feedback for actions, events, or interactions within the game. */
   Effect = 'effect',
   /** Music tracks are composed pieces that set the tone, mood, and atmosphere of the game, often playing in the background during gameplay or specific scenes. */
diff --git a/spx-gui/src/components/asset/gen/modal.ts b/spx-gui/src/components/asset/gen/modal.ts
index e38a705e59..57abaf4566 100644
--- a/spx-gui/src/components/asset/gen/modal.ts
+++ b/spx-gui/src/components/asset/gen/modal.ts
@@ -4,6 +4,7 @@ import { I18n } from '@/utils/i18n'
 import type { SpxProject } from '@/models/spx/project'
 import { SpriteGen } from '@/models/spx/gen/sprite-gen'
 import { BackdropGen } from '@/models/spx/gen/backdrop-gen'
+import { SoundGen } from '@/models/spx/gen/sound-gen'
 import type { AssetGenModel } from '@/models/spx/common/asset'
 
 export interface GenHelpers {
@@ -41,3 +42,13 @@ export function initBackdropGen(i18n: I18n, project: SpxProject, onCleanup: OnCl
   })
   return g
 }
+
+/** Init a sound-gen instance for asset generation modals, and handle its lifecycle properly. */
+export function initSoundGen(onCleanup: OnCleanup) {
+  const g = new SoundGen()
+  onCleanup(() => {
+    g.cancel()
+    g.dispose()
+  })
+  return g
+}
diff --git a/spx-gui/src/components/asset/gen/sound/SoundGenModal.vue b/spx-gui/src/components/asset/gen/sound/SoundGenModal.vue
new file mode 100644
index 0000000000..e3dce49475
--- /dev/null
+++ b/spx-gui/src/components/asset/gen/sound/SoundGenModal.vue
@@ -0,0 +1,87 @@
+<script setup lang="ts">
+import { ref } from 'vue'
+import { UITab, UITabs, UIModal, UIModalClose, useConfirmDialog } from '@/components/ui'
+import { useI18n } from '@/utils/i18n'
+import { useWatchResult } from '@/utils/utils'
+import { useMessageHandle } from '@/utils/exception'
+import type { SpxProject } from '@/models/spx/project'
+import type { Sound } from '@/models/spx/sound'
+import { initSoundGen } from '../modal'
+import TTSGenComp from './TTSGen.vue'
+
+const props = defineProps<{
+  visible: boolean
+  project: SpxProject
+}>()
+
+const emit = defineEmits<{
+  resolved: [Sound]
+  cancelled: []
+}>()
+
+const i18n = useI18n()
+const confirm = useConfirmDialog()
+
+type SoundGenerationType = 'tts'
+
+const selectedType = ref<SoundGenerationType>('tts')
+
+const generationTypes: Array<{ value: SoundGenerationType; label: { zh: string; en: string } }> = [
+  {
+    value: 'tts',
+    label: { zh: '语音合成', en: 'Text to Speech' }
+  }
+]
+
+const gen = useWatchResult(
+  () => props.project,
+  (_project, onCleanup) => initSoundGen(onCleanup)
+)
+
+const handleModalClose = useMessageHandle(
+  async () => {
+    await confirm({
+      title: i18n.t({ zh: '退出声音生成？', en: 'Exit sound generation?' }),
+      content: i18n.t({
+        zh: '当前内容不会被保存，确定要退出吗？',
+        en: 'Current progress will not be saved. Are you sure to exit?'
+      }),
+      confirmText: i18n.t({ en: 'Exit', zh: '退出' })
+    })
+    emit('cancelled')
+  },
+  { en: 'Failed to exit modal', zh: '退出失败' }
+).fn
+</script>
+
+<template>
+  <UIModal
+    :radar="{ name: 'Sound generation modal', desc: 'Modal for sound generation' }"
+    style="width: 960px; height: 720px"
+    :visible="visible"
+    mask-closable
+    @update:visible="handleModalClose"
+  >
+    <header class="flex-none h-14 flex items-center justify-between border-b border-grey-400 px-6">
+      <h2 class="text-xl text-title">{{ $t({ zh: '生成声音', en: 'Sound Generator' }) }}</h2>
+      <UIModalClose class="close" @click="handleModalClose" />
+    </header>
+
+    <div class="flex-[1_1_0] min-h-0 px-6 py-5 flex flex-col">
+      <div class="flex-auto overflow-hidden border border-grey-400 rounded-md flex flex-col">
+        <UITabs v-model:value="selectedType" class="border-b border-grey-400">
+          <UITab v-for="type in generationTypes" :key="type.value" :value="type.value">
+            {{ $t(type.label) }}
+          </UITab>
+        </UITabs>
+
+        <TTSGenComp
+          v-if="gen != null && selectedType === 'tts'"
+          class="flex-auto"
+          :gen="gen"
+          @resolved="emit('resolved', $event)"
+        />
+      </div>
+    </div>
+  </UIModal>
+</template>
diff --git a/spx-gui/src/components/asset/gen/sound/TTSGen.vue b/spx-gui/src/components/asset/gen/sound/TTSGen.vue
new file mode 100644
index 0000000000..5c44a67029
--- /dev/null
+++ b/spx-gui/src/components/asset/gen/sound/TTSGen.vue
@@ -0,0 +1,259 @@
+<script setup lang="ts">
+import { computed } from 'vue'
+import { UIButton, UIButtonGroup, UIButtonGroupItem, UITextInput } from '@/components/ui'
+import { type SoundVoiceAgeGroup, type SoundVoiceGender } from '@/apis/aigc'
+import type { Sound } from '@/models/spx/sound'
+import type { SoundGen } from '@/models/spx/gen/sound-gen'
+import { capture, useMessageHandle } from '@/utils/exception'
+import { useFileUrl } from '@/utils/file'
+import SoundPlayer from '@/components/editor/stage/sound/SoundPlayer.vue'
+import type { LocaleMessage } from '@/utils/i18n'
+
+const props = defineProps<{
+  gen: SoundGen
+}>()
+
+const emit = defineEmits<{
+  resolved: [Sound]
+}>()
+
+type SelectOption<T> = {
+  value: T
+  label: LocaleMessage
+}
+
+const maxSpeechTextLength = 200
+const maxInstructionLength = 50
+
+const voiceGenderOptions: Array<SelectOption<SoundVoiceGender>> = [
+  { value: 'male', label: { en: 'Male', zh: '男声' } },
+  { value: 'female', label: { en: 'Female', zh: '女声' } }
+]
+
+const voiceAgeGroupOptions: Array<SelectOption<SoundVoiceAgeGroup>> = [
+  { value: 'child', label: { en: 'Child', zh: '儿童' } },
+  { value: 'youth', label: { en: 'Youth', zh: '青年' } },
+  { value: 'middle-aged', label: { en: 'Middle-aged', zh: '中年' } },
+  { value: 'senior', label: { en: 'Senior', zh: '老年' } }
+]
+
+const presetRateOptions: Array<SelectOption<number>> = [
+  { value: 0.8, label: { en: 'Slow', zh: '慢' } },
+  { value: 0.9, label: { en: 'Slightly slow', zh: '稍慢' } },
+  { value: 1.0, label: { en: 'Standard', zh: '标准' } },
+  { value: 1.1, label: { en: 'Slightly fast', zh: '稍快' } },
+  { value: 1.2, label: { en: 'Fast', zh: '快' } }
+]
+const presetPitchOptions: Array<SelectOption<number>> = [
+  { value: 0.85, label: { en: 'Low', zh: '低' } },
+  { value: 0.95, label: { en: 'Slightly low', zh: '稍低' } },
+  { value: 1.0, label: { en: 'Standard', zh: '标准' } },
+  { value: 1.05, label: { en: 'Slightly high', zh: '稍高' } },
+  { value: 1.15, label: { en: 'High', zh: '高' } }
+]
+
+const [resultSrc] = useFileUrl(() => props.gen.result?.file ?? null)
+
+const canGenerate = computed(
+  () =>
+    props.gen.generateState.status !== 'running' &&
+    props.gen.settings.name.trim() !== '' &&
+    props.gen.settings.speechSettings.text.trim() !== ''
+)
+
+const handleGenerate = useMessageHandle(() => props.gen.generate(), {
+  en: 'Failed to generate sound',
+  zh: '生成声音失败'
+})
+
+const handleUse = useMessageHandle(
+  async () => {
+    const sound = props.gen.result
+    if (sound == null) throw new Error('generated sound expected')
+    props.gen.recordAdoption().catch((err) => {
+      capture(err, 'failed to record sound asset adoption')
+    })
+    emit('resolved', sound)
+  },
+  {
+    en: 'Failed to use sound',
+    zh: '采用声音失败'
+  }
+)
+</script>
+
+<template>
+  <main
+    v-radar="{ name: 'TTS generation', desc: 'Interface for generating speech with text to speech' }"
+    class="flex flex-col min-h-0"
+  >
+    <section class="flex-1 min-h-0 px-6 py-5 overflow-y-auto">
+      <div class="grid gap-5 max-w-5xl">
+        <div class="grid gap-2">
+          <label class="text-base font-semibold">{{ $t({ en: 'Name', zh: '声音名称' }) }}</label>
+          <UITextInput
+            :value="gen.settings.name"
+            :placeholder="$t({ en: 'e.g. greeting', zh: '例如：问侯' })"
+            @update:value="gen.setSettings({ name: $event })"
+          />
+        </div>
+
+        <div class="grid gap-2">
+          <div class="flex items-center justify-between">
+            <label class="text-base font-semibold">{{ $t({ en: 'Text', zh: '文本' }) }}</label>
+            <span class="text-xs text-grey-700">
+              {{ gen.settings.speechSettings.text.length }}/{{ maxSpeechTextLength }}
+            </span>
+          </div>
+          <UITextInput
+            type="textarea"
+            :rows="3"
+            :value="gen.settings.speechSettings.text"
+            :placeholder="$t({ en: 'Enter the speech text to synthesize', zh: '输入要合成的语音文本' })"
+            @update:value="gen.setSettings({ speechSettings: { text: $event.slice(0, maxSpeechTextLength) } })"
+          />
+        </div>
+
+        <div class="grid gap-2">
+          <label class="text-base font-semibold">{{ $t({ en: 'Speech settings', zh: '声音设定' }) }}</label>
+          <div class="flex flex-col gap-3">
+            <div class="flex items-start gap-4">
+              <label class="pt-1.5 text-sm text-grey-800">{{ $t({ en: 'Gender', zh: '性别' }) }}</label>
+              <UIButtonGroup
+                :value="gen.settings.speechSettings.voiceGender"
+                @update:value="
+                  (value) => gen.setSettings({ speechSettings: { voiceGender: value as SoundVoiceGender } })
+                "
+              >
+                <UIButtonGroupItem
+                  v-for="option in voiceGenderOptions"
+                  :key="option.value"
+                  :value="option.value"
+                  class="w-auto px-3 text-sm whitespace-nowrap"
+                >
+                  {{ $t(option.label) }}
+                </UIButtonGroupItem>
+              </UIButtonGroup>
+              <label class="ml-4 pt-1.5 text-sm text-grey-800">{{ $t({ en: 'Age', zh: '年龄' }) }}</label>
+              <UIButtonGroup
+                :value="gen.settings.speechSettings.voiceAgeGroup"
+                class="max-w-full"
+                @update:value="
+                  (value) => gen.setSettings({ speechSettings: { voiceAgeGroup: value as SoundVoiceAgeGroup } })
+                "
+              >
+                <UIButtonGroupItem
+                  v-for="option in voiceAgeGroupOptions"
+                  :key="option.value"
+                  :value="option.value"
+                  class="w-auto px-3 text-sm whitespace-nowrap"
+                >
+                  {{ $t(option.label) }}
+                </UIButtonGroupItem>
+              </UIButtonGroup>
+            </div>
+            <div class="flex items-start gap-4">
+              <label class="pt-1.5 text-sm text-grey-800">{{ $t({ en: 'Speed', zh: '语速' }) }}</label>
+              <UIButtonGroup
+                :value="String(gen.settings.speechSettings.rate ?? 1.0)"
+                class="max-w-full"
+                @update:value="(value) => gen.setSettings({ speechSettings: { rate: Number(value) } })"
+              >
+                <UIButtonGroupItem
+                  v-for="option in presetRateOptions"
+                  :key="option.value"
+                  :value="String(option.value)"
+                  class="w-auto px-3 text-sm whitespace-nowrap"
+                >
+                  {{ $t(option.label) }}
+                </UIButtonGroupItem>
+              </UIButtonGroup>
+            </div>
+            <div class="flex items-start gap-4">
+              <label class="pt-1.5 text-sm text-grey-800">{{ $t({ en: 'Pitch', zh: '音调' }) }}</label>
+              <UIButtonGroup
+                :value="String(gen.settings.speechSettings.pitch ?? 1.0)"
+                class="max-w-full"
+                @update:value="(value) => gen.setSettings({ speechSettings: { pitch: Number(value) } })"
+              >
+                <UIButtonGroupItem
+                  v-for="option in presetPitchOptions"
+                  :key="option.value"
+                  :value="String(option.value)"
+                  class="w-auto px-3 text-sm whitespace-nowrap"
+                >
+                  {{ $t(option.label) }}
+                </UIButtonGroupItem>
+              </UIButtonGroup>
+            </div>
+            <div class="flex flex-col gap-2">
+              <div class="flex items-center justify-between">
+                <label class="text-sm text-grey-800">{{ $t({ en: 'Instruction', zh: '补充说明' }) }}</label>
+                <span class="text-xs text-grey-700">
+                  {{ gen.settings.speechSettings.instruction?.length ?? 0 }}/{{ maxInstructionLength }}
+                </span>
+              </div>
+              <UITextInput
+                type="textarea"
+                :rows="2"
+                :value="gen.settings.speechSettings.instruction ?? ''"
+                :placeholder="
+                  $t({
+                    en: 'Add extra information such as role, emotion, context, speaking style, etc. For example: The tone should be lively and playful, with a clear smile, making the voice sound full of energy and sunshine.',
+                    zh: '可补充角色、情绪、语境、说话风格等信息，例如：语气要显得活泼俏皮，带着明显的笑意，让声音听起来充满朝气与阳光。'
+                  })
+                "
+                @update:value="
+                  gen.setSettings({ speechSettings: { instruction: $event.slice(0, maxInstructionLength) } })
+                "
+              />
+            </div>
+          </div>
+        </div>
+      </div>
+    </section>
+
+    <footer class="flex-none border-t border-grey-400 px-6 py-4 flex items-center justify-between gap-6">
+      <div class="flex-auto flex items-center gap-3">
+        <template v-if="resultSrc != null && gen.result != null">
+          <SoundPlayer :src="resultSrc" />
+          <label class="text-base font-medium"> &lt;- {{ $t({ en: 'Click to preview', zh: '点击试听' }) }}</label>
+        </template>
+        <div v-else class="text-sm text-grey-700">
+          {{
+            $t({
+              en: 'Generate once to preview the synthesized sound here.',
+              zh: '点击生成后，可在这里试听合成结果。'
+            })
+          }}
+        </div>
+      </div>
+
+      <div class="flex-none flex items-center gap-3">
+        <UIButton
+          type="neutral"
+          size="large"
+          :disabled="!canGenerate"
+          :loading="handleGenerate.isLoading.value"
+          @click="handleGenerate.fn"
+        >
+          {{
+            $t({
+              en: gen.generateState.status === 'initial' ? 'Generate' : 'Regenerate',
+              zh: gen.generateState.status === 'initial' ? '生成' : '重新生成'
+            })
+          }}
+        </UIButton>
+        <UIButton
+          type="primary"
+          size="large"
+          :disabled="gen.result == null"
+          :loading="handleUse.isLoading.value"
+          @click="handleUse.fn"
+        >
+          {{ $t({ en: 'Use', zh: '采用' }) }}
+        </UIButton>
+      </div>
+    </footer>
+  </main>
+</template>
diff --git a/spx-gui/src/components/asset/index.ts b/spx-gui/src/components/asset/index.ts
index 715914d908..a05bdfd145 100644
--- a/spx-gui/src/components/asset/index.ts
+++ b/spx-gui/src/components/asset/index.ts
@@ -34,6 +34,7 @@ import GroupCostumesModal from './animation/GroupCostumesModal.vue'
 import AssetLibraryManagementModal from './library/management/AssetLibraryManagementModal.vue'
 import SpriteGenModal from './gen/sprite/SpriteGenModal.vue'
 import BackdropGenModal from './gen/backdrop/BackdropGenModal.vue'
+import SoundGenModal from './gen/sound/SoundGenModal.vue'
 import type { GenHelpers } from './gen/modal'
 
 export function useSpriteGenModal() {
@@ -51,6 +52,13 @@ export function useBackdropGenModal() {
   }
 }
 
+export function useSoundGenModal() {
+  const invokeModal = useModal(SoundGenModal)
+  return function invokeSoundGenModal(project: SpxProject) {
+    return invokeModal({ project })
+  }
+}
+
 export function useAddAssetFromLibrary() {
   const editorCtx = useEditorCtx()
   const genHelpers = useGenHelpers()
diff --git a/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue b/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue
index a4f968826a..1ddc979e0e 100644
--- a/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue
+++ b/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue
@@ -15,6 +15,9 @@
     <UIMenuItem v-radar="{ name: 'Record sound', desc: 'Click to record a new sound' }" @click="handleRecord">
       {{ $t({ en: 'Record', zh: '录音' }) }}
     </UIMenuItem>
+    <UIMenuItem v-radar="{ name: 'Generate sound', desc: 'Click to generate a sound with AI' }" @click="handleGenerate">
+      {{ $t({ en: 'Generate with AI', zh: '使用 AI 生成' }) }}
+    </UIMenuItem>
   </UIMenu>
 </template>
 
@@ -22,7 +25,12 @@
 import { UIMenu, UIMenuItem } from '@/components/ui'
 import { AssetType } from '@/apis/asset'
 import { useMessageHandle } from '@/utils/exception'
-import { useAddAssetFromLibrary, useAddSoundFromLocalFile, useAddSoundByRecording } from '@/components/asset'
+import {
+  useAddAssetFromLibrary,
+  useAddSoundFromLocalFile,
+  useAddSoundByRecording,
+  useSoundGenModal
+} from '@/components/asset'
 import { useEditorCtx } from '../../EditorContextProvider.vue'
 import type { SoundsEditorState } from './sounds-editor-state'
 
@@ -67,4 +75,19 @@ const handleRecord = useMessageHandle(
     zh: '录音失败'
   }
 ).fn
+
+const invokeSoundGenModal = useSoundGenModal()
+const handleGenerate = useMessageHandle(
+  async () => {
+    const sound = await invokeSoundGenModal(editorCtx.project)
+    await editorCtx.state.history.doAction({ name: { en: 'Add sound', zh: '添加声音' } }, () => {
+      editorCtx.project.addSound(sound)
+    })
+    props.state.select(sound.id)
+  },
+  {
+    en: 'Failed to generate sound',
+    zh: '生成声音失败'
+  }
+).fn
 </script>
diff --git a/spx-gui/src/models/spx/gen/aigc-mock.ts b/spx-gui/src/models/spx/gen/aigc-mock.ts
index b068acf3ec..235d4c06b9 100644
--- a/spx-gui/src/models/spx/gen/aigc-mock.ts
+++ b/spx-gui/src/models/spx/gen/aigc-mock.ts
@@ -471,6 +471,13 @@ export class MockAigcApis {
           imageUrls: this.range(p.n).map((i) => this.url(`backdrop-${name}-${i + 1}.png`))
         } as TaskResult<T>
       }
+      case TaskType.GenerateSound: {
+        const p = params as TaskParams<TaskType.GenerateSound>
+        const name = this.sanitize(p.settings.name)
+        return {
+          audioUrl: this.url(`sound-${name}.mp3`)
+        } as TaskResult<T>
+      }
       default:
         throw new Error(`unsupported task type: ${type as string}`)
     }
diff --git a/spx-gui/src/models/spx/gen/common.ts b/spx-gui/src/models/spx/gen/common.ts
index 7d493b5054..ce72590af4 100644
--- a/spx-gui/src/models/spx/gen/common.ts
+++ b/spx-gui/src/models/spx/gen/common.ts
@@ -220,7 +220,8 @@ export const taskDurations: Record<TaskType, number> = {
   [TaskType.GenerateCostume]: 15,
   [TaskType.GenerateAnimationVideo]: 180,
   [TaskType.ExtractVideoFrames]: 12,
-  [TaskType.GenerateBackdrop]: 15
+  [TaskType.GenerateBackdrop]: 15,
+  [TaskType.GenerateSound]: 5
 }
 
 export type TaskApis = Pick<typeof aigcApis, 'createTask' | 'cancelTask' | 'subscribeTaskEvents'>
diff --git a/spx-gui/src/models/spx/gen/sound-gen.ts b/spx-gui/src/models/spx/gen/sound-gen.ts
new file mode 100644
index 0000000000..50ef4a0d05
--- /dev/null
+++ b/spx-gui/src/models/spx/gen/sound-gen.ts
@@ -0,0 +1,147 @@
+import { nanoid } from 'nanoid'
+import { reactive } from 'vue'
+import { SoundCategory } from '@/apis/common'
+import {
+  adoptAsset,
+  TaskStatus,
+  TaskType,
+  type SpeechSoundSettings,
+  type SpeechSettings,
+  type TaskResultGenerateSound
+} from '@/apis/aigc'
+import { Disposable } from '@/utils/disposable'
+import { createFileWithUniversalUrl } from '../../common/cloud'
+import { ensureValidSoundName, validateSoundName, type SoundLikeParent } from '../common/asset-name'
+import { sound2Asset } from '../common/asset'
+import { Sound } from '../sound'
+import { Phase, Task } from './common'
+
+export type SoundGenInits = {
+  id?: string
+  generateTask?: Task<TaskType.GenerateSound> | null
+  generatePhase?: Phase<Sound>
+}
+
+type GenerateSpeechSettingsUpdates = Partial<Omit<SpeechSoundSettings, 'speechSettings'>> & {
+  speechSettings?: Partial<SpeechSettings>
+}
+
+export class SoundGen extends Disposable {
+  id: string
+  settings: SpeechSoundSettings
+  private generateTask: Task<TaskType.GenerateSound> | null
+  private generatePhase: Phase<Sound>
+
+  constructor(inits: SoundGenInits = {}) {
+    super()
+    this.id = inits.id ?? nanoid()
+    this.settings = {
+      name: '',
+      description: '',
+      category: SoundCategory.Voice,
+      speechSettings: {
+        text: '',
+        voiceGender: 'male',
+        voiceAgeGroup: 'youth',
+        instruction: '',
+        rate: 1,
+        pitch: 1
+      }
+    }
+    this.generateTask = inits.generateTask ?? null
+    this.generatePhase = inits.generatePhase ?? new Phase({ en: 'generate sound', zh: '生成声音' })
+    return reactive(this) as this
+  }
+
+  private parent: SoundLikeParent | null = null
+  setParent(parent: SoundLikeParent | null) {
+    this.parent = parent
+  }
+
+  get name() {
+    return this.settings.name
+  }
+  setName(name: string) {
+    const err = validateSoundName(name, this.parent)
+    if (err != null) throw new Error(`invalid name ${name}: ${err.en}`)
+    this.settings.name = name
+    this.result?.setName(name)
+  }
+
+  setSettings(updates: GenerateSpeechSettingsUpdates) {
+    if (updates.name != null && updates.name !== this.settings.name) {
+      updates = { ...updates, name: ensureValidSoundName(updates.name, this.parent) }
+    }
+    const { speechSettings, ...rest } = updates
+    Object.assign(this.settings, rest)
+    if (speechSettings != null) {
+      Object.assign(this.settings.speechSettings, speechSettings)
+    }
+    if (updates.name != null) this.result?.setName(updates.name)
+  }
+
+  get generateState() {
+    return this.generatePhase.state
+  }
+
+  get result() {
+    return this.generatePhase.state.status === 'finished' ? this.generatePhase.state.result : null
+  }
+
+  reset() {
+    this.generateTask = null
+    this.generatePhase.reset()
+  }
+
+  async generate() {
+    return this.generatePhase.run(async (reporter) => {
+      this.generateTask?.tryCancel()
+      this.generateTask = new Task(TaskType.GenerateSound)
+      await this.generateTask.start({ settings: this.settings })
+      const taskResult = await this.generateTask.untilCompleted(reporter)
+      return this.createSound(taskResult)
+    })
+  }
+
+  private async createSound(taskResult: TaskResultGenerateSound) {
+    const file = createFileWithUniversalUrl(taskResult.audioUrl)
+    const sound = await Sound.create(this.settings.name, file)
+    sound.setAssetMetadata({
+      description: this.settings.description,
+      extraSettings: {
+        category: this.settings.category
+      }
+    })
+    sound.setExtraConfig({
+      builder_soundGen: {
+        ...this.settings,
+        result: {
+          audioUrl: taskResult.audioUrl
+        }
+      }
+    })
+    return sound
+  }
+
+  async recordAdoption() {
+    const sound = this.result
+    if (sound == null) throw new Error('result sound expected')
+    const taskIds = this.generateTask?.data?.status === TaskStatus.Completed ? [this.generateTask.data.id] : []
+    const assetData = await sound2Asset(sound)
+    return adoptAsset({
+      taskIds,
+      asset: {
+        ...assetData,
+        displayName: this.settings.name,
+        description: this.settings.description,
+        extraSettings: {
+          category: this.settings.category
+        }
+      }
+    })
+  }
+
+  cancel() {
+    return this.generateTask?.tryCancel()
+  }
+}

From 6e76b373f4106ab186ee4f79d6406aae150b37fd Mon Sep 17 00:00:00 2001
From: chennan <chennan@qiniu.com>
Date: Thu, 21 May 2026 09:49:19 +0800
Subject: [PATCH 2/3] remove rate & pitch

---
 .../sound-generation/tts-phase-1.zh.md        | 62 ++++--------------
 docs/openapi.yaml                             | 29 ++++-----
 spx-gui/src/apis/aigc.ts                      |  2 -
 .../src/components/asset/gen/sound/TTSGen.vue | 63 +++----------------
 spx-gui/src/models/spx/gen/sound-gen.ts       |  4 +-
 5 files changed, 33 insertions(+), 127 deletions(-)

diff --git a/docs/develop/sound-generation/tts-phase-1.zh.md b/docs/develop/sound-generation/tts-phase-1.zh.md
index 404457e738..80bbdf8021 100644
--- a/docs/develop/sound-generation/tts-phase-1.zh.md
+++ b/docs/develop/sound-generation/tts-phase-1.zh.md
@@ -1,4 +1,4 @@
-# Sound 一期 TTS 接口设计
+# Sound 一期 TTS 功能 & 接口设计
 
 本文档用于对齐 `builder`（前端）与 `builder-backend`（服务端）的 Sound 生成功能设计。
 
@@ -11,10 +11,9 @@
 
 - `cosyvoice-v3.5-flash` / `cosyvoice-v3.5-plus` **没有系统音色**
 - 必须先通过**声音设计/复刻**生成 voice_id，后续语音合成时再把该 voice_id 作为 `voice` 参数使用
-- 模型本身通常可以根据 `text` 推断基础情绪和表达方式
 - 一期前端只选择 `性别 + 年龄段`，服务端为每个桶位固定维护一个默认音色
-- 前端更适合暴露开放的 `instruction` 作为补充说明，而不是固定枚举的 `emotion` / `useCase`
-- `rate` / `pitch` 虽然底层支持较大范围调节，但前端应收敛为少量合理档位，避免生成结果失真
+- 模型本身通常可以根据 `text` 推断基础情绪和表达方式，可以附带可选的 `instruction` 作为补充说明，而不是用固定枚举的 `emotion` / `useCase` 等
+- 暂不开放 `rate / pitch / volume` 等参数，后续根据需求再评估是否增加“基础参数调整”功能
 
 这样可以最大程度贴合 CosyVoice v3.5 的真实能力边界，同时保持前端心智简洁。
 
@@ -34,8 +33,6 @@ Sound TTS 的前端公开协议只保留这些字段：
   - `voiceGender`：声音性别（男 / 女）
   - `voiceAgeGroup`：声音年龄段（儿童 / 青年 / 中年 / 老年）
   - `instruction`：补充“希望怎么说”“更偏什么感觉”“面向谁说”等开放信息
-  - `rate`：语速档位
-  - `pitch`：音调档位
 
 ### 1.2 前端不包含这些底层字段
 
@@ -77,8 +74,6 @@ Sound TTS 的前端公开协议只保留这些字段：
 
 - 音效生成（Sound Effect）
 - 背景音乐生成（Background Music）
-- 公开供应商底层参数，如原始 `voice_id`、连续数值级 `rate` / `pitch`
-- 公开 provider / model / voiceId 选择
 - 将生成结果自动入库到公共素材库
 
 ---
@@ -87,9 +82,8 @@ Sound TTS 的前端公开协议只保留这些字段：
 
 1. **前端讲用户语言，后端讲供应商语言。**
 2. **前端只选性别与年龄桶位，不直接接触 voice_id。**
-3. **运行时暴露稳定的可控项：性别、年龄段、开放 `instruction`、有限档位的 `rate / pitch`。**
-4. **生成结果仍沿用现有 `/aigc/task` 异步任务体系。**
-5. **前端只保留手动选择，不做本地或服务端推荐。**
+3. **前端只暴露最小化的选项：性别、年龄段、开放 `instruction`。**
+4. **生成结果沿用现有 `/aigc/task` 异步任务体系。**
 
 ---
 
@@ -108,7 +102,6 @@ export type SoundVoiceAgeGroup = 'child' | 'youth' | 'middle-aged' | 'senior'
 - 服务端内部按 `男 / 女 × 儿童 / 青年 / 中年 / 老年` 维护 8 个基础桶位
 - 每个桶位在一期只对应一个固定默认音色
 - 前端暴露的是桶位维度，不暴露供应商原始 `voice_id`
-- `rate / pitch` 虽然在协议里仍然使用 number，但前端只提供固定几档可选值
 - 这些固定选项由前端直接内置，不再单独提供“获取可选项”的接口
 - 前端所有可控项都由用户手动选择，不再引入推荐值
 
@@ -124,8 +117,6 @@ export type SpeechSoundSettings = {
     voiceGender: SoundVoiceGender
     voiceAgeGroup: SoundVoiceAgeGroup
     instruction?: string
-    rate?: number
-    pitch?: number
   }
 }
 ```
@@ -135,17 +126,6 @@ export type SpeechSoundSettings = {
 - `voiceGender / voiceAgeGroup` 用于选定基础音色桶位
 - `text` 暂定限制为最多 200 个字符，避免长文本导致生成用时过长
 - `instruction` 限制为最多 50 个字符，不超过 CosyVoice 可接受的长度范围
-- 协议字段 `rate / pitch` 使用 number；前端通过固定档位映射到这些数值
-
-前端固定档位映射如下：
-
-| UI 档位 | `rate` | `pitch` |
-| --- | --- | --- |
-| 低 / 慢 | `0.8` | `0.85` |
-| 稍低 / 稍慢 | `0.9` | `0.95` |
-| 标准 | `1.0` | `1.0` |
-| 稍高 / 稍快 | `1.1` | `1.05` |
-| 高 / 快 | `1.2` | `1.15` |
 
 ---
 
@@ -174,9 +154,7 @@ Content-Type: application/json
         "text": "你好，我们出发吧！",
         "voiceGender": "female",
         "voiceAgeGroup": "youth",
-        "instruction": "像在提醒队友准备出发，语气轻快一点",
-        "rate": 1.1,
-        "pitch": 1.05
+        "instruction": "像在提醒队友准备出发，语气轻快一点"
       }
     }
   }
@@ -200,11 +178,10 @@ UI 顺序：
 1. 输入素材名称
 2. 输入要说的话
 3. 选择声音性别 / 年龄段
-4. 选择语速 / 音调档位
-5. 按需补充 `instruction`
-6. 点击生成
-7. 试听生成结果
-8. 采用到项目
+4. 按需补充 `instruction`
+5. 点击生成
+6. 试听生成结果
+7. 采用到项目
 
 ---
 
@@ -227,7 +204,7 @@ builder-backend/internal/aigc/sound/base_voice_inventory.json
 
 其中：
 
-- `voice_id` 来自 CosyVoice 声音设计接口
+- `voice_id` 来自阿里百炼账号下已设计/复刻的可用于 CosyVoice `target_model` 的音色
 - `voice_gender + voice_age_group` 共同决定一个基础音色桶位
 
 建议：
@@ -241,26 +218,13 @@ builder-backend/internal/aigc/sound/base_voice_inventory.json
 服务端收到请求后完成：
 
 1. 根据 `voiceGender + voiceAgeGroup` 选定基础音色桶位对应的固定 `voice_id`
-2. 根据 `text / instruction / rate / pitch` 等信息生成最终调用参数
+2. 根据 `text / instruction` 等信息生成最终调用参数
 3. 调用供应商生成音频
 4. 上传对象存储并写回 task result
 
-例如：
-
-- `male + middle-aged` -> `cosyvoice-v3.5-plus` 的中年男声默认 `voice_id`
-- `female + youth` -> `cosyvoice-v3.5-plus` 的青年女声默认 `voice_id`
-
-然后再根据：
-
-- `text`
-- `instruction`
-- `rate`
-- `pitch`
-
-生成最终调用参数。默认值策略：
+调用参数默认值策略：
 
 - `instruction` 为空时，仅基于 `text` 做自然表达
-- `rate / pitch` 缺省时使用 `1.0`
 - 输出格式固定用 `mp3`
 - 其它参数不指定，使用 CosyVoice 默认值
 
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index ab3af9e28c..553d39e4d3 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -2832,7 +2832,18 @@ paths:
                           default: 1
                           examples:
                             - 4
-                    - title: generateSound
+                - title: generateSound
+                  type: object
+                  required:
+                    - type
+                    - parameters
+                  properties:
+                    type:
+                      type: string
+                      enum:
+                        - generateSound
+                    parameters:
+                      description: Parameters for speech sound generation.
                       type: object
                       required:
                         - settings
@@ -5018,22 +5029,6 @@ components:
           maxLength: 50
           examples:
             - 像在提醒队友准备出发，语气轻快一点
-        rate:
-          description: Optional speech-rate multiplier passed to synthesis. `1.0` means natural speed.
-          type: number
-          format: float
-          minimum: 0.5
-          maximum: 2.0
-          examples:
-            - 1.1
-        pitch:
-          description: Optional pitch multiplier passed to synthesis. `1.0` means natural pitch.
-          type: number
-          format: float
-          minimum: 0.5
-          maximum: 2.0
-          examples:
-            - 1.05
 
     AIGCSoundBaseSettings:
       description: Base settings shared by all sound-generation requests.
diff --git a/spx-gui/src/apis/aigc.ts b/spx-gui/src/apis/aigc.ts
index 631447897d..2f7f5acfdc 100644
--- a/spx-gui/src/apis/aigc.ts
+++ b/spx-gui/src/apis/aigc.ts
@@ -81,8 +81,6 @@ export type SpeechSettings = {
   voiceGender: SoundVoiceGender
   voiceAgeGroup: SoundVoiceAgeGroup
   instruction?: string
-  rate?: number
-  pitch?: number
 }
 
 export type SpeechSoundSettings = SoundSettingsBase & {
diff --git a/spx-gui/src/components/asset/gen/sound/TTSGen.vue b/spx-gui/src/components/asset/gen/sound/TTSGen.vue
index 5c44a67029..02d3d1ecd6 100644
--- a/spx-gui/src/components/asset/gen/sound/TTSGen.vue
+++ b/spx-gui/src/components/asset/gen/sound/TTSGen.vue
@@ -17,14 +17,14 @@ const emit = defineEmits<{
   resolved: [Sound]
 }>()
 
+const maxSpeechTextLength = 200
+const maxInstructionLength = 50
+
 type SelectOption<T> = {
   value: T
   label: LocaleMessage
 }
 
-const maxSpeechTextLength = 200
-const maxInstructionLength = 50
-
 const voiceGenderOptions: Array<SelectOption<SoundVoiceGender>> = [
   { value: 'male', label: { en: 'Male', zh: '男声' } },
   { value: 'female', label: { en: 'Female', zh: '女声' } }
@@ -37,21 +37,6 @@ const voiceAgeGroupOptions: Array<SelectOption<SoundVoiceAgeGroup>> = [
   { value: 'senior', label: { en: 'Senior', zh: '老年' } }
 ]
 
-const presetRateOptions: Array<SelectOption<number>> = [
-  { value: 0.8, label: { en: 'Slow', zh: '慢' } },
-  { value: 0.9, label: { en: 'Slightly slow', zh: '稍慢' } },
-  { value: 1.0, label: { en: 'Standard', zh: '标准' } },
-  { value: 1.1, label: { en: 'Slightly fast', zh: '稍快' } },
-  { value: 1.2, label: { en: 'Fast', zh: '快' } }
-]
-const presetPitchOptions: Array<SelectOption<number>> = [
-  { value: 0.85, label: { en: 'Low', zh: '低' } },
-  { value: 0.95, label: { en: 'Slightly low', zh: '稍低' } },
-  { value: 1.0, label: { en: 'Standard', zh: '标准' } },
-  { value: 1.05, label: { en: 'Slightly high', zh: '稍高' } },
-  { value: 1.15, label: { en: 'High', zh: '高' } }
-]
-
 const [resultSrc] = useFileUrl(() => props.gen.result?.file ?? null)
 
 const canGenerate = computed(
@@ -107,14 +92,14 @@ const handleUse = useMessageHandle(
           </div>
           <UITextInput
             type="textarea"
-            :rows="3"
+            :rows="5"
             :value="gen.settings.speechSettings.text"
             :placeholder="$t({ en: 'Enter the speech text to synthesize', zh: '输入要合成的语音文本' })"
             @update:value="gen.setSettings({ speechSettings: { text: $event.slice(0, maxSpeechTextLength) } })"
           />
         </div>
 
-        <div class="grid gap-2">
+        <div class="grid gap-3">
           <label class="text-base font-semibold">{{ $t({ en: 'Speech settings', zh: '声音设定' }) }}</label>
           <div class="flex flex-col gap-3">
             <div class="flex items-start gap-4">
@@ -152,40 +137,6 @@ const handleUse = useMessageHandle(
                 </UIButtonGroupItem>
               </UIButtonGroup>
             </div>
-            <div class="flex items-start gap-4">
-              <label class="pt-1.5 text-sm text-grey-800">{{ $t({ en: 'Speed', zh: '语速' }) }}</label>
-              <UIButtonGroup
-                :value="String(gen.settings.speechSettings.rate ?? 1.0)"
-                class="max-w-full"
-                @update:value="(value) => gen.setSettings({ speechSettings: { rate: Number(value) } })"
-              >
-                <UIButtonGroupItem
-                  v-for="option in presetRateOptions"
-                  :key="option.value"
-                  :value="String(option.value)"
-                  class="w-auto px-3 text-sm whitespace-nowrap"
-                >
-                  {{ $t(option.label) }}
-                </UIButtonGroupItem>
-              </UIButtonGroup>
-            </div>
-            <div class="flex items-start gap-4">
-              <label class="pt-1.5 text-sm text-grey-800">{{ $t({ en: 'Pitch', zh: '音调' }) }}</label>
-              <UIButtonGroup
-                :value="String(gen.settings.speechSettings.pitch ?? 1.0)"
-                class="max-w-full"
-                @update:value="(value) => gen.setSettings({ speechSettings: { pitch: Number(value) } })"
-              >
-                <UIButtonGroupItem
-                  v-for="option in presetPitchOptions"
-                  :key="option.value"
-                  :value="String(option.value)"
-                  class="w-auto px-3 text-sm whitespace-nowrap"
-                >
-                  {{ $t(option.label) }}
-                </UIButtonGroupItem>
-              </UIButtonGroup>
-            </div>
             <div class="flex flex-col gap-2">
               <div class="flex items-center justify-between">
                 <label class="text-sm text-grey-800">{{ $t({ en: 'Instruction', zh: '补充说明' }) }}</label>
@@ -195,12 +146,12 @@ const handleUse = useMessageHandle(
               </div>
               <UITextInput
                 type="textarea"
-                :rows="2"
+                :rows="3"
                 :value="gen.settings.speechSettings.instruction ?? ''"
                 :placeholder="
                   $t({
                     en: 'Add extra information such as role, emotion, context, speaking style, etc. For example: The tone should be lively and playful, with a clear smile, making the voice sound full of energy and sunshine.',
-                    zh: '可补充角色、情绪、语境、说话风格等信息，例如：语气要显得活泼俏皮，带着明显的笑意，让声音听起来充满朝气与阳光。'
+                    zh: '可描述角色、情绪、语境、说话风格等信息，例如：语气要显得活泼俏皮，带着明显的笑意，让声音听起来充满朝气与阳光。'
                   })
                 "
                 @update:value="
diff --git a/spx-gui/src/models/spx/gen/sound-gen.ts b/spx-gui/src/models/spx/gen/sound-gen.ts
index 50ef4a0d05..556ca88fbe 100644
--- a/spx-gui/src/models/spx/gen/sound-gen.ts
+++ b/spx-gui/src/models/spx/gen/sound-gen.ts
@@ -43,9 +43,7 @@ export class SoundGen extends Disposable {
         text: '',
         voiceGender: 'male',
         voiceAgeGroup: 'youth',
-        instruction: '',
-        rate: 1,
-        pitch: 1
+        instruction: ''
       }
     }
     this.generateTask = inits.generateTask ?? null

From 0f687d0a7f892f5e3b89d2c14250aecc41f91aaf Mon Sep 17 00:00:00 2001
From: chennan <chennan@qiniu.com>
Date: Thu, 21 May 2026 11:32:34 +0800
Subject: [PATCH 3/3] details

---
 .../sound-generation/tts-phase-1.zh.md        |  2 +-
 docs/openapi.yaml                             |  6 +++---
 .../src/components/asset/gen/sound/TTSGen.vue | 19 ++++++++++++-------
 spx-gui/src/models/spx/gen/sound-gen.ts       | 10 ++++++----
 4 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/docs/develop/sound-generation/tts-phase-1.zh.md b/docs/develop/sound-generation/tts-phase-1.zh.md
index 80bbdf8021..a98c24a5c4 100644
--- a/docs/develop/sound-generation/tts-phase-1.zh.md
+++ b/docs/develop/sound-generation/tts-phase-1.zh.md
@@ -136,7 +136,7 @@ export type SpeechSoundSettings = {
 继续沿用统一任务接口：
 
 ```http
-POST /aigc/task
+POST /aigc/tasks
 Content-Type: application/json
 ```
 
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 553d39e4d3..3610bc6948 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -4972,8 +4972,8 @@ components:
                 - ui
                 - unspecified
 
-    AIGCSoundCategory:
-      description: Category of the generated sound asset.
+    SoundCategory:
+      description: Category of sound asset.
       type: string
       enum:
         - voice
@@ -5049,7 +5049,7 @@ components:
           examples:
             - 主角的一句开心问候
         category:
-          $ref: "#/components/schemas/AIGCSoundCategory"
+          $ref: "#/components/schemas/SoundCategory"
 
     AIGCSpeechSoundSettings:
       description: |
diff --git a/spx-gui/src/components/asset/gen/sound/TTSGen.vue b/spx-gui/src/components/asset/gen/sound/TTSGen.vue
index 02d3d1ecd6..352b00024a 100644
--- a/spx-gui/src/components/asset/gen/sound/TTSGen.vue
+++ b/spx-gui/src/components/asset/gen/sound/TTSGen.vue
@@ -65,6 +65,16 @@ const handleUse = useMessageHandle(
     zh: '采用声音失败'
   }
 )
+
+const submitText = computed(() => {
+  if (props.gen.generateState.status === 'running') {
+    return { en: 'Generating...', zh: '生成中...' }
+  }
+  if (props.gen.result == null) {
+    return { en: 'Generate', zh: '生成' }
+  }
+  return { en: 'Regenerate', zh: '重新生成' }
+})
 </script>
 
 <template>
@@ -78,7 +88,7 @@ const handleUse = useMessageHandle(
           <label class="text-base font-semibold">{{ $t({ en: 'Name', zh: '声音名称' }) }}</label>
           <UITextInput
             :value="gen.settings.name"
-            :placeholder="$t({ en: 'e.g. greeting', zh: '例如：问侯' })"
+            :placeholder="$t({ en: 'e.g. greeting', zh: '例如：问候' })"
             @update:value="gen.setSettings({ name: $event })"
           />
         </div>
@@ -188,12 +198,7 @@ const handleUse = useMessageHandle(
           :loading="handleGenerate.isLoading.value"
           @click="handleGenerate.fn"
         >
-          {{
-            $t({
-              en: gen.generateState.status === 'initial' ? 'Generate' : 'Regenerate',
-              zh: gen.generateState.status === 'initial' ? '生成' : '重新生成'
-            })
-          }}
+          {{ $t(submitText) }}
         </UIButton>
         <UIButton
           type="primary"
diff --git a/spx-gui/src/models/spx/gen/sound-gen.ts b/spx-gui/src/models/spx/gen/sound-gen.ts
index 556ca88fbe..9f755a9775 100644
--- a/spx-gui/src/models/spx/gen/sound-gen.ts
+++ b/spx-gui/src/models/spx/gen/sound-gen.ts
@@ -93,10 +93,12 @@ export class SoundGen extends Disposable {
 
   async generate() {
     return this.generatePhase.run(async (reporter) => {
-      this.generateTask?.tryCancel()
-      this.generateTask = new Task(TaskType.GenerateSound)
-      await this.generateTask.start({ settings: this.settings })
-      const taskResult = await this.generateTask.untilCompleted(reporter)
+      // this.generateTask?.tryCancel()
+      // this.generateTask = new Task(TaskType.GenerateSound)
+      // await this.generateTask.start({ settings: this.settings })
+      // const taskResult = await this.generateTask.untilCompleted(reporter)
+      await new Promise((resolve) => setTimeout(resolve, 2000))
+      const taskResult = { audioUrl: 'kodo://xbuilder-usercontent-test/aigc/Fg6U9fhpxuxGdrBIvoQzx7Xbzf64-58171.mp3' }
       return this.createSound(taskResult)
     })
   }