From a4b9590b82fc538dcdcf489e52f805f656a43876 Mon Sep 17 00:00:00 2001 From: chennan Date: Thu, 21 May 2026 09:48:23 +0800 Subject: [PATCH 1/3] tts doc & demo --- docs/develop/sound-generation/index.zh.md | 22 ++ .../sound-generation/tts-phase-1.zh.md | 272 ++++++++++++++++++ docs/openapi.yaml | 134 +++++++++ spx-gui/src/apis/aigc.ts | 41 ++- spx-gui/src/apis/common/index.ts | 2 + spx-gui/src/components/asset/gen/modal.ts | 11 + .../asset/gen/sound/SoundGenModal.vue | 87 ++++++ .../src/components/asset/gen/sound/TTSGen.vue | 259 +++++++++++++++++ spx-gui/src/components/asset/index.ts | 8 + .../editor/stage/sound/AddSoundMenu.vue | 25 +- spx-gui/src/models/spx/gen/aigc-mock.ts | 7 + spx-gui/src/models/spx/gen/common.ts | 3 +- spx-gui/src/models/spx/gen/sound-gen.ts | 147 ++++++++++ 13 files changed, 1015 insertions(+), 3 deletions(-) create mode 100644 docs/develop/sound-generation/index.zh.md create mode 100644 docs/develop/sound-generation/tts-phase-1.zh.md create mode 100644 spx-gui/src/components/asset/gen/sound/SoundGenModal.vue create mode 100644 spx-gui/src/components/asset/gen/sound/TTSGen.vue create mode 100644 spx-gui/src/models/spx/gen/sound-gen.ts diff --git a/docs/develop/sound-generation/index.zh.md b/docs/develop/sound-generation/index.zh.md new file mode 100644 index 0000000000..2a3306147b --- /dev/null +++ b/docs/develop/sound-generation/index.zh.md @@ -0,0 +1,22 @@ +# Sound 生成 + +本文档用于承接 XBuilder 中 Sound 生成功能的开发设计与实现拆解。 + +## 范围 + +Sound 生成当前规划包含: + +- 语音合成(Text-to-Speech, TTS) +- 音效(Sound Effect) +- 背景音乐(Background Music) + +## 当前实现建议 + +当前实现上建议 **TTS 优先**,并以统一的 Sound 生成任务模型向后兼容音效与背景音乐。 + +- MVP 优先落地:阿里云 CosyVoice TTS +- 后续扩展:同一套任务接口继续支持 Sound Effect / Background Music + +## 设计文档 + +- Sound 一期 TTS 接口设计与 `builder-backend` 开发任务拆解:[`./tts-phase-1.zh.md`](tts-phase-1.zh.md) diff --git a/docs/develop/sound-generation/tts-phase-1.zh.md b/docs/develop/sound-generation/tts-phase-1.zh.md new file mode 100644 index 0000000000..404457e738 --- /dev/null +++ b/docs/develop/sound-generation/tts-phase-1.zh.md @@ -0,0 +1,272 @@ +# Sound 一期 TTS 接口设计 + +本文档用于对齐 `builder`(前端)与 `builder-backend`(服务端)的 Sound 生成功能设计。 + +当前方案基于阿里云 CosyVoice v3.5 的真实约束: + +> **前端表达“我要哪类声音、以什么语气说这段话”** +> **服务端负责“使用哪个 target model / voice_id / instruction 去实现”** + +尤其需要注意: + +- `cosyvoice-v3.5-flash` / `cosyvoice-v3.5-plus` **没有系统音色** +- 必须先通过**声音设计/复刻**生成 voice_id,后续语音合成时再把该 voice_id 作为 `voice` 参数使用 +- 模型本身通常可以根据 `text` 推断基础情绪和表达方式 +- 一期前端只选择 `性别 + 年龄段`,服务端为每个桶位固定维护一个默认音色 +- 前端更适合暴露开放的 `instruction` 作为补充说明,而不是固定枚举的 `emotion` / `useCase` +- `rate` / `pitch` 虽然底层支持较大范围调节,但前端应收敛为少量合理档位,避免生成结果失真 + +这样可以最大程度贴合 CosyVoice v3.5 的真实能力边界,同时保持前端心智简洁。 + +--- + +## 1. 设计结论 + +### 1.1 前端暴露“易理解、可复用、不过度绑定供应商”的概念 + +Sound TTS 的前端公开协议只保留这些字段: + +- `name`:生成后的素材名 +- `description`:素材描述 +- `category`:MVP 版本只有 `voice`(前端不暴露 category 选择,运行时固定传 voice) +- `speechSettings` + - `text`:要说的话 + - `voiceGender`:声音性别(男 / 女) + - `voiceAgeGroup`:声音年龄段(儿童 / 青年 / 中年 / 老年) + - `instruction`:补充“希望怎么说”“更偏什么感觉”“面向谁说”等开放信息 + - `rate`:语速档位 + - `pitch`:音调档位 + +### 1.2 前端不包含这些底层字段 + +以下字段都不应该成为前端协议的一部分: + +- `provider` +- `model` +- 供应商原始 `voice_id` +- `format` +- `sampleRate` + +原因: + +1. 这些都是服务端接供应商时的实现细节。 +2. 它们会把前端交互变成“调底层模型参数”。 +3. 未来更换供应商或调整模型时,不应要求前端同步改协议。 + +### 1.3 服务端负责按分桶选择固定音色与参数映射 + +服务端根据前端提交的请求参数自行决定: + +- 使用哪个 CosyVoice target model 和 `voice_id` +- 如何组织最终调用参数 +- 输出什么格式和采样率 + +--- + +## 2. MVP 范围 + +### 2.1 本期必须实现 + +- 文本转语音(TTS) +- 手动配置基础音色维度与合成参数 +- 异步生成任务创建 / 查询 / SSE 订阅 / 取消 +- 返回音频 URL +- 前端将结果落为 XBuilder 的 `Sound` Asset + +### 2.2 本期明确不做 + +- 音效生成(Sound Effect) +- 背景音乐生成(Background Music) +- 公开供应商底层参数,如原始 `voice_id`、连续数值级 `rate` / `pitch` +- 公开 provider / model / voiceId 选择 +- 将生成结果自动入库到公共素材库 + +--- + +## 3. 核心原则 + +1. **前端讲用户语言,后端讲供应商语言。** +2. **前端只选性别与年龄桶位,不直接接触 voice_id。** +3. **运行时暴露稳定的可控项:性别、年龄段、开放 `instruction`、有限档位的 `rate / pitch`。** +4. **生成结果仍沿用现有 `/aigc/task` 异步任务体系。** +5. **前端只保留手动选择,不做本地或服务端推荐。** + +--- + +## 4. 数据模型 + +### 4.1 基础音色维度(前端可见) + +```ts +export type SoundVoiceGender = 'male' | 'female' + +export type SoundVoiceAgeGroup = 'child' | 'youth' | 'middle-aged' | 'senior' +``` + +说明: + +- 服务端内部按 `男 / 女 × 儿童 / 青年 / 中年 / 老年` 维护 8 个基础桶位 +- 每个桶位在一期只对应一个固定默认音色 +- 前端暴露的是桶位维度,不暴露供应商原始 `voice_id` +- `rate / pitch` 虽然在协议里仍然使用 number,但前端只提供固定几档可选值 +- 这些固定选项由前端直接内置,不再单独提供“获取可选项”的接口 +- 前端所有可控项都由用户手动选择,不再引入推荐值 + +### 4.2 生成请求(前端可见) + +```ts +export type SpeechSoundSettings = { + name: string + description: string + category: 'voice' + speechSettings: { + text: string + voiceGender: SoundVoiceGender + voiceAgeGroup: SoundVoiceAgeGroup + instruction?: string + rate?: number + pitch?: number + } +} +``` + +说明: + +- `voiceGender / voiceAgeGroup` 用于选定基础音色桶位 +- `text` 暂定限制为最多 200 个字符,避免长文本导致生成用时过长 +- `instruction` 限制为最多 50 个字符,不超过 CosyVoice 可接受的长度范围 +- 协议字段 `rate / pitch` 使用 number;前端通过固定档位映射到这些数值 + +前端固定档位映射如下: + +| UI 档位 | `rate` | `pitch` | +| --- | --- | --- | +| 低 / 慢 | `0.8` | `0.85` | +| 稍低 / 稍慢 | `0.9` | `0.95` | +| 标准 | `1.0` | `1.0` | +| 稍高 / 稍快 | `1.1` | `1.05` | +| 高 / 快 | `1.2` | `1.15` | + +--- + +## 5. 接口设计 + +### 5.1 创建 Sound 生成任务 + +继续沿用统一任务接口: + +```http +POST /aigc/task +Content-Type: application/json +``` + +#### Request + +```json +{ + "type": "generateSound", + "parameters": { + "settings": { + "name": "hero-hello", + "description": "主角的一句开心问候", + "category": "voice", + "speechSettings": { + "text": "你好,我们出发吧!", + "voiceGender": "female", + "voiceAgeGroup": "youth", + "instruction": "像在提醒队友准备出发,语气轻快一点", + "rate": 1.1, + "pitch": 1.05 + } + } + } +} +``` + +### 5.2 TaskResult 设计 + +```ts +export type TaskResultGenerateSound = { + audioUrl: string +} +``` + +--- + +## 6. 前端交互 + +UI 顺序: + +1. 输入素材名称 +2. 输入要说的话 +3. 选择声音性别 / 年龄段 +4. 选择语速 / 音调档位 +5. 按需补充 `instruction` +6. 点击生成 +7. 试听生成结果 +8. 采用到项目 + +--- + +## 7. 服务端开发建议 + +### 任务 1:维护基础音色库存 + +服务端维护一份基础音色库存,例如: + +```text +builder-backend/internal/aigc/sound/base_voice_inventory.json +``` + +库存项可包含: + +- `target_model` +- `voice_gender` +- `voice_age_group` +- `voice_id` + +其中: + +- `voice_id` 来自 CosyVoice 声音设计接口 +- `voice_gender + voice_age_group` 共同决定一个基础音色桶位 + +建议: + +- 一期先按 `男 / 女 × 儿童 / 青年 / 中年 / 老年` 维护 8 个基础桶位 +- 每个桶位只维护一个默认基础音色,优先保证稳定、自然、通用 +- 如果后续某个桶位的默认音色无法覆盖主要场景,再考虑扩展该桶位的候选音色数量 + +### 任务 2:处理 Sound 生成任务 + +服务端收到请求后完成: + +1. 根据 `voiceGender + voiceAgeGroup` 选定基础音色桶位对应的固定 `voice_id` +2. 根据 `text / instruction / rate / pitch` 等信息生成最终调用参数 +3. 调用供应商生成音频 +4. 上传对象存储并写回 task result + +例如: + +- `male + middle-aged` -> `cosyvoice-v3.5-plus` 的中年男声默认 `voice_id` +- `female + youth` -> `cosyvoice-v3.5-plus` 的青年女声默认 `voice_id` + +然后再根据: + +- `text` +- `instruction` +- `rate` +- `pitch` + +生成最终调用参数。默认值策略: + +- `instruction` 为空时,仅基于 `text` 做自然表达 +- `rate / pitch` 缺省时使用 `1.0` +- 输出格式固定用 `mp3` +- 其它参数不指定,使用 CosyVoice 默认值 + +--- + +## 8. 相关文档 + +- [阿里百炼语音合成模型用户指南](https://help.aliyun.com/zh/model-studio/tts-model) +- [CosyVoice 语音合成 API 参考](https://help.aliyun.com/zh/model-studio/non-realtime-cosyvoice-api) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index fe2929c17c..ab3af9e28c 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -2676,6 +2676,7 @@ paths: | `generateAnimationVideo` | Generate animation video | | `extractVideoFrames` | Extract frames from video | | `generateBackdrop` | Generate backdrop image | + | `generateSound` | Generate sound asset (TTS in current MVP) | Quota and rate limits: @@ -2831,6 +2832,13 @@ paths: default: 1 examples: - 4 + - title: generateSound + type: object + required: + - settings + properties: + settings: + $ref: "#/components/schemas/AIGCSpeechSoundSettings" responses: "202": description: Task accepted and queued for processing. @@ -4708,6 +4716,7 @@ components: - generateAnimationVideo - extractVideoFrames - generateBackdrop + - generateSound examples: - removeBackground status: @@ -4776,6 +4785,15 @@ components: items: type: string format: uri + - title: generateSound + type: object + required: + - audioUrl + properties: + audioUrl: + description: Universal URL of the generated audio file (e.g. kodo://bucket/key). + type: string + format: uri error: description: Error details. Only available when status is failed. type: object @@ -4943,6 +4961,122 @@ components: - ui - unspecified + AIGCSoundCategory: + description: Category of the generated sound asset. + type: string + enum: + - voice + - effect + - music + - ambience + - other + examples: + - voice + + AIGCSoundVoiceGender: + description: Gender bucket of the generated voice. + type: string + enum: + - male + - female + examples: + - female + + AIGCSoundVoiceAgeGroup: + description: Age bucket of the generated voice. + type: string + enum: + - child + - youth + - middle-aged + - senior + examples: + - middle-aged + + AIGCSpeechSettings: + description: Speech-specific generation settings. + type: object + required: + - text + - voiceGender + - voiceAgeGroup + properties: + text: + description: Text to synthesize into speech. + type: string + minLength: 1 + maxLength: 200 + examples: + - 你好,我们出发吧! + voiceGender: + $ref: "#/components/schemas/AIGCSoundVoiceGender" + voiceAgeGroup: + $ref: "#/components/schemas/AIGCSoundVoiceAgeGroup" + instruction: + description: Optional free-form instruction describing how the speech should be delivered. + type: string + maxLength: 50 + examples: + - 像在提醒队友准备出发,语气轻快一点 + rate: + description: Optional speech-rate multiplier passed to synthesis. `1.0` means natural speed. + type: number + format: float + minimum: 0.5 + maximum: 2.0 + examples: + - 1.1 + pitch: + description: Optional pitch multiplier passed to synthesis. `1.0` means natural pitch. + type: number + format: float + minimum: 0.5 + maximum: 2.0 + examples: + - 1.05 + + AIGCSoundBaseSettings: + description: Base settings shared by all sound-generation requests. + type: object + required: + - name + - description + - category + properties: + name: + description: Name of the resulting Sound asset. + type: string + examples: + - hero-hello + description: + description: Description of the resulting Sound asset. + type: string + examples: + - 主角的一句开心问候 + category: + $ref: "#/components/schemas/AIGCSoundCategory" + + AIGCSpeechSoundSettings: + description: | + Settings for generating a speech sound asset. + + This is the only supported sound-generation request type in the current MVP. + allOf: + - $ref: "#/components/schemas/AIGCSoundBaseSettings" + - type: object + required: + - speechSettings + properties: + category: + description: Category of the resulting Sound asset. Fixed to `voice` for speech generation. + type: string + enum: + - voice + examples: + - voice + speechSettings: + $ref: "#/components/schemas/AIGCSpeechSettings" + UpInfo: description: Upload credentials and configuration. type: object diff --git a/spx-gui/src/apis/aigc.ts b/spx-gui/src/apis/aigc.ts index aca88b1d52..631447897d 100644 --- a/spx-gui/src/apis/aigc.ts +++ b/spx-gui/src/apis/aigc.ts @@ -9,6 +9,7 @@ import { client, type FileCollection, Perspective, + SoundCategory, SpriteCategory, type UniversalUrl } from './common' @@ -65,12 +66,40 @@ export type BackdropSettings = { perspective: Perspective } +export type SoundVoiceGender = 'male' | 'female' + +export type SoundVoiceAgeGroup = 'child' | 'youth' | 'middle-aged' | 'senior' + +export type SoundSettingsBase = { + name: string + description: string + category: SoundCategory +} + +export type SpeechSettings = { + text: string + voiceGender: SoundVoiceGender + voiceAgeGroup: SoundVoiceAgeGroup + instruction?: string + rate?: number + pitch?: number +} + +export type SpeechSoundSettings = SoundSettingsBase & { + category: SoundCategory.Voice + speechSettings: SpeechSettings +} + +// TODO: support more sound types in the future +export type SoundSettings = SpeechSoundSettings + export const enum TaskType { RemoveBackground = 'removeBackground', GenerateCostume = 'generateCostume', GenerateAnimationVideo = 'generateAnimationVideo', ExtractVideoFrames = 'extractVideoFrames', - GenerateBackdrop = 'generateBackdrop' + GenerateBackdrop = 'generateBackdrop', + GenerateSound = 'generateSound' } export const enum TaskStatus { @@ -120,12 +149,17 @@ export type TaskResultGenerateBackdrop = { imageUrls: UniversalUrl[] } +export type TaskResultGenerateSound = { + audioUrl: UniversalUrl +} + export type TaskResult = { [TaskType.RemoveBackground]: TaskResultRemoveBackground [TaskType.GenerateCostume]: TaskResultGenerateCostume [TaskType.GenerateAnimationVideo]: TaskResultGenerateAnimationVideo [TaskType.ExtractVideoFrames]: TaskResultExtractVideoFrames [TaskType.GenerateBackdrop]: TaskResultGenerateBackdrop + [TaskType.GenerateSound]: TaskResultGenerateSound }[T] export type Task = { @@ -174,12 +208,17 @@ export type TaskParamsGenerateBackdrop = { n: number } +export type TaskParamsGenerateSound = { + settings: SoundSettings +} + export type TaskParams = { [TaskType.RemoveBackground]: TaskParamsRemoveBackground [TaskType.GenerateCostume]: TaskParamsGenerateCostume [TaskType.GenerateAnimationVideo]: TaskParamsGenerateAnimationVideo [TaskType.ExtractVideoFrames]: TaskParamsExtractVideoFrames [TaskType.GenerateBackdrop]: TaskParamsGenerateBackdrop + [TaskType.GenerateSound]: TaskParamsGenerateSound }[T] export function createTask(type: T, params: TaskParams, signal?: AbortSignal): Promise> { diff --git a/spx-gui/src/apis/common/index.ts b/spx-gui/src/apis/common/index.ts index b94a728617..6add117101 100644 --- a/spx-gui/src/apis/common/index.ts +++ b/spx-gui/src/apis/common/index.ts @@ -88,6 +88,8 @@ export const enum BackdropCategory { } export const enum SoundCategory { + /** Voice sounds are spoken or narrated audio assets, such as TTS lines, narration, or dialogue clips. */ + Voice = 'voice', /** Sound effects are audio elements that enhance the gaming experience by providing auditory feedback for actions, events, or interactions within the game. */ Effect = 'effect', /** Music tracks are composed pieces that set the tone, mood, and atmosphere of the game, often playing in the background during gameplay or specific scenes. */ diff --git a/spx-gui/src/components/asset/gen/modal.ts b/spx-gui/src/components/asset/gen/modal.ts index e38a705e59..57abaf4566 100644 --- a/spx-gui/src/components/asset/gen/modal.ts +++ b/spx-gui/src/components/asset/gen/modal.ts @@ -4,6 +4,7 @@ import { I18n } from '@/utils/i18n' import type { SpxProject } from '@/models/spx/project' import { SpriteGen } from '@/models/spx/gen/sprite-gen' import { BackdropGen } from '@/models/spx/gen/backdrop-gen' +import { SoundGen } from '@/models/spx/gen/sound-gen' import type { AssetGenModel } from '@/models/spx/common/asset' export interface GenHelpers { @@ -41,3 +42,13 @@ export function initBackdropGen(i18n: I18n, project: SpxProject, onCleanup: OnCl }) return g } + +/** Init a sound-gen instance for asset generation modals, and handle its lifecycle properly. */ +export function initSoundGen(onCleanup: OnCleanup) { + const g = new SoundGen() + onCleanup(() => { + g.cancel() + g.dispose() + }) + return g +} diff --git a/spx-gui/src/components/asset/gen/sound/SoundGenModal.vue b/spx-gui/src/components/asset/gen/sound/SoundGenModal.vue new file mode 100644 index 0000000000..e3dce49475 --- /dev/null +++ b/spx-gui/src/components/asset/gen/sound/SoundGenModal.vue @@ -0,0 +1,87 @@ + + + diff --git a/spx-gui/src/components/asset/gen/sound/TTSGen.vue b/spx-gui/src/components/asset/gen/sound/TTSGen.vue new file mode 100644 index 0000000000..5c44a67029 --- /dev/null +++ b/spx-gui/src/components/asset/gen/sound/TTSGen.vue @@ -0,0 +1,259 @@ + + + diff --git a/spx-gui/src/components/asset/index.ts b/spx-gui/src/components/asset/index.ts index 715914d908..a05bdfd145 100644 --- a/spx-gui/src/components/asset/index.ts +++ b/spx-gui/src/components/asset/index.ts @@ -34,6 +34,7 @@ import GroupCostumesModal from './animation/GroupCostumesModal.vue' import AssetLibraryManagementModal from './library/management/AssetLibraryManagementModal.vue' import SpriteGenModal from './gen/sprite/SpriteGenModal.vue' import BackdropGenModal from './gen/backdrop/BackdropGenModal.vue' +import SoundGenModal from './gen/sound/SoundGenModal.vue' import type { GenHelpers } from './gen/modal' export function useSpriteGenModal() { @@ -51,6 +52,13 @@ export function useBackdropGenModal() { } } +export function useSoundGenModal() { + const invokeModal = useModal(SoundGenModal) + return function invokeSoundGenModal(project: SpxProject) { + return invokeModal({ project }) + } +} + export function useAddAssetFromLibrary() { const editorCtx = useEditorCtx() const genHelpers = useGenHelpers() diff --git a/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue b/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue index a4f968826a..1ddc979e0e 100644 --- a/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue +++ b/spx-gui/src/components/editor/stage/sound/AddSoundMenu.vue @@ -15,6 +15,9 @@ {{ $t({ en: 'Record', zh: '录音' }) }} + + {{ $t({ en: 'Generate with AI', zh: '使用 AI 生成' }) }} + @@ -22,7 +25,12 @@ import { UIMenu, UIMenuItem } from '@/components/ui' import { AssetType } from '@/apis/asset' import { useMessageHandle } from '@/utils/exception' -import { useAddAssetFromLibrary, useAddSoundFromLocalFile, useAddSoundByRecording } from '@/components/asset' +import { + useAddAssetFromLibrary, + useAddSoundFromLocalFile, + useAddSoundByRecording, + useSoundGenModal +} from '@/components/asset' import { useEditorCtx } from '../../EditorContextProvider.vue' import type { SoundsEditorState } from './sounds-editor-state' @@ -67,4 +75,19 @@ const handleRecord = useMessageHandle( zh: '录音失败' } ).fn + +const invokeSoundGenModal = useSoundGenModal() +const handleGenerate = useMessageHandle( + async () => { + const sound = await invokeSoundGenModal(editorCtx.project) + await editorCtx.state.history.doAction({ name: { en: 'Add sound', zh: '添加声音' } }, () => { + editorCtx.project.addSound(sound) + }) + props.state.select(sound.id) + }, + { + en: 'Failed to generate sound', + zh: '生成声音失败' + } +).fn diff --git a/spx-gui/src/models/spx/gen/aigc-mock.ts b/spx-gui/src/models/spx/gen/aigc-mock.ts index b068acf3ec..235d4c06b9 100644 --- a/spx-gui/src/models/spx/gen/aigc-mock.ts +++ b/spx-gui/src/models/spx/gen/aigc-mock.ts @@ -471,6 +471,13 @@ export class MockAigcApis { imageUrls: this.range(p.n).map((i) => this.url(`backdrop-${name}-${i + 1}.png`)) } as TaskResult } + case TaskType.GenerateSound: { + const p = params as TaskParams + const name = this.sanitize(p.settings.name) + return { + audioUrl: this.url(`sound-${name}.mp3`) + } as TaskResult + } default: throw new Error(`unsupported task type: ${type as string}`) } diff --git a/spx-gui/src/models/spx/gen/common.ts b/spx-gui/src/models/spx/gen/common.ts index 7d493b5054..ce72590af4 100644 --- a/spx-gui/src/models/spx/gen/common.ts +++ b/spx-gui/src/models/spx/gen/common.ts @@ -220,7 +220,8 @@ export const taskDurations: Record = { [TaskType.GenerateCostume]: 15, [TaskType.GenerateAnimationVideo]: 180, [TaskType.ExtractVideoFrames]: 12, - [TaskType.GenerateBackdrop]: 15 + [TaskType.GenerateBackdrop]: 15, + [TaskType.GenerateSound]: 5 } export type TaskApis = Pick diff --git a/spx-gui/src/models/spx/gen/sound-gen.ts b/spx-gui/src/models/spx/gen/sound-gen.ts new file mode 100644 index 0000000000..50ef4a0d05 --- /dev/null +++ b/spx-gui/src/models/spx/gen/sound-gen.ts @@ -0,0 +1,147 @@ +import { nanoid } from 'nanoid' +import { reactive } from 'vue' +import { SoundCategory } from '@/apis/common' +import { + adoptAsset, + TaskStatus, + TaskType, + type SpeechSoundSettings, + type SpeechSettings, + type TaskResultGenerateSound +} from '@/apis/aigc' +import { Disposable } from '@/utils/disposable' +import { createFileWithUniversalUrl } from '../../common/cloud' +import { ensureValidSoundName, validateSoundName, type SoundLikeParent } from '../common/asset-name' +import { sound2Asset } from '../common/asset' +import { Sound } from '../sound' +import { Phase, Task } from './common' + +export type SoundGenInits = { + id?: string + generateTask?: Task | null + generatePhase?: Phase +} + +type GenerateSpeechSettingsUpdates = Partial> & { + speechSettings?: Partial +} + +export class SoundGen extends Disposable { + id: string + settings: SpeechSoundSettings + private generateTask: Task | null + private generatePhase: Phase + + constructor(inits: SoundGenInits = {}) { + super() + this.id = inits.id ?? nanoid() + this.settings = { + name: '', + description: '', + category: SoundCategory.Voice, + speechSettings: { + text: '', + voiceGender: 'male', + voiceAgeGroup: 'youth', + instruction: '', + rate: 1, + pitch: 1 + } + } + this.generateTask = inits.generateTask ?? null + this.generatePhase = inits.generatePhase ?? new Phase({ en: 'generate sound', zh: '生成声音' }) + return reactive(this) as this + } + + private parent: SoundLikeParent | null = null + setParent(parent: SoundLikeParent | null) { + this.parent = parent + } + + get name() { + return this.settings.name + } + setName(name: string) { + const err = validateSoundName(name, this.parent) + if (err != null) throw new Error(`invalid name ${name}: ${err.en}`) + this.settings.name = name + this.result?.setName(name) + } + + setSettings(updates: GenerateSpeechSettingsUpdates) { + if (updates.name != null && updates.name !== this.settings.name) { + updates = { ...updates, name: ensureValidSoundName(updates.name, this.parent) } + } + const { speechSettings, ...rest } = updates + Object.assign(this.settings, rest) + if (speechSettings != null) { + Object.assign(this.settings.speechSettings, speechSettings) + } + if (updates.name != null) this.result?.setName(updates.name) + } + + get generateState() { + return this.generatePhase.state + } + + get result() { + return this.generatePhase.state.status === 'finished' ? this.generatePhase.state.result : null + } + + reset() { + this.generateTask = null + this.generatePhase.reset() + } + + async generate() { + return this.generatePhase.run(async (reporter) => { + this.generateTask?.tryCancel() + this.generateTask = new Task(TaskType.GenerateSound) + await this.generateTask.start({ settings: this.settings }) + const taskResult = await this.generateTask.untilCompleted(reporter) + return this.createSound(taskResult) + }) + } + + private async createSound(taskResult: TaskResultGenerateSound) { + const file = createFileWithUniversalUrl(taskResult.audioUrl) + const sound = await Sound.create(this.settings.name, file) + sound.setAssetMetadata({ + description: this.settings.description, + extraSettings: { + category: this.settings.category + } + }) + sound.setExtraConfig({ + builder_soundGen: { + ...this.settings, + result: { + audioUrl: taskResult.audioUrl + } + } + }) + return sound + } + + async recordAdoption() { + const sound = this.result + if (sound == null) throw new Error('result sound expected') + const taskIds = this.generateTask?.data?.status === TaskStatus.Completed ? [this.generateTask.data.id] : [] + const assetData = await sound2Asset(sound) + return adoptAsset({ + taskIds, + asset: { + ...assetData, + displayName: this.settings.name, + description: this.settings.description, + extraSettings: { + category: this.settings.category + } + } + }) + } + + cancel() { + return this.generateTask?.tryCancel() + } +} From 6e76b373f4106ab186ee4f79d6406aae150b37fd Mon Sep 17 00:00:00 2001 From: chennan Date: Thu, 21 May 2026 09:49:19 +0800 Subject: [PATCH 2/3] remove rate & pitch --- .../sound-generation/tts-phase-1.zh.md | 62 ++++-------------- docs/openapi.yaml | 29 ++++----- spx-gui/src/apis/aigc.ts | 2 - .../src/components/asset/gen/sound/TTSGen.vue | 63 +++---------------- spx-gui/src/models/spx/gen/sound-gen.ts | 4 +- 5 files changed, 33 insertions(+), 127 deletions(-) diff --git a/docs/develop/sound-generation/tts-phase-1.zh.md b/docs/develop/sound-generation/tts-phase-1.zh.md index 404457e738..80bbdf8021 100644 --- a/docs/develop/sound-generation/tts-phase-1.zh.md +++ b/docs/develop/sound-generation/tts-phase-1.zh.md @@ -1,4 +1,4 @@ -# Sound 一期 TTS 接口设计 +# Sound 一期 TTS 功能 & 接口设计 本文档用于对齐 `builder`(前端)与 `builder-backend`(服务端)的 Sound 生成功能设计。 @@ -11,10 +11,9 @@ - `cosyvoice-v3.5-flash` / `cosyvoice-v3.5-plus` **没有系统音色** - 必须先通过**声音设计/复刻**生成 voice_id,后续语音合成时再把该 voice_id 作为 `voice` 参数使用 -- 模型本身通常可以根据 `text` 推断基础情绪和表达方式 - 一期前端只选择 `性别 + 年龄段`,服务端为每个桶位固定维护一个默认音色 -- 前端更适合暴露开放的 `instruction` 作为补充说明,而不是固定枚举的 `emotion` / `useCase` -- `rate` / `pitch` 虽然底层支持较大范围调节,但前端应收敛为少量合理档位,避免生成结果失真 +- 模型本身通常可以根据 `text` 推断基础情绪和表达方式,可以附带可选的 `instruction` 作为补充说明,而不是用固定枚举的 `emotion` / `useCase` 等 +- 暂不开放 `rate / pitch / volume` 等参数,后续根据需求再评估是否增加“基础参数调整”功能 这样可以最大程度贴合 CosyVoice v3.5 的真实能力边界,同时保持前端心智简洁。 @@ -34,8 +33,6 @@ Sound TTS 的前端公开协议只保留这些字段: - `voiceGender`:声音性别(男 / 女) - `voiceAgeGroup`:声音年龄段(儿童 / 青年 / 中年 / 老年) - `instruction`:补充“希望怎么说”“更偏什么感觉”“面向谁说”等开放信息 - - `rate`:语速档位 - - `pitch`:音调档位 ### 1.2 前端不包含这些底层字段 @@ -77,8 +74,6 @@ Sound TTS 的前端公开协议只保留这些字段: - 音效生成(Sound Effect) - 背景音乐生成(Background Music) -- 公开供应商底层参数,如原始 `voice_id`、连续数值级 `rate` / `pitch` -- 公开 provider / model / voiceId 选择 - 将生成结果自动入库到公共素材库 --- @@ -87,9 +82,8 @@ Sound TTS 的前端公开协议只保留这些字段: 1. **前端讲用户语言,后端讲供应商语言。** 2. **前端只选性别与年龄桶位,不直接接触 voice_id。** -3. **运行时暴露稳定的可控项:性别、年龄段、开放 `instruction`、有限档位的 `rate / pitch`。** -4. **生成结果仍沿用现有 `/aigc/task` 异步任务体系。** -5. **前端只保留手动选择,不做本地或服务端推荐。** +3. **前端只暴露最小化的选项:性别、年龄段、开放 `instruction`。** +4. **生成结果沿用现有 `/aigc/task` 异步任务体系。** --- @@ -108,7 +102,6 @@ export type SoundVoiceAgeGroup = 'child' | 'youth' | 'middle-aged' | 'senior' - 服务端内部按 `男 / 女 × 儿童 / 青年 / 中年 / 老年` 维护 8 个基础桶位 - 每个桶位在一期只对应一个固定默认音色 - 前端暴露的是桶位维度,不暴露供应商原始 `voice_id` -- `rate / pitch` 虽然在协议里仍然使用 number,但前端只提供固定几档可选值 - 这些固定选项由前端直接内置,不再单独提供“获取可选项”的接口 - 前端所有可控项都由用户手动选择,不再引入推荐值 @@ -124,8 +117,6 @@ export type SpeechSoundSettings = { voiceGender: SoundVoiceGender voiceAgeGroup: SoundVoiceAgeGroup instruction?: string - rate?: number - pitch?: number } } ``` @@ -135,17 +126,6 @@ export type SpeechSoundSettings = { - `voiceGender / voiceAgeGroup` 用于选定基础音色桶位 - `text` 暂定限制为最多 200 个字符,避免长文本导致生成用时过长 - `instruction` 限制为最多 50 个字符,不超过 CosyVoice 可接受的长度范围 -- 协议字段 `rate / pitch` 使用 number;前端通过固定档位映射到这些数值 - -前端固定档位映射如下: - -| UI 档位 | `rate` | `pitch` | -| --- | --- | --- | -| 低 / 慢 | `0.8` | `0.85` | -| 稍低 / 稍慢 | `0.9` | `0.95` | -| 标准 | `1.0` | `1.0` | -| 稍高 / 稍快 | `1.1` | `1.05` | -| 高 / 快 | `1.2` | `1.15` | --- @@ -174,9 +154,7 @@ Content-Type: application/json "text": "你好,我们出发吧!", "voiceGender": "female", "voiceAgeGroup": "youth", - "instruction": "像在提醒队友准备出发,语气轻快一点", - "rate": 1.1, - "pitch": 1.05 + "instruction": "像在提醒队友准备出发,语气轻快一点" } } } @@ -200,11 +178,10 @@ UI 顺序: 1. 输入素材名称 2. 输入要说的话 3. 选择声音性别 / 年龄段 -4. 选择语速 / 音调档位 -5. 按需补充 `instruction` -6. 点击生成 -7. 试听生成结果 -8. 采用到项目 +4. 按需补充 `instruction` +5. 点击生成 +6. 试听生成结果 +7. 采用到项目 --- @@ -227,7 +204,7 @@ builder-backend/internal/aigc/sound/base_voice_inventory.json 其中: -- `voice_id` 来自 CosyVoice 声音设计接口 +- `voice_id` 来自阿里百炼账号下已设计/复刻的可用于 CosyVoice `target_model` 的音色 - `voice_gender + voice_age_group` 共同决定一个基础音色桶位 建议: @@ -241,26 +218,13 @@ builder-backend/internal/aigc/sound/base_voice_inventory.json 服务端收到请求后完成: 1. 根据 `voiceGender + voiceAgeGroup` 选定基础音色桶位对应的固定 `voice_id` -2. 根据 `text / instruction / rate / pitch` 等信息生成最终调用参数 +2. 根据 `text / instruction` 等信息生成最终调用参数 3. 调用供应商生成音频 4. 上传对象存储并写回 task result -例如: - -- `male + middle-aged` -> `cosyvoice-v3.5-plus` 的中年男声默认 `voice_id` -- `female + youth` -> `cosyvoice-v3.5-plus` 的青年女声默认 `voice_id` - -然后再根据: - -- `text` -- `instruction` -- `rate` -- `pitch` - -生成最终调用参数。默认值策略: +调用参数默认值策略: - `instruction` 为空时,仅基于 `text` 做自然表达 -- `rate / pitch` 缺省时使用 `1.0` - 输出格式固定用 `mp3` - 其它参数不指定,使用 CosyVoice 默认值 diff --git a/docs/openapi.yaml b/docs/openapi.yaml index ab3af9e28c..553d39e4d3 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -2832,7 +2832,18 @@ paths: default: 1 examples: - 4 - - title: generateSound + - title: generateSound + type: object + required: + - type + - parameters + properties: + type: + type: string + enum: + - generateSound + parameters: + description: Parameters for speech sound generation. type: object required: - settings @@ -5018,22 +5029,6 @@ components: maxLength: 50 examples: - 像在提醒队友准备出发,语气轻快一点 - rate: - description: Optional speech-rate multiplier passed to synthesis. `1.0` means natural speed. - type: number - format: float - minimum: 0.5 - maximum: 2.0 - examples: - - 1.1 - pitch: - description: Optional pitch multiplier passed to synthesis. `1.0` means natural pitch. - type: number - format: float - minimum: 0.5 - maximum: 2.0 - examples: - - 1.05 AIGCSoundBaseSettings: description: Base settings shared by all sound-generation requests. diff --git a/spx-gui/src/apis/aigc.ts b/spx-gui/src/apis/aigc.ts index 631447897d..2f7f5acfdc 100644 --- a/spx-gui/src/apis/aigc.ts +++ b/spx-gui/src/apis/aigc.ts @@ -81,8 +81,6 @@ export type SpeechSettings = { voiceGender: SoundVoiceGender voiceAgeGroup: SoundVoiceAgeGroup instruction?: string - rate?: number - pitch?: number } export type SpeechSoundSettings = SoundSettingsBase & { diff --git a/spx-gui/src/components/asset/gen/sound/TTSGen.vue b/spx-gui/src/components/asset/gen/sound/TTSGen.vue index 5c44a67029..02d3d1ecd6 100644 --- a/spx-gui/src/components/asset/gen/sound/TTSGen.vue +++ b/spx-gui/src/components/asset/gen/sound/TTSGen.vue @@ -17,14 +17,14 @@ const emit = defineEmits<{ resolved: [Sound] }>() +const maxSpeechTextLength = 200 +const maxInstructionLength = 50 + type SelectOption = { value: T label: LocaleMessage } -const maxSpeechTextLength = 200 -const maxInstructionLength = 50 - const voiceGenderOptions: Array> = [ { value: 'male', label: { en: 'Male', zh: '男声' } }, { value: 'female', label: { en: 'Female', zh: '女声' } } @@ -37,21 +37,6 @@ const voiceAgeGroupOptions: Array> = [ { value: 'senior', label: { en: 'Senior', zh: '老年' } } ] -const presetRateOptions: Array> = [ - { value: 0.8, label: { en: 'Slow', zh: '慢' } }, - { value: 0.9, label: { en: 'Slightly slow', zh: '稍慢' } }, - { value: 1.0, label: { en: 'Standard', zh: '标准' } }, - { value: 1.1, label: { en: 'Slightly fast', zh: '稍快' } }, - { value: 1.2, label: { en: 'Fast', zh: '快' } } -] -const presetPitchOptions: Array> = [ - { value: 0.85, label: { en: 'Low', zh: '低' } }, - { value: 0.95, label: { en: 'Slightly low', zh: '稍低' } }, - { value: 1.0, label: { en: 'Standard', zh: '标准' } }, - { value: 1.05, label: { en: 'Slightly high', zh: '稍高' } }, - { value: 1.15, label: { en: 'High', zh: '高' } } -] - const [resultSrc] = useFileUrl(() => props.gen.result?.file ?? null) const canGenerate = computed( @@ -107,14 +92,14 @@ const handleUse = useMessageHandle( -
+
@@ -152,40 +137,6 @@ const handleUse = useMessageHandle(
-
- - - - {{ $t(option.label) }} - - -
-
- - - - {{ $t(option.label) }} - - -
@@ -195,12 +146,12 @@ const handleUse = useMessageHandle(
{ + if (props.gen.generateState.status === 'running') { + return { en: 'Generating...', zh: '生成中...' } + } + if (props.gen.result == null) { + return { en: 'Generate', zh: '生成' } + } + return { en: 'Regenerate', zh: '重新生成' } +})