From ac4e329eb5ff422d83466bfc1be3068056397a98 Mon Sep 17 00:00:00 2001 From: Neil Ruaro Date: Tue, 3 Mar 2026 15:40:16 +0800 Subject: [PATCH 1/2] feat: add CAMB AI API compatibility layer Add CAMB AI-compatible API endpoints to LocalAI, enabling apps using the CAMB AI SDK/API to use LocalAI as a drop-in local replacement. Follows the existing ElevenLabs integration pattern (schema structs, endpoint handlers, route registration). Includes e2e tests covering all endpoints via the mock backend. --- core/http/app.go | 1 + .../http/endpoints/cambai/audio_separation.go | 17 + .../http/endpoints/cambai/sound_generation.go | 92 ++++++ core/http/endpoints/cambai/transcription.go | 85 +++++ core/http/endpoints/cambai/translation.go | 191 ++++++++++++ core/http/endpoints/cambai/tts.go | 130 ++++++++ core/http/endpoints/cambai/voice.go | 93 ++++++ core/http/routes/cambai.go | 72 +++++ core/schema/cambai.go | 290 ++++++++++++++++++ tests/e2e/cambai_test.go | 275 +++++++++++++++++ tests/e2e/e2e_suite_test.go | 7 + 11 files changed, 1253 insertions(+) create mode 100644 core/http/endpoints/cambai/audio_separation.go create mode 100644 core/http/endpoints/cambai/sound_generation.go create mode 100644 core/http/endpoints/cambai/transcription.go create mode 100644 core/http/endpoints/cambai/translation.go create mode 100644 core/http/endpoints/cambai/tts.go create mode 100644 core/http/endpoints/cambai/voice.go create mode 100644 core/http/routes/cambai.go create mode 100644 core/schema/cambai.go create mode 100644 tests/e2e/cambai_test.go diff --git a/core/http/app.go b/core/http/app.go index 437d524135f0..e5a985e5789f 100644 --- a/core/http/app.go +++ b/core/http/app.go @@ -215,6 +215,7 @@ func API(application *application.Application) (*echo.Echo, error) { requestExtractor := httpMiddleware.NewRequestExtractor(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig()) routes.RegisterElevenLabsRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig()) + routes.RegisterCambAIRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig()) // Create opcache for tracking UI operations (used by both UI and LocalAI routes) var opcache *services.OpCache diff --git a/core/http/endpoints/cambai/audio_separation.go b/core/http/endpoints/cambai/audio_separation.go new file mode 100644 index 000000000000..b51bef607418 --- /dev/null +++ b/core/http/endpoints/cambai/audio_separation.go @@ -0,0 +1,17 @@ +package cambai + +import ( + "net/http" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/schema" +) + +// AudioSeparationEndpoint returns 501 Not Implemented for audio separation. +func AudioSeparationEndpoint() echo.HandlerFunc { + return func(c echo.Context) error { + return c.JSON(http.StatusNotImplemented, schema.CambAIErrorResponse{ + Detail: "Audio separation is not currently supported. No backend available.", + }) + } +} diff --git a/core/http/endpoints/cambai/sound_generation.go b/core/http/endpoints/cambai/sound_generation.go new file mode 100644 index 000000000000..650fd5894ddd --- /dev/null +++ b/core/http/endpoints/cambai/sound_generation.go @@ -0,0 +1,92 @@ +package cambai + +import ( + "net/http" + "path/filepath" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/audio" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// SoundGenerationEndpoint handles CAMB AI text-to-sound (POST /apis/text-to-sound). +func SoundGenerationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITextToSoundRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI text-to-sound request received", "model", input.Model) + + filePath, _, err := backend.SoundGeneration( + input.Prompt, input.Duration, nil, nil, + nil, nil, + nil, "", "", nil, "", + "", "", + nil, + ml, appConfig, *cfg) + if err != nil { + return err + } + + filePath, contentType := audio.NormalizeAudioFile(filePath) + + taskID := uuid.New().String() + + // Return audio file directly with task metadata headers + c.Response().Header().Set("X-Task-ID", taskID) + c.Response().Header().Set("X-Task-Status", "SUCCESS") + if contentType != "" { + c.Response().Header().Set("Content-Type", contentType) + } + return c.Attachment(filePath, filepath.Base(filePath)) + } +} + +// SoundGenerationAsyncEndpoint returns results in CAMB AI async task format. +func SoundGenerationAsyncEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITextToSoundRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI text-to-sound async request received", "model", input.Model) + + _, _, err := backend.SoundGeneration( + input.Prompt, input.Duration, nil, nil, + nil, nil, + nil, "", "", nil, "", + "", "", + nil, + ml, appConfig, *cfg) + if err != nil { + return err + } + + taskID := uuid.New().String() + + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, + Status: "SUCCESS", + RunID: taskID, + }) + } +} diff --git a/core/http/endpoints/cambai/transcription.go b/core/http/endpoints/cambai/transcription.go new file mode 100644 index 000000000000..2b633f44485a --- /dev/null +++ b/core/http/endpoints/cambai/transcription.go @@ -0,0 +1,85 @@ +package cambai + +import ( + "io" + "net/http" + "os" + "path" + "path/filepath" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// TranscriptionEndpoint handles CAMB AI transcription (POST /apis/transcribe). +// Runs synchronously but returns results in CAMB AI's async task format. +func TranscriptionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + input, _ := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranscriptionRequest) + + language := "" + if input != nil && input.LanguageID > 0 { + language = schema.CambAILanguageCodeFromID(input.LanguageID) + } + + file, err := c.FormFile("file") + if err != nil { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "Audio file is required. Upload as multipart form field 'file'.", + }) + } + + f, err := file.Open() + if err != nil { + return err + } + defer f.Close() + + dir, err := os.MkdirTemp("", "cambai-transcribe") + if err != nil { + return err + } + defer os.RemoveAll(dir) + + dst := filepath.Join(dir, path.Base(file.Filename)) + dstFile, err := os.Create(dst) + if err != nil { + return err + } + + if _, err := io.Copy(dstFile, f); err != nil { + xlog.Debug("Audio file copying error", "filename", file.Filename, "dst", dst, "error", err) + return err + } + dstFile.Close() + + xlog.Debug("CAMB AI transcription request", "file", dst, "language", language) + + tr, err := backend.ModelTranscription(dst, language, false, false, "", ml, *cfg, appConfig) + if err != nil { + return err + } + + taskID := uuid.New().String() + + return c.JSON(http.StatusOK, schema.CambAITaskStatusResponse{ + Status: "SUCCESS", + RunID: taskID, + Output: schema.CambAITranscriptionResponse{ + Text: tr.Text, + Language: language, + }, + }) + } +} diff --git a/core/http/endpoints/cambai/translation.go b/core/http/endpoints/cambai/translation.go new file mode 100644 index 000000000000..d62b41fca078 --- /dev/null +++ b/core/http/endpoints/cambai/translation.go @@ -0,0 +1,191 @@ +package cambai + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +func buildTranslationPrompt(text, sourceLang, targetLang string) string { + return fmt.Sprintf( + "Translate the following text from %s to %s. Output ONLY the translation, nothing else.\n\n%s", + sourceLang, targetLang, text, + ) +} + +// TranslationEndpoint handles CAMB AI translation (POST /apis/translate). +// Uses an LLM chat backend to perform translation. +func TranslationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranslationRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI translation request received", "model", input.Model) + + sourceLang := schema.CambAILanguageCodeFromID(input.SourceLanguageID) + targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) + + var translations []string + for _, text := range input.Texts { + prompt := buildTranslationPrompt(text, sourceLang, targetLang) + + fn, err := backend.ModelInference( + c.Request().Context(), prompt, nil, nil, nil, nil, + ml, cfg, cl, appConfig, nil, "", "", nil, nil, nil, + ) + if err != nil { + return err + } + + resp, err := fn() + if err != nil { + return err + } + + translations = append(translations, strings.TrimSpace(resp.Response)) + } + + taskID := uuid.New().String() + + return c.JSON(http.StatusOK, schema.CambAITaskStatusResponse{ + Status: "SUCCESS", + RunID: taskID, + Output: schema.CambAITranslationResponse{ + Translation: translations, + SourceLang: input.SourceLanguageID, + TargetLang: input.TargetLanguageID, + }, + }) + } +} + +// TranslationStreamEndpoint handles CAMB AI streaming translation (POST /apis/translation/stream). +func TranslationStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranslationStreamRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI translation stream request received", "model", input.Model) + + sourceLang := schema.CambAILanguageCodeFromID(input.SourceLanguageID) + targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) + prompt := buildTranslationPrompt(input.Text, sourceLang, targetLang) + + c.Response().Header().Set("Content-Type", "text/plain; charset=utf-8") + c.Response().Header().Set("Transfer-Encoding", "chunked") + c.Response().Header().Set("Cache-Control", "no-cache") + c.Response().Header().Set("Connection", "keep-alive") + + fn, err := backend.ModelInference( + context.Background(), prompt, nil, nil, nil, nil, + ml, cfg, cl, appConfig, + func(token string, _ backend.TokenUsage) bool { + _, writeErr := c.Response().Write([]byte(token)) + if writeErr != nil { + return true + } + c.Response().Flush() + return true + }, + "", "", nil, nil, nil, + ) + if err != nil { + return err + } + + // Call fn to complete inference + _, err = fn() + return err + } +} + +// TranslatedTTSEndpoint handles CAMB AI translated TTS (POST /apis/translated-tts). +// First translates text via LLM, then synthesizes speech from the translation. +func TranslatedTTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranslatedTTSRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI translated TTS request received", "model", input.Model) + + sourceLang := schema.CambAILanguageCodeFromID(input.SourceLanguageID) + targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) + prompt := buildTranslationPrompt(input.Text, sourceLang, targetLang) + + // Step 1: Translate + fn, err := backend.ModelInference( + c.Request().Context(), prompt, nil, nil, nil, nil, + ml, cfg, cl, appConfig, nil, "", "", nil, nil, nil, + ) + if err != nil { + return err + } + + resp, err := fn() + if err != nil { + return err + } + + translatedText := strings.TrimSpace(resp.Response) + + // Step 2: TTS on translated text + // Find a TTS model from config + ttsConfigs := cl.GetModelConfigsByFilter(config.BuildUsecaseFilterFn(config.FLAG_TTS)) + if len(ttsConfigs) == 0 { + return c.JSON(http.StatusServiceUnavailable, schema.CambAIErrorResponse{ + Detail: "No TTS model configured. Configure a TTS model to use translated TTS.", + }) + } + ttsCfg := ttsConfigs[0] + + voice := fmt.Sprintf("%d", input.VoiceID) + language := targetLang + + filePath, _, err := backend.ModelTTS(translatedText, voice, language, ml, appConfig, ttsCfg) + if err != nil { + return err + } + + taskID := uuid.New().String() + + return c.JSON(http.StatusOK, schema.CambAITaskStatusResponse{ + Status: "SUCCESS", + RunID: taskID, + Output: map[string]string{ + "translation": translatedText, + "audio_path": filePath, + }, + }) + } +} diff --git a/core/http/endpoints/cambai/tts.go b/core/http/endpoints/cambai/tts.go new file mode 100644 index 000000000000..bc4786ddf90d --- /dev/null +++ b/core/http/endpoints/cambai/tts.go @@ -0,0 +1,130 @@ +package cambai + +import ( + "fmt" + "net/http" + "path/filepath" + "sync" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/audio" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// ttsTaskResults stores results of async TTS tasks keyed by task ID. +var ttsTaskResults = sync.Map{} + +// TTSStreamEndpoint handles CAMB AI streaming TTS (POST /apis/tts-stream). +func TTSStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITTSStreamRequest) + if !ok || input.SpeechModel == "" || input.Text == "" { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI TTS stream request received", "model", input.SpeechModel) + + voice := fmt.Sprintf("%d", input.VoiceID) + language := input.Language + + c.Response().Header().Set("Content-Type", "audio/wav") + c.Response().Header().Set("Transfer-Encoding", "chunked") + c.Response().Header().Set("Cache-Control", "no-cache") + c.Response().Header().Set("Connection", "keep-alive") + + err := backend.ModelTTSStream(input.Text, voice, language, ml, appConfig, *cfg, func(audioChunk []byte) error { + _, writeErr := c.Response().Write(audioChunk) + if writeErr != nil { + return writeErr + } + c.Response().Flush() + return nil + }) + if err != nil { + // Fallback to non-streaming TTS + xlog.Debug("Streaming TTS not supported, falling back to non-streaming", "error", err) + filePath, _, ttsErr := backend.ModelTTS(input.Text, voice, language, ml, appConfig, *cfg) + if ttsErr != nil { + return ttsErr + } + filePath, contentType := audio.NormalizeAudioFile(filePath) + if contentType != "" { + c.Response().Header().Set("Content-Type", contentType) + } + return c.Attachment(filePath, filepath.Base(filePath)) + } + + return nil + } +} + +// TTSEndpoint handles CAMB AI async TTS (POST /apis/tts). +func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITTSRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI TTS request received", "model", input.Model) + + voice := fmt.Sprintf("%d", input.VoiceID) + language := schema.CambAILanguageCodeFromID(input.LanguageID) + + filePath, _, err := backend.ModelTTS(input.Text, voice, language, ml, appConfig, *cfg) + if err != nil { + return err + } + + taskID := uuid.New().String() + ttsTaskResults.Store(taskID, filePath) + + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, + Status: "SUCCESS", + RunID: taskID, + }) + } +} + +// TTSTaskStatusEndpoint handles polling for async TTS results (GET /apis/tts/:task_id). +func TTSTaskStatusEndpoint() echo.HandlerFunc { + return func(c echo.Context) error { + taskID := c.Param("task_id") + result, ok := ttsTaskResults.Load(taskID) + if !ok { + return c.JSON(http.StatusNotFound, schema.CambAIErrorResponse{ + Detail: "Task not found", + }) + } + + filePath, ok := result.(string) + if !ok { + return c.JSON(http.StatusInternalServerError, schema.CambAIErrorResponse{ + Detail: "Invalid task result", + }) + } + + filePath, contentType := audio.NormalizeAudioFile(filePath) + if contentType != "" { + c.Response().Header().Set("Content-Type", contentType) + } + return c.Attachment(filePath, filepath.Base(filePath)) + } +} diff --git a/core/http/endpoints/cambai/voice.go b/core/http/endpoints/cambai/voice.go new file mode 100644 index 000000000000..577dbc2f73d4 --- /dev/null +++ b/core/http/endpoints/cambai/voice.go @@ -0,0 +1,93 @@ +package cambai + +import ( + "fmt" + "io" + "net/http" + "os" + "path/filepath" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// ListVoicesEndpoint handles CAMB AI list voices (GET /apis/list-voices). +func ListVoicesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + ttsConfigs := cl.GetModelConfigsByFilter(config.BuildUsecaseFilterFn(config.FLAG_TTS)) + + voices := make([]schema.CambAIVoice, 0) + for i, cfg := range ttsConfigs { + voice := schema.CambAIVoice{ + VoiceID: i + 1, + Name: cfg.Name, + } + if cfg.Voice != "" { + voice.Name = fmt.Sprintf("%s (%s)", cfg.Name, cfg.Voice) + } + voices = append(voices, voice) + } + + return c.JSON(http.StatusOK, schema.CambAIListVoicesResponse{ + Voices: voices, + }) + } +} + +// CreateCustomVoiceEndpoint handles CAMB AI custom voice creation (POST /apis/create-custom-voice). +// Accepts an audio file upload and saves it for voice cloning. +func CreateCustomVoiceEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + voiceName := c.FormValue("voice_name") + if voiceName == "" { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "voice_name is required", + }) + } + + file, err := c.FormFile("file") + if err != nil { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "Audio file is required. Upload as multipart form field 'file'.", + }) + } + + f, err := file.Open() + if err != nil { + return err + } + defer f.Close() + + // Save audio file to models directory for voice cloning + voiceDir := filepath.Join(ml.ModelPath, "voices") + if err := os.MkdirAll(voiceDir, 0750); err != nil { + return err + } + + ext := filepath.Ext(file.Filename) + if ext == "" { + ext = ".wav" + } + dstPath := filepath.Join(voiceDir, voiceName+ext) + + dst, err := os.Create(dstPath) + if err != nil { + return err + } + defer dst.Close() + + if _, err := io.Copy(dst, f); err != nil { + return err + } + + xlog.Info("Custom voice audio saved", "name", voiceName, "path", dstPath) + + return c.JSON(http.StatusOK, schema.CambAIVoice{ + VoiceID: 0, + Name: voiceName, + }) + } +} diff --git a/core/http/routes/cambai.go b/core/http/routes/cambai.go new file mode 100644 index 000000000000..73246849f3b5 --- /dev/null +++ b/core/http/routes/cambai.go @@ -0,0 +1,72 @@ +package routes + +import ( + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/cambai" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" +) + +func RegisterCambAIRoutes(app *echo.Echo, + re *middleware.RequestExtractor, + cl *config.ModelConfigLoader, + ml *model.ModelLoader, + appConfig *config.ApplicationConfig) { + + // TTS streaming (POST /apis/tts-stream) + app.POST("/apis/tts-stream", + cambai.TTSStreamEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITTSStreamRequest) })) + + // TTS async (POST /apis/tts) + app.POST("/apis/tts", + cambai.TTSEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITTSRequest) })) + + // TTS task status (GET /apis/tts/:task_id) + app.GET("/apis/tts/:task_id", cambai.TTSTaskStatusEndpoint()) + + // Translated TTS (POST /apis/translated-tts) + app.POST("/apis/translated-tts", + cambai.TranslatedTTSEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranslatedTTSRequest) })) + + // Translation (POST /apis/translate) + app.POST("/apis/translate", + cambai.TranslationEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranslationRequest) })) + + // Translation streaming (POST /apis/translation/stream) + app.POST("/apis/translation/stream", + cambai.TranslationStreamEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranslationStreamRequest) })) + + // Transcription (POST /apis/transcribe) + app.POST("/apis/transcribe", + cambai.TranscriptionEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranscriptionRequest) })) + + // Text-to-sound (POST /apis/text-to-sound) + app.POST("/apis/text-to-sound", + cambai.SoundGenerationEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_SOUND_GENERATION)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITextToSoundRequest) })) + + // List voices (GET /apis/list-voices) + app.GET("/apis/list-voices", cambai.ListVoicesEndpoint(cl, ml, appConfig)) + + // Create custom voice (POST /apis/create-custom-voice) + app.POST("/apis/create-custom-voice", + cambai.CreateCustomVoiceEndpoint(cl, ml, appConfig)) + + // Audio separation stub (POST /apis/audio-separation) + app.POST("/apis/audio-separation", cambai.AudioSeparationEndpoint()) +} diff --git a/core/schema/cambai.go b/core/schema/cambai.go new file mode 100644 index 000000000000..09380ba36455 --- /dev/null +++ b/core/schema/cambai.go @@ -0,0 +1,290 @@ +package schema + +import "fmt" + +// CambAI TTS streaming request (POST /apis/tts-stream) +type CambAITTSStreamRequest struct { + Text string `json:"text"` + VoiceID int `json:"voice_id"` + Language string `json:"language"` + SpeechModel string `json:"speech_model"` + OutputConfiguration *CambAIOutputConfiguration `json:"output_configuration,omitempty"` + InferenceOptions *CambAITTSInferenceOptions `json:"inference_options,omitempty"` +} + +type CambAIOutputConfiguration struct { + Format string `json:"format,omitempty"` + SampleRate int `json:"sample_rate,omitempty"` +} + +type CambAITTSInferenceOptions struct { + Speed *float32 `json:"speed,omitempty"` + Pitch *float32 `json:"pitch,omitempty"` + Temperature *float32 `json:"temperature,omitempty"` +} + +func (r *CambAITTSStreamRequest) ModelName(s *string) string { + if s != nil { + r.SpeechModel = *s + } + return r.SpeechModel +} + +// CambAI async TTS request (POST /apis/tts) +type CambAITTSRequest struct { + Text string `json:"text"` + VoiceID int `json:"voice_id"` + LanguageID int `json:"language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITTSRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI translated TTS request (POST /apis/translated-tts) +type CambAITranslatedTTSRequest struct { + Text string `json:"text"` + VoiceID int `json:"voice_id"` + SourceLanguageID int `json:"source_language"` + TargetLanguageID int `json:"target_language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranslatedTTSRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI translation request (POST /apis/translate) +type CambAITranslationRequest struct { + Texts []string `json:"texts"` + SourceLanguageID int `json:"source_language"` + TargetLanguageID int `json:"target_language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranslationRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI translation stream request (POST /apis/translation/stream) +type CambAITranslationStreamRequest struct { + Text string `json:"text"` + SourceLanguageID int `json:"source_language"` + TargetLanguageID int `json:"target_language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranslationStreamRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI transcription request (POST /apis/transcribe) +type CambAITranscriptionRequest struct { + LanguageID int `json:"language,omitempty"` + MediaURL string `json:"media_url,omitempty"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranscriptionRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI text-to-sound request (POST /apis/text-to-sound) +type CambAITextToSoundRequest struct { + Prompt string `json:"prompt"` + Duration *float32 `json:"duration,omitempty"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITextToSoundRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI create custom voice request (POST /apis/create-custom-voice) +type CambAICreateCustomVoiceRequest struct { + VoiceName string `json:"voice_name"` + Model string `json:"model,omitempty"` +} + +func (r *CambAICreateCustomVoiceRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// Response types + +type CambAITaskResponse struct { + TaskID string `json:"task_id"` + Status string `json:"status"` + RunID string `json:"run_id,omitempty"` +} + +type CambAITaskStatusResponse struct { + Status string `json:"status"` + RunID string `json:"run_id,omitempty"` + Output any `json:"output,omitempty"` +} + +type CambAIVoice struct { + VoiceID int `json:"voice_id"` + Name string `json:"voice_name"` + Gender string `json:"gender,omitempty"` + Age string `json:"age,omitempty"` +} + +type CambAIListVoicesResponse struct { + Voices []CambAIVoice `json:"voices"` +} + +type CambAIErrorResponse struct { + Detail string `json:"detail"` +} + +type CambAITranslationResponse struct { + Translation []string `json:"translation"` + SourceLang int `json:"source_language"` + TargetLang int `json:"target_language"` +} + +type CambAITranscriptionResponse struct { + Text string `json:"text"` + Language string `json:"language,omitempty"` +} + +// CambAILanguageIDToCode maps CAMB AI integer language IDs to BCP-47 codes. +// This is a subset covering the most common languages. +var CambAILanguageIDToCode = map[int]string{ + 1: "en", + 2: "ko", + 3: "nl", + 4: "tr", + 5: "uk", + 6: "pl", + 7: "ta", + 8: "vi", + 9: "sv", + 10: "id", + 11: "ms", + 12: "ja", + 13: "zh", + 14: "bn", + 15: "th", + 16: "tl", + 17: "he", + 18: "pt-br", + 19: "pt", + 20: "ru", + 21: "ca", + 22: "te", + 23: "ml", + 24: "kn", + 25: "gu", + 26: "mr", + 27: "hi", + 28: "da", + 29: "fi", + 30: "no", + 31: "hu", + 32: "sk", + 33: "cs", + 34: "el", + 35: "ro", + 36: "bg", + 37: "sr", + 38: "hr", + 39: "sl", + 40: "mk", + 41: "et", + 42: "lt", + 43: "lv", + 44: "sw", + 45: "ar", + 46: "ur", + 47: "fa", + 48: "af", + 49: "my", + 50: "bs", + 51: "si", + 52: "ne", + 53: "km", + 54: "es", + 55: "cy", + 56: "is", + 57: "pa", + 58: "as", + 59: "ga", + 60: "am", + 61: "az", + 62: "uz", + 63: "ka", + 64: "sq", + 65: "mn", + 66: "la", + 67: "gl", + 68: "eu", + 69: "it", + 70: "de", + 71: "nn", + 72: "lo", + 73: "yo", + 74: "ig", + 75: "ha", + 76: "fr", + 77: "zu", + 78: "xh", + 79: "so", + 80: "mt", + 81: "eo", + 82: "jw", + 83: "su", + 84: "ps", + 85: "sd", + 86: "mg", + 87: "hy", + 88: "lb", + 89: "be", + 90: "tt", + 91: "tg", + 92: "ky", + 93: "tk", + 94: "ha", + 95: "sn", + 96: "ln", + 97: "rw", + 98: "ny", + 99: "ts", + 100: "tn", + 101: "st", + 102: "ss", + 103: "nd", + 104: "ve", +} + +// CambAILanguageCodeFromID converts a CAMB AI language ID to a BCP-47 code. +func CambAILanguageCodeFromID(id int) string { + if code, ok := CambAILanguageIDToCode[id]; ok { + return code + } + return fmt.Sprintf("lang-%d", id) +} diff --git a/tests/e2e/cambai_test.go b/tests/e2e/cambai_test.go new file mode 100644 index 000000000000..7bb30e5a703c --- /dev/null +++ b/tests/e2e/cambai_test.go @@ -0,0 +1,275 @@ +package e2e_test + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/mudler/LocalAI/core/schema" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// cambaiURL returns the base URL for CAMB AI endpoints (no /v1 prefix). +func cambaiURL() string { + return fmt.Sprintf("http://127.0.0.1:%d", apiPort) +} + +var _ = Describe("CAMB AI API Compatibility Tests", Label("CambAI"), func() { + var httpClient *http.Client + + BeforeEach(func() { + httpClient = &http.Client{Timeout: 30 * time.Second} + }) + + Describe("TTS Streaming API", func() { + It("should stream audio from /apis/tts-stream", func() { + body := `{ + "text": "Hello world from CAMB AI streaming", + "voice_id": 1, + "language": "en", + "speech_model": "mock-model" + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts-stream", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + Expect(resp.Header.Get("Content-Type")).To(HavePrefix("audio/")) + + data, err := io.ReadAll(resp.Body) + Expect(err).ToNot(HaveOccurred()) + Expect(len(data)).To(BeNumerically(">", 0), "TTS stream response body should be non-empty") + }) + + It("should return 400 for empty request", func() { + body := `{}` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts-stream", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + // Should fail because text is empty + Expect(resp.StatusCode).To(BeNumerically(">=", 400)) + }) + }) + + Describe("TTS Async API", func() { + It("should return a task response from /apis/tts", func() { + body := `{ + "text": "Hello from async TTS", + "voice_id": 1, + "language": 1 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var taskResp schema.CambAITaskResponse + err = json.NewDecoder(resp.Body).Decode(&taskResp) + Expect(err).ToNot(HaveOccurred()) + Expect(taskResp.TaskID).ToNot(BeEmpty()) + Expect(taskResp.Status).To(Equal("SUCCESS")) + }) + + It("should return audio when polling task status", func() { + // First create a TTS task + body := `{ + "text": "Task polling test", + "voice_id": 1, + "language": 1 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + Expect(resp.StatusCode).To(Equal(200)) + + var taskResp schema.CambAITaskResponse + err = json.NewDecoder(resp.Body).Decode(&taskResp) + Expect(err).ToNot(HaveOccurred()) + + // Poll the task + pollReq, err := http.NewRequest("GET", cambaiURL()+"/apis/tts/"+taskResp.TaskID, nil) + Expect(err).ToNot(HaveOccurred()) + + pollResp, err := httpClient.Do(pollReq) + Expect(err).ToNot(HaveOccurred()) + defer pollResp.Body.Close() + + Expect(pollResp.StatusCode).To(Equal(200)) + Expect(pollResp.Header.Get("Content-Type")).To(HavePrefix("audio/")) + + data, err := io.ReadAll(pollResp.Body) + Expect(err).ToNot(HaveOccurred()) + Expect(len(data)).To(BeNumerically(">", 0)) + }) + + It("should return 404 for unknown task ID", func() { + req, err := http.NewRequest("GET", cambaiURL()+"/apis/tts/nonexistent-task-id", nil) + Expect(err).ToNot(HaveOccurred()) + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(404)) + }) + }) + + Describe("Translation API", func() { + It("should translate text via /apis/translate", func() { + body := `{ + "texts": ["Hello"], + "source_language": 1, + "target_language": 54 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/translate", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var result schema.CambAITaskStatusResponse + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Status).To(Equal("SUCCESS")) + Expect(result.Output).ToNot(BeNil()) + }) + + It("should stream translation via /apis/translation/stream", func() { + body := `{ + "text": "Hello world", + "source_language": 1, + "target_language": 54 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/translation/stream", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + data, err := io.ReadAll(resp.Body) + Expect(err).ToNot(HaveOccurred()) + Expect(len(data)).To(BeNumerically(">", 0), "Stream should return some text") + }) + }) + + Describe("Sound Generation API", func() { + It("should generate sound via /apis/text-to-sound", func() { + body := `{ + "prompt": "rain falling on a tin roof" + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/text-to-sound", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + Expect(resp.Header.Get("Content-Type")).To(HavePrefix("audio/")) + + data, err := io.ReadAll(resp.Body) + Expect(err).ToNot(HaveOccurred()) + Expect(len(data)).To(BeNumerically(">", 0)) + }) + }) + + Describe("Voice Management API", func() { + It("should list voices via /apis/list-voices", func() { + req, err := http.NewRequest("GET", cambaiURL()+"/apis/list-voices", nil) + Expect(err).ToNot(HaveOccurred()) + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var result schema.CambAIListVoicesResponse + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + // voices list may be empty if no TTS models are flagged, but the endpoint should work + Expect(result.Voices).ToNot(BeNil()) + }) + }) + + Describe("Audio Separation API (stub)", func() { + It("should return 501 Not Implemented", func() { + req, err := http.NewRequest("POST", cambaiURL()+"/apis/audio-separation", strings.NewReader(`{}`)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(501)) + + var result schema.CambAIErrorResponse + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Detail).To(ContainSubstring("not currently supported")) + }) + }) + + Describe("Transcription API", func() { + It("should reject request without audio file", func() { + req, err := http.NewRequest("POST", cambaiURL()+"/apis/transcribe", strings.NewReader(`{}`)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + // Should fail because no file was uploaded + Expect(resp.StatusCode).To(BeNumerically(">=", 400)) + }) + }) + + Describe("Language ID Mapping", func() { + It("should map known language IDs correctly", func() { + Expect(schema.CambAILanguageCodeFromID(1)).To(Equal("en")) + Expect(schema.CambAILanguageCodeFromID(54)).To(Equal("es")) + Expect(schema.CambAILanguageCodeFromID(76)).To(Equal("fr")) + Expect(schema.CambAILanguageCodeFromID(70)).To(Equal("de")) + Expect(schema.CambAILanguageCodeFromID(12)).To(Equal("ja")) + Expect(schema.CambAILanguageCodeFromID(13)).To(Equal("zh")) + }) + + It("should return fallback for unknown language IDs", func() { + result := schema.CambAILanguageCodeFromID(9999) + Expect(result).To(Equal("lang-9999")) + }) + }) +}) diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 66d9d6cd7ffb..d2a93b8cb3b8 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -93,6 +93,13 @@ var _ = BeforeSuite(func() { "parameters": map[string]interface{}{ "model": "mock-model.bin", }, + "known_usecases": []string{ + "FLAG_CHAT", + "FLAG_COMPLETION", + "FLAG_TTS", + "FLAG_TRANSCRIPT", + "FLAG_SOUND_GENERATION", + }, } configPath = filepath.Join(modelsPath, "mock-model.yaml") configYAML, err := yaml.Marshal(modelConfig) From 733d573660687ec0a8bebed7f0346543791acb01 Mon Sep 17 00:00:00 2001 From: Neil Ruaro Date: Tue, 3 Mar 2026 16:23:28 +0800 Subject: [PATCH 2/2] fix: align CAMB AI endpoint responses with SDK expectations Tested against the real camb-sdk Python package. Fixes: - list-voices returns flat array (SDK expects list, not wrapped object) - text-to-sound returns task_id JSON (SDK expects OrchestratorPipelineCallResult) - translated-tts returns task_id JSON (SDK expects CreateTranslatedTtsOut) - translation/stream returns JSON (SDK parses response as JSON) - transcribe accepts media_url form field without requiring file upload --- .../http/endpoints/cambai/sound_generation.go | 42 --------- core/http/endpoints/cambai/transcription.go | 91 ++++++++++++------- core/http/endpoints/cambai/translation.go | 37 +++----- core/http/endpoints/cambai/voice.go | 11 +-- core/schema/cambai.go | 12 ++- tests/e2e/cambai_test.go | 19 ++-- 6 files changed, 96 insertions(+), 116 deletions(-) diff --git a/core/http/endpoints/cambai/sound_generation.go b/core/http/endpoints/cambai/sound_generation.go index 650fd5894ddd..124484c77db9 100644 --- a/core/http/endpoints/cambai/sound_generation.go +++ b/core/http/endpoints/cambai/sound_generation.go @@ -2,7 +2,6 @@ package cambai import ( "net/http" - "path/filepath" "github.com/google/uuid" "github.com/labstack/echo/v4" @@ -10,7 +9,6 @@ import ( "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/http/middleware" "github.com/mudler/LocalAI/core/schema" - "github.com/mudler/LocalAI/pkg/audio" "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/xlog" ) @@ -30,46 +28,6 @@ func SoundGenerationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader xlog.Debug("CAMB AI text-to-sound request received", "model", input.Model) - filePath, _, err := backend.SoundGeneration( - input.Prompt, input.Duration, nil, nil, - nil, nil, - nil, "", "", nil, "", - "", "", - nil, - ml, appConfig, *cfg) - if err != nil { - return err - } - - filePath, contentType := audio.NormalizeAudioFile(filePath) - - taskID := uuid.New().String() - - // Return audio file directly with task metadata headers - c.Response().Header().Set("X-Task-ID", taskID) - c.Response().Header().Set("X-Task-Status", "SUCCESS") - if contentType != "" { - c.Response().Header().Set("Content-Type", contentType) - } - return c.Attachment(filePath, filepath.Base(filePath)) - } -} - -// SoundGenerationAsyncEndpoint returns results in CAMB AI async task format. -func SoundGenerationAsyncEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { - return func(c echo.Context) error { - input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITextToSoundRequest) - if !ok { - return echo.ErrBadRequest - } - - cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) - if !ok || cfg == nil { - return echo.ErrBadRequest - } - - xlog.Debug("CAMB AI text-to-sound async request received", "model", input.Model) - _, _, err := backend.SoundGeneration( input.Prompt, input.Duration, nil, nil, nil, nil, diff --git a/core/http/endpoints/cambai/transcription.go b/core/http/endpoints/cambai/transcription.go index 2b633f44485a..12c5c4e5d3f9 100644 --- a/core/http/endpoints/cambai/transcription.go +++ b/core/http/endpoints/cambai/transcription.go @@ -6,6 +6,7 @@ import ( "os" "path" "path/filepath" + "sync" "github.com/google/uuid" "github.com/labstack/echo/v4" @@ -17,8 +18,11 @@ import ( "github.com/mudler/xlog" ) +var transcriptionTaskResults = sync.Map{} + // TranscriptionEndpoint handles CAMB AI transcription (POST /apis/transcribe). -// Runs synchronously but returns results in CAMB AI's async task format. +// The SDK sends multipart form with optional file upload and/or media_url. +// Returns {"task_id": "..."} matching OrchestratorPipelineCallResult. func TranscriptionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { return func(c echo.Context) error { cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) @@ -32,54 +36,79 @@ func TranscriptionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, if input != nil && input.LanguageID > 0 { language = schema.CambAILanguageCodeFromID(input.LanguageID) } - - file, err := c.FormFile("file") - if err != nil { - return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ - Detail: "Audio file is required. Upload as multipart form field 'file'.", - }) + // SDK sends language as multipart form field too + if language == "" { + if langField := c.FormValue("language"); langField != "" { + language = langField + } } - f, err := file.Open() - if err != nil { - return err - } - defer f.Close() + // Try file upload first (field "file" or "media_file") + var audioPath string + for _, fieldName := range []string{"file", "media_file"} { + file, err := c.FormFile(fieldName) + if err != nil { + continue + } - dir, err := os.MkdirTemp("", "cambai-transcribe") - if err != nil { - return err + f, err := file.Open() + if err != nil { + return err + } + defer f.Close() + + dir, err := os.MkdirTemp("", "cambai-transcribe") + if err != nil { + return err + } + defer os.RemoveAll(dir) + + dst := filepath.Join(dir, path.Base(file.Filename)) + dstFile, err := os.Create(dst) + if err != nil { + return err + } + + if _, err := io.Copy(dstFile, f); err != nil { + dstFile.Close() + return err + } + dstFile.Close() + audioPath = dst + break } - defer os.RemoveAll(dir) - dst := filepath.Join(dir, path.Base(file.Filename)) - dstFile, err := os.Create(dst) - if err != nil { - return err + // Fall back to media_url form field + if audioPath == "" { + mediaURL := c.FormValue("media_url") + if mediaURL == "" { + mediaURL = c.FormValue("audio_url") + } + if mediaURL != "" { + audioPath = mediaURL + } } - if _, err := io.Copy(dstFile, f); err != nil { - xlog.Debug("Audio file copying error", "filename", file.Filename, "dst", dst, "error", err) - return err + if audioPath == "" { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "Either a file upload or media_url is required.", + }) } - dstFile.Close() - xlog.Debug("CAMB AI transcription request", "file", dst, "language", language) + xlog.Debug("CAMB AI transcription request", "path", audioPath, "language", language) - tr, err := backend.ModelTranscription(dst, language, false, false, "", ml, *cfg, appConfig) + tr, err := backend.ModelTranscription(audioPath, language, false, false, "", ml, *cfg, appConfig) if err != nil { return err } taskID := uuid.New().String() + transcriptionTaskResults.Store(taskID, tr.Text) - return c.JSON(http.StatusOK, schema.CambAITaskStatusResponse{ + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, Status: "SUCCESS", RunID: taskID, - Output: schema.CambAITranscriptionResponse{ - Text: tr.Text, - Language: language, - }, }) } } diff --git a/core/http/endpoints/cambai/translation.go b/core/http/endpoints/cambai/translation.go index d62b41fca078..22903b3513ca 100644 --- a/core/http/endpoints/cambai/translation.go +++ b/core/http/endpoints/cambai/translation.go @@ -95,31 +95,24 @@ func TranslationStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoad targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) prompt := buildTranslationPrompt(input.Text, sourceLang, targetLang) - c.Response().Header().Set("Content-Type", "text/plain; charset=utf-8") - c.Response().Header().Set("Transfer-Encoding", "chunked") - c.Response().Header().Set("Cache-Control", "no-cache") - c.Response().Header().Set("Connection", "keep-alive") - fn, err := backend.ModelInference( context.Background(), prompt, nil, nil, nil, nil, - ml, cfg, cl, appConfig, - func(token string, _ backend.TokenUsage) bool { - _, writeErr := c.Response().Write([]byte(token)) - if writeErr != nil { - return true - } - c.Response().Flush() - return true - }, - "", "", nil, nil, nil, + ml, cfg, cl, appConfig, nil, "", "", nil, nil, nil, ) if err != nil { return err } - // Call fn to complete inference - _, err = fn() - return err + resp, err := fn() + if err != nil { + return err + } + + return c.JSON(http.StatusOK, map[string]any{ + "translation": strings.TrimSpace(resp.Response), + "source_language": input.SourceLanguageID, + "target_language": input.TargetLanguageID, + }) } } @@ -178,14 +171,12 @@ func TranslatedTTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, } taskID := uuid.New().String() + ttsTaskResults.Store(taskID, filePath) - return c.JSON(http.StatusOK, schema.CambAITaskStatusResponse{ + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, Status: "SUCCESS", RunID: taskID, - Output: map[string]string{ - "translation": translatedText, - "audio_path": filePath, - }, }) } } diff --git a/core/http/endpoints/cambai/voice.go b/core/http/endpoints/cambai/voice.go index 577dbc2f73d4..73943c675272 100644 --- a/core/http/endpoints/cambai/voice.go +++ b/core/http/endpoints/cambai/voice.go @@ -22,8 +22,8 @@ func ListVoicesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app voices := make([]schema.CambAIVoice, 0) for i, cfg := range ttsConfigs { voice := schema.CambAIVoice{ - VoiceID: i + 1, - Name: cfg.Name, + ID: i + 1, + Name: cfg.Name, } if cfg.Voice != "" { voice.Name = fmt.Sprintf("%s (%s)", cfg.Name, cfg.Voice) @@ -31,9 +31,7 @@ func ListVoicesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app voices = append(voices, voice) } - return c.JSON(http.StatusOK, schema.CambAIListVoicesResponse{ - Voices: voices, - }) + return c.JSON(http.StatusOK, voices) } } @@ -85,9 +83,8 @@ func CreateCustomVoiceEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoad xlog.Info("Custom voice audio saved", "name", voiceName, "path", dstPath) - return c.JSON(http.StatusOK, schema.CambAIVoice{ + return c.JSON(http.StatusOK, schema.CambAICreateCustomVoiceResponse{ VoiceID: 0, - Name: voiceName, }) } } diff --git a/core/schema/cambai.go b/core/schema/cambai.go index 09380ba36455..8abc09387b06 100644 --- a/core/schema/cambai.go +++ b/core/schema/cambai.go @@ -147,16 +147,20 @@ type CambAITaskStatusResponse struct { } type CambAIVoice struct { - VoiceID int `json:"voice_id"` - Name string `json:"voice_name"` - Gender string `json:"gender,omitempty"` - Age string `json:"age,omitempty"` + ID int `json:"id"` + Name string `json:"voice_name"` + Gender string `json:"gender,omitempty"` + Age string `json:"age,omitempty"` } type CambAIListVoicesResponse struct { Voices []CambAIVoice `json:"voices"` } +type CambAICreateCustomVoiceResponse struct { + VoiceID int `json:"voice_id"` +} + type CambAIErrorResponse struct { Detail string `json:"detail"` } diff --git a/tests/e2e/cambai_test.go b/tests/e2e/cambai_test.go index 7bb30e5a703c..b8badfbd5d1e 100644 --- a/tests/e2e/cambai_test.go +++ b/tests/e2e/cambai_test.go @@ -160,7 +160,7 @@ var _ = Describe("CAMB AI API Compatibility Tests", Label("CambAI"), func() { Expect(result.Output).ToNot(BeNil()) }) - It("should stream translation via /apis/translation/stream", func() { + It("should translate via /apis/translation/stream", func() { body := `{ "text": "Hello world", "source_language": 1, @@ -176,9 +176,10 @@ var _ = Describe("CAMB AI API Compatibility Tests", Label("CambAI"), func() { Expect(resp.StatusCode).To(Equal(200)) - data, err := io.ReadAll(resp.Body) + var result map[string]any + err = json.NewDecoder(resp.Body).Decode(&result) Expect(err).ToNot(HaveOccurred()) - Expect(len(data)).To(BeNumerically(">", 0), "Stream should return some text") + Expect(result["translation"]).ToNot(BeEmpty()) }) }) @@ -196,11 +197,12 @@ var _ = Describe("CAMB AI API Compatibility Tests", Label("CambAI"), func() { defer resp.Body.Close() Expect(resp.StatusCode).To(Equal(200)) - Expect(resp.Header.Get("Content-Type")).To(HavePrefix("audio/")) - data, err := io.ReadAll(resp.Body) + var taskResp schema.CambAITaskResponse + err = json.NewDecoder(resp.Body).Decode(&taskResp) Expect(err).ToNot(HaveOccurred()) - Expect(len(data)).To(BeNumerically(">", 0)) + Expect(taskResp.TaskID).ToNot(BeEmpty()) + Expect(taskResp.Status).To(Equal("SUCCESS")) }) }) @@ -215,11 +217,10 @@ var _ = Describe("CAMB AI API Compatibility Tests", Label("CambAI"), func() { Expect(resp.StatusCode).To(Equal(200)) - var result schema.CambAIListVoicesResponse + var result []schema.CambAIVoice err = json.NewDecoder(resp.Body).Decode(&result) Expect(err).ToNot(HaveOccurred()) - // voices list may be empty if no TTS models are flagged, but the endpoint should work - Expect(result.Voices).ToNot(BeNil()) + Expect(result).ToNot(BeNil()) }) })