diff --git a/core/http/app.go b/core/http/app.go index 437d524135f0..e5a985e5789f 100644 --- a/core/http/app.go +++ b/core/http/app.go @@ -215,6 +215,7 @@ func API(application *application.Application) (*echo.Echo, error) { requestExtractor := httpMiddleware.NewRequestExtractor(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig()) routes.RegisterElevenLabsRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig()) + routes.RegisterCambAIRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig()) // Create opcache for tracking UI operations (used by both UI and LocalAI routes) var opcache *services.OpCache diff --git a/core/http/endpoints/cambai/audio_separation.go b/core/http/endpoints/cambai/audio_separation.go new file mode 100644 index 000000000000..b51bef607418 --- /dev/null +++ b/core/http/endpoints/cambai/audio_separation.go @@ -0,0 +1,17 @@ +package cambai + +import ( + "net/http" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/schema" +) + +// AudioSeparationEndpoint returns 501 Not Implemented for audio separation. +func AudioSeparationEndpoint() echo.HandlerFunc { + return func(c echo.Context) error { + return c.JSON(http.StatusNotImplemented, schema.CambAIErrorResponse{ + Detail: "Audio separation is not currently supported. No backend available.", + }) + } +} diff --git a/core/http/endpoints/cambai/sound_generation.go b/core/http/endpoints/cambai/sound_generation.go new file mode 100644 index 000000000000..124484c77db9 --- /dev/null +++ b/core/http/endpoints/cambai/sound_generation.go @@ -0,0 +1,50 @@ +package cambai + +import ( + "net/http" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// SoundGenerationEndpoint handles CAMB AI text-to-sound (POST /apis/text-to-sound). +func SoundGenerationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITextToSoundRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI text-to-sound request received", "model", input.Model) + + _, _, err := backend.SoundGeneration( + input.Prompt, input.Duration, nil, nil, + nil, nil, + nil, "", "", nil, "", + "", "", + nil, + ml, appConfig, *cfg) + if err != nil { + return err + } + + taskID := uuid.New().String() + + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, + Status: "SUCCESS", + RunID: taskID, + }) + } +} diff --git a/core/http/endpoints/cambai/transcription.go b/core/http/endpoints/cambai/transcription.go new file mode 100644 index 000000000000..12c5c4e5d3f9 --- /dev/null +++ b/core/http/endpoints/cambai/transcription.go @@ -0,0 +1,114 @@ +package cambai + +import ( + "io" + "net/http" + "os" + "path" + "path/filepath" + "sync" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +var transcriptionTaskResults = sync.Map{} + +// TranscriptionEndpoint handles CAMB AI transcription (POST /apis/transcribe). +// The SDK sends multipart form with optional file upload and/or media_url. +// Returns {"task_id": "..."} matching OrchestratorPipelineCallResult. +func TranscriptionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + input, _ := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranscriptionRequest) + + language := "" + if input != nil && input.LanguageID > 0 { + language = schema.CambAILanguageCodeFromID(input.LanguageID) + } + // SDK sends language as multipart form field too + if language == "" { + if langField := c.FormValue("language"); langField != "" { + language = langField + } + } + + // Try file upload first (field "file" or "media_file") + var audioPath string + for _, fieldName := range []string{"file", "media_file"} { + file, err := c.FormFile(fieldName) + if err != nil { + continue + } + + f, err := file.Open() + if err != nil { + return err + } + defer f.Close() + + dir, err := os.MkdirTemp("", "cambai-transcribe") + if err != nil { + return err + } + defer os.RemoveAll(dir) + + dst := filepath.Join(dir, path.Base(file.Filename)) + dstFile, err := os.Create(dst) + if err != nil { + return err + } + + if _, err := io.Copy(dstFile, f); err != nil { + dstFile.Close() + return err + } + dstFile.Close() + audioPath = dst + break + } + + // Fall back to media_url form field + if audioPath == "" { + mediaURL := c.FormValue("media_url") + if mediaURL == "" { + mediaURL = c.FormValue("audio_url") + } + if mediaURL != "" { + audioPath = mediaURL + } + } + + if audioPath == "" { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "Either a file upload or media_url is required.", + }) + } + + xlog.Debug("CAMB AI transcription request", "path", audioPath, "language", language) + + tr, err := backend.ModelTranscription(audioPath, language, false, false, "", ml, *cfg, appConfig) + if err != nil { + return err + } + + taskID := uuid.New().String() + transcriptionTaskResults.Store(taskID, tr.Text) + + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, + Status: "SUCCESS", + RunID: taskID, + }) + } +} diff --git a/core/http/endpoints/cambai/translation.go b/core/http/endpoints/cambai/translation.go new file mode 100644 index 000000000000..22903b3513ca --- /dev/null +++ b/core/http/endpoints/cambai/translation.go @@ -0,0 +1,182 @@ +package cambai + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +func buildTranslationPrompt(text, sourceLang, targetLang string) string { + return fmt.Sprintf( + "Translate the following text from %s to %s. Output ONLY the translation, nothing else.\n\n%s", + sourceLang, targetLang, text, + ) +} + +// TranslationEndpoint handles CAMB AI translation (POST /apis/translate). +// Uses an LLM chat backend to perform translation. +func TranslationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranslationRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI translation request received", "model", input.Model) + + sourceLang := schema.CambAILanguageCodeFromID(input.SourceLanguageID) + targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) + + var translations []string + for _, text := range input.Texts { + prompt := buildTranslationPrompt(text, sourceLang, targetLang) + + fn, err := backend.ModelInference( + c.Request().Context(), prompt, nil, nil, nil, nil, + ml, cfg, cl, appConfig, nil, "", "", nil, nil, nil, + ) + if err != nil { + return err + } + + resp, err := fn() + if err != nil { + return err + } + + translations = append(translations, strings.TrimSpace(resp.Response)) + } + + taskID := uuid.New().String() + + return c.JSON(http.StatusOK, schema.CambAITaskStatusResponse{ + Status: "SUCCESS", + RunID: taskID, + Output: schema.CambAITranslationResponse{ + Translation: translations, + SourceLang: input.SourceLanguageID, + TargetLang: input.TargetLanguageID, + }, + }) + } +} + +// TranslationStreamEndpoint handles CAMB AI streaming translation (POST /apis/translation/stream). +func TranslationStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranslationStreamRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI translation stream request received", "model", input.Model) + + sourceLang := schema.CambAILanguageCodeFromID(input.SourceLanguageID) + targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) + prompt := buildTranslationPrompt(input.Text, sourceLang, targetLang) + + fn, err := backend.ModelInference( + context.Background(), prompt, nil, nil, nil, nil, + ml, cfg, cl, appConfig, nil, "", "", nil, nil, nil, + ) + if err != nil { + return err + } + + resp, err := fn() + if err != nil { + return err + } + + return c.JSON(http.StatusOK, map[string]any{ + "translation": strings.TrimSpace(resp.Response), + "source_language": input.SourceLanguageID, + "target_language": input.TargetLanguageID, + }) + } +} + +// TranslatedTTSEndpoint handles CAMB AI translated TTS (POST /apis/translated-tts). +// First translates text via LLM, then synthesizes speech from the translation. +func TranslatedTTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITranslatedTTSRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI translated TTS request received", "model", input.Model) + + sourceLang := schema.CambAILanguageCodeFromID(input.SourceLanguageID) + targetLang := schema.CambAILanguageCodeFromID(input.TargetLanguageID) + prompt := buildTranslationPrompt(input.Text, sourceLang, targetLang) + + // Step 1: Translate + fn, err := backend.ModelInference( + c.Request().Context(), prompt, nil, nil, nil, nil, + ml, cfg, cl, appConfig, nil, "", "", nil, nil, nil, + ) + if err != nil { + return err + } + + resp, err := fn() + if err != nil { + return err + } + + translatedText := strings.TrimSpace(resp.Response) + + // Step 2: TTS on translated text + // Find a TTS model from config + ttsConfigs := cl.GetModelConfigsByFilter(config.BuildUsecaseFilterFn(config.FLAG_TTS)) + if len(ttsConfigs) == 0 { + return c.JSON(http.StatusServiceUnavailable, schema.CambAIErrorResponse{ + Detail: "No TTS model configured. Configure a TTS model to use translated TTS.", + }) + } + ttsCfg := ttsConfigs[0] + + voice := fmt.Sprintf("%d", input.VoiceID) + language := targetLang + + filePath, _, err := backend.ModelTTS(translatedText, voice, language, ml, appConfig, ttsCfg) + if err != nil { + return err + } + + taskID := uuid.New().String() + ttsTaskResults.Store(taskID, filePath) + + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, + Status: "SUCCESS", + RunID: taskID, + }) + } +} diff --git a/core/http/endpoints/cambai/tts.go b/core/http/endpoints/cambai/tts.go new file mode 100644 index 000000000000..bc4786ddf90d --- /dev/null +++ b/core/http/endpoints/cambai/tts.go @@ -0,0 +1,130 @@ +package cambai + +import ( + "fmt" + "net/http" + "path/filepath" + "sync" + + "github.com/google/uuid" + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/audio" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// ttsTaskResults stores results of async TTS tasks keyed by task ID. +var ttsTaskResults = sync.Map{} + +// TTSStreamEndpoint handles CAMB AI streaming TTS (POST /apis/tts-stream). +func TTSStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITTSStreamRequest) + if !ok || input.SpeechModel == "" || input.Text == "" { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI TTS stream request received", "model", input.SpeechModel) + + voice := fmt.Sprintf("%d", input.VoiceID) + language := input.Language + + c.Response().Header().Set("Content-Type", "audio/wav") + c.Response().Header().Set("Transfer-Encoding", "chunked") + c.Response().Header().Set("Cache-Control", "no-cache") + c.Response().Header().Set("Connection", "keep-alive") + + err := backend.ModelTTSStream(input.Text, voice, language, ml, appConfig, *cfg, func(audioChunk []byte) error { + _, writeErr := c.Response().Write(audioChunk) + if writeErr != nil { + return writeErr + } + c.Response().Flush() + return nil + }) + if err != nil { + // Fallback to non-streaming TTS + xlog.Debug("Streaming TTS not supported, falling back to non-streaming", "error", err) + filePath, _, ttsErr := backend.ModelTTS(input.Text, voice, language, ml, appConfig, *cfg) + if ttsErr != nil { + return ttsErr + } + filePath, contentType := audio.NormalizeAudioFile(filePath) + if contentType != "" { + c.Response().Header().Set("Content-Type", contentType) + } + return c.Attachment(filePath, filepath.Base(filePath)) + } + + return nil + } +} + +// TTSEndpoint handles CAMB AI async TTS (POST /apis/tts). +func TTSEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.CambAITTSRequest) + if !ok { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("CAMB AI TTS request received", "model", input.Model) + + voice := fmt.Sprintf("%d", input.VoiceID) + language := schema.CambAILanguageCodeFromID(input.LanguageID) + + filePath, _, err := backend.ModelTTS(input.Text, voice, language, ml, appConfig, *cfg) + if err != nil { + return err + } + + taskID := uuid.New().String() + ttsTaskResults.Store(taskID, filePath) + + return c.JSON(http.StatusOK, schema.CambAITaskResponse{ + TaskID: taskID, + Status: "SUCCESS", + RunID: taskID, + }) + } +} + +// TTSTaskStatusEndpoint handles polling for async TTS results (GET /apis/tts/:task_id). +func TTSTaskStatusEndpoint() echo.HandlerFunc { + return func(c echo.Context) error { + taskID := c.Param("task_id") + result, ok := ttsTaskResults.Load(taskID) + if !ok { + return c.JSON(http.StatusNotFound, schema.CambAIErrorResponse{ + Detail: "Task not found", + }) + } + + filePath, ok := result.(string) + if !ok { + return c.JSON(http.StatusInternalServerError, schema.CambAIErrorResponse{ + Detail: "Invalid task result", + }) + } + + filePath, contentType := audio.NormalizeAudioFile(filePath) + if contentType != "" { + c.Response().Header().Set("Content-Type", contentType) + } + return c.Attachment(filePath, filepath.Base(filePath)) + } +} diff --git a/core/http/endpoints/cambai/voice.go b/core/http/endpoints/cambai/voice.go new file mode 100644 index 000000000000..73943c675272 --- /dev/null +++ b/core/http/endpoints/cambai/voice.go @@ -0,0 +1,90 @@ +package cambai + +import ( + "fmt" + "io" + "net/http" + "os" + "path/filepath" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// ListVoicesEndpoint handles CAMB AI list voices (GET /apis/list-voices). +func ListVoicesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + ttsConfigs := cl.GetModelConfigsByFilter(config.BuildUsecaseFilterFn(config.FLAG_TTS)) + + voices := make([]schema.CambAIVoice, 0) + for i, cfg := range ttsConfigs { + voice := schema.CambAIVoice{ + ID: i + 1, + Name: cfg.Name, + } + if cfg.Voice != "" { + voice.Name = fmt.Sprintf("%s (%s)", cfg.Name, cfg.Voice) + } + voices = append(voices, voice) + } + + return c.JSON(http.StatusOK, voices) + } +} + +// CreateCustomVoiceEndpoint handles CAMB AI custom voice creation (POST /apis/create-custom-voice). +// Accepts an audio file upload and saves it for voice cloning. +func CreateCustomVoiceEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + voiceName := c.FormValue("voice_name") + if voiceName == "" { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "voice_name is required", + }) + } + + file, err := c.FormFile("file") + if err != nil { + return c.JSON(http.StatusBadRequest, schema.CambAIErrorResponse{ + Detail: "Audio file is required. Upload as multipart form field 'file'.", + }) + } + + f, err := file.Open() + if err != nil { + return err + } + defer f.Close() + + // Save audio file to models directory for voice cloning + voiceDir := filepath.Join(ml.ModelPath, "voices") + if err := os.MkdirAll(voiceDir, 0750); err != nil { + return err + } + + ext := filepath.Ext(file.Filename) + if ext == "" { + ext = ".wav" + } + dstPath := filepath.Join(voiceDir, voiceName+ext) + + dst, err := os.Create(dstPath) + if err != nil { + return err + } + defer dst.Close() + + if _, err := io.Copy(dst, f); err != nil { + return err + } + + xlog.Info("Custom voice audio saved", "name", voiceName, "path", dstPath) + + return c.JSON(http.StatusOK, schema.CambAICreateCustomVoiceResponse{ + VoiceID: 0, + }) + } +} diff --git a/core/http/routes/cambai.go b/core/http/routes/cambai.go new file mode 100644 index 000000000000..73246849f3b5 --- /dev/null +++ b/core/http/routes/cambai.go @@ -0,0 +1,72 @@ +package routes + +import ( + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/cambai" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" +) + +func RegisterCambAIRoutes(app *echo.Echo, + re *middleware.RequestExtractor, + cl *config.ModelConfigLoader, + ml *model.ModelLoader, + appConfig *config.ApplicationConfig) { + + // TTS streaming (POST /apis/tts-stream) + app.POST("/apis/tts-stream", + cambai.TTSStreamEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITTSStreamRequest) })) + + // TTS async (POST /apis/tts) + app.POST("/apis/tts", + cambai.TTSEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITTSRequest) })) + + // TTS task status (GET /apis/tts/:task_id) + app.GET("/apis/tts/:task_id", cambai.TTSTaskStatusEndpoint()) + + // Translated TTS (POST /apis/translated-tts) + app.POST("/apis/translated-tts", + cambai.TranslatedTTSEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranslatedTTSRequest) })) + + // Translation (POST /apis/translate) + app.POST("/apis/translate", + cambai.TranslationEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranslationRequest) })) + + // Translation streaming (POST /apis/translation/stream) + app.POST("/apis/translation/stream", + cambai.TranslationStreamEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranslationStreamRequest) })) + + // Transcription (POST /apis/transcribe) + app.POST("/apis/transcribe", + cambai.TranscriptionEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITranscriptionRequest) })) + + // Text-to-sound (POST /apis/text-to-sound) + app.POST("/apis/text-to-sound", + cambai.SoundGenerationEndpoint(cl, ml, appConfig), + re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_SOUND_GENERATION)), + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.CambAITextToSoundRequest) })) + + // List voices (GET /apis/list-voices) + app.GET("/apis/list-voices", cambai.ListVoicesEndpoint(cl, ml, appConfig)) + + // Create custom voice (POST /apis/create-custom-voice) + app.POST("/apis/create-custom-voice", + cambai.CreateCustomVoiceEndpoint(cl, ml, appConfig)) + + // Audio separation stub (POST /apis/audio-separation) + app.POST("/apis/audio-separation", cambai.AudioSeparationEndpoint()) +} diff --git a/core/schema/cambai.go b/core/schema/cambai.go new file mode 100644 index 000000000000..8abc09387b06 --- /dev/null +++ b/core/schema/cambai.go @@ -0,0 +1,294 @@ +package schema + +import "fmt" + +// CambAI TTS streaming request (POST /apis/tts-stream) +type CambAITTSStreamRequest struct { + Text string `json:"text"` + VoiceID int `json:"voice_id"` + Language string `json:"language"` + SpeechModel string `json:"speech_model"` + OutputConfiguration *CambAIOutputConfiguration `json:"output_configuration,omitempty"` + InferenceOptions *CambAITTSInferenceOptions `json:"inference_options,omitempty"` +} + +type CambAIOutputConfiguration struct { + Format string `json:"format,omitempty"` + SampleRate int `json:"sample_rate,omitempty"` +} + +type CambAITTSInferenceOptions struct { + Speed *float32 `json:"speed,omitempty"` + Pitch *float32 `json:"pitch,omitempty"` + Temperature *float32 `json:"temperature,omitempty"` +} + +func (r *CambAITTSStreamRequest) ModelName(s *string) string { + if s != nil { + r.SpeechModel = *s + } + return r.SpeechModel +} + +// CambAI async TTS request (POST /apis/tts) +type CambAITTSRequest struct { + Text string `json:"text"` + VoiceID int `json:"voice_id"` + LanguageID int `json:"language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITTSRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI translated TTS request (POST /apis/translated-tts) +type CambAITranslatedTTSRequest struct { + Text string `json:"text"` + VoiceID int `json:"voice_id"` + SourceLanguageID int `json:"source_language"` + TargetLanguageID int `json:"target_language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranslatedTTSRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI translation request (POST /apis/translate) +type CambAITranslationRequest struct { + Texts []string `json:"texts"` + SourceLanguageID int `json:"source_language"` + TargetLanguageID int `json:"target_language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranslationRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI translation stream request (POST /apis/translation/stream) +type CambAITranslationStreamRequest struct { + Text string `json:"text"` + SourceLanguageID int `json:"source_language"` + TargetLanguageID int `json:"target_language"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranslationStreamRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI transcription request (POST /apis/transcribe) +type CambAITranscriptionRequest struct { + LanguageID int `json:"language,omitempty"` + MediaURL string `json:"media_url,omitempty"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITranscriptionRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI text-to-sound request (POST /apis/text-to-sound) +type CambAITextToSoundRequest struct { + Prompt string `json:"prompt"` + Duration *float32 `json:"duration,omitempty"` + Model string `json:"model,omitempty"` +} + +func (r *CambAITextToSoundRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// CambAI create custom voice request (POST /apis/create-custom-voice) +type CambAICreateCustomVoiceRequest struct { + VoiceName string `json:"voice_name"` + Model string `json:"model,omitempty"` +} + +func (r *CambAICreateCustomVoiceRequest) ModelName(s *string) string { + if s != nil { + r.Model = *s + } + return r.Model +} + +// Response types + +type CambAITaskResponse struct { + TaskID string `json:"task_id"` + Status string `json:"status"` + RunID string `json:"run_id,omitempty"` +} + +type CambAITaskStatusResponse struct { + Status string `json:"status"` + RunID string `json:"run_id,omitempty"` + Output any `json:"output,omitempty"` +} + +type CambAIVoice struct { + ID int `json:"id"` + Name string `json:"voice_name"` + Gender string `json:"gender,omitempty"` + Age string `json:"age,omitempty"` +} + +type CambAIListVoicesResponse struct { + Voices []CambAIVoice `json:"voices"` +} + +type CambAICreateCustomVoiceResponse struct { + VoiceID int `json:"voice_id"` +} + +type CambAIErrorResponse struct { + Detail string `json:"detail"` +} + +type CambAITranslationResponse struct { + Translation []string `json:"translation"` + SourceLang int `json:"source_language"` + TargetLang int `json:"target_language"` +} + +type CambAITranscriptionResponse struct { + Text string `json:"text"` + Language string `json:"language,omitempty"` +} + +// CambAILanguageIDToCode maps CAMB AI integer language IDs to BCP-47 codes. +// This is a subset covering the most common languages. +var CambAILanguageIDToCode = map[int]string{ + 1: "en", + 2: "ko", + 3: "nl", + 4: "tr", + 5: "uk", + 6: "pl", + 7: "ta", + 8: "vi", + 9: "sv", + 10: "id", + 11: "ms", + 12: "ja", + 13: "zh", + 14: "bn", + 15: "th", + 16: "tl", + 17: "he", + 18: "pt-br", + 19: "pt", + 20: "ru", + 21: "ca", + 22: "te", + 23: "ml", + 24: "kn", + 25: "gu", + 26: "mr", + 27: "hi", + 28: "da", + 29: "fi", + 30: "no", + 31: "hu", + 32: "sk", + 33: "cs", + 34: "el", + 35: "ro", + 36: "bg", + 37: "sr", + 38: "hr", + 39: "sl", + 40: "mk", + 41: "et", + 42: "lt", + 43: "lv", + 44: "sw", + 45: "ar", + 46: "ur", + 47: "fa", + 48: "af", + 49: "my", + 50: "bs", + 51: "si", + 52: "ne", + 53: "km", + 54: "es", + 55: "cy", + 56: "is", + 57: "pa", + 58: "as", + 59: "ga", + 60: "am", + 61: "az", + 62: "uz", + 63: "ka", + 64: "sq", + 65: "mn", + 66: "la", + 67: "gl", + 68: "eu", + 69: "it", + 70: "de", + 71: "nn", + 72: "lo", + 73: "yo", + 74: "ig", + 75: "ha", + 76: "fr", + 77: "zu", + 78: "xh", + 79: "so", + 80: "mt", + 81: "eo", + 82: "jw", + 83: "su", + 84: "ps", + 85: "sd", + 86: "mg", + 87: "hy", + 88: "lb", + 89: "be", + 90: "tt", + 91: "tg", + 92: "ky", + 93: "tk", + 94: "ha", + 95: "sn", + 96: "ln", + 97: "rw", + 98: "ny", + 99: "ts", + 100: "tn", + 101: "st", + 102: "ss", + 103: "nd", + 104: "ve", +} + +// CambAILanguageCodeFromID converts a CAMB AI language ID to a BCP-47 code. +func CambAILanguageCodeFromID(id int) string { + if code, ok := CambAILanguageIDToCode[id]; ok { + return code + } + return fmt.Sprintf("lang-%d", id) +} diff --git a/tests/e2e/cambai_test.go b/tests/e2e/cambai_test.go new file mode 100644 index 000000000000..b8badfbd5d1e --- /dev/null +++ b/tests/e2e/cambai_test.go @@ -0,0 +1,276 @@ +package e2e_test + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/mudler/LocalAI/core/schema" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// cambaiURL returns the base URL for CAMB AI endpoints (no /v1 prefix). +func cambaiURL() string { + return fmt.Sprintf("http://127.0.0.1:%d", apiPort) +} + +var _ = Describe("CAMB AI API Compatibility Tests", Label("CambAI"), func() { + var httpClient *http.Client + + BeforeEach(func() { + httpClient = &http.Client{Timeout: 30 * time.Second} + }) + + Describe("TTS Streaming API", func() { + It("should stream audio from /apis/tts-stream", func() { + body := `{ + "text": "Hello world from CAMB AI streaming", + "voice_id": 1, + "language": "en", + "speech_model": "mock-model" + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts-stream", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + Expect(resp.Header.Get("Content-Type")).To(HavePrefix("audio/")) + + data, err := io.ReadAll(resp.Body) + Expect(err).ToNot(HaveOccurred()) + Expect(len(data)).To(BeNumerically(">", 0), "TTS stream response body should be non-empty") + }) + + It("should return 400 for empty request", func() { + body := `{}` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts-stream", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + // Should fail because text is empty + Expect(resp.StatusCode).To(BeNumerically(">=", 400)) + }) + }) + + Describe("TTS Async API", func() { + It("should return a task response from /apis/tts", func() { + body := `{ + "text": "Hello from async TTS", + "voice_id": 1, + "language": 1 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var taskResp schema.CambAITaskResponse + err = json.NewDecoder(resp.Body).Decode(&taskResp) + Expect(err).ToNot(HaveOccurred()) + Expect(taskResp.TaskID).ToNot(BeEmpty()) + Expect(taskResp.Status).To(Equal("SUCCESS")) + }) + + It("should return audio when polling task status", func() { + // First create a TTS task + body := `{ + "text": "Task polling test", + "voice_id": 1, + "language": 1 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/tts", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + Expect(resp.StatusCode).To(Equal(200)) + + var taskResp schema.CambAITaskResponse + err = json.NewDecoder(resp.Body).Decode(&taskResp) + Expect(err).ToNot(HaveOccurred()) + + // Poll the task + pollReq, err := http.NewRequest("GET", cambaiURL()+"/apis/tts/"+taskResp.TaskID, nil) + Expect(err).ToNot(HaveOccurred()) + + pollResp, err := httpClient.Do(pollReq) + Expect(err).ToNot(HaveOccurred()) + defer pollResp.Body.Close() + + Expect(pollResp.StatusCode).To(Equal(200)) + Expect(pollResp.Header.Get("Content-Type")).To(HavePrefix("audio/")) + + data, err := io.ReadAll(pollResp.Body) + Expect(err).ToNot(HaveOccurred()) + Expect(len(data)).To(BeNumerically(">", 0)) + }) + + It("should return 404 for unknown task ID", func() { + req, err := http.NewRequest("GET", cambaiURL()+"/apis/tts/nonexistent-task-id", nil) + Expect(err).ToNot(HaveOccurred()) + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(404)) + }) + }) + + Describe("Translation API", func() { + It("should translate text via /apis/translate", func() { + body := `{ + "texts": ["Hello"], + "source_language": 1, + "target_language": 54 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/translate", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var result schema.CambAITaskStatusResponse + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Status).To(Equal("SUCCESS")) + Expect(result.Output).ToNot(BeNil()) + }) + + It("should translate via /apis/translation/stream", func() { + body := `{ + "text": "Hello world", + "source_language": 1, + "target_language": 54 + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/translation/stream", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var result map[string]any + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + Expect(result["translation"]).ToNot(BeEmpty()) + }) + }) + + Describe("Sound Generation API", func() { + It("should generate sound via /apis/text-to-sound", func() { + body := `{ + "prompt": "rain falling on a tin roof" + }` + req, err := http.NewRequest("POST", cambaiURL()+"/apis/text-to-sound", strings.NewReader(body)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var taskResp schema.CambAITaskResponse + err = json.NewDecoder(resp.Body).Decode(&taskResp) + Expect(err).ToNot(HaveOccurred()) + Expect(taskResp.TaskID).ToNot(BeEmpty()) + Expect(taskResp.Status).To(Equal("SUCCESS")) + }) + }) + + Describe("Voice Management API", func() { + It("should list voices via /apis/list-voices", func() { + req, err := http.NewRequest("GET", cambaiURL()+"/apis/list-voices", nil) + Expect(err).ToNot(HaveOccurred()) + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(200)) + + var result []schema.CambAIVoice + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + Expect(result).ToNot(BeNil()) + }) + }) + + Describe("Audio Separation API (stub)", func() { + It("should return 501 Not Implemented", func() { + req, err := http.NewRequest("POST", cambaiURL()+"/apis/audio-separation", strings.NewReader(`{}`)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + Expect(resp.StatusCode).To(Equal(501)) + + var result schema.CambAIErrorResponse + err = json.NewDecoder(resp.Body).Decode(&result) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Detail).To(ContainSubstring("not currently supported")) + }) + }) + + Describe("Transcription API", func() { + It("should reject request without audio file", func() { + req, err := http.NewRequest("POST", cambaiURL()+"/apis/transcribe", strings.NewReader(`{}`)) + Expect(err).ToNot(HaveOccurred()) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + Expect(err).ToNot(HaveOccurred()) + defer resp.Body.Close() + + // Should fail because no file was uploaded + Expect(resp.StatusCode).To(BeNumerically(">=", 400)) + }) + }) + + Describe("Language ID Mapping", func() { + It("should map known language IDs correctly", func() { + Expect(schema.CambAILanguageCodeFromID(1)).To(Equal("en")) + Expect(schema.CambAILanguageCodeFromID(54)).To(Equal("es")) + Expect(schema.CambAILanguageCodeFromID(76)).To(Equal("fr")) + Expect(schema.CambAILanguageCodeFromID(70)).To(Equal("de")) + Expect(schema.CambAILanguageCodeFromID(12)).To(Equal("ja")) + Expect(schema.CambAILanguageCodeFromID(13)).To(Equal("zh")) + }) + + It("should return fallback for unknown language IDs", func() { + result := schema.CambAILanguageCodeFromID(9999) + Expect(result).To(Equal("lang-9999")) + }) + }) +}) diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 66d9d6cd7ffb..d2a93b8cb3b8 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -93,6 +93,13 @@ var _ = BeforeSuite(func() { "parameters": map[string]interface{}{ "model": "mock-model.bin", }, + "known_usecases": []string{ + "FLAG_CHAT", + "FLAG_COMPLETION", + "FLAG_TTS", + "FLAG_TRANSCRIPT", + "FLAG_SOUND_GENERATION", + }, } configPath = filepath.Join(modelsPath, "mock-model.yaml") configYAML, err := yaml.Marshal(modelConfig)