zzet · avfirsov · Jun 15, 2026 · Jun 16, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/cmd/gortex/daemon.go b/cmd/gortex/daemon.go
@@ -37,7 +37,13 @@ var (
 	// (the function has no *cobra.Command of its own) to decide whether
 	// the flag overrides the `embedding:` config block. Set once in
 	// runDaemonStart before buildDaemonState runs.
-	daemonEmbeddingsChanged     bool
+	daemonEmbeddingsChanged bool
+	// daemonEmbeddingsURL / daemonEmbeddingsModel mirror `gortex mcp`'s
+	// --embeddings-url / --embeddings-model so the daemon can be started with an
+	// explicit OpenAI-compatible (or Ollama) embedding API. A non-empty URL forces
+	// the api provider in ResolveEmbedder, overriding the embedding: config block.
+	daemonEmbeddingsURL         string
+	daemonEmbeddingsModel       string
 	daemonStatusWatch           bool
 	daemonStatusInterval        time.Duration
 	daemonHTTPAddr              string
@@ -103,6 +109,10 @@ func init() {
 		"fork to background after starting (logs to the daemon log file — see `gortex daemon logs`)")
 	daemonStartCmd.Flags().BoolVar(&daemonEmbeddings, "embeddings", false,
 		"load a semantic embedding provider (opt-in — adds ~87 MB model download on first use and ~60 ms/symbol warmup)")
+	daemonStartCmd.Flags().StringVar(&daemonEmbeddingsURL, "embeddings-url", "",
+		"OpenAI-compatible (or Ollama) embedding API base URL (e.g. https://api.openai.com/v1). A non-empty URL forces the api provider, overriding the embedding: config. Key via $GORTEX_EMBEDDINGS_API_KEY or $OPENAI_API_KEY (openai.com only).")
+	daemonStartCmd.Flags().StringVar(&daemonEmbeddingsModel, "embeddings-model", "",
+		"embedding model for --embeddings-url (default: auto-detect — text-embedding-3-small for OpenAI, nomic-embed-text for Ollama)")
 	daemonStartCmd.Flags().StringVar(&daemonHTTPAddr, "http-addr", "",
 		"also expose the MCP 2026 Streamable HTTP transport on this TCP address (e.g. 127.0.0.1:7411); empty disables")
 	daemonStartCmd.Flags().StringVar(&daemonHTTPAuthToken, "http-auth-token", "",

diff --git a/cmd/gortex/daemon_state.go b/cmd/gortex/daemon_state.go
@@ -138,6 +138,8 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) {
 		Embedder: serverstack.EmbedderRequest{
 			FlagChanged: daemonEmbeddingsChanged,
 			FlagEnabled: daemonEmbeddings,
+			FlagURL:     daemonEmbeddingsURL,
+			FlagModel:   daemonEmbeddingsModel,
 		},
 		// Workspace-global side-store layout: notes/memories partition
 		// under the "daemon" key in the shared DataDir sidecar; the
@@ -641,8 +643,15 @@ func priorMtimesFromStore(g graph.Store, cm *config.ConfigManager, entry config.
 	// a worktree instance persists its mtimes under `<base>@<workspace>`,
 	// not the bare basename, so a plain ResolvePrefix would load the
 	// canonical checkout's mtimes and force a full re-index every restart.
-	prefix := strings.TrimPrefix(indexer.EffectiveRepoPrefix(cm, entry), "/")
-	if prefix == "" {
+	effective := strings.TrimPrefix(indexer.EffectiveRepoPrefix(cm, entry), "/")
+	repoCount := 1
+	if cm != nil {
+		if g := cm.Global(); g != nil {
+			repoCount = len(g.Repos)
+		}
+	}
+	prefix, ok := warmMtimePrefix(effective, repoCount)
+	if !ok {
 		if logger != nil {
 			logger.Info("daemon: priorMtimesFromStore: empty prefix",
 				zap.String("entry_path", entry.Path),
@@ -654,11 +663,41 @@ func priorMtimesFromStore(g graph.Store, cm *config.ConfigManager, entry config.
 	if logger != nil {
 		logger.Info("daemon: priorMtimesFromStore loaded",
 			zap.String("prefix", prefix),
+			zap.Bool("single_repo", repoCount < 2),
 			zap.Int("count", len(mtimes)))
 	}
 	return mtimes
 }
 
+// warmMtimePrefix picks the repo_prefix to look up persisted file mtimes
+// (and, by extension, to decide whether the warm-restart reconcile can run)
+// for a repo whose EffectiveRepoPrefix is `effective` in a daemon tracking
+// `repoCount` repos total.
+//
+// PURPOSE: single-repo daemons index WITHOUT a prefix — MultiIndexer.
+// indexSingleRepo / ReconcileRepoCtx only switch on a repo prefix once a
+// SECOND repo joins (the willBeMultiRepo gate). So a lone repo's nodes and
+// file_mtimes rows are persisted under "", while EffectiveRepoPrefix returns
+// the path basename (e.g. "drools"). Looking mtimes up under the basename
+// finds zero rows and forces a full cold re-index — and, with an API
+// embedder, a full (paid) re-embed — on every restart.
+//
+// RATIONALE: mirror the indexer's own single-vs-multi decision here so the
+// warm path keys mtimes exactly where they were written. In multi-repo mode
+// an empty effective prefix is untrustworthy (it would collide across repos),
+// so report ok=false and let the caller fall back to a cold index.
+//
+// KEYWORDS: warm-restart, repo-prefix, single-repo, file_mtimes, re-embed
+func warmMtimePrefix(effective string, repoCount int) (prefix string, ok bool) {
+	if repoCount < 2 {
+		return "", true
+	}
+	if effective == "" {
+		return "", false
+	}
+	return effective, true
+}
+
 // storeNeedsRebuild reports whether the backend signalled, via the optional
 // NeedsRebuild capability, that a schema migration crossed a rung an ALTER
 // could not satisfy — so its persisted rows are in an old shape and the

diff --git a/cmd/gortex/daemon_state_test.go b/cmd/gortex/daemon_state_test.go
@@ -73,3 +73,33 @@ func TestLSPDisabledSet_Empty(t *testing.T) {
 		t.Fatalf("expected empty map, got %v", got)
 	}
 }
+
+// TestWarmMtimePrefix covers the single- vs multi-repo prefix decision the
+// warm-restart mtime lookup hangs on. The bug it guards: a lone repo indexes
+// unprefixed (rows under ""), but EffectiveRepoPrefix returns the basename, so
+// looking up the basename finds nothing and forces a paid cold re-index every
+// restart.
+func TestWarmMtimePrefix(t *testing.T) {
+	cases := []struct {
+		name       string
+		effective  string
+		repoCount  int
+		wantPrefix string
+		wantOK     bool
+	}{
+		{"single repo uses empty prefix (the bug)", "drools", 1, "", true},
+		{"single repo, zero configured still unprefixed", "drools", 0, "", true},
+		{"multi-repo keeps its derived prefix", "drools", 2, "drools", true},
+		{"multi-repo with no prefix is untrustworthy", "", 3, "", false},
+		{"single repo already empty prefix", "", 1, "", true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			gotPrefix, gotOK := warmMtimePrefix(tc.effective, tc.repoCount)
+			if gotPrefix != tc.wantPrefix || gotOK != tc.wantOK {
+				t.Fatalf("warmMtimePrefix(%q, %d) = (%q, %v), want (%q, %v)",
+					tc.effective, tc.repoCount, gotPrefix, gotOK, tc.wantPrefix, tc.wantOK)
+			}
+		})
+	}
+}
diff --git a/internal/embedding/api.go b/internal/embedding/api.go
@@ -7,8 +7,10 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"os"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 )
 
@@ -18,15 +20,65 @@ import (
 // fails and the caller aborts to text-only search.
 const maxRetryAfterWait = 60 * time.Second
 
+// maxEmbedInputBytes caps each embedding input. OpenAI's embedding models
+// reject inputs over 8192 tokens with a 400 that aborts the WHOLE batch
+// (and the vector index). A BPE tokenizer never emits more tokens than
+// input characters, and for single-byte (ASCII) source — the overwhelming
+// majority of code — characters equal bytes, so capping the head at 8000
+// bytes guarantees ≤8000 tokens, safely under the 8192 limit, regardless
+// of how token-dense the snippet is. The truncated head still carries the
+// symbol's signature and leading body — enough signal for nearest-neighbour
+// search. Head-truncation beats dropping the whole index over a few giant
+// generated symbols.
+const maxEmbedInputBytes = 8000
+
+// truncateEmbedInputs head-truncates any input over the byte cap, on a
+// UTF-8 rune boundary so the JSON payload stays valid. Returns the same
+// slice when nothing needed trimming (the common case).
+func truncateEmbedInputs(texts []string) []string {
+	var out []string
+	for i, t := range texts {
+		if len(t) <= maxEmbedInputBytes {
+			continue
+		}
+		if out == nil {
+			out = make([]string, len(texts))
+			copy(out, texts)
+		}
+		b := []byte(t[:maxEmbedInputBytes])
+		for len(b) > 0 && b[len(b)-1]&0xC0 == 0x80 { // back off mid-rune
+			b = b[:len(b)-1]
+		}
+		out[i] = string(b)
+	}
+	if out == nil {
+		return texts
+	}
+	return out
+}
+
 // APIProvider calls an external embedding API (Ollama or OpenAI-compatible).
 type APIProvider struct {
 	url    string
 	model  string
+	apiKey string
 	client *http.Client
 	dims   int
 	format apiFormat
+
+	// tokensUsed accumulates the `usage.total_tokens` reported by the
+	// embedding backend across every request, so the indexer can log the
+	// actual token spend of a paid embedding pass (otherwise invisible).
+	// Touched from several goroutines under the concurrent embedding pool,
+	// hence atomic.
+	tokensUsed int64
 }
 
+// TokensUsed reports the total embedding tokens this provider has been
+// billed for so far, summed from each response's usage.total_tokens.
+// Returns 0 for backends that don't report usage (e.g. Ollama).
+func (p *APIProvider) TokensUsed() int64 { return atomic.LoadInt64(&p.tokensUsed) }
+
 type apiFormat int
 
 const (
@@ -50,9 +102,21 @@ func NewAPIProvider(url, model string) *APIProvider {
 		}
 	}
 
+	// API key for authenticated embedding backends (OpenAI, Azure, and
+	// OpenAI-compatible gateways). Ollama on localhost is keyless, so the
+	// key stays optional and an unset value just omits the header. Prefer
+	// an explicit GORTEX_EMBEDDINGS_API_KEY; fall back to OPENAI_API_KEY
+	// only when the endpoint is api.openai.com, so a stray OPENAI_API_KEY
+	// can never leak to an arbitrary third-party URL.
+	apiKey := os.Getenv("GORTEX_EMBEDDINGS_API_KEY")
+	if apiKey == "" && strings.Contains(url, "openai.com") {
+		apiKey = os.Getenv("OPENAI_API_KEY")
+	}
+
 	return &APIProvider{
 		url:    strings.TrimRight(url, "/"),
 		model:  model,
+		apiKey: apiKey,
 		client: &http.Client{Timeout: 30 * time.Second},
 		format: format,
 	}
@@ -79,6 +143,40 @@ func (p *APIProvider) EmbedBatch(ctx context.Context, texts []string) ([][]float
 func (p *APIProvider) Dimensions() int { return p.dims }
 func (p *APIProvider) Close() error    { return nil }
 
+// ProbeDimensions makes one tiny embedding call to discover and cache the
+// provider's vector width, so Dimensions() reports the true value *before*
+// the first indexing pass. An APIProvider learns its width only from the
+// first real embed (embedOpenAI / embedOllama set p.dims from the returned
+// vector); until then Dimensions() returns 0, which has two concrete
+// consequences at daemon startup: the "embeddings enabled" log mislabels
+// the width as dim:0, and the snapshot-vector reload gate
+// (daemon_state.go: vec.Dims == EmbedderDims) rejects a correctly-sized
+// persisted index, forcing a needless full re-embed on every restart.
+//
+// Idempotent: a no-op once the width is known. Best-effort: on any
+// transport/auth error it returns the error and leaves dims at 0 — the
+// caller logs a warning, the lazy path still sets the width from the first
+// real vector, and indexing degrades to BM25 if embeddings are truly
+// unreachable. The probe also doubles as an early connectivity/credential
+// check, surfacing a bad key or URL at startup instead of mid-index.
+func (p *APIProvider) ProbeDimensions(ctx context.Context) (int, error) {
+	if d := p.Dimensions(); d > 0 {
+		return d, nil
+	}
+	pctx, cancel := context.WithTimeout(ctx, 20*time.Second)
+	defer cancel()
+	vec, err := p.Embed(pctx, "gortex embedding dimension probe")
+	if err != nil {
+		return 0, err
+	}
+	// Embed -> EmbedBatch -> embed{OpenAI,Ollama} already cached p.dims from
+	// the response; fall back to the returned vector's length defensively.
+	if p.dims == 0 && len(vec) > 0 {
+		p.dims = len(vec)
+	}
+	return p.dims, nil
+}
+
 // Concurrent reports that this provider is safe — and worth — calling
 // from several goroutines at once. An external HTTP embedding endpoint
 // gains from overlapped round-trips; the indexer's embedding pool uses
@@ -155,7 +253,7 @@ type ollamaResponse struct {
 func (p *APIProvider) embedOllama(ctx context.Context, texts []string) ([][]float32, error) {
 	reqBody := ollamaRequest{
 		Model: p.model,
-		Input: texts,
+		Input: truncateEmbedInputs(texts),
 	}
 
 	body, err := json.Marshal(reqBody)
@@ -169,6 +267,9 @@ func (p *APIProvider) embedOllama(ctx context.Context, texts []string) ([][]floa
 		return nil, fmt.Errorf("create request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
+	if p.apiKey != "" {
+		req.Header.Set("Authorization", "Bearer "+p.apiKey)
+	}
 
 	resp, err := p.doRequest(ctx, req, body)
 	if err != nil {
@@ -201,7 +302,15 @@ type openAIRequest struct {
 }
 
 type openAIResponse struct {
-	Data []openAIEmbedding `json:"data"`
+	Data  []openAIEmbedding `json:"data"`
+	Usage openAIUsage       `json:"usage"`
+}
+
+// openAIUsage carries the token accounting OpenAI returns alongside every
+// embeddings response. total_tokens is what the request is billed on.
+type openAIUsage struct {
+	PromptTokens int `json:"prompt_tokens"`
+	TotalTokens  int `json:"total_tokens"`
 }
 
 type openAIEmbedding struct {
@@ -212,20 +321,32 @@ type openAIEmbedding struct {
 func (p *APIProvider) embedOpenAI(ctx context.Context, texts []string) ([][]float32, error) {
 	reqBody := openAIRequest{
 		Model: p.model,
-		Input: texts,
+		Input: truncateEmbedInputs(texts),
 	}
 
 	body, err := json.Marshal(reqBody)
 	if err != nil {
 		return nil, fmt.Errorf("marshal request: %w", err)
 	}
 
-	url := p.url + "/v1/embeddings"
+	// OpenAI-compatible bases are conventionally given WITH the version
+	// segment (OpenAI "https://api.openai.com/v1", OpenRouter
+	// "https://openrouter.ai/api/v1"). Append "/v1" only when it is absent,
+	// so a "…/v1" base does not become "…/v1/v1/embeddings" (a 404 that
+	// silently degrades the whole vector index to BM25).
+	endpoint := "/v1/embeddings"
+	if strings.HasSuffix(p.url, "/v1") {
+		endpoint = "/embeddings"
+	}
+	url := p.url + endpoint
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
 	if err != nil {
 		return nil, fmt.Errorf("create request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
+	if p.apiKey != "" {
+		req.Header.Set("Authorization", "Bearer "+p.apiKey)
+	}
 
 	resp, err := p.doRequest(ctx, req, body)
 	if err != nil {
@@ -243,6 +364,10 @@ func (p *APIProvider) embedOpenAI(ctx context.Context, texts []string) ([][]floa
 		return nil, fmt.Errorf("decode response: %w", err)
 	}
 
+	if result.Usage.TotalTokens > 0 {
+		atomic.AddInt64(&p.tokensUsed, int64(result.Usage.TotalTokens))
+	}
+
 	vecs := make([][]float32, len(result.Data))
 	for _, d := range result.Data {
 		vecs[d.Index] = d.Embedding