diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fb5524..f0b3600 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ Tag naming: `0.y.z` (no `v` prefix). Align `cmd/version.go` with the tag before ## [Unreleased] +### Fixed + +- **code_files RAG**: populate `rag_for_content` when `fields.content.rag` is enabled so the control plane indexes GitLab file text (schema `usable_in_rag` expects `rag_for_`); omit raw `content` from ingest to keep payload size bounded. + ## [0.1.0] - 2026-05-23 ### Added diff --git a/config/schema.yml b/config/schema.yml index f146438..73a6064 100644 --- a/config/schema.yml +++ b/config/schema.yml @@ -170,7 +170,7 @@ entities: description: "Path relative to the repository root" content: type: "string" - description: "Text content (RAG when fields.content.rag is enabled on the code_files entity)" + description: "Raw file text in snapshots (RAG corpus uses rag_for_content when fields.content.rag is enabled)" usable_in_rag: true title: type: "string" diff --git a/internal/models/code_file.go b/internal/models/code_file.go index f108297..b6a8363 100644 --- a/internal/models/code_file.go +++ b/internal/models/code_file.go @@ -2,10 +2,12 @@ package models // CodeFile is a text file under a cloned repository, pushed for RAG indexing on the control plane. type CodeFile struct { - ID string `json:"id"` - RepoURL string `json:"repo_url"` - FilePath string `json:"file_path"` - Content string `json:"content"` - Title string `json:"title,omitempty"` - SourceURL string `json:"source_url,omitempty"` + ID string `json:"id"` + RepoURL string `json:"repo_url"` + FilePath string `json:"file_path"` + Content string `json:"content"` + // RagForContent is indexed for RAG (control plane: usable_in_rag on schema field content). + RagForContent string `json:"rag_for_content,omitempty"` + Title string `json:"title,omitempty"` + SourceURL string `json:"source_url,omitempty"` } diff --git a/internal/probe/entities/code_files_entity.go b/internal/probe/entities/code_files_entity.go index d06c3bb..7096035 100644 --- a/internal/probe/entities/code_files_entity.go +++ b/internal/probe/entities/code_files_entity.go @@ -60,9 +60,14 @@ func (e *CodeFilesEntity) Refresh(client core.Client) (interface{}, error) { log.Printf("code_files: scan %s: %v", r.URL, err) continue } - if !ragEnabled { - for i := range files { + for i := range files { + if ragEnabled { + files[i].RagForContent = files[i].Content + // RAG text is sent once as rag_for_content; omit content to keep ingest payload bounded. files[i].Content = "" + } else { + files[i].Content = "" + files[i].RagForContent = "" } } all = append(all, files...) diff --git a/internal/probe/entities/code_files_entity_test.go b/internal/probe/entities/code_files_entity_test.go new file mode 100644 index 0000000..cb8b41f --- /dev/null +++ b/internal/probe/entities/code_files_entity_test.go @@ -0,0 +1,55 @@ +package entities + +import ( + "testing" + + "fluid/probes/core" + "fluid/probes/gitlab/internal/config" + "fluid/probes/gitlab/internal/models" +) + +type stubConfig struct { + entities []core.EntityConfig +} + +func (s *stubConfig) GetEntities() []core.EntityConfig { return s.entities } +func (s *stubConfig) GetProbeName() string { return "test" } +func (s *stubConfig) GetProbeVersion() string { return "0.0.0" } +func (s *stubConfig) GetStateDir() string { return "state" } +func (s *stubConfig) GetCleanupInterval() int { return 60 } + +func TestCodeFilesRAGPayload(t *testing.T) { + t.Parallel() + + cfg := &config.Config{ + Data: config.DataConfig{ + Entities: []core.EntityConfig{ + { + Name: "code_files", + Fields: map[string]core.EntityFieldConfig{ + "content": {RAG: true}, + }, + }, + }, + }, + } + + files := []models.CodeFile{{Content: "resource \"x\" {}"}} + ragEnabled := codeFilesRAGEnabled(cfg) + for i := range files { + if ragEnabled { + files[i].RagForContent = files[i].Content + files[i].Content = "" + } else { + files[i].Content = "" + files[i].RagForContent = "" + } + } + + if files[0].RagForContent != `resource "x" {}` { + t.Fatalf("expected rag_for_content payload, got %q", files[0].RagForContent) + } + if files[0].Content != "" { + t.Fatal("expected content omitted when RAG is enabled") + } +}