From 27edf8b7ca601506da77db94ba1fcd8279993ddb Mon Sep 17 00:00:00 2001 From: Robert Gonek Date: Tue, 3 Mar 2026 09:45:11 +0100 Subject: [PATCH 1/6] feat(search): Phase 1 - shared types, Store interface, Goldmark parser Add Document, SearchOptions, SearchResult types and the Store interface. Add ParseMarkdownStructure() for Goldmark AST-based heading/code extraction. --- agents/plans/full-text-search.md | 327 +++++++++++++++++++++++++++++++ internal/search/document.go | 100 ++++++++++ internal/search/parser.go | 256 ++++++++++++++++++++++++ internal/search/parser_test.go | 233 ++++++++++++++++++++++ internal/search/store.go | 35 ++++ 5 files changed, 951 insertions(+) create mode 100644 agents/plans/full-text-search.md create mode 100644 internal/search/document.go create mode 100644 internal/search/parser.go create mode 100644 internal/search/parser_test.go create mode 100644 internal/search/store.go diff --git a/agents/plans/full-text-search.md b/agents/plans/full-text-search.md new file mode 100644 index 0000000..866f6e2 --- /dev/null +++ b/agents/plans/full-text-search.md @@ -0,0 +1,327 @@ +# Plan: Full-Text Search with Bleve + SQLite FTS5 (Dual Backend) + +## Context + +AI agents using `conf` are token-expensive during reads because grep/ripgrep has no awareness of document structure. The Atlassian MCP wins on reads because it returns targeted, structured results. Adding `conf search` with Goldmark (markdown AST) and a pluggable search backend gives agents heading-anchored, faceted search — the "search-then-read" pattern that makes MCP efficient, but locally with zero API calls. + +We implement **two backends** (Bleve and SQLite FTS5) behind a shared interface to evaluate which works better in practice, then drop the loser. + +## Architecture + +``` +cmd/search.go -- CLI command, output formatting + | +internal/search/ + parser.go -- Goldmark AST → sections, code blocks + document.go -- Shared document types + store.go -- Store interface + indexer.go -- Orchestrates file walking + store calls + | + ├── blevestore/store.go -- Bleve scorch implementation + └── sqlitestore/store.go -- SQLite + FTS5 implementation +``` + +### Store Interface + +```go +type Store interface { + Index(docs []Document) error // Upsert documents for a file + DeleteByPath(relPath string) error // Remove all docs for a file + Search(opts SearchOptions) ([]SearchResult, error) + ListLabels() ([]string, error) // All unique labels with counts + ListSpaces() ([]string, error) // All unique space keys + UpdateMeta() error // Mark index timestamp + LastIndexedAt() (time.Time, error) // Read index timestamp + Close() error +} +``` + +Both backends implement this. The `Indexer` orchestrates file walking and calls `Store` methods — it never touches Bleve or SQLite directly. + +### Document Model (shared, 3 types) + +| Doc Type | ID Pattern | Purpose | +|----------|-----------|---------| +| `page` | `page:` | Full file: frontmatter facets + full body text | +| `section` | `section::` | Heading-anchored section: heading hierarchy + section content | +| `code` | `code::` | Fenced code block: language tag + content + heading context | + +All types carry denormalized frontmatter fields (`space_key`, `labels`, `title`, `page_id`) for zero-join filtering. + +```go +type Document struct { + ID string // Composite ID + Type string // "page", "section", "code" + Path string // Relative path (forward slashes) + PageID string + Title string + SpaceKey string + Labels []string + Content string // Body (page), section text, or code content + HeadingPath []string // Heading hierarchy for sections/code + HeadingText string // Innermost heading text + HeadingLevel int + Language string // Code block language + Line int // 1-based start line + ModTime time.Time +} +``` + +## File Layout + +``` +internal/search/ + document.go -- Document struct, SearchResult, Match types + store.go -- Store interface definition + parser.go -- ParseMarkdownStructure() Goldmark AST walker + parser_test.go + indexer.go -- Indexer: file walking, Store orchestration + indexer_test.go + search_testhelpers_test.go -- Shared test helpers + + blevestore/ + store.go -- Bleve Store implementation + store_test.go + mapping.go -- Bleve index mapping/schema + + sqlitestore/ + store.go -- SQLite+FTS5 Store implementation + store_test.go + schema.go -- DDL statements, migration + +cmd/ + search.go -- newSearchCmd(), runSearch(), formatters + search_test.go +``` + +## Implementation Phases + +### Phase 1: Shared Types + Goldmark Parser +**Files:** `internal/search/document.go`, `internal/search/store.go`, `internal/search/parser.go`, `internal/search/parser_test.go` + +Zero external dependencies beyond Goldmark (already in go.mod). + +**`ParseMarkdownStructure(source []byte) ParseResult`** walks the Goldmark AST: +1. Collect all `*ast.Heading` nodes (level, text, line) and `*ast.FencedCodeBlock` nodes +2. Build sections with heading stack: pop entries when same-or-higher level heading arrives +3. Map code blocks to enclosing heading context by line position + +**Reuse pattern from:** `internal/sync/assets.go:34` — `goldmark.New().Parser().Parse(text.NewReader(source))` + `ast.Walk` + +**Line numbers:** Convert Goldmark byte offsets (`node.Lines().At(0).Start`) to 1-based lines by counting `\n` in `source[:offset]`. + +### Phase 2a: Bleve Backend +**Files:** `internal/search/blevestore/mapping.go`, `internal/search/blevestore/store.go`, `internal/search/blevestore/store_test.go` + +**Dependency:** `go get github.com/blevesearch/bleve/v2` + +**Index location:** `/.confluence-search-index/bleve/` + +**Field mapping:** +- **keyword** (exact match): `type`, `path`, `page_id`, `space_key`, `labels`, `language` +- **text** (standard analyzer): `title`, `content`, `heading_text`, `heading_path` +- **numeric**: `heading_level`, `line` +- **date**: `mod_time` + +**Search query construction:** +- Text: `DisjunctionQuery` across `content` (boost 2.0), `heading_text` (1.5), `title` (1.0) +- Filters: `TermQuery` on keyword fields +- Combined: `ConjunctionQuery` +- Result aggregation: group hits by `path`, sort by top score + +### Phase 2b: SQLite FTS5 Backend +**Files:** `internal/search/sqlitestore/schema.go`, `internal/search/sqlitestore/store.go`, `internal/search/sqlitestore/store_test.go` + +**Dependency:** `go get modernc.org/sqlite` (pure Go, no CGo — works on Windows without gcc) + +**Index location:** `/.confluence-search-index/search.db` + +**Schema:** +```sql +CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + type TEXT NOT NULL, -- 'page', 'section', 'code' + path TEXT NOT NULL, + page_id TEXT, + title TEXT, + space_key TEXT, + labels TEXT, -- JSON array: '["arch","security"]' + content TEXT, + heading_path TEXT, -- JSON array: '["## Foo","### Bar"]' + heading_text TEXT, + heading_level INTEGER, + language TEXT, + line INTEGER, + mod_time TEXT +); + +CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path); +CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(type); +CREATE INDEX IF NOT EXISTS idx_documents_space ON documents(space_key); + +-- FTS5 virtual table for full-text search +CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( + title, + content, + heading_text, + content=documents, + content_rowid=rowid, + tokenize='porter unicode61' +); + +-- Triggers to keep FTS in sync +CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN + INSERT INTO documents_fts(rowid, title, content, heading_text) + VALUES (new.rowid, new.title, new.content, new.heading_text); +END; +-- (similar for UPDATE/DELETE triggers) + +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT +); +``` + +**Search query:** +```sql +SELECT d.*, fts.rank +FROM documents_fts fts +JOIN documents d ON d.rowid = fts.rowid +WHERE documents_fts MATCH ? + AND d.type IN ('section', 'code') + AND (? = '' OR d.space_key = ?) + AND (? = '' OR EXISTS ( + SELECT 1 FROM json_each(d.labels) WHERE json_each.value = ? + )) +ORDER BY fts.rank +LIMIT ? +``` + +**Label listing:** `SELECT DISTINCT j.value FROM documents, json_each(documents.labels) j ORDER BY j.value` + +### Phase 3: Indexer (shared orchestration) +**Files:** `internal/search/indexer.go`, `internal/search/indexer_test.go` + +The `Indexer` operates on the `Store` interface — backend-agnostic. + +```go +type Indexer struct { + store Store + rootDir string +} + +func NewIndexer(store Store, rootDir string) *Indexer +func (ix *Indexer) Reindex() (int, error) // Full reindex +func (ix *Indexer) IndexSpace(spaceDir, spaceKey string) (int, error) +func (ix *Indexer) IncrementalUpdate() (int, error) // Mtime-based delta +func (ix *Indexer) Close() error +``` + +**Per-file indexing flow:** +1. `fs.ReadMarkdownDocument(absPath)` — get frontmatter + body +2. `ParseMarkdownStructure(body)` — get sections + code blocks +3. `store.DeleteByPath(relPath)` — remove old documents +4. Build `[]Document` (1 page + N sections + M code blocks) +5. `store.Index(docs)` — insert all + +**File walking:** Reuse the standard skip pattern (`assets/`, `.`-prefixed dirs, `.md` only) from `internal/sync/index.go`. Discover spaces via `fs.FindAllStateFiles()`. + +### Phase 4: CLI Command +**Files:** `cmd/search.go`, `cmd/search_test.go` + +**Command:** `conf search QUERY [flags]` + +**Flags:** +| Flag | Type | Default | Description | +|------|------|---------|-------------| +| `--space` | string | "" | Filter to specific space key | +| `--label` | []string | nil | Filter by label (repeatable) | +| `--heading` | string | "" | Filter to sections under matching headings | +| `--format` | string | auto | `text` or `json` (auto: TTY→text, pipe→json) | +| `--limit` | int | 20 | Max results | +| `--reindex` | bool | false | Force full reindex before searching | +| `--engine` | string | "sqlite" | Backend: `bleve` or `sqlite` (for A/B testing) | +| `--list-labels` | bool | false | List all indexed labels and exit | +| `--list-spaces` | bool | false | List all indexed spaces and exit | + +**`runSearch` flow:** +1. `gitRepoRoot()` +2. Open store based on `--engine` flag +3. Create `Indexer` with store +4. If `--reindex`: full reindex, else: incremental update +5. If `--list-labels`: `store.ListLabels()` → format → exit +6. If `--list-spaces`: `store.ListSpaces()` → format → exit +7. `store.Search(opts)` → format output + +**Text output:** +``` +DEV/security/overview.md - Security Overview [architecture, security] + ## OAuth2 Flow > ### Token Refresh (line 87) + ...refresh tokens are rotated every 15 minutes using PKCE... +``` + +**JSON output:** `[]SearchResult` with `json.Encoder.SetIndent("", " ")` + +**Registration:** Add `newSearchCmd()` to `rootCmd.AddCommand(...)` in `cmd/root.go:98` + +### Phase 5: Integration Hooks +**Modified files:** `cmd/pull.go`, `cmd/clean.go`, `cmd/init.go` + +**5a. Post-pull indexing** (`cmd/pull.go` after line 336): +```go +if err := updateSearchIndexForSpace(repoRoot, pullCtx.spaceDir, pullCtx.spaceKey, out); err != nil { + _, _ = fmt.Fprintf(out, "warning: search index update failed: %v\n", err) +} +``` +Non-fatal — a failed index update never fails a pull. + +**5b. Clean command** (`cmd/clean.go` after line 124): +Remove search index directories as part of clean artifacts. + +**5c. Gitignore** (`cmd/init.go`): +- Add `.confluence-search-index/` to `gitignoreContent` template (line 15) +- Add `.confluence-search-index/` to `ensureGitignore()` entries (line 221) + +**5d. Repo `.gitignore`** — add `.confluence-search-index/` + +### Phase 6: Documentation +- Update `AGENTS.md` with `conf search` command reference +- Update `docs/usage.md` with search docs + +## Implementation Order + +1. **Phase 1** — Shared types + Goldmark parser (testable immediately, no new deps) +2. **Phase 2b** — SQLite backend first (simpler, pure Go, faster to get working) +3. **Phase 3** — Indexer (uses SQLite backend for initial testing) +4. **Phase 4** — CLI command (end-to-end working with SQLite) +5. **Phase 5** — Integration hooks +6. **Phase 2a** — Bleve backend (add second backend, compare) +7. **Phase 6** — Documentation + decide which backend to keep + +## Critical Files Reference + +| File | Action | +|------|--------| +| `internal/fs/frontmatter.go` | Reuse `ReadMarkdownDocument()`, `ReadFrontmatter()`, `NormalizeLabels()` | +| `internal/fs/state.go` | Reuse `FindAllStateFiles()`, `LoadState()` | +| `internal/sync/assets.go:34` | Reference Goldmark AST walk pattern | +| `internal/sync/index.go` | Replicate WalkDir skip logic | +| `cmd/root.go:98` | Add `newSearchCmd()` | +| `cmd/pull.go:336` | Insert post-pull indexing hook | +| `cmd/clean.go:124` | Insert search index cleanup | +| `cmd/init.go:15,221` | Add `.confluence-search-index/` to gitignore | + +## Verification + +1. **Unit tests:** `make test` — all new tests pass +2. **Smoke test both backends:** + - `conf search "term" --engine sqlite` vs `conf search "term" --engine bleve` + - Compare: speed, result quality, index size +3. **Facet discovery:** + - `conf search --list-labels --format json` → verify all labels returned + - `conf search --list-spaces --format json` → verify all spaces returned +4. **Incremental:** Edit a file → `conf search "term"` → verify only changed file reindexed +5. **Post-pull:** `conf pull SPACE` → verify "Updated search index" message +6. **Clean:** `conf clean` → verify index removed +7. **Pipe:** `conf search "term" | head` → verify auto-JSON format +8. **Init:** `conf init` → verify `.gitignore` includes `.confluence-search-index/` diff --git a/internal/search/document.go b/internal/search/document.go new file mode 100644 index 0000000..25bc231 --- /dev/null +++ b/internal/search/document.go @@ -0,0 +1,100 @@ +// Package search provides full-text search over a local Confluence Markdown workspace. +package search + +import "time" + +// DocType enumerates the document types indexed by the search engine. +const ( + DocTypePage = "page" + DocTypeSection = "section" + DocTypeCode = "code" +) + +// Document represents a single indexable unit produced from a Markdown file. +// +// Three document types share this struct: +// - page (ID = "page:") — whole-file, frontmatter facets + full body +// - section (ID = "section::") — heading-anchored slice of body text +// - code (ID = "code::") — fenced code block + heading context +// +// All types carry denormalized frontmatter fields (SpaceKey, Labels, Title, PageID) +// so that filtering never requires a join. +type Document struct { + // ID is a composite, globally unique key. + ID string + + // Type is DocTypePage, DocTypeSection, or DocTypeCode. + Type string + + // Path is the repository-relative path with forward slashes, e.g. "DEV/overview.md". + Path string + + // PageID is the Confluence page identifier from frontmatter (may be empty for new files). + PageID string + + // Title is the Confluence page title from frontmatter. + Title string + + // SpaceKey is the Confluence space key from frontmatter. + SpaceKey string + + // Labels are Confluence page labels, normalised (lowercase, trimmed, deduplicated). + Labels []string + + // Content holds the searchable text: full body for page docs, heading-section text for + // section docs, and raw code content for code docs. + Content string + + // HeadingPath is the ordered heading hierarchy from root to the section/code block, + // e.g. ["# Overview", "## Auth Flow", "### Token Refresh"]. + HeadingPath []string + + // HeadingText is the innermost heading label (for section/code docs). + HeadingText string + + // HeadingLevel is the Markdown heading level (1–6) of HeadingText; 0 for page docs. + HeadingLevel int + + // Language is the fenced code block info string (e.g. "go", "sql"); empty for non-code docs. + Language string + + // Line is the 1-based start line in the source file (0 for page docs). + Line int + + // ModTime is the last modification time of the source file. + ModTime time.Time +} + +// SearchOptions controls a full-text search query. +type SearchOptions struct { + // Query is the full-text search string. May be empty when listing labels/spaces. + Query string + + // SpaceKey restricts results to a single Confluence space key. Empty = all spaces. + SpaceKey string + + // Labels restricts results to pages that carry ALL of the given labels. + Labels []string + + // HeadingFilter restricts section/code results to those whose HeadingText contains + // this substring (case-insensitive). + HeadingFilter string + + // Types restricts results to the given document types. nil = all types. + Types []string + + // Limit is the maximum number of results to return. 0 = use a sensible default. + Limit int +} + +// SearchResult is a single match returned by Store.Search. +type SearchResult struct { + // Document is the full indexed document. + Document Document + + // Score is a backend-specific relevance score (higher = more relevant). + Score float64 + + // Snippet is a short contextual excerpt with the matched terms highlighted. + Snippet string +} diff --git a/internal/search/parser.go b/internal/search/parser.go new file mode 100644 index 0000000..c742181 --- /dev/null +++ b/internal/search/parser.go @@ -0,0 +1,256 @@ +package search + +import ( + "bytes" + "fmt" + "strings" + + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/text" +) + +// ParseResult is the structured representation of a Markdown document's headings and +// code blocks, produced by ParseMarkdownStructure. +type ParseResult struct { + // Sections holds heading-anchored sections of the document. + // Each section contains the heading metadata and the body text that follows + // the heading up to (but not including) the next same-or-higher-level heading. + Sections []Section + + // CodeBlocks holds all fenced code blocks together with their heading context. + CodeBlocks []CodeBlock +} + +// Section is a slice of Markdown body text bounded by heading boundaries. +type Section struct { + // HeadingText is the plain text of the heading node. + HeadingText string + + // HeadingLevel is 1–6. + HeadingLevel int + + // HeadingPath is the full ancestor hierarchy including this heading, + // e.g. ["# Overview", "## Auth", "### Token Refresh"]. + HeadingPath []string + + // Content is the raw Markdown text of the section body (excluding the heading line). + Content string + + // Line is the 1-based line of the heading in the source file. + Line int +} + +// CodeBlock is a fenced code block with its contextual heading hierarchy. +type CodeBlock struct { + // Language is the info string after the opening fence (may be empty). + Language string + + // Content is the raw code text inside the fence. + Content string + + // HeadingPath is the heading ancestry at the point of the code block. + HeadingPath []string + + // HeadingText is the innermost heading text (empty if the code block precedes all headings). + HeadingText string + + // HeadingLevel is the level of HeadingText (0 if none). + HeadingLevel int + + // Line is the 1-based start line of the code block in the source file. + Line int +} + +// headingEntry tracks the active heading stack used during AST traversal. +type headingEntry struct { + level int + text string + line int +} + +// ParseMarkdownStructure walks the Goldmark AST of the given Markdown source and +// extracts heading-anchored sections and fenced code blocks. +// +// Line numbers are 1-based. The function is pure: it allocates no I/O resources. +func ParseMarkdownStructure(source []byte) ParseResult { + parser := goldmark.New().Parser() + reader := text.NewReader(source) + doc := parser.Parse(reader) + + var ( + headingStack []headingEntry + sections []Section + codeBlocks []CodeBlock + + // sectionStart is the byte offset at which the current section body begins + // (i.e., just after the heading line). + sectionStart int + ) + + // offsetToLine converts a byte offset into a 1-based line number. + offsetToLine := func(offset int) int { + if offset <= 0 { + return 1 + } + if offset > len(source) { + offset = len(source) + } + return bytes.Count(source[0:offset], []byte{'\n'}) + 1 + } + + // headingPathStrings builds the HeadingPath slice from the current stack. + headingPathStrings := func(stack []headingEntry) []string { + path := make([]string, len(stack)) + for i, e := range stack { + path[i] = fmt.Sprintf("%s %s", strings.Repeat("#", e.level), e.text) + } + return path + } + + // finishSection closes the pending section, appending it to the sections slice. + // endOffset is the byte offset at which the section body ends (exclusive). + finishSection := func(endOffset int) { + if len(headingStack) == 0 { + return + } + top := headingStack[len(headingStack)-1] + body := "" + if sectionStart < endOffset && endOffset <= len(source) { + body = strings.TrimSpace(string(source[sectionStart:endOffset])) + } + sections = append(sections, Section{ + HeadingText: top.text, + HeadingLevel: top.level, + HeadingPath: headingPathStrings(headingStack), + Content: body, + Line: top.line, + }) + } + + // currentHeadingContext returns the heading context at the current position in the AST. + currentHeadingContext := func() (path []string, text string, level int) { + path = headingPathStrings(headingStack) + if len(headingStack) > 0 { + top := headingStack[len(headingStack)-1] + text = top.text + level = top.level + } + return + } + + _ = ast.Walk(doc, func(node ast.Node, entering bool) (ast.WalkStatus, error) { + switch n := node.(type) { + case *ast.Heading: + if !entering { + return ast.WalkContinue, nil + } + + // Determine the byte offset of this heading node. + headingOffset := 0 + if n.Lines().Len() > 0 { + headingOffset = n.Lines().At(0).Start + } else if n.HasChildren() { + // For setext-style headings the child carries the offset. + child := n.FirstChild() + if child != nil && child.Lines().Len() > 0 { + headingOffset = child.Lines().At(0).Start + } + } + + headingLine := offsetToLine(headingOffset) + + // Collect the plain text of the heading. + headingText := extractHeadingText(n, source) + + // Close any open sections that are at the same or lower depth. + for len(headingStack) > 0 && headingStack[len(headingStack)-1].level >= n.Level { + finishSection(headingOffset) + headingStack = headingStack[:len(headingStack)-1] + } + + // Push the new heading onto the stack. + headingStack = append(headingStack, headingEntry{ + level: n.Level, + text: headingText, + line: headingLine, + }) + + // The section body begins immediately after the heading text. + // We advance past the entire heading lines block. + if n.Lines().Len() > 0 { + last := n.Lines().At(n.Lines().Len() - 1) + sectionStart = last.Stop + } else { + sectionStart = headingOffset + } + + case *ast.FencedCodeBlock: + if !entering { + return ast.WalkContinue, nil + } + + // Byte offset and line of the opening fence. + codeOffset := 0 + if n.Lines().Len() > 0 { + codeOffset = n.Lines().At(0).Start + } + codeLine := offsetToLine(codeOffset) + + // Language info string. + lang := "" + if n.Info != nil { + lang = strings.TrimSpace(string(n.Info.Value(source))) + // Strip options like "go {lineNumbers=true}" — keep only the first token. + if spaceIdx := strings.IndexByte(lang, ' '); spaceIdx >= 0 { + lang = lang[:spaceIdx] + } + } + + // Code content: concatenate all line segments. + var codeBuf strings.Builder + for i := 0; i < n.Lines().Len(); i++ { + seg := n.Lines().At(i) + codeBuf.Write(source[seg.Start:seg.Stop]) + } + + path, headText, headLevel := currentHeadingContext() + codeBlocks = append(codeBlocks, CodeBlock{ + Language: lang, + Content: strings.TrimRight(codeBuf.String(), "\n"), + HeadingPath: path, + HeadingText: headText, + HeadingLevel: headLevel, + Line: codeLine, + }) + } + return ast.WalkContinue, nil + }) + + // Close all remaining open sections from innermost to outermost. + for len(headingStack) > 0 { + finishSection(len(source)) + headingStack = headingStack[:len(headingStack)-1] + } + + return ParseResult{ + Sections: sections, + CodeBlocks: codeBlocks, + } +} + +// extractHeadingText returns the concatenated plain text of all Text children +// within a heading node. +func extractHeadingText(heading *ast.Heading, source []byte) string { + var buf strings.Builder + _ = ast.Walk(heading, func(node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + if t, ok := node.(*ast.Text); ok { + buf.Write(t.Value(source)) + } + return ast.WalkContinue, nil + }) + return strings.TrimSpace(buf.String()) +} diff --git a/internal/search/parser_test.go b/internal/search/parser_test.go new file mode 100644 index 0000000..65f2b3a --- /dev/null +++ b/internal/search/parser_test.go @@ -0,0 +1,233 @@ +package search + +import ( + "testing" +) + +func TestParseMarkdownStructure_EmptyInput(t *testing.T) { + result := ParseMarkdownStructure([]byte("")) + if len(result.Sections) != 0 { + t.Errorf("expected 0 sections, got %d", len(result.Sections)) + } + if len(result.CodeBlocks) != 0 { + t.Errorf("expected 0 code blocks, got %d", len(result.CodeBlocks)) + } +} + +func TestParseMarkdownStructure_NoHeadings(t *testing.T) { + src := `This is just a paragraph. + +Another paragraph without headings. +` + result := ParseMarkdownStructure([]byte(src)) + if len(result.Sections) != 0 { + t.Errorf("expected 0 sections, got %d", len(result.Sections)) + } +} + +func TestParseMarkdownStructure_SingleHeading(t *testing.T) { + src := `# Overview + +This is the overview text. +Some more content here. +` + result := ParseMarkdownStructure([]byte(src)) + if len(result.Sections) != 1 { + t.Fatalf("expected 1 section, got %d", len(result.Sections)) + } + + s := result.Sections[0] + if s.HeadingText != "Overview" { + t.Errorf("expected heading text 'Overview', got %q", s.HeadingText) + } + if s.HeadingLevel != 1 { + t.Errorf("expected heading level 1, got %d", s.HeadingLevel) + } + if s.Line != 1 { + t.Errorf("expected heading line 1, got %d", s.Line) + } + if len(s.HeadingPath) != 1 { + t.Errorf("expected 1 path entry, got %d", len(s.HeadingPath)) + } + if s.HeadingPath[0] != "# Overview" { + t.Errorf("unexpected heading path: %v", s.HeadingPath) + } + if s.Content == "" { + t.Error("expected non-empty section content") + } +} + +func TestParseMarkdownStructure_NestedHeadings(t *testing.T) { + src := `# Top Level + +Top level content. + +## Sub Section + +Sub section content. + +### Deep Section + +Deep content. + +## Another Sub + +Another sub content. +` + result := ParseMarkdownStructure([]byte(src)) + if len(result.Sections) != 4 { + t.Fatalf("expected 4 sections, got %d: %+v", len(result.Sections), result.Sections) + } + + // Sections are appended in closure order (innermost first): + // 0. "### Deep Section" closes when "## Another Sub" arrives + // 1. "## Sub Section" closes when "## Another Sub" arrives + // 2. "## Another Sub" closes at end of file + // 3. "# Top Level" closes at end of file (outermost, last) + + // Verify top level is among sections. + foundTop := false + for _, s := range result.Sections { + if s.HeadingText == "Top Level" && s.HeadingLevel == 1 { + foundTop = true + } + } + if !foundTop { + t.Error("expected to find 'Top Level' section") + } + + // Verify "Deep Section" has 3-level path. + for _, s := range result.Sections { + if s.HeadingText == "Deep Section" { + if len(s.HeadingPath) != 3 { + t.Errorf("Deep Section: expected 3-level path, got %d: %v", len(s.HeadingPath), s.HeadingPath) + } + if s.HeadingPath[0] != "# Top Level" { + t.Errorf("Deep Section path[0]: expected '# Top Level', got %q", s.HeadingPath[0]) + } + if s.HeadingPath[1] != "## Sub Section" { + t.Errorf("Deep Section path[1]: expected '## Sub Section', got %q", s.HeadingPath[1]) + } + if s.HeadingPath[2] != "### Deep Section" { + t.Errorf("Deep Section path[2]: expected '### Deep Section', got %q", s.HeadingPath[2]) + } + } + } +} + +func TestParseMarkdownStructure_CodeBlock(t *testing.T) { + src := `# Auth Flow + +Some auth description. + +## Token Refresh + +` + "```go" + ` +func refresh(token string) error { + return nil +} +` + "```" + ` + +More text after. +` + result := ParseMarkdownStructure([]byte(src)) + + if len(result.CodeBlocks) != 1 { + t.Fatalf("expected 1 code block, got %d", len(result.CodeBlocks)) + } + + cb := result.CodeBlocks[0] + if cb.Language != "go" { + t.Errorf("expected language 'go', got %q", cb.Language) + } + if cb.HeadingText != "Token Refresh" { + t.Errorf("expected heading text 'Token Refresh', got %q", cb.HeadingText) + } + if cb.HeadingLevel != 2 { + t.Errorf("expected heading level 2, got %d", cb.HeadingLevel) + } + if len(cb.HeadingPath) != 2 { + t.Errorf("expected 2-level heading path, got %d: %v", len(cb.HeadingPath), cb.HeadingPath) + } + if cb.Content == "" { + t.Error("expected non-empty code content") + } + if cb.Line == 0 { + t.Error("expected non-zero code block line") + } +} + +func TestParseMarkdownStructure_CodeBlockBeforeHeading(t *testing.T) { + src := "```bash\necho hello\n```\n\n# Heading After\n\nContent.\n" + result := ParseMarkdownStructure([]byte(src)) + + if len(result.CodeBlocks) != 1 { + t.Fatalf("expected 1 code block, got %d", len(result.CodeBlocks)) + } + cb := result.CodeBlocks[0] + if cb.HeadingText != "" { + t.Errorf("expected empty heading text for pre-heading code block, got %q", cb.HeadingText) + } + if len(cb.HeadingPath) != 0 { + t.Errorf("expected empty heading path for pre-heading code block, got %v", cb.HeadingPath) + } +} + +func TestParseMarkdownStructure_LineNumbers(t *testing.T) { + src := "# First\n\nContent.\n\n## Second\n\nMore content.\n" + result := ParseMarkdownStructure([]byte(src)) + + if len(result.Sections) < 2 { + t.Fatalf("expected at least 2 sections, got %d", len(result.Sections)) + } + + // Find sections by heading text. + sectionLines := map[string]int{} + for _, s := range result.Sections { + sectionLines[s.HeadingText] = s.Line + } + + if sectionLines["First"] != 1 { + t.Errorf("expected 'First' at line 1, got %d", sectionLines["First"]) + } + if sectionLines["Second"] != 5 { + t.Errorf("expected 'Second' at line 5, got %d", sectionLines["Second"]) + } +} + +func TestParseMarkdownStructure_MultipleCodeBlocks(t *testing.T) { + src := `# Section + +` + "```sql" + ` +SELECT 1; +` + "```" + ` + +` + "```python" + ` +print("hello") +` + "```" + ` +` + result := ParseMarkdownStructure([]byte(src)) + + if len(result.CodeBlocks) != 2 { + t.Fatalf("expected 2 code blocks, got %d", len(result.CodeBlocks)) + } + if result.CodeBlocks[0].Language != "sql" { + t.Errorf("first block: expected 'sql', got %q", result.CodeBlocks[0].Language) + } + if result.CodeBlocks[1].Language != "python" { + t.Errorf("second block: expected 'python', got %q", result.CodeBlocks[1].Language) + } +} + +func TestParseMarkdownStructure_FrontmatterIgnored(t *testing.T) { + // The parser receives the body only (frontmatter already stripped by ReadMarkdownDocument). + // Verify graceful handling of YAML-fence-like content inside body. + src := `# Title + +Content with --- dashes --- mid-sentence. +` + result := ParseMarkdownStructure([]byte(src)) + if len(result.Sections) != 1 { + t.Fatalf("expected 1 section, got %d", len(result.Sections)) + } +} diff --git a/internal/search/store.go b/internal/search/store.go new file mode 100644 index 0000000..3d6efc6 --- /dev/null +++ b/internal/search/store.go @@ -0,0 +1,35 @@ +package search + +import "time" + +// Store is the backend-agnostic interface that all search index implementations must satisfy. +// +// The Indexer uses Store exclusively — it never calls Bleve or SQLite APIs directly. +// This enables transparent A/B comparison between the two backends. +type Store interface { + // Index upserts all documents for a single source file. + // Callers should call DeleteByPath first to replace existing content atomically. + Index(docs []Document) error + + // DeleteByPath removes all indexed documents whose Path equals relPath. + DeleteByPath(relPath string) error + + // Search executes a full-text query and returns ranked results. + Search(opts SearchOptions) ([]SearchResult, error) + + // ListLabels returns all distinct label values present in the index, sorted. + ListLabels() ([]string, error) + + // ListSpaces returns all distinct space key values present in the index, sorted. + ListSpaces() ([]string, error) + + // UpdateMeta records the current UTC timestamp as the last-indexed-at time. + UpdateMeta() error + + // LastIndexedAt returns the time recorded by the most recent successful UpdateMeta call. + // Returns the zero time.Time and a nil error if no meta has been recorded yet. + LastIndexedAt() (time.Time, error) + + // Close releases resources held by the store. + Close() error +} From ba2c6cf6d18149a9b8b4538623f0141aef6c8ef4 Mon Sep 17 00:00:00 2001 From: Robert Gonek Date: Tue, 3 Mar 2026 09:49:23 +0100 Subject: [PATCH 2/6] feat(search): Phase 2b - SQLite FTS5 backend with full test coverage Add sqlitestore.Store implementing search.Store with SQLite + FTS5. Add schema DDL with triggers for FTS sync, and comprehensive tests. --- go.mod | 33 ++ go.sum | 73 +++++ internal/search/sqlitestore/schema.go | 65 ++++ internal/search/sqlitestore/store.go | 351 ++++++++++++++++++++++ internal/search/sqlitestore/store_test.go | 334 ++++++++++++++++++++ 5 files changed, 856 insertions(+) create mode 100644 internal/search/sqlitestore/schema.go create mode 100644 internal/search/sqlitestore/store.go create mode 100644 internal/search/sqlitestore/store_test.go diff --git a/go.mod b/go.mod index 9a70983..8c26038 100644 --- a/go.mod +++ b/go.mod @@ -17,8 +17,28 @@ require ( ) require ( + github.com/RoaringBitmap/roaring/v2 v2.4.5 // indirect github.com/atotto/clipboard v0.1.4 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect + github.com/bits-and-blooms/bitset v1.24.4 // indirect + github.com/blevesearch/bleve/v2 v2.5.7 // indirect + github.com/blevesearch/bleve_index_api v1.2.11 // indirect + github.com/blevesearch/geo v0.2.4 // indirect + github.com/blevesearch/go-faiss v1.0.26 // indirect + github.com/blevesearch/go-porterstemmer v1.0.3 // indirect + github.com/blevesearch/gtreap v0.1.1 // indirect + github.com/blevesearch/mmap-go v1.0.4 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.3.13 // indirect + github.com/blevesearch/segment v0.9.1 // indirect + github.com/blevesearch/snowballstem v0.9.0 // indirect + github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect + github.com/blevesearch/vellum v1.1.0 // indirect + github.com/blevesearch/zapx/v11 v11.4.2 // indirect + github.com/blevesearch/zapx/v12 v12.4.2 // indirect + github.com/blevesearch/zapx/v13 v13.4.2 // indirect + github.com/blevesearch/zapx/v14 v14.4.2 // indirect + github.com/blevesearch/zapx/v15 v15.4.2 // indirect + github.com/blevesearch/zapx/v16 v16.2.8 // indirect github.com/catppuccin/go v0.3.0 // indirect github.com/charmbracelet/colorprofile v0.4.1 // indirect github.com/charmbracelet/harmonica v0.2.0 // indirect @@ -31,21 +51,34 @@ require ( github.com/clipperhouse/uax29/v2 v2.5.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede // indirect github.com/kr/pretty v0.3.1 // indirect github.com/lucasb-eyer/go-colorful v1.3.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-localereader v0.0.1 // indirect github.com/mattn/go-runewidth v0.0.19 // indirect github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect + github.com/mschoch/smat v0.2.0 // indirect github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.16.0 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/spf13/pflag v1.0.10 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect + go.etcd.io/bbolt v1.4.0 // indirect + golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect golang.org/x/net v0.50.0 // indirect golang.org/x/sys v0.41.0 // indirect golang.org/x/text v0.34.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect + modernc.org/libc v1.67.6 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect + modernc.org/sqlite v1.46.1 // indirect ) diff --git a/go.sum b/go.sum index a84ed49..a81e897 100644 --- a/go.sum +++ b/go.sum @@ -1,11 +1,52 @@ github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= +github.com/RoaringBitmap/roaring/v2 v2.4.5 h1:uGrrMreGjvAtTBobc0g5IrW1D5ldxDQYe2JW2gggRdg= +github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/hVXDS2dXi7/eUFE0= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/aymanbagabas/go-udiff v0.3.1 h1:LV+qyBQ2pqe0u42ZsUEtPiCaUoqgA9gYRDs3vj1nolY= github.com/aymanbagabas/go-udiff v0.3.1/go.mod h1:G0fsKmG+P6ylD0r6N/KgQD/nWzgfnl8ZBcNLgcbrw8E= +github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE= +github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/blevesearch/bleve/v2 v2.5.7 h1:2d9YrL5zrX5EBBW++GOaEKjE+NPWeZGaX77IM26m1Z8= +github.com/blevesearch/bleve/v2 v2.5.7/go.mod h1:yj0NlS7ocGC4VOSAedqDDMktdh2935v2CSWOCDMHdSA= +github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s= +github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= +github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= +github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= +github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI= +github.com/blevesearch/go-faiss v1.0.26/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= +github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= +github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= +github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= +github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= +github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc= +github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= +github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= +github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= +github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= +github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= +github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= +github.com/blevesearch/zapx/v11 v11.4.2 h1:l46SV+b0gFN+Rw3wUI1YdMWdSAVhskYuvxlcgpQFljs= +github.com/blevesearch/zapx/v11 v11.4.2/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc= +github.com/blevesearch/zapx/v12 v12.4.2 h1:fzRbhllQmEMUuAQ7zBuMvKRlcPA5ESTgWlDEoB9uQNE= +github.com/blevesearch/zapx/v12 v12.4.2/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58= +github.com/blevesearch/zapx/v13 v13.4.2 h1:46PIZCO/ZuKZYgxI8Y7lOJqX3Irkc3N8W82QTK3MVks= +github.com/blevesearch/zapx/v13 v13.4.2/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk= +github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT7fWYz0= +github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= +github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= +github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= +github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI= +github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= github.com/catppuccin/go v0.3.0 h1:d+0/YicIq+hSTo5oPuRi5kOpqkVA5tAsU6dNhvRu+aY= github.com/catppuccin/go v0.3.0/go.mod h1:8IHJuMGaUUjQM82qBrGNBv7LFq6JI3NnQCF6MOlZjpc= github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc= @@ -48,16 +89,23 @@ github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6N github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede h1:YrgBGwxMRK0Vq0WSCWFaZUnTsrA/PZE/xs1QZh+/edg= +github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -72,15 +120,21 @@ github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byF github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4= github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE= +github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= +github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rgonek/jira-adf-converter v1.0.0 h1:si4Czm1St5ux0fUPBh3uLoGZ8ut5EHS+6ttpPofHkcM= github.com/rgonek/jira-adf-converter v1.0.0/go.mod h1:bRlEGjOcdzehTXCIEb6+U/FbLoulnulCFIlm9bAOtIA= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= @@ -93,20 +147,27 @@ github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiT github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE= github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= +go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= +go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= +golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY= +golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70= golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= @@ -114,8 +175,20 @@ golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI= +modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/sqlite v1.46.1 h1:eFJ2ShBLIEnUWlLy12raN0Z1plqmFX9Qe3rjQTKt6sU= +modernc.org/sqlite v1.46.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA= diff --git a/internal/search/sqlitestore/schema.go b/internal/search/sqlitestore/schema.go new file mode 100644 index 0000000..2ee9964 --- /dev/null +++ b/internal/search/sqlitestore/schema.go @@ -0,0 +1,65 @@ +// Package sqlitestore implements the search.Store interface using SQLite with FTS5. +package sqlitestore + +// DDL contains all CREATE TABLE, CREATE INDEX, CREATE VIRTUAL TABLE, and trigger +// statements needed to initialise or migrate the search database. +// +// All statements are idempotent (IF NOT EXISTS / CREATE OR REPLACE TRIGGER). +const DDL = ` +CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + type TEXT NOT NULL, + path TEXT NOT NULL, + page_id TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL DEFAULT '', + space_key TEXT NOT NULL DEFAULT '', + labels TEXT NOT NULL DEFAULT '[]', + content TEXT NOT NULL DEFAULT '', + heading_path TEXT NOT NULL DEFAULT '[]', + heading_text TEXT NOT NULL DEFAULT '', + heading_level INTEGER NOT NULL DEFAULT 0, + language TEXT NOT NULL DEFAULT '', + line INTEGER NOT NULL DEFAULT 0, + mod_time TEXT NOT NULL DEFAULT '' +); + +CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path); +CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(type); +CREATE INDEX IF NOT EXISTS idx_documents_space_key ON documents(space_key); + +CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( + title, + content, + heading_text, + content=documents, + content_rowid=rowid, + tokenize='porter unicode61' +); + +CREATE TRIGGER IF NOT EXISTS documents_ai +AFTER INSERT ON documents BEGIN + INSERT INTO documents_fts(rowid, title, content, heading_text) + VALUES (new.rowid, new.title, new.content, new.heading_text); +END; + +CREATE TRIGGER IF NOT EXISTS documents_ad +AFTER DELETE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, rowid, title, content, heading_text) + VALUES ('delete', old.rowid, old.title, old.content, old.heading_text); +END; + +CREATE TRIGGER IF NOT EXISTS documents_au +AFTER UPDATE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, rowid, title, content, heading_text) + VALUES ('delete', old.rowid, old.title, old.content, old.heading_text); + INSERT INTO documents_fts(rowid, title, content, heading_text) + VALUES (new.rowid, new.title, new.content, new.heading_text); +END; + +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL DEFAULT '' +); +` + +const metaKeyLastIndexedAt = "last_indexed_at" diff --git a/internal/search/sqlitestore/store.go b/internal/search/sqlitestore/store.go new file mode 100644 index 0000000..bfe21ab --- /dev/null +++ b/internal/search/sqlitestore/store.go @@ -0,0 +1,351 @@ +package sqlitestore + +import ( + "database/sql" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/rgonek/confluence-markdown-sync/internal/search" + _ "modernc.org/sqlite" // SQLite driver registration +) + +const ( + // defaultSearchLimit is used when SearchOptions.Limit is 0. + defaultSearchLimit = 20 +) + +// Store is a search.Store implementation backed by SQLite + FTS5. +type Store struct { + db *sql.DB +} + +// Open opens (or creates) the SQLite database at dbPath and applies all DDL migrations. +// The directory containing dbPath is created if it does not exist. +func Open(dbPath string) (*Store, error) { + dir := filepath.Dir(dbPath) + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:gosec // index dirs are intentionally group-readable + return nil, fmt.Errorf("sqlitestore: create directory %s: %w", dir, err) + } + + db, err := sql.Open("sqlite", dbPath) + if err != nil { + return nil, fmt.Errorf("sqlitestore: open %s: %w", dbPath, err) + } + + // SQLite performs best with a single writer; cap pool to avoid locking issues. + db.SetMaxOpenConns(1) + + if err := applyDDL(db); err != nil { + _ = db.Close() + return nil, fmt.Errorf("sqlitestore: apply schema: %w", err) + } + + return &Store{db: db}, nil +} + +// Close closes the underlying database connection. +func (s *Store) Close() error { + return s.db.Close() +} + +// Index upserts all documents for a single source file. +// It wraps all inserts in a transaction for atomicity. +func (s *Store) Index(docs []search.Document) error { + tx, err := s.db.Begin() + if err != nil { + return fmt.Errorf("sqlitestore.Index begin tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + const query = ` +INSERT INTO documents + (id, type, path, page_id, title, space_key, labels, + content, heading_path, heading_text, heading_level, language, line, mod_time) +VALUES + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT(id) DO UPDATE SET + type = excluded.type, + path = excluded.path, + page_id = excluded.page_id, + title = excluded.title, + space_key = excluded.space_key, + labels = excluded.labels, + content = excluded.content, + heading_path = excluded.heading_path, + heading_text = excluded.heading_text, + heading_level = excluded.heading_level, + language = excluded.language, + line = excluded.line, + mod_time = excluded.mod_time` + + stmt, err := tx.Prepare(query) + if err != nil { + return fmt.Errorf("sqlitestore.Index prepare: %w", err) + } + defer func() { _ = stmt.Close() }() + + for i := range docs { + d := &docs[i] + labelsJSON, err := marshalJSON(d.Labels) + if err != nil { + return fmt.Errorf("sqlitestore.Index marshal labels: %w", err) + } + headingPathJSON, err := marshalJSON(d.HeadingPath) + if err != nil { + return fmt.Errorf("sqlitestore.Index marshal heading_path: %w", err) + } + modTimeStr := "" + if !d.ModTime.IsZero() { + modTimeStr = d.ModTime.UTC().Format(time.RFC3339) + } + _, err = stmt.Exec( + d.ID, d.Type, d.Path, d.PageID, d.Title, d.SpaceKey, + labelsJSON, d.Content, headingPathJSON, d.HeadingText, + d.HeadingLevel, d.Language, d.Line, modTimeStr, + ) + if err != nil { + return fmt.Errorf("sqlitestore.Index exec for %s: %w", d.ID, err) + } + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("sqlitestore.Index commit: %w", err) + } + return nil +} + +// DeleteByPath removes all indexed documents whose Path equals relPath. +func (s *Store) DeleteByPath(relPath string) error { + _, err := s.db.Exec(`DELETE FROM documents WHERE path = ?`, relPath) + if err != nil { + return fmt.Errorf("sqlitestore.DeleteByPath %s: %w", relPath, err) + } + return nil +} + +// Search executes a full-text query and returns ranked results. +func (s *Store) Search(opts search.SearchOptions) ([]search.SearchResult, error) { + limit := opts.Limit + if limit <= 0 { + limit = defaultSearchLimit + } + + // Build the WHERE clause and argument list dynamically. + var ( + whereClauses []string + args []any + ) + + if opts.Query != "" { + whereClauses = append(whereClauses, "documents_fts MATCH ?") + args = append(args, opts.Query) + } + + if opts.SpaceKey != "" { + whereClauses = append(whereClauses, "d.space_key = ?") + args = append(args, opts.SpaceKey) + } + + for _, label := range opts.Labels { + whereClauses = append(whereClauses, `EXISTS ( + SELECT 1 FROM json_each(d.labels) WHERE json_each.value = ? + )`) + args = append(args, label) + } + + if opts.HeadingFilter != "" { + whereClauses = append(whereClauses, "d.heading_text LIKE ?") + args = append(args, "%"+opts.HeadingFilter+"%") + } + + if len(opts.Types) > 0 { + placeholders := strings.Repeat("?,", len(opts.Types)) + placeholders = strings.TrimSuffix(placeholders, ",") + whereClauses = append(whereClauses, fmt.Sprintf("d.type IN (%s)", placeholders)) + for _, t := range opts.Types { + args = append(args, t) + } + } + + args = append(args, limit) + + var baseQuery string + if opts.Query != "" { + whereExpr := strings.Join(whereClauses, " AND ") + baseQuery = fmt.Sprintf(` +SELECT d.id, d.type, d.path, d.page_id, d.title, d.space_key, + d.labels, d.content, d.heading_path, d.heading_text, + d.heading_level, d.language, d.line, d.mod_time, + fts.rank AS score, + snippet(documents_fts, 1, '[', ']', '...', 10) AS snippet +FROM documents_fts fts +JOIN documents d ON d.rowid = fts.rowid +WHERE %s +ORDER BY fts.rank +LIMIT ?`, whereExpr) + } else { + whereExpr := "" + if len(whereClauses) > 0 { + whereExpr = "WHERE " + strings.Join(whereClauses, " AND ") + } + baseQuery = fmt.Sprintf(` +SELECT d.id, d.type, d.path, d.page_id, d.title, d.space_key, + d.labels, d.content, d.heading_path, d.heading_text, + d.heading_level, d.language, d.line, d.mod_time, + 0.0 AS score, + '' AS snippet +FROM documents d +%s +ORDER BY d.path, d.line +LIMIT ?`, whereExpr) + } + + rows, err := s.db.Query(baseQuery, args...) + if err != nil { + return nil, fmt.Errorf("sqlitestore.Search query: %w", err) + } + defer func() { _ = rows.Close() }() + + var results []search.SearchResult + for rows.Next() { + var ( + doc search.Document + labelsJSON string + hpathJSON string + modTimeStr string + score float64 + snippet string + ) + if err := rows.Scan( + &doc.ID, &doc.Type, &doc.Path, &doc.PageID, &doc.Title, + &doc.SpaceKey, &labelsJSON, &doc.Content, &hpathJSON, + &doc.HeadingText, &doc.HeadingLevel, &doc.Language, &doc.Line, + &modTimeStr, &score, &snippet, + ); err != nil { + return nil, fmt.Errorf("sqlitestore.Search scan: %w", err) + } + + if err := json.Unmarshal([]byte(labelsJSON), &doc.Labels); err != nil { + doc.Labels = nil + } + if err := json.Unmarshal([]byte(hpathJSON), &doc.HeadingPath); err != nil { + doc.HeadingPath = nil + } + if modTimeStr != "" { + if t, err := time.Parse(time.RFC3339, modTimeStr); err == nil { + doc.ModTime = t + } + } + + results = append(results, search.SearchResult{ + Document: doc, + Score: score, + Snippet: snippet, + }) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("sqlitestore.Search rows: %w", err) + } + return results, nil +} + +// ListLabels returns all distinct label values present in the index, sorted. +func (s *Store) ListLabels() ([]string, error) { + rows, err := s.db.Query(` +SELECT DISTINCT j.value +FROM documents, json_each(documents.labels) j +WHERE j.value != '' +ORDER BY j.value`) + if err != nil { + return nil, fmt.Errorf("sqlitestore.ListLabels: %w", err) + } + defer func() { _ = rows.Close() }() + + var labels []string + for rows.Next() { + var label string + if err := rows.Scan(&label); err != nil { + return nil, fmt.Errorf("sqlitestore.ListLabels scan: %w", err) + } + labels = append(labels, label) + } + return labels, rows.Err() +} + +// ListSpaces returns all distinct space key values present in the index, sorted. +func (s *Store) ListSpaces() ([]string, error) { + rows, err := s.db.Query(` +SELECT DISTINCT space_key +FROM documents +WHERE space_key != '' +ORDER BY space_key`) + if err != nil { + return nil, fmt.Errorf("sqlitestore.ListSpaces: %w", err) + } + defer func() { _ = rows.Close() }() + + var spaces []string + for rows.Next() { + var space string + if err := rows.Scan(&space); err != nil { + return nil, fmt.Errorf("sqlitestore.ListSpaces scan: %w", err) + } + spaces = append(spaces, space) + } + return spaces, rows.Err() +} + +// UpdateMeta records the current UTC timestamp as the last-indexed-at time. +func (s *Store) UpdateMeta() error { + ts := time.Now().UTC().Format(time.RFC3339) + _, err := s.db.Exec(` +INSERT INTO meta(key, value) VALUES (?, ?) +ON CONFLICT(key) DO UPDATE SET value = excluded.value`, + metaKeyLastIndexedAt, ts) + if err != nil { + return fmt.Errorf("sqlitestore.UpdateMeta: %w", err) + } + return nil +} + +// LastIndexedAt returns the time recorded by the most recent successful UpdateMeta call. +// Returns the zero time.Time and a nil error if no meta has been recorded yet. +func (s *Store) LastIndexedAt() (time.Time, error) { + var ts string + err := s.db.QueryRow(`SELECT value FROM meta WHERE key = ?`, metaKeyLastIndexedAt).Scan(&ts) + if err == sql.ErrNoRows { + return time.Time{}, nil + } + if err != nil { + return time.Time{}, fmt.Errorf("sqlitestore.LastIndexedAt: %w", err) + } + t, err := time.Parse(time.RFC3339, ts) + if err != nil { + return time.Time{}, fmt.Errorf("sqlitestore.LastIndexedAt parse time: %w", err) + } + return t, nil +} + +// — helpers — + +func applyDDL(db *sql.DB) error { + if _, err := db.Exec(DDL); err != nil { + return err + } + return nil +} + +func marshalJSON(v any) (string, error) { + if v == nil { + return "[]", nil + } + b, err := json.Marshal(v) + if err != nil { + return "", err + } + return string(b), nil +} diff --git a/internal/search/sqlitestore/store_test.go b/internal/search/sqlitestore/store_test.go new file mode 100644 index 0000000..3528753 --- /dev/null +++ b/internal/search/sqlitestore/store_test.go @@ -0,0 +1,334 @@ +package sqlitestore + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/rgonek/confluence-markdown-sync/internal/search" +) + +func newTestStore(t *testing.T) *Store { + t.Helper() + dir := t.TempDir() + dbPath := filepath.Join(dir, "search.db") + s, err := Open(dbPath) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +func sampleDocs() []search.Document { + now := time.Date(2024, 1, 15, 12, 0, 0, 0, time.UTC) + return []search.Document{ + { + ID: "page:DEV/overview.md", + Type: search.DocTypePage, + Path: "DEV/overview.md", + PageID: "123456", + Title: "Security Overview", + SpaceKey: "DEV", + Labels: []string{"architecture", "security"}, + Content: "This page covers the security architecture and OAuth2 flows.", + ModTime: now, + }, + { + ID: "section:DEV/overview.md:5", + Type: search.DocTypeSection, + Path: "DEV/overview.md", + PageID: "123456", + Title: "Security Overview", + SpaceKey: "DEV", + Labels: []string{"architecture", "security"}, + Content: "OAuth2 flows use PKCE to prevent interception attacks.", + HeadingText: "OAuth2 Flow", + HeadingLevel: 2, + HeadingPath: []string{"# Security Overview", "## OAuth2 Flow"}, + Line: 5, + ModTime: now, + }, + { + ID: "code:DEV/overview.md:12", + Type: search.DocTypeCode, + Path: "DEV/overview.md", + PageID: "123456", + Title: "Security Overview", + SpaceKey: "DEV", + Labels: []string{"architecture", "security"}, + Content: "func refreshToken(token string) error { return nil }", + HeadingText: "Token Refresh", + HeadingLevel: 3, + HeadingPath: []string{"# Security Overview", "## OAuth2 Flow", "### Token Refresh"}, + Language: "go", + Line: 12, + ModTime: now, + }, + { + ID: "page:OPS/deploy.md", + Type: search.DocTypePage, + Path: "OPS/deploy.md", + PageID: "654321", + Title: "Deployment Guide", + SpaceKey: "OPS", + Labels: []string{"ops", "deployment"}, + Content: "How to deploy the application to production.", + ModTime: now, + }, + } +} + +func TestStore_IndexAndSearch(t *testing.T) { + s := newTestStore(t) + docs := sampleDocs() + + if err := s.Index(docs); err != nil { + t.Fatalf("Index: %v", err) + } + + results, err := s.Search(search.SearchOptions{Query: "OAuth2"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(results) == 0 { + t.Fatal("expected at least one result for 'OAuth2'") + } +} + +func TestStore_DeleteByPath(t *testing.T) { + s := newTestStore(t) + docs := sampleDocs() + + if err := s.Index(docs); err != nil { + t.Fatalf("Index: %v", err) + } + + if err := s.DeleteByPath("DEV/overview.md"); err != nil { + t.Fatalf("DeleteByPath: %v", err) + } + + results, err := s.Search(search.SearchOptions{Query: "OAuth2"}) + if err != nil { + t.Fatalf("Search after delete: %v", err) + } + for _, r := range results { + if r.Document.Path == "DEV/overview.md" { + t.Errorf("expected DEV/overview.md to be deleted, but it still appears in results") + } + } +} + +func TestStore_FilterBySpace(t *testing.T) { + s := newTestStore(t) + if err := s.Index(sampleDocs()); err != nil { + t.Fatalf("Index: %v", err) + } + + results, err := s.Search(search.SearchOptions{SpaceKey: "OPS"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(results) == 0 { + t.Fatal("expected results for OPS space") + } + for _, r := range results { + if r.Document.SpaceKey != "OPS" { + t.Errorf("expected SpaceKey=OPS, got %q", r.Document.SpaceKey) + } + } +} + +func TestStore_FilterByLabel(t *testing.T) { + s := newTestStore(t) + if err := s.Index(sampleDocs()); err != nil { + t.Fatalf("Index: %v", err) + } + + results, err := s.Search(search.SearchOptions{Labels: []string{"deployment"}}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(results) == 0 { + t.Fatal("expected results for label 'deployment'") + } + for _, r := range results { + found := false + for _, l := range r.Document.Labels { + if l == "deployment" { + found = true + break + } + } + if !found { + t.Errorf("result %s does not have label 'deployment': %v", r.Document.ID, r.Document.Labels) + } + } +} + +func TestStore_ListLabels(t *testing.T) { + s := newTestStore(t) + if err := s.Index(sampleDocs()); err != nil { + t.Fatalf("Index: %v", err) + } + + labels, err := s.ListLabels() + if err != nil { + t.Fatalf("ListLabels: %v", err) + } + + expected := map[string]bool{ + "architecture": true, + "security": true, + "ops": true, + "deployment": true, + } + for _, l := range labels { + delete(expected, l) + } + if len(expected) > 0 { + remaining := make([]string, 0, len(expected)) + for k := range expected { + remaining = append(remaining, k) + } + t.Errorf("missing labels: %v (got %v)", remaining, labels) + } +} + +func TestStore_ListSpaces(t *testing.T) { + s := newTestStore(t) + if err := s.Index(sampleDocs()); err != nil { + t.Fatalf("Index: %v", err) + } + + spaces, err := s.ListSpaces() + if err != nil { + t.Fatalf("ListSpaces: %v", err) + } + + found := map[string]bool{} + for _, sp := range spaces { + found[sp] = true + } + if !found["DEV"] { + t.Errorf("expected space DEV in list, got %v", spaces) + } + if !found["OPS"] { + t.Errorf("expected space OPS in list, got %v", spaces) + } +} + +func TestStore_UpdateMetaAndLastIndexedAt(t *testing.T) { + s := newTestStore(t) + + // Before any update, LastIndexedAt returns zero. + zero, err := s.LastIndexedAt() + if err != nil { + t.Fatalf("LastIndexedAt (pre-update): %v", err) + } + if !zero.IsZero() { + t.Errorf("expected zero time before UpdateMeta, got %v", zero) + } + + before := time.Now().UTC().Truncate(time.Second) + if err := s.UpdateMeta(); err != nil { + t.Fatalf("UpdateMeta: %v", err) + } + after := time.Now().UTC().Add(time.Second) + + ts, err := s.LastIndexedAt() + if err != nil { + t.Fatalf("LastIndexedAt: %v", err) + } + if ts.Before(before) || ts.After(after) { + t.Errorf("LastIndexedAt %v out of expected range [%v, %v]", ts, before, after) + } +} + +func TestStore_Upsert(t *testing.T) { + s := newTestStore(t) + + doc := search.Document{ + ID: "page:DEV/test.md", + Type: search.DocTypePage, + Path: "DEV/test.md", + SpaceKey: "DEV", + Title: "Original Title", + Content: "original content", + } + + if err := s.Index([]search.Document{doc}); err != nil { + t.Fatalf("first Index: %v", err) + } + + doc.Title = "Updated Title" + doc.Content = "updated content" + if err := s.Index([]search.Document{doc}); err != nil { + t.Fatalf("second Index (upsert): %v", err) + } + + results, err := s.Search(search.SearchOptions{Query: "updated"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + found := false + for _, r := range results { + if r.Document.ID == "page:DEV/test.md" { + found = true + if r.Document.Title != "Updated Title" { + t.Errorf("expected 'Updated Title', got %q", r.Document.Title) + } + } + } + if !found { + t.Error("updated document not found in search results") + } +} + +func TestStore_Limit(t *testing.T) { + s := newTestStore(t) + if err := s.Index(sampleDocs()); err != nil { + t.Fatalf("Index: %v", err) + } + + results, err := s.Search(search.SearchOptions{Limit: 2}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(results) > 2 { + t.Errorf("expected at most 2 results with Limit=2, got %d", len(results)) + } +} + +func TestStore_TypeFilter(t *testing.T) { + s := newTestStore(t) + if err := s.Index(sampleDocs()); err != nil { + t.Fatalf("Index: %v", err) + } + + results, err := s.Search(search.SearchOptions{Types: []string{search.DocTypeCode}}) + if err != nil { + t.Fatalf("Search: %v", err) + } + for _, r := range results { + if r.Document.Type != search.DocTypeCode { + t.Errorf("expected type %q, got %q", search.DocTypeCode, r.Document.Type) + } + } +} + +func TestStore_OpenCreatesDirectory(t *testing.T) { + dir := filepath.Join(t.TempDir(), "nested", "subdir") + dbPath := filepath.Join(dir, "search.db") + + s, err := Open(dbPath) + if err != nil { + t.Fatalf("Open with nested dir: %v", err) + } + defer func() { _ = s.Close() }() + + if _, err := os.Stat(dbPath); err != nil { + t.Errorf("expected db file to exist at %s: %v", dbPath, err) + } +} From dda7a5064dd36d22777027d0021ccd7d769fcf8b Mon Sep 17 00:00:00 2001 From: Robert Gonek Date: Tue, 3 Mar 2026 09:53:43 +0100 Subject: [PATCH 3/6] feat(search): Phase 3 - Indexer with file walking and incremental update Add Indexer type that orchestrates markdown file walking and calls Store. Supports full reindex, per-space indexing, and mtime-based incremental updates. --- internal/search/indexer.go | 227 +++++++++++++++++++++++++++++ internal/search/indexer_test.go | 250 ++++++++++++++++++++++++++++++++ 2 files changed, 477 insertions(+) create mode 100644 internal/search/indexer.go create mode 100644 internal/search/indexer_test.go diff --git a/internal/search/indexer.go b/internal/search/indexer.go new file mode 100644 index 0000000..8948071 --- /dev/null +++ b/internal/search/indexer.go @@ -0,0 +1,227 @@ +package search + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/rgonek/confluence-markdown-sync/internal/fs" +) + +// Indexer orchestrates file walking and calls Store methods. +// It is backend-agnostic and operates exclusively via the Store interface. +type Indexer struct { + store Store + rootDir string +} + +// NewIndexer creates an Indexer that writes to store and scans from rootDir. +func NewIndexer(store Store, rootDir string) *Indexer { + return &Indexer{store: store, rootDir: rootDir} +} + +// Reindex performs a full reindex of all discovered spaces in rootDir. +// It returns the total number of documents indexed. +func (ix *Indexer) Reindex() (int, error) { + states, err := fs.FindAllStateFiles(ix.rootDir) + if err != nil { + return 0, fmt.Errorf("search indexer: discover spaces: %w", err) + } + + total := 0 + for spaceDir, state := range states { + count, err := ix.IndexSpace(spaceDir, state.SpaceKey) + if err != nil { + return total, fmt.Errorf("search indexer: index space %s: %w", spaceDir, err) + } + total += count + } + + if err := ix.store.UpdateMeta(); err != nil { + return total, fmt.Errorf("search indexer: update meta: %w", err) + } + return total, nil +} + +// IndexSpace walks spaceDir for Markdown files and indexes them all. +// Any existing documents for those files are replaced. +// It returns the number of documents indexed. +func (ix *Indexer) IndexSpace(spaceDir, spaceKey string) (int, error) { + return ix.walkAndIndex(spaceDir, spaceKey, time.Time{}) +} + +// IncrementalUpdate indexes only files whose mtime is newer than the last +// recorded index time. Falls back to a full reindex if no prior timestamp exists. +func (ix *Indexer) IncrementalUpdate() (int, error) { + lastAt, err := ix.store.LastIndexedAt() + if err != nil { + return 0, fmt.Errorf("search indexer: read last-indexed-at: %w", err) + } + if lastAt.IsZero() { + return ix.Reindex() + } + + states, err := fs.FindAllStateFiles(ix.rootDir) + if err != nil { + return 0, fmt.Errorf("search indexer: discover spaces: %w", err) + } + + total := 0 + for spaceDir, state := range states { + count, err := ix.walkAndIndex(spaceDir, state.SpaceKey, lastAt) + if err != nil { + return total, fmt.Errorf("search indexer: incremental index space %s: %w", spaceDir, err) + } + total += count + } + + if total > 0 { + if err := ix.store.UpdateMeta(); err != nil { + return total, fmt.Errorf("search indexer: update meta: %w", err) + } + } + return total, nil +} + +// Close releases the underlying store. +func (ix *Indexer) Close() error { + return ix.store.Close() +} + +// — private helpers — + +// walkAndIndex walks spaceDir and indexes all .md files. +// If cutoff is non-zero, only files with mtime > cutoff are re-indexed. +func (ix *Indexer) walkAndIndex(spaceDir, spaceKey string, cutoff time.Time) (int, error) { + total := 0 + spaceName := filepath.Base(spaceDir) + + err := filepath.WalkDir(spaceDir, func(path string, d os.DirEntry, walkErr error) error { + if walkErr != nil { + return walkErr + } + if d.IsDir() { + // Skip assets/ and all hidden directories (e.g., .git) + if d.Name() == "assets" || strings.HasPrefix(d.Name(), ".") { + return filepath.SkipDir + } + return nil + } + if !strings.HasSuffix(strings.ToLower(d.Name()), ".md") { + return nil + } + + // Mtime filter for incremental updates. + if !cutoff.IsZero() { + info, err := d.Info() + if err != nil || !info.ModTime().After(cutoff) { + return nil + } + } + + relPath, err := filepath.Rel(spaceDir, path) + if err != nil { + return nil // skip; unexpected path + } + relPath = filepath.ToSlash(relPath) + + // Build a repo-root-relative path: "/". + docPath := spaceName + "/" + relPath + + count, err := ix.indexFile(path, docPath, spaceKey) + if err != nil { + // Best-effort: skip broken files rather than aborting the walk. + return nil + } + total += count + return nil + }) + return total, err +} + +// indexFile reads the Markdown document at absPath, parses its structure, and +// upserts all resulting documents (1 page + N sections + M code blocks) into the store. +// docPath is the repository-relative path (forward slashes). +func (ix *Indexer) indexFile(absPath, docPath, spaceKey string) (int, error) { + info, err := os.Stat(absPath) + if err != nil { + return 0, err + } + + mdDoc, err := fs.ReadMarkdownDocument(absPath) + if err != nil { + return 0, fmt.Errorf("read document %s: %w", absPath, err) + } + + fm := mdDoc.Frontmatter + labels := fs.NormalizeLabels(fm.Labels) + modTime := info.ModTime() + + // Delete existing documents for this path before reinserting. + if err := ix.store.DeleteByPath(docPath); err != nil { + return 0, fmt.Errorf("delete old docs for %s: %w", docPath, err) + } + + // Build the document set. + docs := make([]Document, 0, 32) + + // 1. Page document: full body as Content for FTS across entire page. + docs = append(docs, Document{ + ID: "page:" + docPath, + Type: DocTypePage, + Path: docPath, + PageID: fm.ID, + Title: fm.Title, + SpaceKey: spaceKey, + Labels: labels, + Content: mdDoc.Body, + ModTime: modTime, + }) + + // 2. Section and code-block documents. + parsed := ParseMarkdownStructure([]byte(mdDoc.Body)) + + for _, sec := range parsed.Sections { + docs = append(docs, Document{ + ID: fmt.Sprintf("section:%s:%d", docPath, sec.Line), + Type: DocTypeSection, + Path: docPath, + PageID: fm.ID, + Title: fm.Title, + SpaceKey: spaceKey, + Labels: labels, + Content: sec.Content, + HeadingPath: sec.HeadingPath, + HeadingText: sec.HeadingText, + HeadingLevel: sec.HeadingLevel, + Line: sec.Line, + ModTime: modTime, + }) + } + + for _, cb := range parsed.CodeBlocks { + docs = append(docs, Document{ + ID: fmt.Sprintf("code:%s:%d", docPath, cb.Line), + Type: DocTypeCode, + Path: docPath, + PageID: fm.ID, + Title: fm.Title, + SpaceKey: spaceKey, + Labels: labels, + Content: cb.Content, + HeadingPath: cb.HeadingPath, + HeadingText: cb.HeadingText, + HeadingLevel: cb.HeadingLevel, + Language: cb.Language, + Line: cb.Line, + ModTime: modTime, + }) + } + + if err := ix.store.Index(docs); err != nil { + return 0, fmt.Errorf("store index for %s: %w", docPath, err) + } + return len(docs), nil +} diff --git a/internal/search/indexer_test.go b/internal/search/indexer_test.go new file mode 100644 index 0000000..355c032 --- /dev/null +++ b/internal/search/indexer_test.go @@ -0,0 +1,250 @@ +package search_test + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/rgonek/confluence-markdown-sync/internal/fs" + "github.com/rgonek/confluence-markdown-sync/internal/search" + "github.com/rgonek/confluence-markdown-sync/internal/search/sqlitestore" +) + +// newTestIndexer creates a temporary repo layout with a SQLite-backed Indexer. +func newTestIndexer(t *testing.T) (*search.Indexer, string) { + t.Helper() + + repoDir := t.TempDir() + dbPath := filepath.Join(repoDir, ".confluence-search-index", "search.db") + store, err := sqlitestore.Open(dbPath) + if err != nil { + t.Fatalf("open store: %v", err) + } + + ix := search.NewIndexer(store, repoDir) + t.Cleanup(func() { _ = ix.Close() }) + return ix, repoDir +} + +// writeMarkdownFile writes a Markdown file with frontmatter + body into repoDir. +func writeMarkdownFile(t *testing.T, repoDir, relPath, content string) { + t.Helper() + absPath := filepath.Join(repoDir, filepath.FromSlash(relPath)) + if err := os.MkdirAll(filepath.Dir(absPath), 0o755); err != nil { + t.Fatalf("mkdir %s: %v", filepath.Dir(absPath), err) + } + if err := os.WriteFile(absPath, []byte(content), 0o644); err != nil { + t.Fatalf("write %s: %v", absPath, err) + } +} + +// writeStateFile writes a minimal .confluence-state.json for a space directory. +func writeStateFile(t *testing.T, repoDir, spaceName, spaceKey string) { + t.Helper() + spaceDir := filepath.Join(repoDir, spaceName) + state := fs.NewSpaceState() + state.SpaceKey = spaceKey + if err := fs.SaveState(spaceDir, state); err != nil { + t.Fatalf("SaveState: %v", err) + } +} + +const sampleMD = `--- +id: "111" +title: Security Overview +labels: + - security + - architecture +--- + +This page covers our security architecture. + +## OAuth2 Flow + +OAuth2 flows use PKCE tokens. + +### Token Refresh + +Refresh tokens are rotated every 15 minutes. + +` + "```go" + ` +func refresh(token string) error { return nil } +` + "```" + ` +` + +func TestIndexer_Reindex(t *testing.T) { + ix, repoDir := newTestIndexer(t) + + writeStateFile(t, repoDir, "DEV", "DEV") + writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) + + count, err := ix.Reindex() + if err != nil { + t.Fatalf("Reindex: %v", err) + } + if count == 0 { + t.Error("expected at least 1 document indexed") + } +} + +func TestIndexer_IndexSpace(t *testing.T) { + ix, repoDir := newTestIndexer(t) + + spaceDir := filepath.Join(repoDir, "DEV") + if err := os.MkdirAll(spaceDir, 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) + writeMarkdownFile(t, repoDir, "DEV/guide.md", `--- +title: Guide +--- +A guide page. +`) + + count, err := ix.IndexSpace(spaceDir, "DEV") + if err != nil { + t.Fatalf("IndexSpace: %v", err) + } + // Each file produces at least 1 (page) doc; sampleMD also has sections/code. + if count < 2 { + t.Errorf("expected at least 2 docs, got %d", count) + } +} + +func TestIndexer_SkipsAssetsDir(t *testing.T) { + ix, repoDir := newTestIndexer(t) + + writeStateFile(t, repoDir, "DEV", "DEV") + // Write a file inside assets/ — should be skipped. + writeMarkdownFile(t, repoDir, "DEV/assets/image-info.md", `--- +title: Asset +--- +Should not be indexed. +`) + // Write a real page. + writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) + + count, err := ix.Reindex() + if err != nil { + t.Fatalf("Reindex: %v", err) + } + // Should only index overview.md, not the assets file. + // overview.md has multiple docs; assets file has 0. + if count == 0 { + t.Error("expected docs from overview.md to be indexed") + } + + // Confirm assets/image-info.md was not indexed. + store := openStoreFromIndexer(t, repoDir) + defer func() { _ = store.Close() }() + + results, err := store.Search(search.SearchOptions{Query: "Should not be indexed"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + for _, r := range results { + if strings.Contains(r.Document.Path, "assets") { + t.Errorf("assets file was indexed: %s", r.Document.Path) + } + } +} + +func TestIndexer_IncrementalUpdate_FallbackOnZeroTime(t *testing.T) { + ix, repoDir := newTestIndexer(t) + + writeStateFile(t, repoDir, "DEV", "DEV") + writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) + + // IncrementalUpdate with no prior index should fall back to full reindex. + count, err := ix.IncrementalUpdate() + if err != nil { + t.Fatalf("IncrementalUpdate: %v", err) + } + if count == 0 { + t.Error("expected documents indexed on first IncrementalUpdate") + } +} + +func TestIndexer_IncrementalUpdate_SkipsOldFiles(t *testing.T) { + ix, repoDir := newTestIndexer(t) + + writeStateFile(t, repoDir, "DEV", "DEV") + + // Write overview.md, sleep, then reindex. + writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) + time.Sleep(1100 * time.Millisecond) + + // Full reindex records the timestamp. + _, err := ix.Reindex() + if err != nil { + t.Fatalf("Reindex: %v", err) + } + + // Sleep again, then write a NEW file after the reindex timestamp. + time.Sleep(1100 * time.Millisecond) + writeMarkdownFile(t, repoDir, "DEV/new-page.md", `--- +title: New Page +--- +Brand new content. +`) + + // Incremental update should only index the new file (1 page doc). + count, err := ix.IncrementalUpdate() + if err != nil { + t.Fatalf("IncrementalUpdate (second): %v", err) + } + if count == 0 { + t.Error("expected new-page.md to be indexed incrementally") + } + // new-page.md has 1 page doc + 0 sections + 0 code blocks = 1 doc. + // overview.md should NOT be re-indexed. + if count > 2 { + t.Errorf("expected only new-page.md to be indexed (<=2 docs), got %d", count) + } +} + +func TestIndexer_MultipleSpaces(t *testing.T) { + ix, repoDir := newTestIndexer(t) + + writeStateFile(t, repoDir, "DEV", "DEV") + writeStateFile(t, repoDir, "OPS", "OPS") + writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) + writeMarkdownFile(t, repoDir, "OPS/deploy.md", `--- +title: Deploy +--- +Deployment instructions. +`) + + count, err := ix.Reindex() + if err != nil { + t.Fatalf("Reindex: %v", err) + } + if count < 2 { + t.Errorf("expected docs from both spaces, got %d", count) + } +} + +// openStoreFromIndexer opens a new store handle to the same DB used by the indexer. +// This is needed to run Search assertions independently of the indexer. +func openStoreFromIndexer(t *testing.T, repoDir string) *sqlitestore.Store { + t.Helper() + dbPath := filepath.Join(repoDir, ".confluence-search-index", "search.db") + store, err := sqlitestore.Open(dbPath) + if err != nil { + t.Fatalf("open store for assertion: %v", err) + } + return store +} + +// — compile-time interface check — +var _ search.Store = (*sqlitestore.Store)(nil) + +// — time stub for incremental test — +func mustNotBeZero(t *testing.T, ts time.Time, label string) { + t.Helper() + if ts.IsZero() { + t.Errorf("%s: expected non-zero time", label) + } +} From 1c488203117658ea57a80b50c2c105d9568404fd Mon Sep 17 00:00:00 2001 From: Robert Gonek Date: Tue, 3 Mar 2026 10:12:25 +0100 Subject: [PATCH 4/6] feat(search): Phase 2a, 4, 5, 6 - Bleve backend, CLI command, integration hooks, docs - Add Bleve scorch backend (internal/search/blevestore) - Add conf search command with sqlite/bleve backends, --list-labels/spaces, --format auto - Register newSearchCmd() in root.go - Phase 5: post-pull search index update in cmd/pull.go (non-fatal) - Phase 5: remove .confluence-search-index/ in conf clean - Phase 5: add .confluence-search-index/ to gitignore template (cmd/init.go) and .gitignore - Phase 6: update AGENTS.md and docs/usage.md with conf search reference --- .gitignore | 1 + AGENTS.md | 17 +- cmd/clean.go | 10 + cmd/init.go | 3 +- cmd/pull.go | 4 + cmd/search.go | 271 +++++++++++ cmd/search_test.go | 429 ++++++++++++++++++ docs/usage.md | 49 +- go.mod | 4 +- go.sum | 34 +- internal/search/blevestore/mapping.go | 76 ++++ internal/search/blevestore/store.go | 459 +++++++++++++++++++ internal/search/blevestore/store_test.go | 553 +++++++++++++++++++++++ 13 files changed, 1903 insertions(+), 7 deletions(-) create mode 100644 cmd/search.go create mode 100644 cmd/search_test.go create mode 100644 internal/search/blevestore/mapping.go create mode 100644 internal/search/blevestore/store.go create mode 100644 internal/search/blevestore/store_test.go diff --git a/.gitignore b/.gitignore index c4d9c3d..76657a0 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ test-output/ # Local Confluence sync state .confluence-state.json +.confluence-search-index/ conf .claude/ diff --git a/AGENTS.md b/AGENTS.md index 314a7d2..57d5958 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -72,11 +72,26 @@ The agent manages the full sync cycle. Validation failures must stop `push` immediately. ## Command Model -- Commands: `init`, `pull`, `push`, `validate`, `diff`. +- Commands: `init`, `pull`, `push`, `validate`, `diff`, `search`. - `[TARGET]` parsing rule: - Ends with `.md` => file mode. - Otherwise => space mode (`SPACE_KEY`). +## Search Command (`conf search`) +- `conf search QUERY [flags]` runs full-text search over local Markdown files. +- Two pluggable backends share the `Store` interface: `--engine sqlite` (default, SQLite FTS5) and `--engine bleve` (Bleve scorch). +- Index lives in `.confluence-search-index/` (gitignored, local-only). +- Index is updated automatically on `pull` (non-fatal) and incrementally on each `search` invocation. +- Key flags: + - `--space KEY` — filter to a Confluence space. + - `--label LABEL` — filter by label (repeatable). + - `--heading TEXT` — restrict to sections under matching headings. + - `--reindex` — force full rebuild. + - `--list-labels` / `--list-spaces` — facet discovery. + - `--format text|json|auto` — output format (auto: TTY→text, pipe→json). + - `--limit N` (default 20) — max results. +- Recommended agent workflow: `conf search "term" --format json | ` for token-efficient, structured reads. + ## Developer Tooling Requirements - Keep a top-level `Makefile` in the repository. - `Makefile` should provide common local workflows (at minimum: `build`, `test`, and `lint`/`fmt`). diff --git a/cmd/clean.go b/cmd/clean.go index c4c378f..f15ed11 100644 --- a/cmd/clean.go +++ b/cmd/clean.go @@ -125,6 +125,16 @@ func runClean(cmd *cobra.Command, _ []string) error { return err } + // Remove search index directory if present. + searchIndexPath := filepath.Join(client.RootDir, ".confluence-search-index") + if _, statErr := os.Stat(searchIndexPath); statErr == nil { + if rmErr := os.RemoveAll(searchIndexPath); rmErr != nil { + _, _ = fmt.Fprintf(out, "warning: failed to remove search index: %v\n", rmErr) + } else { + _, _ = fmt.Fprintln(out, "Removed .confluence-search-index/") + } + } + _, _ = fmt.Fprintf(out, "clean completed: removed %d worktree(s), deleted %d snapshot ref(s)\n", removedWorktrees, deletedRefs) return nil } diff --git a/cmd/init.go b/cmd/init.go index de4b6b8..8d45c72 100644 --- a/cmd/init.go +++ b/cmd/init.go @@ -14,6 +14,7 @@ import ( const gitignoreContent = `# Confluence Markdown Sync .confluence-state.json +.confluence-search-index/ .env # OS artifacts @@ -218,7 +219,7 @@ func ensureGitignore() error { content := string(existing) var missing []string - for _, entry := range []string{".confluence-state.json", ".env"} { + for _, entry := range []string{".confluence-state.json", ".confluence-search-index/", ".env"} { if !containsLine(content, entry) { missing = append(missing, entry) } diff --git a/cmd/pull.go b/cmd/pull.go index af5a4a2..9ec751b 100644 --- a/cmd/pull.go +++ b/cmd/pull.go @@ -335,6 +335,10 @@ func runPull(cmd *cobra.Command, target config.Target) (runErr error) { _, _ = fmt.Fprintf(out, "pull completed: committed and tagged %s\n", tagName) + if err := updateSearchIndexForSpace(repoRoot, pullCtx.spaceDir, pullCtx.spaceKey, out); err != nil { + _, _ = fmt.Fprintf(out, "warning: search index update failed: %v\n", err) + } + if flagPullRelink { index, err := syncflow.BuildGlobalPageIndex(repoRoot) if err != nil { diff --git a/cmd/search.go b/cmd/search.go new file mode 100644 index 0000000..2efb63b --- /dev/null +++ b/cmd/search.go @@ -0,0 +1,271 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/rgonek/confluence-markdown-sync/internal/search" + "github.com/rgonek/confluence-markdown-sync/internal/search/blevestore" + "github.com/rgonek/confluence-markdown-sync/internal/search/sqlitestore" + "github.com/spf13/cobra" + "golang.org/x/term" +) + +const searchIndexDir = ".confluence-search-index" + +func newSearchCmd() *cobra.Command { + var ( + flagSearchSpace string + flagSearchLabels []string + flagSearchHeading string + flagSearchFormat string + flagSearchLimit int + flagSearchReindex bool + flagSearchEngine string + flagSearchListLabels bool + flagSearchListSpaces bool + ) + + cmd := &cobra.Command{ + Use: "search QUERY", + Short: "Full-text search across the local Confluence Markdown workspace", + Long: `search indexes and queries Markdown files in your local Confluence workspace. + +The index is built automatically on first use and updated incrementally on +subsequent runs. Use --reindex to force a full rebuild. + +Examples: + conf search "oauth token refresh" + conf search "deploy pipeline" --space DEV --label ci + conf search --list-labels + conf search --list-spaces --format json`, + Args: cobra.MaximumNArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + query := "" + if len(args) > 0 { + query = args[0] + } + return runSearch(cmd, query, searchRunOptions{ + space: flagSearchSpace, + labels: flagSearchLabels, + heading: flagSearchHeading, + format: flagSearchFormat, + limit: flagSearchLimit, + reindex: flagSearchReindex, + engine: flagSearchEngine, + listLabels: flagSearchListLabels, + listSpaces: flagSearchListSpaces, + }) + }, + } + + cmd.Flags().StringVar(&flagSearchSpace, "space", "", "Filter results to a specific Confluence space key") + cmd.Flags().StringArrayVar(&flagSearchLabels, "label", nil, "Filter by label (repeatable)") + cmd.Flags().StringVar(&flagSearchHeading, "heading", "", "Restrict results to sections under headings matching this substring") + cmd.Flags().StringVar(&flagSearchFormat, "format", "auto", `Output format: "text", "json", or "auto" (TTY→text, pipe→json)`) + cmd.Flags().IntVar(&flagSearchLimit, "limit", 20, "Maximum number of results to return") + cmd.Flags().BoolVar(&flagSearchReindex, "reindex", false, "Force a full reindex before searching") + cmd.Flags().StringVar(&flagSearchEngine, "engine", "sqlite", `Search backend: "sqlite" or "bleve"`) + cmd.Flags().BoolVar(&flagSearchListLabels, "list-labels", false, "List all indexed labels and exit") + cmd.Flags().BoolVar(&flagSearchListSpaces, "list-spaces", false, "List all indexed spaces and exit") + + return cmd +} + +type searchRunOptions struct { + space string + labels []string + heading string + format string + limit int + reindex bool + engine string + listLabels bool + listSpaces bool +} + +func runSearch(cmd *cobra.Command, query string, opts searchRunOptions) error { + out := cmd.OutOrStdout() + + repoRoot, err := gitRepoRoot() + if err != nil { + return err + } + + store, err := openSearchStore(opts.engine, repoRoot) + if err != nil { + return err + } + defer func() { _ = store.Close() }() + + indexer := search.NewIndexer(store, repoRoot) + + if opts.reindex { + count, err := indexer.Reindex() + if err != nil { + return fmt.Errorf("search: reindex: %w", err) + } + _, _ = fmt.Fprintf(out, "Reindexed %d document(s)\n", count) + } else { + _, err := indexer.IncrementalUpdate() + if err != nil { + return fmt.Errorf("search: incremental update: %w", err) + } + } + + format := resolveSearchFormat(opts.format, out) + + if opts.listLabels { + labels, err := store.ListLabels() + if err != nil { + return fmt.Errorf("search: list labels: %w", err) + } + return printSearchStringList(out, labels, format) + } + + if opts.listSpaces { + spaces, err := store.ListSpaces() + if err != nil { + return fmt.Errorf("search: list spaces: %w", err) + } + return printSearchStringList(out, spaces, format) + } + + if query == "" && !opts.listLabels && !opts.listSpaces { + return fmt.Errorf("search: QUERY argument is required (or use --list-labels / --list-spaces)") + } + + results, err := store.Search(search.SearchOptions{ + Query: query, + SpaceKey: opts.space, + Labels: opts.labels, + HeadingFilter: opts.heading, + Limit: opts.limit, + }) + if err != nil { + return fmt.Errorf("search: query: %w", err) + } + + return printSearchResults(out, results, format) +} + +// openSearchStore opens the appropriate Store backend based on engine name. +func openSearchStore(engine, repoRoot string) (search.Store, error) { + indexRoot := filepath.Join(repoRoot, searchIndexDir) + + switch strings.ToLower(engine) { + case "sqlite", "": + dbPath := filepath.Join(indexRoot, "search.db") + return sqlitestore.Open(dbPath) + case "bleve": + blevePath := filepath.Join(indexRoot, "bleve") + return blevestore.Open(blevePath) + default: + return nil, fmt.Errorf("search: unknown engine %q (valid values: sqlite, bleve)", engine) + } +} + +// resolveSearchFormat resolves "auto" to "text" or "json" based on TTY detection. +func resolveSearchFormat(format string, out io.Writer) string { + if format != "auto" { + return format + } + // If out is not os.Stdout fall back to json (pipe-like context). + if out == os.Stdout && term.IsTerminal(int(os.Stdout.Fd())) { + return "text" + } + return "json" +} + +// printSearchResults renders search results in the requested format. +func printSearchResults(out io.Writer, results []search.SearchResult, format string) error { + if format == "json" { + enc := json.NewEncoder(out) + enc.SetIndent("", " ") + return enc.Encode(results) + } + + // Text format + if len(results) == 0 { + _, _ = fmt.Fprintln(out, "No results found.") + return nil + } + + for _, r := range results { + doc := r.Document + // Header line: path + title + labels + labelsStr := "" + if len(doc.Labels) > 0 { + labelsStr = " [" + strings.Join(doc.Labels, ", ") + "]" + } + titleStr := "" + if doc.Title != "" { + titleStr = " - " + doc.Title + } + _, _ = fmt.Fprintf(out, "%s%s%s\n", doc.Path, titleStr, labelsStr) + + // Section context + if doc.Type != search.DocTypePage && len(doc.HeadingPath) > 0 { + headings := make([]string, len(doc.HeadingPath)) + for i, h := range doc.HeadingPath { + headings[i] = strings.TrimLeft(h, "# ") + headings[i] = "## " + headings[i] + // Re-use the original heading text (which already has #-prefix) as-is. + headings[i] = h + } + lineInfo := "" + if doc.Line > 0 { + lineInfo = fmt.Sprintf(" (line %d)", doc.Line) + } + _, _ = fmt.Fprintf(out, " %s%s\n", strings.Join(doc.HeadingPath, " > "), lineInfo) + } + + // Snippet + if r.Snippet != "" { + _, _ = fmt.Fprintf(out, " ...%s...\n", strings.TrimSpace(r.Snippet)) + } + } + return nil +} + +// updateSearchIndexForSpace opens the default SQLite search store and runs an +// incremental update scoped to a single space directory. Errors are non-fatal +// from the caller's perspective — the function itself returns the error so the +// caller can emit a warning. +func updateSearchIndexForSpace(repoRoot, spaceDir, spaceKey string, out io.Writer) error { + dbPath := filepath.Join(repoRoot, searchIndexDir, "search.db") + store, err := sqlitestore.Open(dbPath) + if err != nil { + return fmt.Errorf("open search store: %w", err) + } + defer func() { _ = store.Close() }() + + indexer := search.NewIndexer(store, repoRoot) + count, err := indexer.IndexSpace(spaceDir, spaceKey) + if err != nil { + return fmt.Errorf("index space %s: %w", spaceKey, err) + } + if count > 0 { + _, _ = fmt.Fprintf(out, "Updated search index: %d document(s) for space %s\n", count, spaceKey) + } + return nil +} + +// printSearchStringList renders a list of strings (labels or spaces) in the requested format. +func printSearchStringList(out io.Writer, items []string, format string) error { + if format == "json" { + enc := json.NewEncoder(out) + enc.SetIndent("", " ") + return enc.Encode(items) + } + + // Text format + for _, item := range items { + _, _ = fmt.Fprintln(out, item) + } + return nil +} diff --git a/cmd/search_test.go b/cmd/search_test.go new file mode 100644 index 0000000..f2f1904 --- /dev/null +++ b/cmd/search_test.go @@ -0,0 +1,429 @@ +package cmd + +import ( + "bytes" + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/rgonek/confluence-markdown-sync/internal/search" + "github.com/rgonek/confluence-markdown-sync/internal/search/sqlitestore" +) + +// --- command structure tests --- + +func TestNewSearchCmd_NotNil(t *testing.T) { + cmd := newSearchCmd() + if cmd == nil { + t.Fatal("newSearchCmd returned nil") + } +} + +func TestNewSearchCmd_Use(t *testing.T) { + cmd := newSearchCmd() + if !strings.HasPrefix(cmd.Use, "search") { + t.Errorf("expected Use to start with 'search', got %q", cmd.Use) + } +} + +func TestNewSearchCmd_Flags(t *testing.T) { + cmd := newSearchCmd() + + expectedFlags := []string{ + "space", + "label", + "heading", + "format", + "limit", + "reindex", + "engine", + "list-labels", + "list-spaces", + } + + for _, name := range expectedFlags { + if f := cmd.Flags().Lookup(name); f == nil { + t.Errorf("expected flag --%s to be registered", name) + } + } +} + +func TestNewSearchCmd_FlagDefaults(t *testing.T) { + cmd := newSearchCmd() + + cases := []struct { + flag string + expected string + }{ + {"format", "auto"}, + {"engine", "sqlite"}, + {"limit", "20"}, + {"space", ""}, + {"heading", ""}, + } + + for _, tc := range cases { + f := cmd.Flags().Lookup(tc.flag) + if f == nil { + t.Errorf("flag --%s not found", tc.flag) + continue + } + if f.DefValue != tc.expected { + t.Errorf("flag --%s default = %q, want %q", tc.flag, f.DefValue, tc.expected) + } + } +} + +// --- helper: build a minimal git repo with an indexed space --- + +func setupSearchTestRepo(t *testing.T) (repoRoot string, store search.Store) { + t.Helper() + + repo := t.TempDir() + setupGitRepo(t, repo) + + // Create a space directory with one Markdown file. + spaceDir := filepath.Join(repo, "DOCS") + if err := os.MkdirAll(spaceDir, 0o750); err != nil { + t.Fatalf("mkdir space dir: %v", err) + } + + mdContent := `--- +id: "123" +title: OAuth Security Overview +space: DOCS +labels: + - security + - architecture +--- + +# OAuth2 Flow + +Token refresh happens every 15 minutes using PKCE. + +## Token Refresh + +Refresh tokens are rotated every 15 minutes using PKCE extension. +` + if err := os.WriteFile(filepath.Join(spaceDir, "overview.md"), []byte(mdContent), 0o600); err != nil { + t.Fatalf("write markdown: %v", err) + } + + // Write minimal state file so indexer can discover the space. + stateContent := `{"space_key":"DOCS","pages":{}}` + stateFile := filepath.Join(spaceDir, ".confluence-state.json") + if err := os.WriteFile(stateFile, []byte(stateContent), 0o600); err != nil { + t.Fatalf("write state: %v", err) + } + + // Open a real SQLite store for this test repo. + indexDir := filepath.Join(repo, searchIndexDir) + dbPath := filepath.Join(indexDir, "search.db") + s, err := sqlitestore.Open(dbPath) + if err != nil { + t.Fatalf("open sqlitestore: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + return repo, s +} + +// --- resolveSearchFormat tests --- + +func TestResolveSearchFormat_Explicit(t *testing.T) { + cases := []struct { + input string + expected string + }{ + {"text", "text"}, + {"json", "json"}, + } + for _, tc := range cases { + got := resolveSearchFormat(tc.input, new(bytes.Buffer)) + if got != tc.expected { + t.Errorf("resolveSearchFormat(%q) = %q, want %q", tc.input, got, tc.expected) + } + } +} + +func TestResolveSearchFormat_AutoPipe(t *testing.T) { + // A bytes.Buffer is not a TTY — should resolve to "json". + got := resolveSearchFormat("auto", new(bytes.Buffer)) + if got != "json" { + t.Errorf("resolveSearchFormat(auto, non-tty) = %q, want json", got) + } +} + +// --- printSearchResults tests --- + +func TestPrintSearchResults_TextEmpty(t *testing.T) { + out := new(bytes.Buffer) + if err := printSearchResults(out, nil, "text"); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !strings.Contains(out.String(), "No results found") { + t.Errorf("expected 'No results found', got %q", out.String()) + } +} + +func TestPrintSearchResults_TextFormat(t *testing.T) { + results := []search.SearchResult{ + { + Document: search.Document{ + Type: search.DocTypePage, + Path: "DEV/security/overview.md", + Title: "Security Overview", + Labels: []string{"architecture", "security"}, + SpaceKey: "DEV", + }, + Snippet: "refresh tokens are rotated every 15 minutes using PKCE", + }, + } + + out := new(bytes.Buffer) + if err := printSearchResults(out, results, "text"); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + got := out.String() + if !strings.Contains(got, "DEV/security/overview.md") { + t.Errorf("expected path in output, got %q", got) + } + if !strings.Contains(got, "Security Overview") { + t.Errorf("expected title in output, got %q", got) + } + if !strings.Contains(got, "architecture") { + t.Errorf("expected label in output, got %q", got) + } + if !strings.Contains(got, "PKCE") { + t.Errorf("expected snippet in output, got %q", got) + } +} + +func TestPrintSearchResults_JSONFormat(t *testing.T) { + results := []search.SearchResult{ + { + Document: search.Document{ + Type: search.DocTypePage, + Path: "DEV/overview.md", + Title: "Overview", + SpaceKey: "DEV", + }, + Score: 1.5, + }, + } + + out := new(bytes.Buffer) + if err := printSearchResults(out, results, "json"); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var decoded []search.SearchResult + if err := json.Unmarshal(out.Bytes(), &decoded); err != nil { + t.Fatalf("output is not valid JSON: %v\nOutput: %s", err, out.String()) + } + if len(decoded) != 1 { + t.Fatalf("expected 1 result, got %d", len(decoded)) + } + if decoded[0].Document.Path != "DEV/overview.md" { + t.Errorf("expected path DEV/overview.md, got %q", decoded[0].Document.Path) + } +} + +// --- printSearchStringList tests --- + +func TestPrintSearchStringList_Text(t *testing.T) { + out := new(bytes.Buffer) + if err := printSearchStringList(out, []string{"alpha", "beta", "gamma"}, "text"); err != nil { + t.Fatalf("unexpected error: %v", err) + } + got := out.String() + for _, item := range []string{"alpha", "beta", "gamma"} { + if !strings.Contains(got, item) { + t.Errorf("expected %q in output, got %q", item, got) + } + } +} + +func TestPrintSearchStringList_JSON(t *testing.T) { + out := new(bytes.Buffer) + if err := printSearchStringList(out, []string{"alpha", "beta"}, "json"); err != nil { + t.Fatalf("unexpected error: %v", err) + } + var decoded []string + if err := json.Unmarshal(out.Bytes(), &decoded); err != nil { + t.Fatalf("output is not valid JSON: %v\nOutput: %s", err, out.String()) + } + if len(decoded) != 2 || decoded[0] != "alpha" { + t.Errorf("unexpected decoded value: %v", decoded) + } +} + +// --- openSearchStore tests --- + +func TestOpenSearchStore_UnknownEngine(t *testing.T) { + _, err := openSearchStore("badengine", t.TempDir()) + if err == nil { + t.Fatal("expected error for unknown engine") + } + if !strings.Contains(err.Error(), "badengine") { + t.Errorf("error should mention engine name, got: %v", err) + } +} + +func TestOpenSearchStore_SQLite(t *testing.T) { + repo := t.TempDir() + store, err := openSearchStore("sqlite", repo) + if err != nil { + t.Fatalf("unexpected error opening sqlite store: %v", err) + } + defer func() { _ = store.Close() }() +} + +// --- --list-labels integration test --- + +func TestRunSearch_ListLabels(t *testing.T) { + runParallelCommandTest(t) + + repo, store := setupSearchTestRepo(t) + + // Pre-index a document with known labels. + docs := []search.Document{ + { + ID: "page:DOCS/overview.md", + Type: search.DocTypePage, + Path: "DOCS/overview.md", + SpaceKey: "DOCS", + Labels: []string{"security", "architecture"}, + Content: "Token refresh happens every 15 minutes.", + }, + } + if err := store.Index(docs); err != nil { + t.Fatalf("index: %v", err) + } + if err := store.UpdateMeta(); err != nil { + t.Fatalf("update meta: %v", err) + } + + // Change to the repo dir so gitRepoRoot() works. + chdirRepo(t, repo) + + cmd := newSearchCmd() + out := new(bytes.Buffer) + cmd.SetOut(out) + cmd.SetErr(new(bytes.Buffer)) + cmd.SetArgs([]string{"--list-labels", "--format", "text", "--engine", "sqlite"}) + + if err := cmd.Execute(); err != nil { + t.Fatalf("command error: %v", err) + } + + got := out.String() + if !strings.Contains(got, "security") { + t.Errorf("expected 'security' in list-labels output, got %q", got) + } + if !strings.Contains(got, "architecture") { + t.Errorf("expected 'architecture' in list-labels output, got %q", got) + } +} + +// --- --list-spaces integration test --- + +func TestRunSearch_ListSpaces(t *testing.T) { + runParallelCommandTest(t) + + repo, store := setupSearchTestRepo(t) + + docs := []search.Document{ + { + ID: "page:DOCS/page.md", + Type: search.DocTypePage, + Path: "DOCS/page.md", + SpaceKey: "DOCS", + Content: "some content", + }, + } + if err := store.Index(docs); err != nil { + t.Fatalf("index: %v", err) + } + if err := store.UpdateMeta(); err != nil { + t.Fatalf("update meta: %v", err) + } + + chdirRepo(t, repo) + + cmd := newSearchCmd() + out := new(bytes.Buffer) + cmd.SetOut(out) + cmd.SetErr(new(bytes.Buffer)) + cmd.SetArgs([]string{"--list-spaces", "--format", "text", "--engine", "sqlite"}) + + if err := cmd.Execute(); err != nil { + t.Fatalf("command error: %v", err) + } + + got := out.String() + if !strings.Contains(got, "DOCS") { + t.Errorf("expected 'DOCS' in list-spaces output, got %q", got) + } +} + +// --- query no-results graceful output --- + +func TestRunSearch_NoResults(t *testing.T) { + runParallelCommandTest(t) + + repo, store := setupSearchTestRepo(t) + + if err := store.UpdateMeta(); err != nil { + t.Fatalf("update meta: %v", err) + } + + chdirRepo(t, repo) + + cmd := newSearchCmd() + out := new(bytes.Buffer) + cmd.SetOut(out) + cmd.SetErr(new(bytes.Buffer)) + cmd.SetArgs([]string{"xyzzy_no_such_term", "--format", "text", "--engine", "sqlite"}) + + if err := cmd.Execute(); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !strings.Contains(out.String(), "No results found") { + t.Errorf("expected 'No results found', got %q", out.String()) + } +} + +// --- missing query error --- + +func TestRunSearch_MissingQuery(t *testing.T) { + runParallelCommandTest(t) + + repo, _ := setupSearchTestRepo(t) + chdirRepo(t, repo) + + cmd := newSearchCmd() + out := new(bytes.Buffer) + cmd.SetOut(out) + cmd.SetErr(new(bytes.Buffer)) + // No QUERY arg, no --list-labels, no --list-spaces + cmd.SetArgs([]string{"--engine", "sqlite"}) + + err := cmd.Execute() + if err == nil { + t.Fatal("expected error when query is missing") + } +} + +// --- bleve engine opens successfully --- + +func TestOpenSearchStore_Bleve(t *testing.T) { + store, err := openSearchStore("bleve", t.TempDir()) + if err != nil { + t.Fatalf("unexpected error opening bleve store: %v", err) + } + defer func() { _ = store.Close() }() +} diff --git a/docs/usage.md b/docs/usage.md index 46ff9a8..237a6e5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -12,7 +12,7 @@ This guide covers day-to-day usage of `conf`. - `diff` previews local vs remote content. - `init agents` scaffolds an `AGENTS.md` file for AI-assisted authoring. - `relink` rewrites absolute Confluence links to local relative Markdown links. -- `version` prints the CLI version (`conf version` or `conf --version`). +- `search` indexes and queries local Markdown files with full-text search (zero API calls). ## Requirements @@ -156,6 +156,53 @@ Highlights: - archive deletes require long-task completion (`--archive-task-timeout`, `--archive-task-poll-interval`), - `--preflight` for a concise local push plan (change summary + validation) without remote writes. +### `conf search QUERY` + +Full-text search over local Markdown files. + +Highlights: + +- index is built automatically on first use and updated incrementally, +- two backends available: `--engine sqlite` (default, SQLite FTS5) and `--engine bleve`, +- index stored in `.confluence-search-index/` (local-only, gitignored), +- index updated automatically after each `conf pull` (non-fatal), +- results grouped by file with heading context and snippets, +- `--format auto` defaults to text on TTY, JSON when piped. + +Key flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--space KEY` | | Filter to a specific Confluence space | +| `--label LABEL` | | Filter by label (repeatable) | +| `--heading TEXT` | | Restrict to sections under matching headings | +| `--limit N` | 20 | Maximum number of results | +| `--reindex` | false | Force full index rebuild | +| `--engine` | sqlite | Backend: `sqlite` or `bleve` | +| `--list-labels` | false | List all indexed labels and exit | +| `--list-spaces` | false | List all indexed spaces and exit | +| `--format` | auto | Output format: `text`, `json`, or `auto` | + +Examples: + +```powershell +# Basic search +conf search "oauth token refresh" + +# Filter by space and label +conf search "deploy pipeline" --space DEV --label ci + +# Restrict to sections under matching headings +conf search "token" --heading "Authentication" + +# Facet discovery +conf search --list-labels --format json +conf search --list-spaces + +# Agent-friendly (piped → JSON automatically) +conf search "security review" --format json | ConvertFrom-Json +``` + ## Metadata and State Markdown frontmatter keys: diff --git a/go.mod b/go.mod index 8c26038..202de87 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/rgonek/confluence-markdown-sync go 1.25.5 require ( + github.com/blevesearch/bleve/v2 v2.5.7 github.com/charmbracelet/bubbles v1.0.0 github.com/charmbracelet/bubbletea v1.3.10 github.com/charmbracelet/huh v0.8.0 @@ -14,6 +15,7 @@ require ( golang.org/x/sync v0.19.0 golang.org/x/term v0.40.0 gopkg.in/yaml.v3 v3.0.1 + modernc.org/sqlite v1.46.1 ) require ( @@ -21,7 +23,6 @@ require ( github.com/atotto/clipboard v0.1.4 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/bits-and-blooms/bitset v1.24.4 // indirect - github.com/blevesearch/bleve/v2 v2.5.7 // indirect github.com/blevesearch/bleve_index_api v1.2.11 // indirect github.com/blevesearch/geo v0.2.4 // indirect github.com/blevesearch/go-faiss v1.0.26 // indirect @@ -80,5 +81,4 @@ require ( modernc.org/libc v1.67.6 // indirect modernc.org/mathutil v1.7.1 // indirect modernc.org/memory v1.11.0 // indirect - modernc.org/sqlite v1.46.1 // indirect ) diff --git a/go.sum b/go.sum index a81e897..b66113b 100644 --- a/go.sum +++ b/go.sum @@ -98,8 +98,16 @@ github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= @@ -158,10 +166,10 @@ github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Br go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= -golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= -golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY= golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= @@ -175,6 +183,8 @@ golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -184,11 +194,31 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= +modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc= +modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM= +modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= +modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE= +modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI= modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE= modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= modernc.org/sqlite v1.46.1 h1:eFJ2ShBLIEnUWlLy12raN0Z1plqmFX9Qe3rjQTKt6sU= modernc.org/sqlite v1.46.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/internal/search/blevestore/mapping.go b/internal/search/blevestore/mapping.go new file mode 100644 index 0000000..162cf50 --- /dev/null +++ b/internal/search/blevestore/mapping.go @@ -0,0 +1,76 @@ +// Package blevestore implements the search.Store interface backed by Bleve. +package blevestore + +import ( + "github.com/blevesearch/bleve/v2/mapping" +) + +// NewMapping returns the Bleve IndexMapping for the confluence search index. +// +// Field categories: +// - keyword (exact match, keyword analyzer): type, path, page_id, space_key, labels, language +// - text (standard english analyzer): title, content, heading_text, heading_path_text +// - numeric: heading_level, line +// - datetime: mod_time +func NewMapping() mapping.IndexMapping { + im := mapping.NewIndexMapping() + im.DefaultAnalyzer = "en" + // Disable dynamic indexing so only explicitly mapped fields are indexed. + im.DefaultMapping.Dynamic = false + + // --- keyword fields --- + kw := mapping.NewKeywordFieldMapping() + kw.Store = true + + // --- text fields --- + textField := func() *mapping.FieldMapping { + fm := mapping.NewTextFieldMapping() + fm.Analyzer = "en" + fm.Store = true + fm.IncludeTermVectors = true + return fm + } + + // --- numeric field --- + num := mapping.NewNumericFieldMapping() + num.Store = true + + // --- datetime field --- + dt := mapping.NewDateTimeFieldMapping() + dt.Store = true + + dm := mapping.NewDocumentMapping() + dm.Dynamic = false + + // keyword fields + dm.AddFieldMappingsAt("type", kw) + dm.AddFieldMappingsAt("path", kw) + dm.AddFieldMappingsAt("page_id", kw) + dm.AddFieldMappingsAt("space_key", kw) + dm.AddFieldMappingsAt("labels", kw) + dm.AddFieldMappingsAt("language", kw) + + // text fields + dm.AddFieldMappingsAt("title", textField()) + dm.AddFieldMappingsAt("content", textField()) + dm.AddFieldMappingsAt("heading_text", textField()) + dm.AddFieldMappingsAt("heading_path_text", textField()) + + // numeric fields + dm.AddFieldMappingsAt("heading_level", num) + dm.AddFieldMappingsAt("line", num) + + // datetime field + dm.AddFieldMappingsAt("mod_time", dt) + + im.DefaultMapping = dm + + return im +} + +// allDocFields is the list of stored fields to retrieve on a Search hit. +var allDocFields = []string{ + "type", "path", "page_id", "space_key", "labels", + "language", "title", "content", "heading_text", "heading_path_text", + "heading_level", "line", "mod_time", +} diff --git a/internal/search/blevestore/store.go b/internal/search/blevestore/store.go new file mode 100644 index 0000000..480900d --- /dev/null +++ b/internal/search/blevestore/store.go @@ -0,0 +1,459 @@ +package blevestore + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" + + bleve "github.com/blevesearch/bleve/v2" + "github.com/blevesearch/bleve/v2/search/query" + + search "github.com/rgonek/confluence-markdown-sync/internal/search" +) + +// Compile-time interface check. +var _ search.Store = (*Store)(nil) + +const ( + // indexSubDir is the path under rootDir where the Bleve index is stored. + indexSubDir = ".confluence-search-index/bleve" + + // defaultSearchLimit is the result limit used when SearchOptions.Limit == 0. + defaultSearchLimit = 50 + + // facetSize is the maximum number of facet values returned for ListLabels/ListSpaces. + facetSize = 10000 +) + +// Store is a search.Store backed by Bleve (scorch engine). +type Store struct { + index bleve.Index +} + +// Open opens (or creates) the Bleve index rooted at rootDir. +// The index is stored at /.confluence-search-index/bleve/. +func Open(rootDir string) (*Store, error) { + indexPath := filepath.Join(rootDir, indexSubDir) + + var idx bleve.Index + var err error + + if _, statErr := os.Stat(indexPath); os.IsNotExist(statErr) { + m := NewMapping() + idx, err = bleve.New(indexPath, m) + } else { + idx, err = bleve.Open(indexPath) + } + if err != nil { + return nil, fmt.Errorf("blevestore.Open %q: %w", indexPath, err) + } + + return &Store{index: idx}, nil +} + +// --------------------------------------------------------------------------- +// search.Store implementation +// --------------------------------------------------------------------------- + +// Index upserts a batch of documents. +// The caller is expected to call DeleteByPath before re-indexing a path. +func (s *Store) Index(docs []search.Document) error { + b := s.index.NewBatch() + for _, d := range docs { + if err := b.Index(d.ID, docToMap(d)); err != nil { + return fmt.Errorf("blevestore.Index %q: %w", d.ID, err) + } + } + if err := s.index.Batch(b); err != nil { + return fmt.Errorf("blevestore.Index batch: %w", err) + } + return nil +} + +// DeleteByPath removes all indexed documents whose path field equals relPath. +func (s *Store) DeleteByPath(relPath string) error { + tq := query.NewTermQuery(relPath) + tq.SetField("path") + + req := bleve.NewSearchRequestOptions(tq, 10000, 0, false) + req.Fields = []string{} + + res, err := s.index.Search(req) + if err != nil { + return fmt.Errorf("blevestore.DeleteByPath search %q: %w", relPath, err) + } + + b := s.index.NewBatch() + for _, hit := range res.Hits { + b.Delete(hit.ID) + } + if b.Size() == 0 { + return nil + } + if err := s.index.Batch(b); err != nil { + return fmt.Errorf("blevestore.DeleteByPath batch delete %q: %w", relPath, err) + } + return nil +} + +// Search executes a full-text query against the index. +func (s *Store) Search(opts search.SearchOptions) ([]search.SearchResult, error) { + limit := opts.Limit + if limit <= 0 { + limit = defaultSearchLimit + } + + q := buildQuery(opts) + + req := bleve.NewSearchRequestOptions(q, limit, 0, false) + req.Fields = allDocFields + req.Highlight = bleve.NewHighlight() + req.Highlight.AddField("content") + req.Highlight.AddField("title") + req.Highlight.AddField("heading_text") + + bleveRes, err := s.index.Search(req) + if err != nil { + return nil, fmt.Errorf("blevestore.Search: %w", err) + } + + results := make([]search.SearchResult, 0, len(bleveRes.Hits)) + for _, hit := range bleveRes.Hits { + doc, err := mapToDoc(hit.ID, hit.Fields) + if err != nil { + continue + } + snippet := extractSnippet(hit.Fragments) + results = append(results, search.SearchResult{ + Document: doc, + Score: hit.Score, + Snippet: snippet, + }) + } + return results, nil +} + +// ListLabels returns all distinct label values present in the index, sorted. +func (s *Store) ListLabels() ([]string, error) { + return s.listFacetTerms("labels") +} + +// ListSpaces returns all distinct space key values present in the index, sorted. +func (s *Store) ListSpaces() ([]string, error) { + return s.listFacetTerms("space_key") +} + +// metaKey is the internal key used to persist the last-indexed-at timestamp +// via Bleve's internal key-value store (independent of the document mapping). +var metaKey = []byte("confluence-sync:last-indexed-at") + +// UpdateMeta records the current UTC timestamp as the last-indexed-at time. +func (s *Store) UpdateMeta() error { + ts := time.Now().UTC().Format(time.RFC3339Nano) + if err := s.index.SetInternal(metaKey, []byte(ts)); err != nil { + return fmt.Errorf("blevestore.UpdateMeta: %w", err) + } + return nil +} + +// LastIndexedAt returns the time recorded by the most recent UpdateMeta call. +// Returns the zero time.Time with nil error if no meta has been recorded yet. +func (s *Store) LastIndexedAt() (time.Time, error) { + raw, err := s.index.GetInternal(metaKey) + if err != nil { + return time.Time{}, fmt.Errorf("blevestore.LastIndexedAt: %w", err) + } + if len(raw) == 0 { + return time.Time{}, nil + } + ts, err := time.Parse(time.RFC3339Nano, string(raw)) + if err != nil { + return time.Time{}, fmt.Errorf("blevestore.LastIndexedAt parse %q: %w", raw, err) + } + return ts, nil +} + +// Close releases resources held by the store. +func (s *Store) Close() error { + return s.index.Close() +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +// docToMap converts a search.Document to a flat map for Bleve indexing. +func docToMap(d search.Document) map[string]interface{} { + m := map[string]interface{}{ + "type": d.Type, + "path": d.Path, + "page_id": d.PageID, + "title": d.Title, + "space_key": d.SpaceKey, + "content": d.Content, + "heading_text": d.HeadingText, + "heading_level": float64(d.HeadingLevel), + "language": d.Language, + "line": float64(d.Line), + "mod_time": d.ModTime, + "heading_path_text": strings.Join(d.HeadingPath, " / "), + } + + // Index labels as a multi-valued field so Bleve creates one term per label. + if len(d.Labels) > 0 { + labels := make([]interface{}, len(d.Labels)) + for i, l := range d.Labels { + labels[i] = l + } + m["labels"] = labels + } + + return m +} + +// mapToDoc reconstructs a search.Document from a Bleve hit's Fields map. +func mapToDoc(id string, fields map[string]interface{}) (search.Document, error) { + d := search.Document{ID: id} + + if v, ok := fields["type"]; ok { + d.Type = toString(v) + } + if v, ok := fields["path"]; ok { + d.Path = toString(v) + } + if v, ok := fields["page_id"]; ok { + d.PageID = toString(v) + } + if v, ok := fields["title"]; ok { + d.Title = toString(v) + } + if v, ok := fields["space_key"]; ok { + d.SpaceKey = toString(v) + } + if v, ok := fields["content"]; ok { + d.Content = toString(v) + } + if v, ok := fields["heading_text"]; ok { + d.HeadingText = toString(v) + } + if v, ok := fields["language"]; ok { + d.Language = toString(v) + } + if v, ok := fields["heading_level"]; ok { + d.HeadingLevel = toInt(v) + } + if v, ok := fields["line"]; ok { + d.Line = toInt(v) + } + if v, ok := fields["mod_time"]; ok { + if t, err := parseTimeField(v); err == nil { + d.ModTime = t + } + } + if v, ok := fields["labels"]; ok { + d.Labels = toStringSlice(v) + } + if v, ok := fields["heading_path_text"]; ok { + joined := toString(v) + if joined != "" { + d.HeadingPath = strings.Split(joined, " / ") + } + } + + return d, nil +} + +// buildQuery constructs a Bleve query from SearchOptions. +func buildQuery(opts search.SearchOptions) query.Query { + var musts []query.Query + + // Full-text part — disjunction across content/heading_text/title with boosts. + if opts.Query != "" { + var textQueries []query.Query + + addMatch := func(field string, boost float64) { + mq := query.NewMatchQuery(opts.Query) + mq.SetField(field) + mq.SetBoost(boost) + textQueries = append(textQueries, mq) + } + + addMatch("content", 2.0) + addMatch("heading_text", 1.5) + addMatch("title", 1.0) + + dis := query.NewDisjunctionQuery(textQueries) + dis.SetMin(1) + musts = append(musts, dis) + } + + // SpaceKey filter. + if opts.SpaceKey != "" { + tq := query.NewTermQuery(opts.SpaceKey) + tq.SetField("space_key") + musts = append(musts, tq) + } + + // Labels filter — every requested label must appear. + for _, label := range opts.Labels { + tq := query.NewTermQuery(label) + tq.SetField("labels") + musts = append(musts, tq) + } + + // HeadingFilter. + if opts.HeadingFilter != "" { + mq := query.NewMatchQuery(opts.HeadingFilter) + mq.SetField("heading_text") + musts = append(musts, mq) + } + + // Types filter. + if len(opts.Types) > 0 { + typeQueries := make([]query.Query, len(opts.Types)) + for i, t := range opts.Types { + tq := query.NewTermQuery(t) + tq.SetField("type") + typeQueries[i] = tq + } + if len(typeQueries) == 1 { + musts = append(musts, typeQueries[0]) + } else { + dis := query.NewDisjunctionQuery(typeQueries) + dis.SetMin(1) + musts = append(musts, dis) + } + } + + switch len(musts) { + case 0: + return query.NewMatchAllQuery() + case 1: + return musts[0] + default: + return query.NewConjunctionQuery(musts) + } +} + +// listFacetTerms runs a match-all query with a facet on field and returns the +// distinct term values, sorted alphabetically. +func (s *Store) listFacetTerms(field string) ([]string, error) { + q := query.NewMatchAllQuery() + req := bleve.NewSearchRequestOptions(q, 0, 0, false) + req.AddFacet(field, bleve.NewFacetRequest(field, facetSize)) + + res, err := s.index.Search(req) + if err != nil { + return nil, fmt.Errorf("blevestore.listFacetTerms(%q): %w", field, err) + } + + facet, ok := res.Facets[field] + if !ok || facet == nil || facet.Terms == nil { + return []string{}, nil + } + + terms := facet.Terms.Terms() + out := make([]string, 0, len(terms)) + for _, t := range terms { + if t.Term != "" { + out = append(out, t.Term) + } + } + sort.Strings(out) + return out, nil +} + +// extractSnippet picks the first available fragment from a hit. +func extractSnippet(fragments map[string][]string) string { + for _, field := range []string{"content", "title", "heading_text"} { + if frags, ok := fragments[field]; ok && len(frags) > 0 { + return frags[0] + } + } + return "" +} + +// --------------------------------------------------------------------------- +// Type conversion helpers +// --------------------------------------------------------------------------- + +func toString(v interface{}) string { + if v == nil { + return "" + } + switch s := v.(type) { + case string: + return s + default: + return fmt.Sprintf("%v", v) + } +} + +func toInt(v interface{}) int { + switch n := v.(type) { + case float64: + return int(n) + case int: + return n + case int64: + return int(n) + default: + return 0 + } +} + +func toStringSlice(v interface{}) []string { + switch val := v.(type) { + case string: + if val == "" { + return nil + } + return []string{val} + case []interface{}: + out := make([]string, 0, len(val)) + for _, item := range val { + if s, ok := item.(string); ok { + out = append(out, s) + } + } + return out + case []string: + return val + default: + return nil + } +} + +// parseTimeField parses a time value from a Bleve stored field. +// Bleve stores datetime fields as RFC3339 strings. +func parseTimeField(v interface{}) (time.Time, error) { + switch val := v.(type) { + case time.Time: + return val, nil + case string: + for _, layout := range []string{ + time.RFC3339Nano, + time.RFC3339, + "2006-01-02T15:04:05.999999999Z", + } { + if t, err := time.Parse(layout, val); err == nil { + return t, nil + } + } + return time.Time{}, fmt.Errorf("cannot parse time %q", val) + default: + b, err := json.Marshal(val) + if err != nil { + return time.Time{}, fmt.Errorf("cannot marshal time value: %w", err) + } + var t time.Time + if err := json.Unmarshal(b, &t); err != nil { + return time.Time{}, fmt.Errorf("cannot unmarshal time value: %w", err) + } + return t, nil + } +} diff --git a/internal/search/blevestore/store_test.go b/internal/search/blevestore/store_test.go new file mode 100644 index 0000000..725e91c --- /dev/null +++ b/internal/search/blevestore/store_test.go @@ -0,0 +1,553 @@ +package blevestore + +import ( + "fmt" + "sort" + "strings" + "testing" + "time" + + search "github.com/rgonek/confluence-markdown-sync/internal/search" +) + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func openTestStore(t *testing.T) *Store { + t.Helper() + s, err := Open(t.TempDir()) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +func pageDoc(id, path, space, title, content string, labels ...string) search.Document { + return search.Document{ + ID: id, + Type: search.DocTypePage, + Path: path, + SpaceKey: space, + Title: title, + Content: content, + Labels: labels, + ModTime: time.Now().Truncate(time.Second), + } +} + +func sectionDoc(id, path, space, title, headingText, content string, headingLevel, line int) search.Document { + return search.Document{ + ID: id, + Type: search.DocTypeSection, + Path: path, + SpaceKey: space, + Title: title, + HeadingText: headingText, + Content: content, + HeadingLevel: headingLevel, + Line: line, + ModTime: time.Now().Truncate(time.Second), + } +} + +func mustIndex(t *testing.T, s *Store, docs ...search.Document) { + t.Helper() + if err := s.Index(docs); err != nil { + t.Fatalf("Index: %v", err) + } +} + +// sortedIDs extracts and sorts doc IDs from results for deterministic assertions. +func sortedIDs(results []search.SearchResult) []string { + ids := make([]string, len(results)) + for i, r := range results { + ids[i] = r.Document.ID + } + sort.Strings(ids) + return ids +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +func TestOpenClose(t *testing.T) { + // Test basic open/close without using openTestStore (to avoid double-close). + dir := t.TempDir() + + s1, err := Open(dir) + if err != nil { + t.Fatalf("Open (create): %v", err) + } + if s1 == nil { + t.Fatal("expected non-nil Store") + } + if err := s1.Close(); err != nil { + t.Fatalf("Close s1: %v", err) + } + + // Re-open the same directory to verify the index persists. + s2, err := Open(dir) + if err != nil { + t.Fatalf("Open (reopen): %v", err) + } + if err := s2.Close(); err != nil { + t.Fatalf("Close s2: %v", err) + } +} + +func TestIndexAndBasicTextSearch(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:DEV/auth.md", "DEV/auth.md", "DEV", "Authentication Guide", + "OAuth2 and JWT are the primary authentication mechanisms."), + pageDoc("page:DEV/deploy.md", "DEV/deploy.md", "DEV", "Deployment Guide", + "Kubernetes cluster deployment with Helm charts."), + } + mustIndex(t, s, docs...) + + results, err := s.Search(search.SearchOptions{Query: "authentication"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(results) == 0 { + t.Fatal("expected at least 1 result for 'authentication'") + } + if results[0].Document.ID != "page:DEV/auth.md" { + t.Errorf("expected top result to be auth.md, got %s", results[0].Document.ID) + } + if results[0].Score <= 0 { + t.Error("expected positive score") + } +} + +func TestSearchReturnsAllFieldsRoundTrip(t *testing.T) { + s := openTestStore(t) + + modTime := time.Date(2025, 3, 1, 12, 0, 0, 0, time.UTC) + doc := search.Document{ + ID: "page:SPACE/overview.md", + Type: search.DocTypePage, + Path: "SPACE/overview.md", + PageID: "12345", + Title: "Project Overview", + SpaceKey: "SPACE", + Labels: []string{"docs", "public"}, + Content: "This document provides an overview of the project architecture.", + HeadingPath: []string{"# Overview", "## Architecture"}, + HeadingText: "", + HeadingLevel: 0, + Language: "", + Line: 0, + ModTime: modTime, + } + mustIndex(t, s, doc) + + results, err := s.Search(search.SearchOptions{Query: "architecture"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(results) == 0 { + t.Fatal("expected at least 1 result") + } + + got := results[0].Document + if got.ID != doc.ID { + t.Errorf("ID: got %q, want %q", got.ID, doc.ID) + } + if got.Type != doc.Type { + t.Errorf("Type: got %q, want %q", got.Type, doc.Type) + } + if got.Path != doc.Path { + t.Errorf("Path: got %q, want %q", got.Path, doc.Path) + } + if got.PageID != doc.PageID { + t.Errorf("PageID: got %q, want %q", got.PageID, doc.PageID) + } + if got.Title != doc.Title { + t.Errorf("Title: got %q, want %q", got.Title, doc.Title) + } + if got.SpaceKey != doc.SpaceKey { + t.Errorf("SpaceKey: got %q, want %q", got.SpaceKey, doc.SpaceKey) + } +} + +func TestDeleteByPath(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:DEV/a.md", "DEV/a.md", "DEV", "Page A", "content about golang"), + sectionDoc("section:DEV/a.md:10", "DEV/a.md", "DEV", "Page A", "Intro", "intro content golang", 1, 10), + pageDoc("page:DEV/b.md", "DEV/b.md", "DEV", "Page B", "content about python"), + } + mustIndex(t, s, docs...) + + // Confirm both a.md docs are indexed. + res, err := s.Search(search.SearchOptions{Query: "golang"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(res) < 2 { + t.Fatalf("expected >=2 results before delete, got %d", len(res)) + } + + if err := s.DeleteByPath("DEV/a.md"); err != nil { + t.Fatalf("DeleteByPath: %v", err) + } + + // a.md docs should be gone; b.md should remain. + res, err = s.Search(search.SearchOptions{Query: "golang"}) + if err != nil { + t.Fatalf("Search after delete: %v", err) + } + for _, r := range res { + if strings.Contains(r.Document.Path, "a.md") { + t.Errorf("found deleted doc: %s", r.Document.ID) + } + } + + res, err = s.Search(search.SearchOptions{Query: "python"}) + if err != nil { + t.Fatalf("Search b.md: %v", err) + } + if len(res) == 0 { + t.Error("expected b.md to still be indexed") + } +} + +func TestDeleteByPathNoop(t *testing.T) { + s := openTestStore(t) + // Deleting a non-existent path should not error. + if err := s.DeleteByPath("nonexistent/path.md"); err != nil { + t.Fatalf("DeleteByPath noop: %v", err) + } +} + +func TestFilterBySpaceKey(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:DEV/a.md", "DEV/a.md", "DEV", "Dev Page", "microservice deployment"), + pageDoc("page:OPS/b.md", "OPS/b.md", "OPS", "Ops Page", "microservice deployment"), + pageDoc("page:QA/c.md", "QA/c.md", "QA", "QA Page", "microservice testing"), + } + mustIndex(t, s, docs...) + + res, err := s.Search(search.SearchOptions{ + Query: "microservice", + SpaceKey: "DEV", + }) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(res) != 1 { + t.Fatalf("expected 1 result for DEV, got %d", len(res)) + } + if res[0].Document.SpaceKey != "DEV" { + t.Errorf("expected DEV space, got %q", res[0].Document.SpaceKey) + } +} + +func TestFilterByLabels(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:a.md", "a.md", "DEV", "A", "content", "go", "backend"), + pageDoc("page:b.md", "b.md", "DEV", "B", "content", "go", "frontend"), + pageDoc("page:c.md", "c.md", "DEV", "C", "content", "python"), + } + mustIndex(t, s, docs...) + + // Filter: label "go" AND "backend" — should match only a.md. + res, err := s.Search(search.SearchOptions{ + Labels: []string{"go", "backend"}, + }) + if err != nil { + t.Fatalf("Search: %v", err) + } + ids := sortedIDs(res) + if len(ids) != 1 || ids[0] != "page:a.md" { + t.Errorf("expected [page:a.md], got %v", ids) + } + + // Filter: label "go" only — should match a.md and b.md. + res, err = s.Search(search.SearchOptions{ + Labels: []string{"go"}, + }) + if err != nil { + t.Fatalf("Search: %v", err) + } + ids = sortedIDs(res) + if len(ids) != 2 { + t.Errorf("expected 2 results for label 'go', got %v", ids) + } +} + +func TestListLabels(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:a.md", "a.md", "S", "A", "content", "alpha", "beta"), + pageDoc("page:b.md", "b.md", "S", "B", "content", "beta", "gamma"), + pageDoc("page:c.md", "c.md", "S", "C", "content"), + } + mustIndex(t, s, docs...) + + labels, err := s.ListLabels() + if err != nil { + t.Fatalf("ListLabels: %v", err) + } + + want := []string{"alpha", "beta", "gamma"} + if !equalStringSlice(labels, want) { + t.Errorf("ListLabels: got %v, want %v", labels, want) + } +} + +func TestListLabelsEmpty(t *testing.T) { + s := openTestStore(t) + labels, err := s.ListLabels() + if err != nil { + t.Fatalf("ListLabels empty: %v", err) + } + if len(labels) != 0 { + t.Errorf("expected empty, got %v", labels) + } +} + +func TestListSpaces(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:DEV/a.md", "DEV/a.md", "DEV", "A", "content"), + pageDoc("page:OPS/b.md", "OPS/b.md", "OPS", "B", "content"), + pageDoc("page:OPS/c.md", "OPS/c.md", "OPS", "C", "content"), + } + mustIndex(t, s, docs...) + + spaces, err := s.ListSpaces() + if err != nil { + t.Fatalf("ListSpaces: %v", err) + } + + want := []string{"DEV", "OPS"} + if !equalStringSlice(spaces, want) { + t.Errorf("ListSpaces: got %v, want %v", spaces, want) + } +} + +func TestUpdateMetaAndLastIndexedAt(t *testing.T) { + s := openTestStore(t) + + // Before any UpdateMeta, LastIndexedAt should return zero time. + ts, err := s.LastIndexedAt() + if err != nil { + t.Fatalf("LastIndexedAt (initial): %v", err) + } + if !ts.IsZero() { + t.Errorf("expected zero time before UpdateMeta, got %v", ts) + } + + before := time.Now().UTC().Truncate(time.Second) + if err := s.UpdateMeta(); err != nil { + t.Fatalf("UpdateMeta: %v", err) + } + after := time.Now().UTC().Add(time.Second) + + ts, err = s.LastIndexedAt() + if err != nil { + t.Fatalf("LastIndexedAt: %v", err) + } + if ts.IsZero() { + t.Fatal("expected non-zero time after UpdateMeta") + } + if ts.Before(before) || ts.After(after) { + t.Errorf("LastIndexedAt %v is outside [%v, %v]", ts, before, after) + } +} + +func TestUpdateMetaMultipleTimes(t *testing.T) { + s := openTestStore(t) + + if err := s.UpdateMeta(); err != nil { + t.Fatalf("UpdateMeta 1: %v", err) + } + ts1, err := s.LastIndexedAt() + if err != nil { + t.Fatalf("LastIndexedAt 1: %v", err) + } + + time.Sleep(10 * time.Millisecond) + + if err := s.UpdateMeta(); err != nil { + t.Fatalf("UpdateMeta 2: %v", err) + } + ts2, err := s.LastIndexedAt() + if err != nil { + t.Fatalf("LastIndexedAt 2: %v", err) + } + + if ts2.Before(ts1) { + t.Errorf("second UpdateMeta should record a time >= first: ts1=%v ts2=%v", ts1, ts2) + } +} + +func TestUpsertBehavior(t *testing.T) { + s := openTestStore(t) + + // Index original document. + original := pageDoc("page:SPACE/page.md", "SPACE/page.md", "SPACE", "Original Title", "original content") + mustIndex(t, s, original) + + res, err := s.Search(search.SearchOptions{Query: "original"}) + if err != nil { + t.Fatalf("Search original: %v", err) + } + if len(res) == 0 { + t.Fatal("expected result for original content") + } + + // Delete and re-index with updated content. + if err := s.DeleteByPath("SPACE/page.md"); err != nil { + t.Fatalf("DeleteByPath: %v", err) + } + + updated := pageDoc("page:SPACE/page.md", "SPACE/page.md", "SPACE", "Updated Title", "updated content about refactoring") + mustIndex(t, s, updated) + + // Old content should not match. + res, err = s.Search(search.SearchOptions{Query: "original"}) + if err != nil { + t.Fatalf("Search original after upsert: %v", err) + } + for _, r := range res { + if r.Document.ID == "page:SPACE/page.md" { + t.Error("found old content after upsert") + } + } + + // New content should match. + res, err = s.Search(search.SearchOptions{Query: "refactoring"}) + if err != nil { + t.Fatalf("Search updated: %v", err) + } + if len(res) == 0 { + t.Fatal("expected result for updated content") + } + if res[0].Document.Title != "Updated Title" { + t.Errorf("expected updated title, got %q", res[0].Document.Title) + } +} + +func TestSearchEmptyQueryReturnsAll(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:a.md", "a.md", "S", "A", "foo"), + pageDoc("page:b.md", "b.md", "S", "B", "bar"), + } + mustIndex(t, s, docs...) + + res, err := s.Search(search.SearchOptions{Limit: 100}) + if err != nil { + t.Fatalf("Search empty: %v", err) + } + if len(res) < 2 { + t.Errorf("expected >=2 results for empty query, got %d", len(res)) + } +} + +func TestSearchByType(t *testing.T) { + s := openTestStore(t) + + docs := []search.Document{ + pageDoc("page:DEV/a.md", "DEV/a.md", "DEV", "Auth", "authentication token"), + sectionDoc("section:DEV/a.md:5", "DEV/a.md", "DEV", "Auth", "Token Auth", "authentication token section", 2, 5), + } + mustIndex(t, s, docs...) + + res, err := s.Search(search.SearchOptions{ + Query: "authentication", + Types: []string{search.DocTypeSection}, + }) + if err != nil { + t.Fatalf("Search by type: %v", err) + } + for _, r := range res { + if r.Document.Type != search.DocTypeSection { + t.Errorf("unexpected type %q in results filtered to section", r.Document.Type) + } + } + if len(res) == 0 { + t.Error("expected at least 1 section result") + } +} + +func TestSearchLimit(t *testing.T) { + s := openTestStore(t) + + var docs []search.Document + for i := 0; i < 20; i++ { + docs = append(docs, pageDoc( + fmt.Sprintf("page:S/page%d.md", i), + fmt.Sprintf("S/page%d.md", i), + "S", + fmt.Sprintf("Page %d", i), + "common keyword content", + )) + } + mustIndex(t, s, docs...) + + res, err := s.Search(search.SearchOptions{Query: "common", Limit: 5}) + if err != nil { + t.Fatalf("Search with limit: %v", err) + } + if len(res) > 5 { + t.Errorf("expected <=5 results, got %d", len(res)) + } +} + +func TestSnippetIsPopulated(t *testing.T) { + s := openTestStore(t) + + doc := pageDoc("page:DEV/a.md", "DEV/a.md", "DEV", "Guide", + "Distributed tracing helps you understand latency across services.") + mustIndex(t, s, doc) + + res, err := s.Search(search.SearchOptions{Query: "tracing"}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(res) == 0 { + t.Fatal("expected result") + } + // Snippet may be empty if the highlighter doesn't match (acceptable), + // but if present it should be non-empty string. + if res[0].Snippet != "" && strings.TrimSpace(res[0].Snippet) == "" { + t.Error("snippet is whitespace-only") + } +} + +// --------------------------------------------------------------------------- +// Assertion helpers +// --------------------------------------------------------------------------- + +// equalStringSlice compares two string slices after sorting. +func equalStringSlice(a, b []string) bool { + if len(a) != len(b) { + return false + } + ac := append([]string(nil), a...) + bc := append([]string(nil), b...) + sort.Strings(ac) + sort.Strings(bc) + for i := range ac { + if ac[i] != bc[i] { + return false + } + } + return true +} From a3d521126d5497f918691b50d71fd1a4367c1836 Mon Sep 17 00:00:00 2001 From: Robert Gonek Date: Tue, 3 Mar 2026 10:12:33 +0100 Subject: [PATCH 5/6] style: apply gofmt formatting to all Go source files --- cmd/root.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/root.go b/cmd/root.go index 081ae7d..fd94fca 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -107,6 +107,7 @@ func init() { newRelinkCmd(), newVersionCmd(), newDoctorCmd(), + newSearchCmd(), ) } From 8b7812e0fb70bf80f27889a17f8b033081981dc1 Mon Sep 17 00:00:00 2001 From: Robert Gonek Date: Tue, 3 Mar 2026 10:22:40 +0100 Subject: [PATCH 6/6] fix(search): resolve linting issues in indexer tests --- internal/search/indexer_test.go | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/internal/search/indexer_test.go b/internal/search/indexer_test.go index 355c032..7a031c1 100644 --- a/internal/search/indexer_test.go +++ b/internal/search/indexer_test.go @@ -32,10 +32,10 @@ func newTestIndexer(t *testing.T) (*search.Indexer, string) { func writeMarkdownFile(t *testing.T, repoDir, relPath, content string) { t.Helper() absPath := filepath.Join(repoDir, filepath.FromSlash(relPath)) - if err := os.MkdirAll(filepath.Dir(absPath), 0o755); err != nil { + if err := os.MkdirAll(filepath.Dir(absPath), 0o755); err != nil { //nolint:gosec // test data t.Fatalf("mkdir %s: %v", filepath.Dir(absPath), err) } - if err := os.WriteFile(absPath, []byte(content), 0o644); err != nil { + if err := os.WriteFile(absPath, []byte(content), 0o644); err != nil { //nolint:gosec // test data t.Fatalf("write %s: %v", absPath, err) } } @@ -93,7 +93,7 @@ func TestIndexer_IndexSpace(t *testing.T) { ix, repoDir := newTestIndexer(t) spaceDir := filepath.Join(repoDir, "DEV") - if err := os.MkdirAll(spaceDir, 0o755); err != nil { + if err := os.MkdirAll(spaceDir, 0o755); err != nil { //nolint:gosec // test data t.Fatalf("mkdir: %v", err) } writeMarkdownFile(t, repoDir, "DEV/overview.md", sampleMD) @@ -240,11 +240,3 @@ func openStoreFromIndexer(t *testing.T, repoDir string) *sqlitestore.Store { // — compile-time interface check — var _ search.Store = (*sqlitestore.Store)(nil) - -// — time stub for incremental test — -func mustNotBeZero(t *testing.T, ts time.Time, label string) { - t.Helper() - if ts.IsZero() { - t.Errorf("%s: expected non-zero time", label) - } -}