From 919ddc39a21e52bdd570f1f360d28100367e746e Mon Sep 17 00:00:00 2001 From: nludwig Date: Thu, 12 Feb 2026 14:39:20 -0800 Subject: [PATCH 1/6] add (provisional) indexing and querying of prs --- .env.sample | 4 + .gitignore | 3 +- .simili.yaml | 1 + README.md | 40 +- cmd/simili/commands/index.go | 127 +++++-- cmd/simili/commands/pr_duplicate.go | 446 +++++++++++++++++++++++ cmd/simili/commands/pr_duplicate_test.go | 70 ++++ cmd/simili/commands/pr_support.go | 128 +++++++ internal/core/config/config.go | 17 +- internal/integrations/gemini/llm.go | 53 +++ internal/integrations/gemini/prompts.go | 57 +++ internal/integrations/github/client.go | 40 ++ 12 files changed, 954 insertions(+), 32 deletions(-) create mode 100644 cmd/simili/commands/pr_duplicate.go create mode 100644 cmd/simili/commands/pr_duplicate_test.go create mode 100644 cmd/simili/commands/pr_support.go diff --git a/.env.sample b/.env.sample index a12817c..889f1ff 100644 --- a/.env.sample +++ b/.env.sample @@ -6,6 +6,10 @@ # ============================================================================= QDRANT_URL=https://your-cluster.qdrant.io:6333 QDRANT_API_KEY=your-qdrant-api-key +# Main issue collection +QDRANT_COLLECTION=your-issues-collection +# Optional dedicated pull-request collection +# QDRANT_PR_COLLECTION=your-pr-collection # ============================================================================= # REQUIRED: Embedding Provider (at least one) diff --git a/.gitignore b/.gitignore index 3bf30fb..8e8ec78 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,5 @@ go.work.sum .env.local # Local binaries and configs -simili - +/simili diff --git a/.simili.yaml b/.simili.yaml index e77e4ca..d86c8d1 100644 --- a/.simili.yaml +++ b/.simili.yaml @@ -2,6 +2,7 @@ qdrant: url: ${QDRANT_URL} api_key: ${QDRANT_API_KEY} collection: ${QDRANT_COLLECTION} + pr_collection: ${QDRANT_PR_COLLECTION} embedding: provider: gemini api_key: ${GEMINI_API_KEY} diff --git a/README.md b/README.md index 3c3351b..616a8e8 100644 --- a/README.md +++ b/README.md @@ -104,12 +104,36 @@ Bulk index issues from a GitHub repository into the vector database. simili index --repo owner/repo --workers 5 --limit 100 ``` +Optionally index pull requests (metadata-only) into a separate PR collection. + +```bash +simili index --repo owner/repo --workers 5 --include-prs +``` + **Flags:** - `--repo` (required): Target repository (owner/name) - `--workers`: Number of concurrent workers (default: 5) -- `--since`: Start from issue number or timestamp -- `--limit`: Maximum issues to index +- `--since`: RFC3339 timestamp filter (uses GitHub `updated_at`) - `--dry-run`: Simulate without writing to database +- `--include-prs`: Also index pull requests (metadata-only) +- `--pr-collection`: Override PR collection name (default: `qdrant.pr_collection` or `QDRANT_PR_COLLECTION`) + +### `simili pr-duplicate` + +Check whether a pull request appears to be a duplicate of existing issues or pull requests. +This command searches both the issue collection and PR collection, then runs an LLM duplicate decision. + +```bash +simili pr-duplicate --repo owner/repo --number 123 --top-k 8 +``` + +**Flags:** +- `--repo` (required): Target repository (owner/name) +- `--number` (required): Pull request number +- `--top-k`: Maximum combined candidates to evaluate (default: 8) +- `--threshold`: Similarity threshold override +- `--pr-collection`: Override PR collection name +- `--json`: Emit JSON output only ### `simili process` @@ -181,11 +205,17 @@ Create a JSON file with an array of issues: # 1. Index repository issues simili index --repo ballerina-platform/ballerina-library --workers 10 -# 2. Prepare test issues in batch.json -# 3. Run batch analysis +# 2. Index PRs into separate collection +simili index --repo ballerina-platform/ballerina-library --workers 10 --include-prs + +# 3. Check if a PR duplicates prior issues/PRs +simili pr-duplicate --repo ballerina-platform/ballerina-library --number 123 --top-k 10 + +# 4. Prepare test issues in batch.json +# 5. Run batch analysis simili batch --file batch.json --format csv --out-file analysis.csv --workers 5 -# 4. Review results +# 6. Review results cat analysis.csv ``` diff --git a/cmd/simili/commands/index.go b/cmd/simili/commands/index.go index 974f632..64a1600 100644 --- a/cmd/simili/commands/index.go +++ b/cmd/simili/commands/index.go @@ -25,12 +25,13 @@ import ( ) var ( - indexRepo string - indexSince string // Can be a timestamp (ISO8601) or issue number (int) - indexWorkers int - indexToken string - indexDryRun bool - indexIncludePRs bool + indexRepo string + indexSince string // Timestamp (RFC3339), mapped to GitHub's "updated_at" filter. + indexWorkers int + indexToken string + indexDryRun bool + indexIncludePRs bool + indexPRCollection string ) type Checkpoint struct { @@ -41,11 +42,14 @@ type Checkpoint struct { // indexCmd represents the index command var indexCmd = &cobra.Command{ Use: "index", - Short: "Bulk index issues into the vector database", + Short: "Bulk index issues (and optionally PRs) into the vector database", Long: `Index existing issues from a GitHub repository into the Qdrant vector database. It fetches issues, comments, chunks the text, generates embeddings using the active AI provider, and stores them for semantic search. +Optionally, pull requests can also be indexed into a dedicated PR collection +using metadata (title, description, changed file paths, and linked issues). + Supports resuming via a local checkpoint file or --since flag.`, Run: runIndex, } @@ -54,11 +58,12 @@ func init() { rootCmd.AddCommand(indexCmd) indexCmd.Flags().StringVar(&indexRepo, "repo", "", "Target repository (owner/name)") - indexCmd.Flags().StringVar(&indexSince, "since", "", "Start indexing from this issue number or timestamp") + indexCmd.Flags().StringVar(&indexSince, "since", "", "Start indexing from this RFC3339 timestamp (filters by updated_at)") indexCmd.Flags().IntVar(&indexWorkers, "workers", 5, "Number of concurrent workers") indexCmd.Flags().StringVar(&indexToken, "token", "", "GitHub token (optional, defaults to GITHUB_TOKEN env var)") indexCmd.Flags().BoolVar(&indexDryRun, "dry-run", false, "Simulate indexing without writing to DB") - indexCmd.Flags().BoolVar(&indexIncludePRs, "include-prs", true, "Include pull requests in indexing") + indexCmd.Flags().BoolVar(&indexIncludePRs, "include-prs", false, "Also index pull requests (metadata only) into PR collection") + indexCmd.Flags().StringVar(&indexPRCollection, "pr-collection", "", "Override PR collection name (default: qdrant.pr_collection or QDRANT_PR_COLLECTION)") if err := indexCmd.MarkFlagRequired("repo"); err != nil { log.Fatalf("Failed to mark repo flag as required: %v", err) @@ -77,6 +82,7 @@ func runIndex(cmd *cobra.Command, args []string) { if err != nil { log.Fatalf("Failed to load config: %v", err) } + prCollection := resolvePRCollection(cfg, indexPRCollection) // 2. Auth & Clients token := indexToken @@ -112,6 +118,13 @@ func runIndex(cmd *cobra.Command, args []string) { if err != nil { log.Fatalf("Failed to create/verify collection: %v", err) } + + if indexIncludePRs && prCollection != cfg.Qdrant.Collection { + err = qdrantClient.CreateCollection(ctx, prCollection, cfg.Embedding.Dimensions) + if err != nil { + log.Fatalf("Failed to create/verify PR collection: %v", err) + } + } } // 3. Parse Repo @@ -125,7 +138,7 @@ func runIndex(cmd *cobra.Command, args []string) { // Checkpoint logic omitted for simplicity in v0.1.0 as standard pagination handles most cases. // Users can rely on --since for updates. - log.Printf("Starting indexing for %s/%s with %d workers...", org, repoName, indexWorkers) + log.Printf("Starting indexing for %s/%s with %d workers (include PRs: %t)...", org, repoName, indexWorkers, indexIncludePRs) // Fetch loop page := 1 @@ -133,7 +146,8 @@ func runIndex(cmd *cobra.Command, args []string) { // Job channel type Job struct { - Issue *github.Issue + Issue *github.Issue + IsPullRequest bool } jobs := make(chan Job, indexWorkers) var wg sync.WaitGroup @@ -144,7 +158,11 @@ func runIndex(cmd *cobra.Command, args []string) { go func(id int) { defer wg.Done() for job := range jobs { - processIssue(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, cfg.Qdrant.Collection, org, repoName, indexDryRun) + if job.IsPullRequest { + processPullRequest(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, prCollection, org, repoName, indexDryRun) + } else { + processIssue(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, cfg.Qdrant.Collection, org, repoName, indexDryRun) + } } }(i) } @@ -152,7 +170,7 @@ func runIndex(cmd *cobra.Command, args []string) { // Issue Producer opts := &github.IssueListByRepoOptions{ State: "all", - Sort: "created", + Sort: "updated", Direction: "asc", ListOptions: github.ListOptions{PerPage: 100}, } @@ -181,10 +199,13 @@ func runIndex(cmd *cobra.Command, args []string) { log.Printf("Fetched page %d (%d issues)", page, len(issues)) for _, issue := range issues { - if !indexIncludePRs && issue.IsPullRequest() { + if issue.IsPullRequest() { + if indexIncludePRs { + jobs <- Job{Issue: issue, IsPullRequest: true} + } continue } - jobs <- Job{Issue: issue} + jobs <- Job{Issue: issue, IsPullRequest: false} } if resp.NextPage == 0 { @@ -248,11 +269,6 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si return } - itemType := "issue" - if issue.IsPullRequest() { - itemType = "pull_request" - } - points := make([]*qdrant.Point, len(chunks)) for i, chunk := range chunks { chunkID := uuid.NewMD5(uuid.NameSpaceURL, fmt.Appendf(nil, "%s/%s#%d-chunk-%d", org, repo, issue.GetNumber(), i)).String() @@ -263,11 +279,11 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si "org": org, "repo": repo, "issue_number": issue.GetNumber(), + "title": issue.GetTitle(), "text": chunk, "url": issue.GetHTMLURL(), - "type": itemType, "state": issue.GetState(), - "title": issue.GetTitle(), + "type": "issue", }, } } @@ -279,3 +295,70 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si log.Printf("[Worker %d] Indexed #%d", workerID, issue.GetNumber()) } } + +func processPullRequest(ctx context.Context, workerID int, issue *github.Issue, gh *similiGithub.Client, em *gemini.Embedder, qd *qdrant.Client, splitter *text.RecursiveCharacterSplitter, collection, org, repo string, dryRun bool) { + prNumber := issue.GetNumber() + + pr, err := gh.GetPullRequest(ctx, org, repo, prNumber) + if err != nil { + log.Printf("[Worker %d] Error fetching PR #%d: %v", workerID, prNumber, err) + return + } + + filePaths, err := listAllPullRequestFilePaths(ctx, gh, org, repo, prNumber) + if err != nil { + log.Printf("[Worker %d] Error fetching files for PR #%d: %v", workerID, prNumber, err) + return + } + + fullText := buildPRMetadataText(pr, filePaths) + if strings.TrimSpace(fullText) == "" { + log.Printf("[Worker %d] PR #%d has no indexable content, skipping", workerID, prNumber) + return + } + + chunks := splitter.SplitText(fullText) + if len(chunks) == 0 { + chunks = []string{fullText} + } + + embeddings, err := em.EmbedBatch(ctx, chunks) + if err != nil { + log.Printf("[Worker %d] Error embedding PR #%d: %v", workerID, prNumber, err) + return + } + + if dryRun { + log.Printf("[DryRun] Would upsert PR #%d (%d chunks) into %s", prNumber, len(chunks), collection) + return + } + + points := make([]*qdrant.Point, len(chunks)) + for i, chunk := range chunks { + pointID := uuid.NewMD5(uuid.NameSpaceURL, []byte(fmt.Sprintf("%s/%s/pr/%d/chunk/%d", org, repo, prNumber, i))).String() + points[i] = &qdrant.Point{ + ID: pointID, + Vector: embeddings[i], + Payload: map[string]interface{}{ + "org": org, + "repo": repo, + "pr_number": prNumber, + "title": pr.GetTitle(), + "description": strings.TrimSpace(pr.GetBody()), + "text": chunk, + "url": pr.GetHTMLURL(), + "state": pr.GetState(), + "merged": pr.GetMerged(), + "changed_files": strings.Join(filePaths, "\n"), + "type": "pull_request", + }, + } + } + + if err := qd.Upsert(ctx, collection, points); err != nil { + log.Printf("[Worker %d] Error upserting PR #%d: %v", workerID, prNumber, err) + return + } + + log.Printf("[Worker %d] Indexed PR #%d", workerID, prNumber) +} diff --git a/cmd/simili/commands/pr_duplicate.go b/cmd/simili/commands/pr_duplicate.go new file mode 100644 index 0000000..f37cbd5 --- /dev/null +++ b/cmd/simili/commands/pr_duplicate.go @@ -0,0 +1,446 @@ +package commands + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "sort" + "strings" + + "github.com/similigh/simili-bot/internal/core/config" + "github.com/similigh/simili-bot/internal/integrations/gemini" + similiGithub "github.com/similigh/simili-bot/internal/integrations/github" + "github.com/similigh/simili-bot/internal/integrations/qdrant" + "github.com/spf13/cobra" +) + +var ( + prDuplicateRepo string + prDuplicateNumber int + prDuplicateToken string + prDuplicatePRCollection string + prDuplicateTopK int + prDuplicateThreshold float64 + prDuplicateJSON bool +) + +type prDuplicateCandidate struct { + ID string `json:"id"` + EntityType string `json:"entity_type"` + Org string `json:"org"` + Repo string `json:"repo"` + Number int `json:"number"` + Title string `json:"title"` + URL string `json:"url"` + State string `json:"state"` + Similarity float64 `json:"similarity"` + Body string `json:"-"` +} + +type prDuplicateOutput struct { + PullRequest struct { + Org string `json:"org"` + Repo string `json:"repo"` + Number int `json:"number"` + Title string `json:"title"` + URL string `json:"url"` + } `json:"pull_request"` + Candidates []prDuplicateCandidate `json:"candidates"` + Duplicate *gemini.PRDuplicateResult `json:"duplicate,omitempty"` + Matched *prDuplicateCandidate `json:"matched,omitempty"` +} + +var prDuplicateCmd = &cobra.Command{ + Use: "pr-duplicate", + Short: "Check whether a PR is a duplicate of existing issues/PRs", + Long: `Analyze a pull request for duplicate intent by searching both the issue +collection and PR collection, then using Gemini to make a duplicate decision.`, + Run: runPRDuplicate, +} + +func init() { + rootCmd.AddCommand(prDuplicateCmd) + + prDuplicateCmd.Flags().StringVar(&prDuplicateRepo, "repo", "", "Target repository (owner/name)") + prDuplicateCmd.Flags().IntVar(&prDuplicateNumber, "number", 0, "Pull request number") + prDuplicateCmd.Flags().StringVar(&prDuplicateToken, "token", "", "GitHub token (optional, defaults to GITHUB_TOKEN env var)") + prDuplicateCmd.Flags().StringVar(&prDuplicatePRCollection, "pr-collection", "", "Override PR collection name") + prDuplicateCmd.Flags().IntVar(&prDuplicateTopK, "top-k", 8, "Maximum combined candidates to evaluate") + prDuplicateCmd.Flags().Float64Var(&prDuplicateThreshold, "threshold", 0, "Similarity threshold override (default: config)") + prDuplicateCmd.Flags().BoolVar(&prDuplicateJSON, "json", false, "Output JSON only") + + if err := prDuplicateCmd.MarkFlagRequired("repo"); err != nil { + log.Fatalf("Failed to mark repo flag as required: %v", err) + } + if err := prDuplicateCmd.MarkFlagRequired("number"); err != nil { + log.Fatalf("Failed to mark number flag as required: %v", err) + } +} + +func runPRDuplicate(cmd *cobra.Command, args []string) { + ctx := context.Background() + + cfgPath := config.FindConfigPath(cfgFile) + if cfgPath == "" { + log.Fatalf("Config file not found. Please verify your setup.") + } + cfg, err := config.Load(cfgPath) + if err != nil { + log.Fatalf("Failed to load config: %v", err) + } + + token := prDuplicateToken + if token == "" { + token = os.Getenv("GITHUB_TOKEN") + } + if token == "" { + log.Fatal("GitHub token is required (use --token or GITHUB_TOKEN env var)") + } + + parts := strings.Split(prDuplicateRepo, "/") + if len(parts) != 2 { + log.Fatalf("Invalid repo format: %s (expected owner/name)", prDuplicateRepo) + } + org, repo := parts[0], parts[1] + + threshold := prDuplicateThreshold + if threshold <= 0 { + threshold = cfg.Defaults.SimilarityThreshold + } + if threshold <= 0 { + threshold = 0.65 + } + + topK := prDuplicateTopK + if topK <= 0 { + topK = cfg.Defaults.MaxSimilarToShow + } + if topK <= 0 { + topK = 8 + } + + prCollection := resolvePRCollection(cfg, prDuplicatePRCollection) + + ghClient := similiGithub.NewClient(ctx, token) + pr, err := ghClient.GetPullRequest(ctx, org, repo, prDuplicateNumber) + if err != nil { + log.Fatalf("Failed to fetch pull request: %v", err) + } + + filePaths, err := listAllPullRequestFilePaths(ctx, ghClient, org, repo, prDuplicateNumber) + if err != nil { + log.Fatalf("Failed to fetch pull request files: %v", err) + } + + prText := buildPRMetadataText(pr, filePaths) + + geminiKey := cfg.Embedding.APIKey + if geminiKey == "" { + geminiKey = os.Getenv("GEMINI_API_KEY") + } + if geminiKey == "" { + log.Fatal("Gemini API key is required (set embedding.api_key or GEMINI_API_KEY)") + } + + embedder, err := gemini.NewEmbedder(geminiKey, cfg.Embedding.Model) + if err != nil { + log.Fatalf("Failed to initialize Gemini embedder: %v", err) + } + defer embedder.Close() + + embedding, err := embedder.Embed(ctx, prText) + if err != nil { + log.Fatalf("Failed to embed pull request content: %v", err) + } + + qdrantClient, err := qdrant.NewClient(cfg.Qdrant.URL, cfg.Qdrant.APIKey) + if err != nil { + log.Fatalf("Failed to initialize Qdrant client: %v", err) + } + defer qdrantClient.Close() + + searchLimit := topK * 3 + if searchLimit < topK { + searchLimit = topK + } + + issueCollectionExists, err := qdrantClient.CollectionExists(ctx, cfg.Qdrant.Collection) + if err != nil { + log.Fatalf("Failed to verify issue collection '%s': %v", cfg.Qdrant.Collection, err) + } + if !issueCollectionExists { + log.Fatalf("Issue collection '%s' does not exist", cfg.Qdrant.Collection) + } + + issueResults, err := qdrantClient.Search(ctx, cfg.Qdrant.Collection, embedding, searchLimit, threshold) + if err != nil { + log.Fatalf("Failed searching issue collection '%s': %v", cfg.Qdrant.Collection, err) + } + + prResults := make([]*qdrant.SearchResult, 0) + if prCollection != cfg.Qdrant.Collection { + prCollectionExists, err := qdrantClient.CollectionExists(ctx, prCollection) + if err != nil { + log.Fatalf("Failed to verify PR collection '%s': %v", prCollection, err) + } + if prCollectionExists { + prResults, err = qdrantClient.Search(ctx, prCollection, embedding, searchLimit, threshold) + if err != nil { + log.Fatalf("Failed searching PR collection '%s': %v", prCollection, err) + } + } else if !prDuplicateJSON { + fmt.Printf("Warning: PR collection '%s' does not exist; searching issues only.\n\n", prCollection) + } + } + + candidates := mergeDuplicateCandidates(issueResults, prResults, org, repo, prDuplicateNumber) + if len(candidates) > topK { + candidates = candidates[:topK] + } + + llmClient, err := gemini.NewLLMClient(geminiKey) + if err != nil { + log.Fatalf("Failed to initialize Gemini LLM client: %v", err) + } + defer llmClient.Close() + + var duplicateResult *gemini.PRDuplicateResult + var matched *prDuplicateCandidate + if len(candidates) > 0 { + llmCandidates := make([]gemini.PRDuplicateCandidateInput, len(candidates)) + for i, c := range candidates { + llmCandidates[i] = gemini.PRDuplicateCandidateInput{ + ID: c.ID, + EntityType: c.EntityType, + Org: c.Org, + Repo: c.Repo, + Number: c.Number, + Title: c.Title, + Body: c.Body, + URL: c.URL, + Similarity: c.Similarity, + State: c.State, + } + } + + duplicateResult, err = llmClient.DetectPRDuplicate(ctx, &gemini.PRDuplicateCheckInput{ + PullRequest: &gemini.IssueInput{ + Title: pr.GetTitle(), + Body: pr.GetBody(), + Author: pr.GetUser().GetLogin(), + }, + Candidates: llmCandidates, + }) + if err != nil { + log.Fatalf("Failed to run duplicate analysis: %v", err) + } + + if duplicateResult.IsDuplicate && duplicateResult.DuplicateID != "" { + for i := range candidates { + if candidates[i].ID == duplicateResult.DuplicateID { + c := candidates[i] + matched = &c + break + } + } + } + } + + out := prDuplicateOutput{ + Candidates: candidates, + Duplicate: duplicateResult, + Matched: matched, + } + out.PullRequest.Org = org + out.PullRequest.Repo = repo + out.PullRequest.Number = prDuplicateNumber + out.PullRequest.Title = pr.GetTitle() + out.PullRequest.URL = pr.GetHTMLURL() + + if prDuplicateJSON { + printJSONOutput(out) + return + } + + fmt.Printf("PR: %s/%s#%d\n", org, repo, prDuplicateNumber) + fmt.Printf("Title: %s\n", pr.GetTitle()) + fmt.Printf("Issue Collection: %s\n", cfg.Qdrant.Collection) + fmt.Printf("PR Collection: %s\n", prCollection) + fmt.Printf("Threshold: %.2f\n\n", threshold) + + if len(candidates) == 0 { + fmt.Println("No similar issues or pull requests found.") + return + } + + fmt.Println("Top Candidates:") + for i, c := range candidates { + label := "Issue" + if c.EntityType == "pull_request" { + label = "PR" + } + fmt.Printf("%d. [%s] %s/%s#%d (%.0f%%)\n", i+1, label, c.Org, c.Repo, c.Number, c.Similarity*100) + fmt.Printf(" %s\n", c.Title) + fmt.Printf(" %s\n", c.URL) + } + + if duplicateResult == nil { + return + } + + fmt.Println() + if duplicateResult.IsDuplicate { + fmt.Printf("Duplicate: YES (confidence %.2f)\n", duplicateResult.Confidence) + if matched != nil { + label := "Issue" + if matched.EntityType == "pull_request" { + label = "PR" + } + fmt.Printf("Matched: [%s] %s/%s#%d\n", label, matched.Org, matched.Repo, matched.Number) + } else if duplicateResult.DuplicateID != "" { + fmt.Printf("Matched ID: %s\n", duplicateResult.DuplicateID) + } + } else { + fmt.Printf("Duplicate: NO (confidence %.2f)\n", duplicateResult.Confidence) + } + if duplicateResult.Reasoning != "" { + fmt.Printf("Reasoning: %s\n", duplicateResult.Reasoning) + } +} + +func printJSONOutput(v interface{}) { + data, err := json.MarshalIndent(v, "", " ") + if err != nil { + log.Fatalf("Failed to marshal JSON output: %v", err) + } + fmt.Println(string(data)) +} + +func mergeDuplicateCandidates(issueResults, prResults []*qdrant.SearchResult, currentOrg, currentRepo string, currentPR int) []prDuplicateCandidate { + byID := make(map[string]prDuplicateCandidate) + + add := func(res *qdrant.SearchResult) { + candidate, ok := buildCandidateFromSearchResult(res) + if !ok { + return + } + + if candidate.EntityType == "pull_request" && + candidate.Org == currentOrg && + candidate.Repo == currentRepo && + candidate.Number == currentPR { + return + } + + existing, found := byID[candidate.ID] + if !found || candidate.Similarity > existing.Similarity { + byID[candidate.ID] = candidate + } + } + + for _, res := range issueResults { + add(res) + } + for _, res := range prResults { + add(res) + } + + merged := make([]prDuplicateCandidate, 0, len(byID)) + for _, candidate := range byID { + merged = append(merged, candidate) + } + + sort.Slice(merged, func(i, j int) bool { + return merged[i].Similarity > merged[j].Similarity + }) + + return merged +} + +func buildCandidateFromSearchResult(res *qdrant.SearchResult) (prDuplicateCandidate, bool) { + var candidate prDuplicateCandidate + + org, _ := res.Payload["org"].(string) + repo, _ := res.Payload["repo"].(string) + entityType, _ := res.Payload["type"].(string) + + if entityType == "" { + if _, ok := res.Payload["pr_number"]; ok { + entityType = "pull_request" + } else { + entityType = "issue" + } + } + + var number int + var ok bool + if entityType == "pull_request" { + number, ok = toInt(res.Payload["pr_number"]) + } else { + number, ok = toInt(res.Payload["issue_number"]) + if !ok { + number, ok = toInt(res.Payload["number"]) + } + } + if !ok { + return candidate, false + } + + title, _ := res.Payload["title"].(string) + body, _ := res.Payload["text"].(string) + if body == "" { + body, _ = res.Payload["description"].(string) + } + if title == "" { + title = titleFromTextFallback(body) + } + if title == "" { + title = "Untitled" + } + + url, _ := res.Payload["url"].(string) + state, _ := res.Payload["state"].(string) + if state == "" { + state = "open" + } + + id := fmt.Sprintf("%s:%s/%s#%d", entityType, org, repo, number) + candidate = prDuplicateCandidate{ + ID: id, + EntityType: entityType, + Org: org, + Repo: repo, + Number: number, + Title: title, + URL: url, + State: state, + Similarity: float64(res.Score), + Body: body, + } + return candidate, true +} + +func titleFromTextFallback(text string) string { + if strings.HasPrefix(text, "Title: ") { + lines := strings.SplitN(text, "\n", 2) + if len(lines) > 0 { + return strings.TrimSpace(strings.TrimPrefix(lines[0], "Title: ")) + } + } + return "" +} + +func toInt(v interface{}) (int, bool) { + switch val := v.(type) { + case int: + return val, true + case int64: + return int(val), true + case float64: + return int(val), true + default: + return 0, false + } +} diff --git a/cmd/simili/commands/pr_duplicate_test.go b/cmd/simili/commands/pr_duplicate_test.go new file mode 100644 index 0000000..c6e81c6 --- /dev/null +++ b/cmd/simili/commands/pr_duplicate_test.go @@ -0,0 +1,70 @@ +package commands + +import ( + "reflect" + "testing" + + "github.com/similigh/simili-bot/internal/integrations/qdrant" +) + +func TestExtractLinkedIssueRefs(t *testing.T) { + body := "This change fixes #12 and closes #34. It also resolves #12 again." + got := extractLinkedIssueRefs(body) + want := []int{12, 34} + + if !reflect.DeepEqual(got, want) { + t.Fatalf("extractLinkedIssueRefs() = %v, want %v", got, want) + } +} + +func TestBuildCandidateFromSearchResult_Issue(t *testing.T) { + res := &qdrant.SearchResult{ + Score: 0.91, + Payload: map[string]interface{}{ + "org": "acme", + "repo": "core", + "issue_number": int64(42), + "title": "Crash on startup", + "text": "Title: Crash on startup\n\nBody: ...", + "url": "https://example.test/issues/42", + "state": "open", + "type": "issue", + }, + } + + candidate, ok := buildCandidateFromSearchResult(res) + if !ok { + t.Fatal("expected candidate to parse") + } + if candidate.ID != "issue:acme/core#42" { + t.Fatalf("candidate.ID = %q", candidate.ID) + } + if candidate.EntityType != "issue" { + t.Fatalf("candidate.EntityType = %q", candidate.EntityType) + } +} + +func TestBuildCandidateFromSearchResult_PR(t *testing.T) { + res := &qdrant.SearchResult{ + Score: 0.88, + Payload: map[string]interface{}{ + "org": "acme", + "repo": "core", + "pr_number": int64(77), + "text": "Title: Improve retry behavior\n\nDescription: ...", + "url": "https://example.test/pull/77", + "type": "pull_request", + }, + } + + candidate, ok := buildCandidateFromSearchResult(res) + if !ok { + t.Fatal("expected candidate to parse") + } + if candidate.ID != "pull_request:acme/core#77" { + t.Fatalf("candidate.ID = %q", candidate.ID) + } + if candidate.Title != "Improve retry behavior" { + t.Fatalf("candidate.Title = %q", candidate.Title) + } +} diff --git a/cmd/simili/commands/pr_support.go b/cmd/simili/commands/pr_support.go new file mode 100644 index 0000000..7e339ea --- /dev/null +++ b/cmd/simili/commands/pr_support.go @@ -0,0 +1,128 @@ +package commands + +import ( + "context" + "fmt" + "os" + "regexp" + "sort" + "strconv" + "strings" + + "github.com/google/go-github/v60/github" + similiConfig "github.com/similigh/simili-bot/internal/core/config" + similiGithub "github.com/similigh/simili-bot/internal/integrations/github" +) + +var linkedIssuePattern = regexp.MustCompile(`(?i)\b(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)\s+#(\d+)\b`) + +func resolvePRCollection(cfg *similiConfig.Config, override string) string { + if strings.TrimSpace(override) != "" { + return strings.TrimSpace(override) + } + if env := strings.TrimSpace(os.Getenv("QDRANT_PR_COLLECTION")); env != "" { + return env + } + if strings.TrimSpace(cfg.Qdrant.PRCollection) != "" { + return strings.TrimSpace(cfg.Qdrant.PRCollection) + } + if strings.TrimSpace(cfg.Qdrant.Collection) != "" { + return strings.TrimSpace(cfg.Qdrant.Collection) + "_prs" + } + return "simili_prs" +} + +func listAllPullRequestFilePaths(ctx context.Context, gh *similiGithub.Client, org, repo string, number int) ([]string, error) { + paths := make([]string, 0, 32) + page := 1 + + for { + files, resp, err := gh.ListPullRequestFiles(ctx, org, repo, number, &github.ListOptions{ + PerPage: 100, + Page: page, + }) + if err != nil { + return nil, err + } + + for _, f := range files { + name := strings.TrimSpace(f.GetFilename()) + if name != "" { + paths = append(paths, name) + } + } + + if resp == nil || resp.NextPage == 0 { + break + } + page = resp.NextPage + } + + sort.Strings(paths) + return paths, nil +} + +func buildPRMetadataText(pr *github.PullRequest, changedFiles []string) string { + var sb strings.Builder + sb.WriteString(fmt.Sprintf("Title: %s\n\n", pr.GetTitle())) + + body := strings.TrimSpace(pr.GetBody()) + if body != "" { + sb.WriteString(fmt.Sprintf("Description: %s\n\n", body)) + } + + sb.WriteString(fmt.Sprintf("State: %s\n", pr.GetState())) + sb.WriteString(fmt.Sprintf("Merged: %t\n", pr.GetMerged())) + sb.WriteString(fmt.Sprintf("Author: %s\n", pr.GetUser().GetLogin())) + sb.WriteString(fmt.Sprintf("Base Branch: %s\n", pr.GetBase().GetRef())) + sb.WriteString(fmt.Sprintf("Head Branch: %s\n\n", pr.GetHead().GetRef())) + + if len(changedFiles) > 0 { + sb.WriteString("Changed Files:\n") + for _, path := range changedFiles { + sb.WriteString("- ") + sb.WriteString(path) + sb.WriteString("\n") + } + sb.WriteString("\n") + } + + linked := extractLinkedIssueRefs(pr.GetBody()) + if len(linked) > 0 { + sb.WriteString("Linked Issues:\n") + for _, issueNum := range linked { + sb.WriteString("- #") + sb.WriteString(strconv.Itoa(issueNum)) + sb.WriteString("\n") + } + } + + return strings.TrimSpace(sb.String()) +} + +func extractLinkedIssueRefs(body string) []int { + matches := linkedIssuePattern.FindAllStringSubmatch(body, -1) + if len(matches) == 0 { + return nil + } + + seen := make(map[int]struct{}, len(matches)) + result := make([]int, 0, len(matches)) + for _, match := range matches { + if len(match) < 2 { + continue + } + number, err := strconv.Atoi(match[1]) + if err != nil { + continue + } + if _, ok := seen[number]; ok { + continue + } + seen[number] = struct{}{} + result = append(result, number) + } + + sort.Ints(result) + return result +} diff --git a/internal/core/config/config.go b/internal/core/config/config.go index 4a73f90..5cccad3 100644 --- a/internal/core/config/config.go +++ b/internal/core/config/config.go @@ -52,9 +52,10 @@ type Config struct { // QdrantConfig holds Qdrant connection settings. type QdrantConfig struct { - URL string `yaml:"url"` - APIKey string `yaml:"api_key"` - Collection string `yaml:"collection"` + URL string `yaml:"url"` + APIKey string `yaml:"api_key"` + Collection string `yaml:"collection"` + PRCollection string `yaml:"pr_collection,omitempty"` } // EmbeddingConfig holds embedding provider settings. @@ -250,6 +251,13 @@ func (c *Config) applyDefaults() { if c.Transfer.RepoCollection == "" { c.Transfer.RepoCollection = "simili_repos" } + if c.Qdrant.PRCollection == "" { + if c.Qdrant.Collection != "" { + c.Qdrant.PRCollection = c.Qdrant.Collection + "_prs" + } else { + c.Qdrant.PRCollection = "simili_prs" + } + } } // mergeConfigs merges a child config onto a parent config. @@ -275,6 +283,9 @@ func mergeConfigs(parent, child *Config) *Config { if child.Qdrant.Collection != "" { result.Qdrant.Collection = child.Qdrant.Collection } + if child.Qdrant.PRCollection != "" { + result.Qdrant.PRCollection = child.Qdrant.PRCollection + } // Embedding: override if any field is set if child.Embedding.Provider != "" { diff --git a/internal/integrations/gemini/llm.go b/internal/integrations/gemini/llm.go index b9525c2..c5951e7 100644 --- a/internal/integrations/gemini/llm.go +++ b/internal/integrations/gemini/llm.go @@ -107,6 +107,34 @@ type DuplicateResult struct { SimilarIssues json.RawMessage `json:"similar_issues"` // Flexible: can be []int or []object } +// PRDuplicateCandidateInput represents an issue/PR candidate for PR duplicate checking. +type PRDuplicateCandidateInput struct { + ID string + EntityType string // "issue" or "pull_request" + Org string + Repo string + Number int + Title string + Body string + URL string + Similarity float64 + State string +} + +// PRDuplicateCheckInput represents input for PR duplicate detection. +type PRDuplicateCheckInput struct { + PullRequest *IssueInput + Candidates []PRDuplicateCandidateInput +} + +// PRDuplicateResult holds duplicate detection result for pull requests. +type PRDuplicateResult struct { + IsDuplicate bool `json:"is_duplicate"` + DuplicateID string `json:"duplicate_id"` // Candidate ID, e.g. issue:org/repo#123 + Confidence float64 `json:"confidence"` // 0.0-1.0 + Reasoning string `json:"reasoning"` +} + // NewLLMClient creates a new LLM client. func NewLLMClient(apiKey string, model ...string) (*LLMClient, error) { provider, resolvedKey, err := ResolveProvider(apiKey) @@ -305,6 +333,31 @@ func (l *LLMClient) DetectDuplicate(ctx context.Context, input *DuplicateCheckIn return &result, nil } +// DetectPRDuplicate analyzes whether a pull request is a duplicate of existing issues/PRs. +func (l *LLMClient) DetectPRDuplicate(ctx context.Context, input *PRDuplicateCheckInput) (*PRDuplicateResult, error) { + if len(input.Candidates) == 0 { + return &PRDuplicateResult{IsDuplicate: false}, nil + } + + prompt := buildPRDuplicateDetectionPrompt(input) + + responseText, err := l.generateText(ctx, prompt, 0.2, true) + if err != nil { + return nil, fmt.Errorf("failed to detect PR duplicate: %w", err) + } + + var result PRDuplicateResult + if err := unmarshalJSONResponse(responseText, &result); err != nil { + return nil, fmt.Errorf("failed to parse PR duplicate response: %w", err) + } + + if !result.IsDuplicate { + result.DuplicateID = "" + } + + return &result, nil +} + func (l *LLMClient) generateText(ctx context.Context, prompt string, temperature float32, jsonMode bool) (string, error) { switch l.provider { case ProviderGemini: diff --git a/internal/integrations/gemini/prompts.go b/internal/integrations/gemini/prompts.go index cdfa9f1..4f7a09e 100644 --- a/internal/integrations/gemini/prompts.go +++ b/internal/integrations/gemini/prompts.go @@ -312,3 +312,60 @@ ONLY set is_duplicate to true if confidence >= 0.85. When in doubt, set is_dupli similarList.String(), ) } + +// buildPRDuplicateDetectionPrompt creates a prompt for PR duplicate detection across issues and PRs. +func buildPRDuplicateDetectionPrompt(input *PRDuplicateCheckInput) string { + var candidates strings.Builder + for i, c := range input.Candidates { + fmt.Fprintf(&candidates, "--- Candidate %d ---\n", i+1) + fmt.Fprintf(&candidates, "ID: %s\n", c.ID) + fmt.Fprintf(&candidates, "Type: %s\n", c.EntityType) + fmt.Fprintf(&candidates, "Repository: %s/%s\n", c.Org, c.Repo) + fmt.Fprintf(&candidates, "Number: %d\n", c.Number) + fmt.Fprintf(&candidates, "State: %s\n", c.State) + fmt.Fprintf(&candidates, "Vector similarity: %.0f%%\n", c.Similarity*100) + fmt.Fprintf(&candidates, "Title: %s\n", c.Title) + if c.Body != "" { + fmt.Fprintf(&candidates, "Content:\n%s\n", truncate(c.Body, 600)) + } + fmt.Fprintf(&candidates, "URL: %s\n\n", c.URL) + } + + return fmt.Sprintf(`You are a strict duplicate detector for GitHub pull requests. + +Task: +Determine whether the CURRENT pull request is a true duplicate of any candidate issue or pull request. + +Definition of duplicate: +- A duplicate means the PR addresses the same underlying problem/request and intended outcome. +- Related context is not enough. Same component but different fix scope is NOT a duplicate. + +Current Pull Request: +- Title: %s +- Description: %s + +Candidates: +%s + +Return valid JSON in this exact format: +{ + "is_duplicate": false, + "duplicate_id": "", + "confidence": 0.0, + "reasoning": "brief explanation" +} + +Rules: +- If is_duplicate is true, duplicate_id MUST be exactly one ID from the candidate list. +- If is_duplicate is false, duplicate_id MUST be "". +- Be conservative: only mark duplicate when confidence >= 0.85. +- Confidence guidance: + - 0.95+ certain duplicate + - 0.85-0.95 very likely duplicate + - 0.70-0.85 related but likely distinct + - <0.70 different`, + input.PullRequest.Title, + truncate(input.PullRequest.Body, 1200), + candidates.String(), + ) +} diff --git a/internal/integrations/github/client.go b/internal/integrations/github/client.go index 498277c..5879747 100644 --- a/internal/integrations/github/client.go +++ b/internal/integrations/github/client.go @@ -29,6 +29,16 @@ func (c *Client) GetIssue(ctx context.Context, org, repo string, number int) (*g return issue, nil } +// GetPullRequest fetches pull request details. +func (c *Client) GetPullRequest(ctx context.Context, org, repo string, number int) (*github.PullRequest, error) { + pr, _, err := c.client.PullRequests.Get(ctx, org, repo, number) + if err != nil { + return nil, fmt.Errorf("failed to fetch pull request: %w", err) + } + + return pr, nil +} + // CreateComment posts a comment on an issue. func (c *Client) CreateComment(ctx context.Context, org, repo string, number int, body string) error { if strings.TrimSpace(body) == "" { @@ -119,6 +129,22 @@ func (c *Client) ListIssues(ctx context.Context, org, repo string, opts *github. return issues, resp, nil } +// ListPullRequests fetches a list of pull requests from the repository. +func (c *Client) ListPullRequests(ctx context.Context, org, repo string, opts *github.PullRequestListOptions) ([]*github.PullRequest, *github.Response, error) { + if opts == nil { + opts = &github.PullRequestListOptions{ + State: "all", + } + } + + prs, resp, err := c.client.PullRequests.List(ctx, org, repo, opts) + if err != nil { + return nil, resp, fmt.Errorf("failed to list pull requests for %s/%s: %w", org, repo, err) + } + + return prs, resp, nil +} + // ListComments fetches comments for a specific issue. func (c *Client) ListComments(ctx context.Context, org, repo string, number int, opts *github.IssueListCommentsOptions) ([]*github.IssueComment, *github.Response, error) { comments, resp, err := c.client.Issues.ListComments(ctx, org, repo, number, opts) @@ -128,6 +154,20 @@ func (c *Client) ListComments(ctx context.Context, org, repo string, number int, return comments, resp, nil } +// ListPullRequestFiles fetches files changed in a pull request. +func (c *Client) ListPullRequestFiles(ctx context.Context, org, repo string, number int, opts *github.ListOptions) ([]*github.CommitFile, *github.Response, error) { + if opts == nil { + opts = &github.ListOptions{PerPage: 100} + } + + files, resp, err := c.client.PullRequests.ListFiles(ctx, org, repo, number, opts) + if err != nil { + return nil, resp, fmt.Errorf("failed to list files for pull request #%d in %s/%s: %w", number, org, repo, err) + } + + return files, resp, nil +} + // GetFileContent fetches the raw content of a file from a repository. // ref can be a branch, tag, or commit SHA. If empty, the default branch is used. func (c *Client) GetFileContent(ctx context.Context, org, repo, path, ref string) ([]byte, error) { From 1e25a7af1988b353253ca735b884dd1a06c5f8d0 Mon Sep 17 00:00:00 2001 From: nludwig Date: Thu, 12 Feb 2026 14:45:16 -0800 Subject: [PATCH 2/6] split up runPRDuplicate --- cmd/simili/commands/pr_duplicate.go | 286 ++++++++++++++++++---------- 1 file changed, 184 insertions(+), 102 deletions(-) diff --git a/cmd/simili/commands/pr_duplicate.go b/cmd/simili/commands/pr_duplicate.go index f37cbd5..d60b5f5 100644 --- a/cmd/simili/commands/pr_duplicate.go +++ b/cmd/simili/commands/pr_duplicate.go @@ -9,6 +9,7 @@ import ( "sort" "strings" + "github.com/google/go-github/v60/github" "github.com/similigh/simili-bot/internal/core/config" "github.com/similigh/simili-bot/internal/integrations/gemini" similiGithub "github.com/similigh/simili-bot/internal/integrations/github" @@ -52,6 +53,17 @@ type prDuplicateOutput struct { Matched *prDuplicateCandidate `json:"matched,omitempty"` } +type prDuplicateRunOptions struct { + Token string + Org string + Repo string + Number int + TopK int + Threshold float64 + PRCollection string + GeminiKey string +} + var prDuplicateCmd = &cobra.Command{ Use: "pr-duplicate", Short: "Check whether a PR is a duplicate of existing issues/PRs", @@ -82,28 +94,64 @@ func init() { func runPRDuplicate(cmd *cobra.Command, args []string) { ctx := context.Background() - cfgPath := config.FindConfigPath(cfgFile) - if cfgPath == "" { - log.Fatalf("Config file not found. Please verify your setup.") - } - cfg, err := config.Load(cfgPath) + cfg, err := loadPRDuplicateConfig() if err != nil { log.Fatalf("Failed to load config: %v", err) } - token := prDuplicateToken + opts, err := resolvePRDuplicateRunOptions(cfg) + if err != nil { + log.Fatalf("%v", err) + } + + pr, prText, err := fetchPullRequestMetadataText(ctx, opts) + if err != nil { + log.Fatalf("Failed to fetch pull request metadata: %v", err) + } + + embedding, err := generateEmbeddingForPRText(ctx, cfg, opts.GeminiKey, prText) + if err != nil { + log.Fatalf("Failed to embed pull request content: %v", err) + } + + candidates, prCollectionMissing, err := findPRDuplicateCandidates(ctx, cfg, opts, embedding) + if err != nil { + log.Fatalf("Failed to search duplicate candidates: %v", err) + } + if prCollectionMissing && !prDuplicateJSON { + fmt.Printf("Warning: PR collection '%s' does not exist; searching issues only.\n\n", opts.PRCollection) + } + + duplicateResult, matched, err := detectPRDuplicate(ctx, opts.GeminiKey, pr, candidates) + if err != nil { + log.Fatalf("Failed to run duplicate analysis: %v", err) + } + + out := buildPRDuplicateOutput(pr, opts, candidates, duplicateResult, matched) + renderPRDuplicateOutput(out, cfg.Qdrant.Collection, opts.PRCollection, opts.Threshold) +} + +func loadPRDuplicateConfig() (*config.Config, error) { + cfgPath := config.FindConfigPath(cfgFile) + if cfgPath == "" { + return nil, fmt.Errorf("config file not found. Please verify your setup") + } + return config.Load(cfgPath) +} + +func resolvePRDuplicateRunOptions(cfg *config.Config) (*prDuplicateRunOptions, error) { + token := strings.TrimSpace(prDuplicateToken) if token == "" { - token = os.Getenv("GITHUB_TOKEN") + token = strings.TrimSpace(os.Getenv("GITHUB_TOKEN")) } if token == "" { - log.Fatal("GitHub token is required (use --token or GITHUB_TOKEN env var)") + return nil, fmt.Errorf("GitHub token is required (use --token or GITHUB_TOKEN env var)") } parts := strings.Split(prDuplicateRepo, "/") - if len(parts) != 2 { - log.Fatalf("Invalid repo format: %s (expected owner/name)", prDuplicateRepo) + if len(parts) != 2 || strings.TrimSpace(parts[0]) == "" || strings.TrimSpace(parts[1]) == "" { + return nil, fmt.Errorf("invalid repo format: %s (expected owner/name)", prDuplicateRepo) } - org, repo := parts[0], parts[1] threshold := prDuplicateThreshold if threshold <= 0 { @@ -121,162 +169,196 @@ func runPRDuplicate(cmd *cobra.Command, args []string) { topK = 8 } - prCollection := resolvePRCollection(cfg, prDuplicatePRCollection) + geminiKey := strings.TrimSpace(cfg.Embedding.APIKey) + if geminiKey == "" { + geminiKey = strings.TrimSpace(os.Getenv("GEMINI_API_KEY")) + } + if geminiKey == "" { + return nil, fmt.Errorf("Gemini API key is required (set embedding.api_key or GEMINI_API_KEY)") + } + + return &prDuplicateRunOptions{ + Token: token, + Org: strings.TrimSpace(parts[0]), + Repo: strings.TrimSpace(parts[1]), + Number: prDuplicateNumber, + TopK: topK, + Threshold: threshold, + PRCollection: resolvePRCollection(cfg, prDuplicatePRCollection), + GeminiKey: geminiKey, + }, nil +} - ghClient := similiGithub.NewClient(ctx, token) - pr, err := ghClient.GetPullRequest(ctx, org, repo, prDuplicateNumber) +func fetchPullRequestMetadataText(ctx context.Context, opts *prDuplicateRunOptions) (*github.PullRequest, string, error) { + ghClient := similiGithub.NewClient(ctx, opts.Token) + + pr, err := ghClient.GetPullRequest(ctx, opts.Org, opts.Repo, opts.Number) if err != nil { - log.Fatalf("Failed to fetch pull request: %v", err) + return nil, "", fmt.Errorf("fetch pull request: %w", err) } - filePaths, err := listAllPullRequestFilePaths(ctx, ghClient, org, repo, prDuplicateNumber) + filePaths, err := listAllPullRequestFilePaths(ctx, ghClient, opts.Org, opts.Repo, opts.Number) if err != nil { - log.Fatalf("Failed to fetch pull request files: %v", err) + return nil, "", fmt.Errorf("fetch pull request files: %w", err) } - prText := buildPRMetadataText(pr, filePaths) - - geminiKey := cfg.Embedding.APIKey - if geminiKey == "" { - geminiKey = os.Getenv("GEMINI_API_KEY") - } - if geminiKey == "" { - log.Fatal("Gemini API key is required (set embedding.api_key or GEMINI_API_KEY)") - } + return pr, buildPRMetadataText(pr, filePaths), nil +} +func generateEmbeddingForPRText(ctx context.Context, cfg *config.Config, geminiKey, prText string) ([]float32, error) { embedder, err := gemini.NewEmbedder(geminiKey, cfg.Embedding.Model) if err != nil { - log.Fatalf("Failed to initialize Gemini embedder: %v", err) + return nil, fmt.Errorf("initialize Gemini embedder: %w", err) } defer embedder.Close() embedding, err := embedder.Embed(ctx, prText) if err != nil { - log.Fatalf("Failed to embed pull request content: %v", err) + return nil, err } + return embedding, nil +} +func findPRDuplicateCandidates(ctx context.Context, cfg *config.Config, opts *prDuplicateRunOptions, embedding []float32) ([]prDuplicateCandidate, bool, error) { qdrantClient, err := qdrant.NewClient(cfg.Qdrant.URL, cfg.Qdrant.APIKey) if err != nil { - log.Fatalf("Failed to initialize Qdrant client: %v", err) + return nil, false, fmt.Errorf("initialize Qdrant client: %w", err) } defer qdrantClient.Close() - searchLimit := topK * 3 - if searchLimit < topK { - searchLimit = topK + searchLimit := opts.TopK * 3 + if searchLimit < opts.TopK { + searchLimit = opts.TopK } issueCollectionExists, err := qdrantClient.CollectionExists(ctx, cfg.Qdrant.Collection) if err != nil { - log.Fatalf("Failed to verify issue collection '%s': %v", cfg.Qdrant.Collection, err) + return nil, false, fmt.Errorf("verify issue collection '%s': %w", cfg.Qdrant.Collection, err) } if !issueCollectionExists { - log.Fatalf("Issue collection '%s' does not exist", cfg.Qdrant.Collection) + return nil, false, fmt.Errorf("issue collection '%s' does not exist", cfg.Qdrant.Collection) } - issueResults, err := qdrantClient.Search(ctx, cfg.Qdrant.Collection, embedding, searchLimit, threshold) + issueResults, err := qdrantClient.Search(ctx, cfg.Qdrant.Collection, embedding, searchLimit, opts.Threshold) if err != nil { - log.Fatalf("Failed searching issue collection '%s': %v", cfg.Qdrant.Collection, err) + return nil, false, fmt.Errorf("search issue collection '%s': %w", cfg.Qdrant.Collection, err) } prResults := make([]*qdrant.SearchResult, 0) - if prCollection != cfg.Qdrant.Collection { - prCollectionExists, err := qdrantClient.CollectionExists(ctx, prCollection) + prCollectionMissing := false + if opts.PRCollection != cfg.Qdrant.Collection { + prCollectionExists, err := qdrantClient.CollectionExists(ctx, opts.PRCollection) if err != nil { - log.Fatalf("Failed to verify PR collection '%s': %v", prCollection, err) + return nil, false, fmt.Errorf("verify PR collection '%s': %w", opts.PRCollection, err) } if prCollectionExists { - prResults, err = qdrantClient.Search(ctx, prCollection, embedding, searchLimit, threshold) + prResults, err = qdrantClient.Search(ctx, opts.PRCollection, embedding, searchLimit, opts.Threshold) if err != nil { - log.Fatalf("Failed searching PR collection '%s': %v", prCollection, err) + return nil, false, fmt.Errorf("search PR collection '%s': %w", opts.PRCollection, err) } - } else if !prDuplicateJSON { - fmt.Printf("Warning: PR collection '%s' does not exist; searching issues only.\n\n", prCollection) + } else { + prCollectionMissing = true } } - candidates := mergeDuplicateCandidates(issueResults, prResults, org, repo, prDuplicateNumber) - if len(candidates) > topK { - candidates = candidates[:topK] + candidates := mergeDuplicateCandidates(issueResults, prResults, opts.Org, opts.Repo, opts.Number) + if len(candidates) > opts.TopK { + candidates = candidates[:opts.TopK] + } + + return candidates, prCollectionMissing, nil +} + +func detectPRDuplicate(ctx context.Context, geminiKey string, pr *github.PullRequest, candidates []prDuplicateCandidate) (*gemini.PRDuplicateResult, *prDuplicateCandidate, error) { + if len(candidates) == 0 { + return nil, nil, nil } llmClient, err := gemini.NewLLMClient(geminiKey) if err != nil { - log.Fatalf("Failed to initialize Gemini LLM client: %v", err) + return nil, nil, fmt.Errorf("initialize Gemini LLM client: %w", err) } defer llmClient.Close() - var duplicateResult *gemini.PRDuplicateResult - var matched *prDuplicateCandidate - if len(candidates) > 0 { - llmCandidates := make([]gemini.PRDuplicateCandidateInput, len(candidates)) - for i, c := range candidates { - llmCandidates[i] = gemini.PRDuplicateCandidateInput{ - ID: c.ID, - EntityType: c.EntityType, - Org: c.Org, - Repo: c.Repo, - Number: c.Number, - Title: c.Title, - Body: c.Body, - URL: c.URL, - Similarity: c.Similarity, - State: c.State, - } + llmCandidates := make([]gemini.PRDuplicateCandidateInput, len(candidates)) + for i, c := range candidates { + llmCandidates[i] = gemini.PRDuplicateCandidateInput{ + ID: c.ID, + EntityType: c.EntityType, + Org: c.Org, + Repo: c.Repo, + Number: c.Number, + Title: c.Title, + Body: c.Body, + URL: c.URL, + Similarity: c.Similarity, + State: c.State, } + } - duplicateResult, err = llmClient.DetectPRDuplicate(ctx, &gemini.PRDuplicateCheckInput{ - PullRequest: &gemini.IssueInput{ - Title: pr.GetTitle(), - Body: pr.GetBody(), - Author: pr.GetUser().GetLogin(), - }, - Candidates: llmCandidates, - }) - if err != nil { - log.Fatalf("Failed to run duplicate analysis: %v", err) - } + duplicateResult, err := llmClient.DetectPRDuplicate(ctx, &gemini.PRDuplicateCheckInput{ + PullRequest: &gemini.IssueInput{ + Title: pr.GetTitle(), + Body: pr.GetBody(), + Author: pr.GetUser().GetLogin(), + }, + Candidates: llmCandidates, + }) + if err != nil { + return nil, nil, err + } - if duplicateResult.IsDuplicate && duplicateResult.DuplicateID != "" { - for i := range candidates { - if candidates[i].ID == duplicateResult.DuplicateID { - c := candidates[i] - matched = &c - break - } - } + matched := findMatchedDuplicateCandidate(candidates, duplicateResult) + return duplicateResult, matched, nil +} + +func findMatchedDuplicateCandidate(candidates []prDuplicateCandidate, duplicateResult *gemini.PRDuplicateResult) *prDuplicateCandidate { + if duplicateResult == nil || !duplicateResult.IsDuplicate || duplicateResult.DuplicateID == "" { + return nil + } + + for i := range candidates { + if candidates[i].ID == duplicateResult.DuplicateID { + c := candidates[i] + return &c } } + return nil +} +func buildPRDuplicateOutput(pr *github.PullRequest, opts *prDuplicateRunOptions, candidates []prDuplicateCandidate, duplicateResult *gemini.PRDuplicateResult, matched *prDuplicateCandidate) prDuplicateOutput { out := prDuplicateOutput{ Candidates: candidates, Duplicate: duplicateResult, Matched: matched, } - out.PullRequest.Org = org - out.PullRequest.Repo = repo - out.PullRequest.Number = prDuplicateNumber + out.PullRequest.Org = opts.Org + out.PullRequest.Repo = opts.Repo + out.PullRequest.Number = opts.Number out.PullRequest.Title = pr.GetTitle() out.PullRequest.URL = pr.GetHTMLURL() + return out +} +func renderPRDuplicateOutput(out prDuplicateOutput, issueCollection, prCollection string, threshold float64) { if prDuplicateJSON { printJSONOutput(out) return } - fmt.Printf("PR: %s/%s#%d\n", org, repo, prDuplicateNumber) - fmt.Printf("Title: %s\n", pr.GetTitle()) - fmt.Printf("Issue Collection: %s\n", cfg.Qdrant.Collection) + fmt.Printf("PR: %s/%s#%d\n", out.PullRequest.Org, out.PullRequest.Repo, out.PullRequest.Number) + fmt.Printf("Title: %s\n", out.PullRequest.Title) + fmt.Printf("Issue Collection: %s\n", issueCollection) fmt.Printf("PR Collection: %s\n", prCollection) fmt.Printf("Threshold: %.2f\n\n", threshold) - if len(candidates) == 0 { + if len(out.Candidates) == 0 { fmt.Println("No similar issues or pull requests found.") return } fmt.Println("Top Candidates:") - for i, c := range candidates { + for i, c := range out.Candidates { label := "Issue" if c.EntityType == "pull_request" { label = "PR" @@ -286,27 +368,27 @@ func runPRDuplicate(cmd *cobra.Command, args []string) { fmt.Printf(" %s\n", c.URL) } - if duplicateResult == nil { + if out.Duplicate == nil { return } fmt.Println() - if duplicateResult.IsDuplicate { - fmt.Printf("Duplicate: YES (confidence %.2f)\n", duplicateResult.Confidence) - if matched != nil { + if out.Duplicate.IsDuplicate { + fmt.Printf("Duplicate: YES (confidence %.2f)\n", out.Duplicate.Confidence) + if out.Matched != nil { label := "Issue" - if matched.EntityType == "pull_request" { + if out.Matched.EntityType == "pull_request" { label = "PR" } - fmt.Printf("Matched: [%s] %s/%s#%d\n", label, matched.Org, matched.Repo, matched.Number) - } else if duplicateResult.DuplicateID != "" { - fmt.Printf("Matched ID: %s\n", duplicateResult.DuplicateID) + fmt.Printf("Matched: [%s] %s/%s#%d\n", label, out.Matched.Org, out.Matched.Repo, out.Matched.Number) + } else if out.Duplicate.DuplicateID != "" { + fmt.Printf("Matched ID: %s\n", out.Duplicate.DuplicateID) } } else { - fmt.Printf("Duplicate: NO (confidence %.2f)\n", duplicateResult.Confidence) + fmt.Printf("Duplicate: NO (confidence %.2f)\n", out.Duplicate.Confidence) } - if duplicateResult.Reasoning != "" { - fmt.Printf("Reasoning: %s\n", duplicateResult.Reasoning) + if out.Duplicate.Reasoning != "" { + fmt.Printf("Reasoning: %s\n", out.Duplicate.Reasoning) } } From 98d2d8568bddbb521d24d91315feaa86e3417b5a Mon Sep 17 00:00:00 2001 From: nludwig Date: Thu, 12 Feb 2026 16:46:07 -0800 Subject: [PATCH 3/6] chore: reorder qdrant env vars in sample --- .env.sample | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.env.sample b/.env.sample index 889f1ff..054c10e 100644 --- a/.env.sample +++ b/.env.sample @@ -4,10 +4,10 @@ # ============================================================================= # REQUIRED: Vector Database (Qdrant) # ============================================================================= -QDRANT_URL=https://your-cluster.qdrant.io:6333 -QDRANT_API_KEY=your-qdrant-api-key # Main issue collection QDRANT_COLLECTION=your-issues-collection +QDRANT_URL=https://your-cluster.qdrant.io:6333 +QDRANT_API_KEY=your-qdrant-api-key # Optional dedicated pull-request collection # QDRANT_PR_COLLECTION=your-pr-collection From bb72bc6a73413d6ef28df568bc2e52c9a32c02d6 Mon Sep 17 00:00:00 2001 From: nludwig Date: Thu, 12 Feb 2026 16:47:22 -0800 Subject: [PATCH 4/6] fix: make batch LLM initialization non-fatal --- cmd/simili/commands/batch.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cmd/simili/commands/batch.go b/cmd/simili/commands/batch.go index cd445fe..96fe2cc 100644 --- a/cmd/simili/commands/batch.go +++ b/cmd/simili/commands/batch.go @@ -339,12 +339,13 @@ func initializeDependencies(cfg *config.Config) (*pipeline.Dependencies, error) llmModel = envModel } llm, err := gemini.NewLLMClient(llmKey, llmModel) - if err != nil { - return nil, fmt.Errorf("failed to initialize LLM client: %w", err) - } - deps.LLMClient = llm - if verbose { - fmt.Printf("✓ Initialized LLM client (%s) with model: %s\n", llm.Provider(), llm.Model()) + if err == nil { + deps.LLMClient = llm + if verbose { + fmt.Printf("✓ Initialized LLM client (%s) with model: %s\n", llm.Provider(), llm.Model()) + } + } else if verbose { + fmt.Printf("ℹ LLM client unavailable, continuing without LLM steps: %v\n", err) } return deps, nil From 52a442c828e881421315e0c93488d361d7a64d9d Mon Sep 17 00:00:00 2001 From: nludwig Date: Thu, 12 Feb 2026 16:48:49 -0800 Subject: [PATCH 5/6] fix: paginate pull request file listing in github client --- cmd/simili/commands/pr_support.go | 32 +++++++++----------------- internal/integrations/github/client.go | 31 ++++++++++++++++++++----- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/cmd/simili/commands/pr_support.go b/cmd/simili/commands/pr_support.go index 7e339ea..f01bd87 100644 --- a/cmd/simili/commands/pr_support.go +++ b/cmd/simili/commands/pr_support.go @@ -33,29 +33,19 @@ func resolvePRCollection(cfg *similiConfig.Config, override string) string { } func listAllPullRequestFilePaths(ctx context.Context, gh *similiGithub.Client, org, repo string, number int) ([]string, error) { - paths := make([]string, 0, 32) - page := 1 - - for { - files, resp, err := gh.ListPullRequestFiles(ctx, org, repo, number, &github.ListOptions{ - PerPage: 100, - Page: page, - }) - if err != nil { - return nil, err - } - - for _, f := range files { - name := strings.TrimSpace(f.GetFilename()) - if name != "" { - paths = append(paths, name) - } - } + files, _, err := gh.ListPullRequestFiles(ctx, org, repo, number, &github.ListOptions{ + PerPage: 100, + }) + if err != nil { + return nil, err + } - if resp == nil || resp.NextPage == 0 { - break + paths := make([]string, 0, len(files)) + for _, f := range files { + name := strings.TrimSpace(f.GetFilename()) + if name != "" { + paths = append(paths, name) } - page = resp.NextPage } sort.Strings(paths) diff --git a/internal/integrations/github/client.go b/internal/integrations/github/client.go index 5879747..28fe370 100644 --- a/internal/integrations/github/client.go +++ b/internal/integrations/github/client.go @@ -154,18 +154,37 @@ func (c *Client) ListComments(ctx context.Context, org, repo string, number int, return comments, resp, nil } -// ListPullRequestFiles fetches files changed in a pull request. +// ListPullRequestFiles fetches files changed in a pull request across all pages. func (c *Client) ListPullRequestFiles(ctx context.Context, org, repo string, number int, opts *github.ListOptions) ([]*github.CommitFile, *github.Response, error) { + var listOpts github.ListOptions if opts == nil { - opts = &github.ListOptions{PerPage: 100} + listOpts = github.ListOptions{PerPage: 100} + } else { + listOpts = *opts + if listOpts.PerPage == 0 { + listOpts.PerPage = 100 + } } - files, resp, err := c.client.PullRequests.ListFiles(ctx, org, repo, number, opts) - if err != nil { - return nil, resp, fmt.Errorf("failed to list files for pull request #%d in %s/%s: %w", number, org, repo, err) + allFiles := make([]*github.CommitFile, 0, listOpts.PerPage) + var lastResp *github.Response + + for { + files, resp, err := c.client.PullRequests.ListFiles(ctx, org, repo, number, &listOpts) + if err != nil { + return nil, resp, fmt.Errorf("failed to list files for pull request #%d in %s/%s: %w", number, org, repo, err) + } + + allFiles = append(allFiles, files...) + lastResp = resp + + if resp == nil || resp.NextPage == 0 { + break + } + listOpts.Page = resp.NextPage } - return files, resp, nil + return allFiles, lastResp, nil } // GetFileContent fetches the raw content of a file from a repository. From ed8f93acd5e1e7d78c9554064fe99b55e35cf0c3 Mon Sep 17 00:00:00 2001 From: nludwig Date: Wed, 18 Feb 2026 11:23:26 -0800 Subject: [PATCH 6/6] fix: use resolved embedding dimensions for PR collection --- cmd/simili/commands/index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/simili/commands/index.go b/cmd/simili/commands/index.go index 64a1600..a7556cf 100644 --- a/cmd/simili/commands/index.go +++ b/cmd/simili/commands/index.go @@ -120,7 +120,7 @@ func runIndex(cmd *cobra.Command, args []string) { } if indexIncludePRs && prCollection != cfg.Qdrant.Collection { - err = qdrantClient.CreateCollection(ctx, prCollection, cfg.Embedding.Dimensions) + err = qdrantClient.CreateCollection(ctx, prCollection, embeddingDimensions) if err != nil { log.Fatalf("Failed to create/verify PR collection: %v", err) }