similigh · nick1udwig · Feb 12, 2026 · Feb 12, 2026 · Feb 13, 2026 · Feb 13, 2026
@@ -4,8 +4,12 @@
 # =============================================================================
 # REQUIRED: Vector Database (Qdrant)
 # =============================================================================
+# Main issue collection
+QDRANT_COLLECTION=your-issues-collection
 QDRANT_URL=https://your-cluster.qdrant.io:6333
 QDRANT_API_KEY=your-qdrant-api-key
+# Optional dedicated pull-request collection
+# QDRANT_PR_COLLECTION=your-pr-collection
 
 # =============================================================================
 # REQUIRED: Embedding Provider (at least one)

@@ -33,6 +33,5 @@ go.work.sum
 .env.local
 
 # Local binaries and configs
-simili
-
+/simili
 
@@ -2,6 +2,7 @@ qdrant:
   url: ${QDRANT_URL}
   api_key: ${QDRANT_API_KEY}
   collection: ${QDRANT_COLLECTION}
+  pr_collection: ${QDRANT_PR_COLLECTION}
 embedding:
   provider: gemini
   api_key: ${GEMINI_API_KEY}

@@ -104,12 +104,36 @@ Bulk index issues from a GitHub repository into the vector database.
 simili index --repo owner/repo --workers 5 --limit 100
 ```
 
+Optionally index pull requests (metadata-only) into a separate PR collection.
+
+```bash
+simili index --repo owner/repo --workers 5 --include-prs
+```
+
 **Flags:**
 - `--repo` (required): Target repository (owner/name)
 - `--workers`: Number of concurrent workers (default: 5)
-- `--since`: Start from issue number or timestamp
-- `--limit`: Maximum issues to index
+- `--since`: RFC3339 timestamp filter (uses GitHub `updated_at`)
 - `--dry-run`: Simulate without writing to database
+- `--include-prs`: Also index pull requests (metadata-only)
+- `--pr-collection`: Override PR collection name (default: `qdrant.pr_collection` or `QDRANT_PR_COLLECTION`)
+
+### `simili pr-duplicate`
+
+Check whether a pull request appears to be a duplicate of existing issues or pull requests.
+This command searches both the issue collection and PR collection, then runs an LLM duplicate decision.
+
+```bash
+simili pr-duplicate --repo owner/repo --number 123 --top-k 8
+```
+
+**Flags:**
+- `--repo` (required): Target repository (owner/name)
+- `--number` (required): Pull request number
+- `--top-k`: Maximum combined candidates to evaluate (default: 8)
+- `--threshold`: Similarity threshold override
+- `--pr-collection`: Override PR collection name
+- `--json`: Emit JSON output only
 
 ### `simili process`
 
@@ -181,11 +205,17 @@ Create a JSON file with an array of issues:
 # 1. Index repository issues
 simili index --repo ballerina-platform/ballerina-library --workers 10
 
-# 2. Prepare test issues in batch.json
-# 3. Run batch analysis
+# 2. Index PRs into separate collection
+simili index --repo ballerina-platform/ballerina-library --workers 10 --include-prs
+
+# 3. Check if a PR duplicates prior issues/PRs
+simili pr-duplicate --repo ballerina-platform/ballerina-library --number 123 --top-k 10
+
+# 4. Prepare test issues in batch.json
+# 5. Run batch analysis
 simili batch --file batch.json --format csv --out-file analysis.csv --workers 5
 
-# 4. Review results
+# 6. Review results
 cat analysis.csv
 ```
 

@@ -339,12 +339,13 @@ func initializeDependencies(cfg *config.Config) (*pipeline.Dependencies, error)
 		llmModel = envModel
 	}
 	llm, err := gemini.NewLLMClient(llmKey, llmModel)
-	if err != nil {
-		return nil, fmt.Errorf("failed to initialize LLM client: %w", err)
-	}
-	deps.LLMClient = llm
-	if verbose {
-		fmt.Printf("✓ Initialized LLM client (%s) with model: %s\n", llm.Provider(), llm.Model())
+	if err == nil {
+		deps.LLMClient = llm
+		if verbose {
+			fmt.Printf("✓ Initialized LLM client (%s) with model: %s\n", llm.Provider(), llm.Model())
+		}
+	} else if verbose {
+		fmt.Printf("ℹ LLM client unavailable, continuing without LLM steps: %v\n", err)
 	}
 
 	return deps, nil

@@ -25,12 +25,13 @@ import (
 )
 
 var (
-	indexRepo       string
-	indexSince      string // Can be a timestamp (ISO8601) or issue number (int)
-	indexWorkers    int
-	indexToken      string
-	indexDryRun     bool
-	indexIncludePRs bool
+	indexRepo         string
+	indexSince        string // Timestamp (RFC3339), mapped to GitHub's "updated_at" filter.
+	indexWorkers      int
+	indexToken        string
+	indexDryRun       bool
+	indexIncludePRs   bool
+	indexPRCollection string
 )
 
 type Checkpoint struct {
@@ -41,11 +42,14 @@ type Checkpoint struct {
 // indexCmd represents the index command
 var indexCmd = &cobra.Command{
 	Use:   "index",
-	Short: "Bulk index issues into the vector database",
+	Short: "Bulk index issues (and optionally PRs) into the vector database",
 	Long: `Index existing issues from a GitHub repository into the Qdrant vector database.
 It fetches issues, comments, chunks the text, generates embeddings using the active AI provider,
 and stores them for semantic search.
 
+Optionally, pull requests can also be indexed into a dedicated PR collection
+using metadata (title, description, changed file paths, and linked issues).
+
 Supports resuming via a local checkpoint file or --since flag.`,
 	Run: runIndex,
 }
@@ -54,11 +58,12 @@ func init() {
 	rootCmd.AddCommand(indexCmd)
 
 	indexCmd.Flags().StringVar(&indexRepo, "repo", "", "Target repository (owner/name)")
-	indexCmd.Flags().StringVar(&indexSince, "since", "", "Start indexing from this issue number or timestamp")
+	indexCmd.Flags().StringVar(&indexSince, "since", "", "Start indexing from this RFC3339 timestamp (filters by updated_at)")
 	indexCmd.Flags().IntVar(&indexWorkers, "workers", 5, "Number of concurrent workers")
 	indexCmd.Flags().StringVar(&indexToken, "token", "", "GitHub token (optional, defaults to GITHUB_TOKEN env var)")
 	indexCmd.Flags().BoolVar(&indexDryRun, "dry-run", false, "Simulate indexing without writing to DB")
-	indexCmd.Flags().BoolVar(&indexIncludePRs, "include-prs", true, "Include pull requests in indexing")
+	indexCmd.Flags().BoolVar(&indexIncludePRs, "include-prs", false, "Also index pull requests (metadata only) into PR collection")
+	indexCmd.Flags().StringVar(&indexPRCollection, "pr-collection", "", "Override PR collection name (default: qdrant.pr_collection or QDRANT_PR_COLLECTION)")
 
 	if err := indexCmd.MarkFlagRequired("repo"); err != nil {
 		log.Fatalf("Failed to mark repo flag as required: %v", err)
@@ -77,6 +82,7 @@ func runIndex(cmd *cobra.Command, args []string) {
 	if err != nil {
 		log.Fatalf("Failed to load config: %v", err)
 	}
+	prCollection := resolvePRCollection(cfg, indexPRCollection)
 
 	// 2. Auth & Clients
 	token := indexToken
@@ -112,6 +118,13 @@ func runIndex(cmd *cobra.Command, args []string) {
 		if err != nil {
 			log.Fatalf("Failed to create/verify collection: %v", err)
 		}
+
+		if indexIncludePRs && prCollection != cfg.Qdrant.Collection {
+			err = qdrantClient.CreateCollection(ctx, prCollection, embeddingDimensions)
+			if err != nil {
+				log.Fatalf("Failed to create/verify PR collection: %v", err)
+			}
+		}
 	}
 
 	// 3. Parse Repo
@@ -125,15 +138,16 @@ func runIndex(cmd *cobra.Command, args []string) {
 	// Checkpoint logic omitted for simplicity in v0.1.0 as standard pagination handles most cases.
 	// Users can rely on --since for updates.
 
-	log.Printf("Starting indexing for %s/%s with %d workers...", org, repoName, indexWorkers)
+	log.Printf("Starting indexing for %s/%s with %d workers (include PRs: %t)...", org, repoName, indexWorkers, indexIncludePRs)
 
 	// Fetch loop
 	page := 1
 	splitter := text.NewRecursiveCharacterSplitter()
 
 	// Job channel
 	type Job struct {
-		Issue *github.Issue
+		Issue         *github.Issue
+		IsPullRequest bool
 	}
 	jobs := make(chan Job, indexWorkers)
 	var wg sync.WaitGroup
@@ -144,15 +158,19 @@ func runIndex(cmd *cobra.Command, args []string) {
 		go func(id int) {
 			defer wg.Done()
 			for job := range jobs {
-				processIssue(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, cfg.Qdrant.Collection, org, repoName, indexDryRun)
+				if job.IsPullRequest {
+					processPullRequest(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, prCollection, org, repoName, indexDryRun)
+				} else {
+					processIssue(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, cfg.Qdrant.Collection, org, repoName, indexDryRun)
+				}
 			}
 		}(i)
 	}
 
 	// Issue Producer
 	opts := &github.IssueListByRepoOptions{
 		State:       "all",
-		Sort:        "created",
+		Sort:        "updated",
 		Direction:   "asc",
 		ListOptions: github.ListOptions{PerPage: 100},
 	}
@@ -181,10 +199,13 @@ func runIndex(cmd *cobra.Command, args []string) {
 		log.Printf("Fetched page %d (%d issues)", page, len(issues))
 
 		for _, issue := range issues {
-			if !indexIncludePRs && issue.IsPullRequest() {
+			if issue.IsPullRequest() {
+				if indexIncludePRs {
+					jobs <- Job{Issue: issue, IsPullRequest: true}
+				}
 				continue
 			}
-			jobs <- Job{Issue: issue}
+			jobs <- Job{Issue: issue, IsPullRequest: false}
 		}
 
 		if resp.NextPage == 0 {
@@ -248,11 +269,6 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si
 		return
 	}
 
-	itemType := "issue"
-	if issue.IsPullRequest() {
-		itemType = "pull_request"
-	}
-
 	points := make([]*qdrant.Point, len(chunks))
 	for i, chunk := range chunks {
 		chunkID := uuid.NewMD5(uuid.NameSpaceURL, fmt.Appendf(nil, "%s/%s#%d-chunk-%d", org, repo, issue.GetNumber(), i)).String()
@@ -263,11 +279,11 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si
 				"org":          org,
 				"repo":         repo,
 				"issue_number": issue.GetNumber(),
+				"title":        issue.GetTitle(),
 				"text":         chunk,
 				"url":          issue.GetHTMLURL(),
-				"type":         itemType,
 				"state":        issue.GetState(),
-				"title":        issue.GetTitle(),
+				"type":         "issue",
 			},
 		}
 	}
@@ -279,3 +295,70 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si
 		log.Printf("[Worker %d] Indexed #%d", workerID, issue.GetNumber())
 	}
 }
+
+func processPullRequest(ctx context.Context, workerID int, issue *github.Issue, gh *similiGithub.Client, em *gemini.Embedder, qd *qdrant.Client, splitter *text.RecursiveCharacterSplitter, collection, org, repo string, dryRun bool) {
+	prNumber := issue.GetNumber()
+
+	pr, err := gh.GetPullRequest(ctx, org, repo, prNumber)
+	if err != nil {
+		log.Printf("[Worker %d] Error fetching PR #%d: %v", workerID, prNumber, err)
+		return
+	}
+
+	filePaths, err := listAllPullRequestFilePaths(ctx, gh, org, repo, prNumber)
+	if err != nil {
+		log.Printf("[Worker %d] Error fetching files for PR #%d: %v", workerID, prNumber, err)
+		return
+	}
+
+	fullText := buildPRMetadataText(pr, filePaths)
+	if strings.TrimSpace(fullText) == "" {
+		log.Printf("[Worker %d] PR #%d has no indexable content, skipping", workerID, prNumber)
+		return
+	}
+
+	chunks := splitter.SplitText(fullText)
+	if len(chunks) == 0 {
+		chunks = []string{fullText}
+	}
+
+	embeddings, err := em.EmbedBatch(ctx, chunks)
+	if err != nil {
+		log.Printf("[Worker %d] Error embedding PR #%d: %v", workerID, prNumber, err)
+		return
+	}
+
+	if dryRun {
+		log.Printf("[DryRun] Would upsert PR #%d (%d chunks) into %s", prNumber, len(chunks), collection)
+		return
+	}
+
+	points := make([]*qdrant.Point, len(chunks))
+	for i, chunk := range chunks {
+		pointID := uuid.NewMD5(uuid.NameSpaceURL, []byte(fmt.Sprintf("%s/%s/pr/%d/chunk/%d", org, repo, prNumber, i))).String()
+		points[i] = &qdrant.Point{
+			ID:     pointID,
+			Vector: embeddings[i],
+			Payload: map[string]interface{}{
+				"org":           org,
+				"repo":          repo,
+				"pr_number":     prNumber,
+				"title":         pr.GetTitle(),
+				"description":   strings.TrimSpace(pr.GetBody()),
+				"text":          chunk,
+				"url":           pr.GetHTMLURL(),
+				"state":         pr.GetState(),
+				"merged":        pr.GetMerged(),
+				"changed_files": strings.Join(filePaths, "\n"),
+				"type":          "pull_request",
+			},
+		}
+	}
+
+	if err := qd.Upsert(ctx, collection, points); err != nil {
+		log.Printf("[Worker %d] Error upserting PR #%d: %v", workerID, prNumber, err)
+		return
+	}
+
+	log.Printf("[Worker %d] Indexed PR #%d", workerID, prNumber)
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,6 +33,5 @@ go.work.sum @@
     .env.local
     # Local binaries and configs
-    simili
+    /simili