Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
# =============================================================================
# REQUIRED: Vector Database (Qdrant)
# =============================================================================
# Main issue collection
QDRANT_COLLECTION=your-issues-collection
QDRANT_URL=https://your-cluster.qdrant.io:6333
QDRANT_API_KEY=your-qdrant-api-key
# Optional dedicated pull-request collection
# QDRANT_PR_COLLECTION=your-pr-collection

# =============================================================================
# REQUIRED: Embedding Provider (at least one)
Expand Down
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,5 @@ go.work.sum
.env.local

# Local binaries and configs
simili

/simili

1 change: 1 addition & 0 deletions .simili.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ qdrant:
url: ${QDRANT_URL}
api_key: ${QDRANT_API_KEY}
collection: ${QDRANT_COLLECTION}
pr_collection: ${QDRANT_PR_COLLECTION}
embedding:
provider: gemini
api_key: ${GEMINI_API_KEY}
Expand Down
40 changes: 35 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,36 @@ Bulk index issues from a GitHub repository into the vector database.
simili index --repo owner/repo --workers 5 --limit 100
```

Optionally index pull requests (metadata-only) into a separate PR collection.

```bash
simili index --repo owner/repo --workers 5 --include-prs
```

**Flags:**
- `--repo` (required): Target repository (owner/name)
- `--workers`: Number of concurrent workers (default: 5)
- `--since`: Start from issue number or timestamp
- `--limit`: Maximum issues to index
- `--since`: RFC3339 timestamp filter (uses GitHub `updated_at`)
- `--dry-run`: Simulate without writing to database
- `--include-prs`: Also index pull requests (metadata-only)
- `--pr-collection`: Override PR collection name (default: `qdrant.pr_collection` or `QDRANT_PR_COLLECTION`)

### `simili pr-duplicate`

Check whether a pull request appears to be a duplicate of existing issues or pull requests.
This command searches both the issue collection and PR collection, then runs an LLM duplicate decision.

```bash
simili pr-duplicate --repo owner/repo --number 123 --top-k 8
```

**Flags:**
- `--repo` (required): Target repository (owner/name)
- `--number` (required): Pull request number
- `--top-k`: Maximum combined candidates to evaluate (default: 8)
- `--threshold`: Similarity threshold override
- `--pr-collection`: Override PR collection name
- `--json`: Emit JSON output only

### `simili process`

Expand Down Expand Up @@ -181,11 +205,17 @@ Create a JSON file with an array of issues:
# 1. Index repository issues
simili index --repo ballerina-platform/ballerina-library --workers 10

# 2. Prepare test issues in batch.json
# 3. Run batch analysis
# 2. Index PRs into separate collection
simili index --repo ballerina-platform/ballerina-library --workers 10 --include-prs

# 3. Check if a PR duplicates prior issues/PRs
simili pr-duplicate --repo ballerina-platform/ballerina-library --number 123 --top-k 10

# 4. Prepare test issues in batch.json
# 5. Run batch analysis
simili batch --file batch.json --format csv --out-file analysis.csv --workers 5

# 4. Review results
# 6. Review results
cat analysis.csv
```

Expand Down
13 changes: 7 additions & 6 deletions cmd/simili/commands/batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,13 @@ func initializeDependencies(cfg *config.Config) (*pipeline.Dependencies, error)
llmModel = envModel
}
llm, err := gemini.NewLLMClient(llmKey, llmModel)
if err != nil {
return nil, fmt.Errorf("failed to initialize LLM client: %w", err)
}
deps.LLMClient = llm
if verbose {
fmt.Printf("βœ“ Initialized LLM client (%s) with model: %s\n", llm.Provider(), llm.Model())
if err == nil {
deps.LLMClient = llm
if verbose {
fmt.Printf("βœ“ Initialized LLM client (%s) with model: %s\n", llm.Provider(), llm.Model())
}
} else if verbose {
fmt.Printf("β„Ή LLM client unavailable, continuing without LLM steps: %v\n", err)
}

return deps, nil
Expand Down
127 changes: 105 additions & 22 deletions cmd/simili/commands/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ import (
)

var (
indexRepo string
indexSince string // Can be a timestamp (ISO8601) or issue number (int)
indexWorkers int
indexToken string
indexDryRun bool
indexIncludePRs bool
indexRepo string
indexSince string // Timestamp (RFC3339), mapped to GitHub's "updated_at" filter.
indexWorkers int
indexToken string
indexDryRun bool
indexIncludePRs bool
indexPRCollection string
)

type Checkpoint struct {
Expand All @@ -41,11 +42,14 @@ type Checkpoint struct {
// indexCmd represents the index command
var indexCmd = &cobra.Command{
Use: "index",
Short: "Bulk index issues into the vector database",
Short: "Bulk index issues (and optionally PRs) into the vector database",
Long: `Index existing issues from a GitHub repository into the Qdrant vector database.
It fetches issues, comments, chunks the text, generates embeddings using the active AI provider,
and stores them for semantic search.

Optionally, pull requests can also be indexed into a dedicated PR collection
using metadata (title, description, changed file paths, and linked issues).

Supports resuming via a local checkpoint file or --since flag.`,
Run: runIndex,
}
Expand All @@ -54,11 +58,12 @@ func init() {
rootCmd.AddCommand(indexCmd)

indexCmd.Flags().StringVar(&indexRepo, "repo", "", "Target repository (owner/name)")
indexCmd.Flags().StringVar(&indexSince, "since", "", "Start indexing from this issue number or timestamp")
indexCmd.Flags().StringVar(&indexSince, "since", "", "Start indexing from this RFC3339 timestamp (filters by updated_at)")
indexCmd.Flags().IntVar(&indexWorkers, "workers", 5, "Number of concurrent workers")
indexCmd.Flags().StringVar(&indexToken, "token", "", "GitHub token (optional, defaults to GITHUB_TOKEN env var)")
indexCmd.Flags().BoolVar(&indexDryRun, "dry-run", false, "Simulate indexing without writing to DB")
indexCmd.Flags().BoolVar(&indexIncludePRs, "include-prs", true, "Include pull requests in indexing")
indexCmd.Flags().BoolVar(&indexIncludePRs, "include-prs", false, "Also index pull requests (metadata only) into PR collection")
indexCmd.Flags().StringVar(&indexPRCollection, "pr-collection", "", "Override PR collection name (default: qdrant.pr_collection or QDRANT_PR_COLLECTION)")

if err := indexCmd.MarkFlagRequired("repo"); err != nil {
log.Fatalf("Failed to mark repo flag as required: %v", err)
Expand All @@ -77,6 +82,7 @@ func runIndex(cmd *cobra.Command, args []string) {
if err != nil {
log.Fatalf("Failed to load config: %v", err)
}
prCollection := resolvePRCollection(cfg, indexPRCollection)

// 2. Auth & Clients
token := indexToken
Expand Down Expand Up @@ -112,6 +118,13 @@ func runIndex(cmd *cobra.Command, args []string) {
if err != nil {
log.Fatalf("Failed to create/verify collection: %v", err)
}

if indexIncludePRs && prCollection != cfg.Qdrant.Collection {
err = qdrantClient.CreateCollection(ctx, prCollection, embeddingDimensions)
if err != nil {
log.Fatalf("Failed to create/verify PR collection: %v", err)
}
}
}

// 3. Parse Repo
Expand All @@ -125,15 +138,16 @@ func runIndex(cmd *cobra.Command, args []string) {
// Checkpoint logic omitted for simplicity in v0.1.0 as standard pagination handles most cases.
// Users can rely on --since for updates.

log.Printf("Starting indexing for %s/%s with %d workers...", org, repoName, indexWorkers)
log.Printf("Starting indexing for %s/%s with %d workers (include PRs: %t)...", org, repoName, indexWorkers, indexIncludePRs)

// Fetch loop
page := 1
splitter := text.NewRecursiveCharacterSplitter()

// Job channel
type Job struct {
Issue *github.Issue
Issue *github.Issue
IsPullRequest bool
}
jobs := make(chan Job, indexWorkers)
var wg sync.WaitGroup
Expand All @@ -144,15 +158,19 @@ func runIndex(cmd *cobra.Command, args []string) {
go func(id int) {
defer wg.Done()
for job := range jobs {
processIssue(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, cfg.Qdrant.Collection, org, repoName, indexDryRun)
if job.IsPullRequest {
processPullRequest(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, prCollection, org, repoName, indexDryRun)
} else {
processIssue(ctx, id, job.Issue, ghClient, geminiClient, qdrantClient, splitter, cfg.Qdrant.Collection, org, repoName, indexDryRun)
}
}
}(i)
}

// Issue Producer
opts := &github.IssueListByRepoOptions{
State: "all",
Sort: "created",
Sort: "updated",
Direction: "asc",
ListOptions: github.ListOptions{PerPage: 100},
}
Expand Down Expand Up @@ -181,10 +199,13 @@ func runIndex(cmd *cobra.Command, args []string) {
log.Printf("Fetched page %d (%d issues)", page, len(issues))

for _, issue := range issues {
if !indexIncludePRs && issue.IsPullRequest() {
if issue.IsPullRequest() {
if indexIncludePRs {
jobs <- Job{Issue: issue, IsPullRequest: true}
}
continue
}
jobs <- Job{Issue: issue}
jobs <- Job{Issue: issue, IsPullRequest: false}
}

if resp.NextPage == 0 {
Expand Down Expand Up @@ -248,11 +269,6 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si
return
}

itemType := "issue"
if issue.IsPullRequest() {
itemType = "pull_request"
}

points := make([]*qdrant.Point, len(chunks))
for i, chunk := range chunks {
chunkID := uuid.NewMD5(uuid.NameSpaceURL, fmt.Appendf(nil, "%s/%s#%d-chunk-%d", org, repo, issue.GetNumber(), i)).String()
Expand All @@ -263,11 +279,11 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si
"org": org,
"repo": repo,
"issue_number": issue.GetNumber(),
"title": issue.GetTitle(),
"text": chunk,
"url": issue.GetHTMLURL(),
"type": itemType,
"state": issue.GetState(),
"title": issue.GetTitle(),
"type": "issue",
},
}
}
Expand All @@ -279,3 +295,70 @@ func processIssue(ctx context.Context, workerID int, issue *github.Issue, gh *si
log.Printf("[Worker %d] Indexed #%d", workerID, issue.GetNumber())
}
}

func processPullRequest(ctx context.Context, workerID int, issue *github.Issue, gh *similiGithub.Client, em *gemini.Embedder, qd *qdrant.Client, splitter *text.RecursiveCharacterSplitter, collection, org, repo string, dryRun bool) {
prNumber := issue.GetNumber()

pr, err := gh.GetPullRequest(ctx, org, repo, prNumber)
if err != nil {
log.Printf("[Worker %d] Error fetching PR #%d: %v", workerID, prNumber, err)
return
}

filePaths, err := listAllPullRequestFilePaths(ctx, gh, org, repo, prNumber)
if err != nil {
log.Printf("[Worker %d] Error fetching files for PR #%d: %v", workerID, prNumber, err)
return
}

fullText := buildPRMetadataText(pr, filePaths)
if strings.TrimSpace(fullText) == "" {
log.Printf("[Worker %d] PR #%d has no indexable content, skipping", workerID, prNumber)
return
}

chunks := splitter.SplitText(fullText)
if len(chunks) == 0 {
chunks = []string{fullText}
}

embeddings, err := em.EmbedBatch(ctx, chunks)
if err != nil {
log.Printf("[Worker %d] Error embedding PR #%d: %v", workerID, prNumber, err)
return
}

if dryRun {
log.Printf("[DryRun] Would upsert PR #%d (%d chunks) into %s", prNumber, len(chunks), collection)
return
}

points := make([]*qdrant.Point, len(chunks))
for i, chunk := range chunks {
pointID := uuid.NewMD5(uuid.NameSpaceURL, []byte(fmt.Sprintf("%s/%s/pr/%d/chunk/%d", org, repo, prNumber, i))).String()
points[i] = &qdrant.Point{
ID: pointID,
Vector: embeddings[i],
Payload: map[string]interface{}{
"org": org,
"repo": repo,
"pr_number": prNumber,
"title": pr.GetTitle(),
"description": strings.TrimSpace(pr.GetBody()),
"text": chunk,
"url": pr.GetHTMLURL(),
"state": pr.GetState(),
"merged": pr.GetMerged(),
"changed_files": strings.Join(filePaths, "\n"),
"type": "pull_request",
},
}
}

if err := qd.Upsert(ctx, collection, points); err != nil {
log.Printf("[Worker %d] Error upserting PR #%d: %v", workerID, prNumber, err)
return
}

log.Printf("[Worker %d] Indexed PR #%d", workerID, prNumber)
}
Loading
Loading