diff --git a/.codecov.yml b/.codecov.yml index 9a7ce59..8278ef4 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -7,3 +7,4 @@ coverage: patch: default: target: 80% + threshold: 0% diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..7575371 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,41 @@ +--- +name: Bug Report +about: Report a bug in chisel +title: '' +labels: bug +assignees: '' +--- + +## Description + + + +## Steps to Reproduce + +1. +2. +3. + +## Expected Behaviour + + + +## Actual Behaviour + + + +## Environment + +- Go version: +- chisel version: +- OS: + +## Minimal Example + +```go +// Code that reproduces the issue +``` + +## Additional Context + + diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md new file mode 100644 index 0000000..533a327 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -0,0 +1,28 @@ +--- +name: Documentation +about: Report documentation issues or request improvements +title: '' +labels: documentation +assignees: '' +--- + +## Type + + + +- [ ] Missing documentation +- [ ] Incorrect documentation +- [ ] Unclear documentation +- [ ] Documentation improvement + +## Location + + + +## Description + + + +## Suggested Change + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bcb11a8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,29 @@ +--- +name: Feature Request +about: Suggest a new feature for chisel +title: '' +labels: enhancement +assignees: '' +--- + +## Problem + + + +## Proposed Solution + + + +## Example Usage + +```go +// How would this feature be used? +``` + +## Alternatives Considered + + + +## Additional Context + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..4cfb4da --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,31 @@ +## Summary + + + +## Changes + + + +- + +## Type + + + +- [ ] Bug fix +- [ ] New feature +- [ ] Enhancement +- [ ] Refactoring +- [ ] Documentation +- [ ] Testing + +## Checklist + +- [ ] Tests pass (`make test`) +- [ ] Linting passes (`make lint`) +- [ ] New code has tests +- [ ] Documentation updated (if applicable) + +## Related Issues + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a211374 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,110 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + name: Test + runs-on: ubuntu-latest + strategy: + matrix: + go-version: ['1.24', '1.25'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + + - name: Test all modules + run: | + go test -v -race ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: golangci-lint + uses: golangci/golangci-lint-action@v7 + with: + version: v2.7.2 + args: --config=.golangci.yml --timeout=5m ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + skip-cache: false + skip-save-cache: false + + - name: Security Report + if: always() + run: | + golangci-lint run --config=.golangci.yml --out-format=json ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... > lint-report.json || true + echo "### Security Scan Summary" >> $GITHUB_STEP_SUMMARY + echo "Linters with findings:" >> $GITHUB_STEP_SUMMARY + jq -r '.Issues[] | .FromLinter' lint-report.json 2>/dev/null | sort | uniq -c | sort -nr >> $GITHUB_STEP_SUMMARY || echo "No issues found βœ…" >> $GITHUB_STEP_SUMMARY + + security: + name: Security + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Run Gosec Security Scanner + uses: securego/gosec@v2.22.11 + with: + args: '-fmt sarif -out gosec-results.sarif --no-fail ./...' + + - name: Upload SARIF file + uses: github/codeql-action/upload-sarif@v3 + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository + continue-on-error: true + with: + sarif_file: gosec-results.sarif + wait-for-processing: true + + benchmark: + name: Benchmark + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Run benchmarks + run: | + echo "### Provider Benchmarks" | tee benchmark_results.txt + go test -bench=. -benchmem -benchtime=1s ./testing/benchmarks/... | tee -a benchmark_results.txt + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: benchmark_results.txt + + ci-complete: + name: CI Complete + needs: [test, lint, security, benchmark] + runs-on: ubuntu-latest + steps: + - run: echo "CI complete" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..bc100b2 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,49 @@ +name: CodeQL + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 6 * * 1' + +permissions: + contents: read + security-events: write + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: go + queries: security-and-quality + + - name: Build + run: | + go build ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:go" + + - name: Security summary + if: always() + run: | + echo "### πŸ”’ CodeQL Security Analysis" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Security analysis completed. Check the Security tab for detailed findings." >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 0000000..9b2c03b --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,145 @@ +name: Coverage + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +permissions: + contents: read + checks: write + pull-requests: write + +jobs: + coverage: + name: Test Coverage Analysis + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + cache: true + + - name: Run tests with coverage + run: | + # Test all modules and collect coverage + echo "=== Testing all modules ===" + go test -v -race -coverprofile=coverage.out -covermode=atomic \ + ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + + # Generate coverage report + go tool cover -func=coverage.out > coverage-summary.txt + echo "Coverage Summary:" + tail -1 coverage-summary.txt + + # Generate HTML report + go tool cover -html=coverage.out -o coverage.html + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ./coverage.out + flags: unit + name: chisel-coverage + fail_ci_if_error: false + verbose: true + + - name: Generate coverage badge + run: | + COVERAGE=$(go tool cover -func=coverage.out | tail -1 | grep -oE '[0-9]+\.[0-9]+' | tail -1) + echo "Coverage: $COVERAGE%" + echo "COVERAGE=$COVERAGE" >> $GITHUB_ENV + + if awk "BEGIN {exit !($COVERAGE >= 80)}"; then + COLOR="green" + elif awk "BEGIN {exit !($COVERAGE >= 60)}"; then + COLOR="yellow" + else + COLOR="red" + fi + echo "COVERAGE_COLOR=$COLOR" >> $GITHUB_ENV + + - name: Create coverage comment (PR only) + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + + const summary = fs.readFileSync('coverage-summary.txt', 'utf8'); + const lines = summary.split('\n').filter(line => line.trim()); + + const totalLine = lines[lines.length - 1]; + const coverage = totalLine.match(/(\d+\.\d+)%/)?.[1] || 'N/A'; + + const body = `## πŸ“Š Coverage Report + + **Total Coverage:** ${coverage}% + + ### Coverage by Package + \`\`\` + ${lines.slice(0, -1).join('\n')} + \`\`\` + + --- + *Coverage report generated by [Codecov](https://codecov.io)*`; + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const botComment = comments.find(comment => + comment.user.type === 'Bot' && comment.body.includes('πŸ“Š Coverage Report') + ); + + if (botComment) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: botComment.id, + body: body + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: body + }); + } + continue-on-error: true + + - name: Upload coverage artifacts + uses: actions/upload-artifact@v4 + with: + name: coverage-reports + path: | + coverage.out + coverage.html + coverage-summary.txt + retention-days: 30 + + - name: Coverage summary + run: | + echo "### πŸ“Š Test Coverage Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Total Coverage:** ${{ env.COVERAGE }}%" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### Coverage by Package" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + head -n -1 coverage-summary.txt >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### Coverage Standards" >> $GITHUB_STEP_SUMMARY + echo "- 🎯 Target: 70% overall, 80% for new code" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..56eb681 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,105 @@ +name: Release + +on: + push: + tags: + - 'v*' + +permissions: + contents: write + +jobs: + release: + name: Release + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Validate release + run: | + # Ensure tests pass before release + go test -race ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + + - name: Extract version + id: version + run: | + VERSION=${GITHUB_REF#refs/tags/} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Releasing version: $VERSION" + + - name: Create submodule tags + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + VERSION=${{ steps.version.outputs.version }} + + # Tag each submodule with prefixed version + MODULES="golang markdown typescript python rust testing" + + for module in $MODULES; do + TAG="${module}/${VERSION}" + echo "Creating tag: $TAG" + git tag "$TAG" || echo "Tag $TAG already exists" + done + + # Push all tags + git push origin --tags + + - name: Generate release notes + id: notes + run: | + VERSION=${{ steps.version.outputs.version }} + + # Get commits since last tag + PREV_TAG=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") + + if [ -n "$PREV_TAG" ]; then + NOTES=$(git log --pretty=format:"- %s (%h)" "$PREV_TAG"..HEAD --no-merges) + else + NOTES=$(git log --pretty=format:"- %s (%h)" --no-merges -20) + fi + + # Write to file for multiline output + echo "$NOTES" > release_notes.txt + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + name: ${{ steps.version.outputs.version }} + body_path: release_notes.txt + draft: false + prerelease: ${{ contains(steps.version.outputs.version, '-') }} + generate_release_notes: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Release summary + run: | + VERSION=${{ steps.version.outputs.version }} + echo "### πŸš€ Release $VERSION" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### Submodule Tags Created" >> $GITHUB_STEP_SUMMARY + echo "- \`golang/$VERSION\`" >> $GITHUB_STEP_SUMMARY + echo "- \`markdown/$VERSION\`" >> $GITHUB_STEP_SUMMARY + echo "- \`typescript/$VERSION\`" >> $GITHUB_STEP_SUMMARY + echo "- \`python/$VERSION\`" >> $GITHUB_STEP_SUMMARY + echo "- \`rust/$VERSION\`" >> $GITHUB_STEP_SUMMARY + echo "- \`testing/$VERSION\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### Installation" >> $GITHUB_STEP_SUMMARY + echo '```bash' >> $GITHUB_STEP_SUMMARY + echo "# Main module" >> $GITHUB_STEP_SUMMARY + echo "go get github.com/zoobzio/chisel@$VERSION" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "# Language providers" >> $GITHUB_STEP_SUMMARY + echo "go get github.com/zoobzio/chisel/golang@$VERSION" >> $GITHUB_STEP_SUMMARY + echo "go get github.com/zoobzio/chisel/typescript@$VERSION" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY diff --git a/.golangci.yml b/.golangci.yml index e6e206c..b4814a5 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -16,9 +16,12 @@ linters: # Security - gosec - noctx + - bodyclose + - sqlclosecheck # Error handling - errorlint + - errchkjson - wastedassign # Best practices @@ -31,6 +34,23 @@ linters: - prealloc - copyloopvar + settings: + errcheck: + check-type-assertions: true + check-blank: true + + govet: + enable-all: true + disable: + - fieldalignment + + dupl: + threshold: 150 + + goconst: + min-len: 3 + min-occurrences: 3 + exclusions: rules: - path: _test\.go @@ -38,18 +58,3 @@ linters: - dupl - goconst - govet - -linters-settings: - errcheck: - check-type-assertions: true - check-blank: true - - govet: - enable-all: true - - dupl: - threshold: 150 - - goconst: - min-len: 3 - min-occurrences: 3 diff --git a/Makefile b/Makefile index ac968f3..9f7be59 100644 --- a/Makefile +++ b/Makefile @@ -1,25 +1,58 @@ -.PHONY: help test lint check build clean install-tools +.PHONY: test test-unit test-integration test-bench lint lint-fix coverage clean help check ci install-tools install-hooks build -help: ## Show this help - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}' +.DEFAULT_GOAL := help -test: ## Run all tests - go test -race -cover ./... +help: ## Display available commands + @echo "chisel Development Commands" + @echo "===========================" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-18s\033[0m %s\n", $$1, $$2}' + +test: ## Run all tests with race detector + @go test -v -race ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + +test-unit: ## Run unit tests only (short mode) + @go test -v -race -short ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + +test-integration: ## Run integration tests + @go test -v -race ./testing/integration/... + +test-bench: ## Run benchmarks + @go test -bench=. -benchmem -benchtime=1s ./testing/benchmarks/... lint: ## Run linters - golangci-lint run ./... + @golangci-lint run --config=.golangci.yml --timeout=5m ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... -check: lint test ## Run linters and tests +lint-fix: ## Run linters with auto-fix + @golangci-lint run --config=.golangci.yml --fix ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + +coverage: ## Generate coverage report (HTML) + @go test -coverprofile=coverage.out ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... + @go tool cover -html=coverage.out -o coverage.html + @go tool cover -func=coverage.out | tail -1 + @echo "Coverage report: coverage.html" build: ## Build all packages - go build ./... + @go build ./... ./golang/... ./markdown/... ./typescript/... ./python/... ./rust/... ./testing/... -clean: ## Clean build artifacts - rm -rf dist/ coverage.out coverage.html +clean: ## Remove generated files + @rm -f coverage.out coverage.html coverage.txt + @rm -rf dist/ + @find . -name "*.test" -delete + @find . -name "*.prof" -delete + @find . -name "*.out" -delete install-tools: ## Install development tools - go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest + @go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.8.0 + +install-hooks: ## Install git pre-commit hook + @mkdir -p .git/hooks + @echo '#!/bin/sh' > .git/hooks/pre-commit + @echo 'make check' >> .git/hooks/pre-commit + @chmod +x .git/hooks/pre-commit + @echo "Pre-commit hook installed" + +check: test lint ## Run tests and lint (quick validation) + @echo "All checks passed!" -coverage: ## Generate coverage report - go test -coverprofile=coverage.out ./... - go tool cover -html=coverage.out -o coverage.html +ci: clean lint test coverage test-bench ## Full CI simulation + @echo "CI simulation complete!" diff --git a/README.md b/README.md new file mode 100644 index 0000000..0e8a260 --- /dev/null +++ b/README.md @@ -0,0 +1,200 @@ +# chisel + +[![CI Status](https://github.com/zoobzio/chisel/workflows/CI/badge.svg)](https://github.com/zoobzio/chisel/actions/workflows/ci.yml) +[![codecov](https://codecov.io/gh/zoobzio/chisel/graph/badge.svg?branch=main)](https://codecov.io/gh/zoobzio/chisel) +[![Go Report Card](https://goreportcard.com/badge/github.com/zoobzio/chisel)](https://goreportcard.com/report/github.com/zoobzio/chisel) +[![CodeQL](https://github.com/zoobzio/chisel/workflows/CodeQL/badge.svg)](https://github.com/zoobzio/chisel/security/code-scanning) +[![Go Reference](https://pkg.go.dev/badge/github.com/zoobzio/chisel.svg)](https://pkg.go.dev/github.com/zoobzio/chisel) +[![License](https://img.shields.io/github/license/zoobzio/chisel)](LICENSE) +[![Go Version](https://img.shields.io/github/go-mod/go-version/zoobzio/chisel)](go.mod) +[![Release](https://img.shields.io/github/v/release/zoobzio/chisel)](https://github.com/zoobzio/chisel/releases) + +AST-aware code chunking for semantic search and embeddings. Chisel parses source code into meaningful unitsβ€”functions, classes, methodsβ€”preserving the context that makes code searchable. + +## From Syntax to Semantics + +```go +source := []byte(` +func New(cfg Config) *Handler { ... } + +func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { ... } + +type Config struct { + Timeout time.Duration + Logger *slog.Logger +} +`) + +chunks, _ := c.Chunk(ctx, chisel.Go, "api.go", source) + +for _, chunk := range chunks { + fmt.Printf("[%s] %s (lines %d-%d)\n", chunk.Kind, chunk.Symbol, chunk.StartLine, chunk.EndLine) +} +// [function] New (lines 2-2) +// [method] Handler.ServeHTTP (lines 4-4) +// [class] Config (lines 6-9) +``` + +Every chunk carries its symbol name, kind, line range, and parent context. Methods know their receiver. Nested types know their enclosing scope. + +```go +chunk := chunks[1] +// chunk.Symbol β†’ "Handler.ServeHTTP" +// chunk.Kind β†’ "method" +// chunk.Context β†’ ["Handler"] +// chunk.Content β†’ the full method source +// chunk.StartLine β†’ 4 +// chunk.EndLine β†’ 4 +``` + +Feed chunks to an embedding model, store in a vector database, and search code by meaning rather than text. + +## Install + +```bash +go get github.com/zoobzio/chisel +``` + +**Language providers** (install only what you need): + +```bash +go get github.com/zoobzio/chisel/golang # Go (stdlib, no deps) +go get github.com/zoobzio/chisel/markdown # Markdown (no deps) +go get github.com/zoobzio/chisel/typescript # TypeScript/JavaScript (tree-sitter) +go get github.com/zoobzio/chisel/python # Python (tree-sitter) +go get github.com/zoobzio/chisel/rust # Rust (tree-sitter) +``` + +Requires Go 1.24+. + +## Quick Start + +```go +package main + +import ( + "context" + "fmt" + + "github.com/zoobzio/chisel" + "github.com/zoobzio/chisel/golang" + "github.com/zoobzio/chisel/typescript" +) + +func main() { + // Create a chunker with language providers + c := chisel.New( + golang.New(), + typescript.New(), + typescript.NewJavaScript(), + ) + + source := []byte(` +package auth + +// Authenticate validates user credentials. +func Authenticate(username, password string) (*User, error) { + // ... +} + +// User represents an authenticated user. +type User struct { + ID string + Email string +} +`) + + chunks, err := c.Chunk(context.Background(), chisel.Go, "auth.go", source) + if err != nil { + panic(err) + } + + for _, chunk := range chunks { + fmt.Printf("[%s] %s\n", chunk.Kind, chunk.Symbol) + fmt.Printf(" Lines: %d-%d\n", chunk.StartLine, chunk.EndLine) + if len(chunk.Context) > 0 { + fmt.Printf(" Context: %v\n", chunk.Context) + } + } +} +``` + +Output: + +```text +[function] Authenticate + Lines: 4-6 +[class] User + Lines: 8-12 +``` + +## Capabilities + +| Feature | Description | Docs | +|---------|-------------|------| +| **Multi-language** | Go, TypeScript, JavaScript, Python, Rust, Markdown | [Providers](docs/2.guides/1.providers.md) | +| **Semantic extraction** | Functions, methods, classes, interfaces, types, enums | [Concepts](docs/1.learn/3.concepts.md) | +| **Context preservation** | Parent chain for nested definitions | [Architecture](docs/1.learn/4.architecture.md) | +| **Line mapping** | Precise source locations for each chunk | [Types](docs/4.reference/2.types.md) | +| **Zero-copy providers** | Go and Markdown use stdlib only | [Architecture](docs/1.learn/4.architecture.md) | + +## Why Chisel? + +- **Semantic boundaries** β€” Chunks split at function/class boundaries, not arbitrary line counts +- **Embedding-ready** β€” Output designed for vector databases and semantic search +- **Isolated dependencies** β€” Tree-sitter only where needed; Go/Markdown have zero external deps +- **Context-aware** β€” Methods know their parent class; nested functions know their scope +- **Consistent interface** β€” Same `Provider` contract across all languages + +## Code Intelligence Pipelines + +Chisel enables a pattern: **parse once, search by meaning**. + +Your codebase becomes a corpus of semantic units. Each function, method, and type gets embedded with its full context β€” symbol name, parent scope, documentation. Queries match intent, not just text. + +```go +// Chunk your codebase +chunks, _ := c.Chunk(ctx, chisel.Go, path, source) + +// Embed each chunk (using your embedding provider) +for _, chunk := range chunks { + embedding := embedder.Embed(chunk.Content) + vectorDB.Store(embedding, chunk.Symbol, chunk.Kind, path) +} + +// Search by meaning +results := vectorDB.Query("authentication middleware") +// Returns: AuthMiddleware, ValidateToken, SessionHandler +// Not just files containing the word "authentication" +``` + +Symbol names and kinds become metadata. Line ranges enable source navigation. Context chains power hierarchical search. + +## Ecosystem + +Chisel provides the chunking layer for code intelligence pipelines: + +- **[vicky](https://github.com/zoobzio/vicky)** β€” Code search and retrieval service + +## Documentation + +- **Learn** + - [Overview](docs/1.learn/1.overview.md) β€” What chisel is and why + - [Quickstart](docs/1.learn/2.quickstart.md) β€” Get productive in minutes + - [Concepts](docs/1.learn/3.concepts.md) β€” Core abstractions + - [Architecture](docs/1.learn/4.architecture.md) β€” How it works internally +- **Guides** + - [Providers](docs/2.guides/1.providers.md) β€” Language-specific details + - [Testing](docs/2.guides/2.testing.md) β€” Testing code that uses chisel + - [Troubleshooting](docs/2.guides/3.troubleshooting.md) β€” Common issues +- **Reference** + - [API](docs/4.reference/1.api.md) β€” Function signatures + - [Types](docs/4.reference/2.types.md) β€” Type definitions + +## Contributing + +Contributions welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. + +## License + +MIT β€” see [LICENSE](LICENSE) for details. diff --git a/docs/1.learn/1.overview.md b/docs/1.learn/1.overview.md new file mode 100644 index 0000000..8ed8161 --- /dev/null +++ b/docs/1.learn/1.overview.md @@ -0,0 +1,45 @@ +--- +title: Overview +description: AST-aware code chunking for semantic search and embeddings +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Overview + - Introduction +--- + +# Overview + +Chisel parses source code into semantic chunksβ€”functions, classes, methods, typesβ€”preserving the structure and context that makes code meaningful. Feed these chunks to an embedder, store them in a vector database, and search code by what it does rather than what it says. + +## The Idea + +Code search should understand code. Splitting files at arbitrary line boundaries destroys the very structure that gives code meaning. A function split in half is two meaningless fragments. A method without its class context is an orphan. + +Chisel asks: what if we chunked code the way developers think about it? + +## The Implementation + +Chisel provides: + +- **Language providers** β€” Parsers for Go, TypeScript, JavaScript, Python, Rust, and Markdown +- **Semantic extraction** β€” Functions, methods, classes, interfaces, types, enums, modules +- **Context preservation** β€” Parent chain for nested definitions (method β†’ class β†’ module) +- **Line mapping** β€” Precise source locations for navigation and display +- **Uniform interface** β€” Same `Provider` contract across all languages + +## What It Enables + +Chisel is the chunking layer for code intelligence: + +- **Semantic search** β€” Find code by meaning, not keywords +- **Code retrieval** β€” Fetch relevant context for LLM prompts +- **Documentation generation** β€” Extract structure for API docs +- **Codebase understanding** β€” Map relationships between components + +## Next Steps + +- [Quickstart](2.quickstart.md) β€” Get productive in minutes +- [Concepts](3.concepts.md) β€” Understand the core abstractions +- [Architecture](4.architecture.md) β€” Learn how chisel works internally diff --git a/docs/1.learn/2.quickstart.md b/docs/1.learn/2.quickstart.md new file mode 100644 index 0000000..642001c --- /dev/null +++ b/docs/1.learn/2.quickstart.md @@ -0,0 +1,180 @@ +--- +title: Quickstart +description: Get productive with chisel in minutes +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Quickstart + - Getting Started +--- + +# Quickstart + +## Installation + +Install the core package: + +```bash +go get github.com/zoobzio/chisel +``` + +Install providers for the languages you need: + +```bash +# Zero-dependency providers +go get github.com/zoobzio/chisel/golang +go get github.com/zoobzio/chisel/markdown + +# Tree-sitter providers +go get github.com/zoobzio/chisel/typescript +go get github.com/zoobzio/chisel/python +go get github.com/zoobzio/chisel/rust +``` + +Requires Go 1.24+. + +## Basic Usage + +```go +package main + +import ( + "context" + "fmt" + + "github.com/zoobzio/chisel" + "github.com/zoobzio/chisel/golang" +) + +func main() { + // Create a provider + provider := golang.New() + + source := []byte(`package math + +// Add returns the sum of two integers. +func Add(a, b int) int { + return a + b +} + +// Calculator performs arithmetic operations. +type Calculator struct { + value int +} + +// Add adds n to the calculator's value. +func (c *Calculator) Add(n int) { + c.value += n +} +`) + + // Chunk the source + chunks, err := provider.Chunk(context.Background(), "math.go", source) + if err != nil { + panic(err) + } + + // Inspect the results + for _, chunk := range chunks { + fmt.Printf("[%s] %s (lines %d-%d)\n", + chunk.Kind, chunk.Symbol, chunk.StartLine, chunk.EndLine) + } +} +``` + +Output: + +```text +[function] Add (lines 3-6) +[class] Calculator (lines 8-11) +[method] Calculator.Add (lines 13-16) +``` + +## Multiple Languages + +Use the `Chunker` to route files to the appropriate provider: + +```go +import ( + "github.com/zoobzio/chisel" + "github.com/zoobzio/chisel/golang" + "github.com/zoobzio/chisel/typescript" + "github.com/zoobzio/chisel/python" +) + +// Create a chunker with multiple providers +c := chisel.New( + golang.New(), + typescript.New(), + typescript.NewJavaScript(), + python.New(), +) + +// Chunk by language +goChunks, _ := c.Chunk(ctx, chisel.Go, "main.go", goSource) +tsChunks, _ := c.Chunk(ctx, chisel.TypeScript, "app.ts", tsSource) +pyChunks, _ := c.Chunk(ctx, chisel.Python, "utils.py", pySource) +``` + +See [Providers Guide](../2.guides/1.providers.md) for language-specific details. + +## Context Chains + +Methods and nested definitions include their parent context: + +```go +source := []byte(`class UserService { + async getUser(id: string): Promise { + return this.db.find(id); + } +}`) + +chunks, _ := typescript.New().Chunk(ctx, "service.ts", source) + +for _, c := range chunks { + if c.Kind == chisel.KindMethod { + fmt.Printf("%s in %v\n", c.Symbol, c.Context) + // getUser in [class UserService] + } +} +``` + +See [Concepts](3.concepts.md) for more on context preservation. + +## Embedding Pipeline + +A typical pipeline: chunk β†’ embed β†’ store β†’ search. + +```go +// 1. Chunk the code +chunks, _ := provider.Chunk(ctx, filename, source) + +// 2. Embed each chunk (using your embedder of choice) +for _, chunk := range chunks { + embedding := embedder.Embed(chunk.Content) + + // 3. Store with metadata + store.Insert(Document{ + Content: chunk.Content, + Embedding: embedding, + Metadata: map[string]any{ + "symbol": chunk.Symbol, + "kind": chunk.Kind, + "file": filename, + "startLine": chunk.StartLine, + "endLine": chunk.EndLine, + "context": chunk.Context, + }, + }) +} + +// 4. Search by meaning +results := store.Search(embedder.Embed("authentication logic")) +``` + +## Next Steps + +- [Concepts](3.concepts.md) β€” Understand chunks, kinds, and context +- [Architecture](4.architecture.md) β€” How chisel parses code +- [API Reference](../4.reference/1.api.md) β€” Complete function signatures diff --git a/docs/1.learn/3.concepts.md b/docs/1.learn/3.concepts.md new file mode 100644 index 0000000..7f61b19 --- /dev/null +++ b/docs/1.learn/3.concepts.md @@ -0,0 +1,132 @@ +--- +title: Concepts +description: Core abstractions in chisel +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Concepts + - Architecture +--- + +# Concepts + +Chisel is built around a few core abstractions: chunks, kinds, providers, and context. Understanding these helps you reason about what chisel extracts and why. + +## Chunks + +A **Chunk** is a semantic unit of code. Unlike line-based splitting, chunks follow the natural boundaries of code: where functions start and end, where classes are defined, where documentation lives. + +```go +type Chunk struct { + Content string // The actual source code + Symbol string // Name: "Add", "UserService", "Config" + Kind Kind // Category: function, method, class, etc. + StartLine int // Where it begins (1-indexed) + EndLine int // Where it ends (1-indexed) + Context []string // Parent chain: ["class UserService"] +} +``` + +Each chunk is self-contained. The `Content` field holds the complete sourceβ€”including comments and documentationβ€”so embeddings capture the full meaning. + +See [Types Reference](../4.reference/2.types.md#chunk) for the complete definition. + +## Kinds + +**Kind** categorizes what a chunk represents. This lets you filter, group, or weight chunks differently in your pipeline. + +| Kind | Description | Example | +|------|-------------|---------| +| `function` | Standalone function | `func Add(a, b int) int` | +| `method` | Function with receiver/self | `func (c *Calc) Add(n int)` | +| `class` | Class or struct definition | `class UserService {}` | +| `interface` | Interface or trait | `interface Reader {}` | +| `type` | Type alias or other type | `type ID = string` | +| `enum` | Enumeration | `enum Status { Active }` | +| `constant` | Constant declaration | `const MaxSize = 100` | +| `variable` | Variable declaration | `var cache = map{}` | +| `section` | Markdown header | `## Installation` | +| `module` | Package/file level | Package documentation | + +Not every language uses every kind. Go has no enums; Python has no interfaces. Chisel maps language constructs to the closest semantic equivalent. + +See [Types Reference](../4.reference/2.types.md#kind) for the complete list. + +## Providers + +A **Provider** parses a specific language into chunks. Each provider understands its language's AST and extracts meaningful units. + +```go +type Provider interface { + Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) + Language() Language +} +``` + +Chisel ships with providers for: + +- **Go** β€” Uses stdlib `go/parser`, zero external dependencies +- **Markdown** β€” Header-based splitting, zero dependencies +- **TypeScript/JavaScript** β€” Tree-sitter parser +- **Python** β€” Tree-sitter parser +- **Rust** β€” Tree-sitter parser + +The provider isolation is intentional. If you only need Go support, you don't pay for tree-sitter. Import only what you use. + +See [Providers Guide](../2.guides/1.providers.md) for language-specific behavior. + +## Context + +**Context** captures the parent chain for nested definitions. When you chunk a method, the context tells you which class it belongs to. + +```typescript +class UserService { + private db: Database; + + async getUser(id: string): Promise { + return this.db.find(id); + } +} +``` + +The `getUser` chunk will have: + +```go +Chunk{ + Symbol: "getUser", + Kind: KindMethod, + Context: []string{"class UserService"}, +} +``` + +Context flows downward. A method inside a class inside a module might have: + +```go +Context: []string{"module api", "class UserService"} +``` + +This enables queries like "find all methods in UserService" or "show me everything in the api module." + +## Languages + +**Language** identifies which provider handles a file. Use it with the `Chunker` to route files automatically. + +```go +const ( + Go Language = "go" + TypeScript Language = "typescript" + JavaScript Language = "javascript" + Python Language = "python" + Rust Language = "rust" + Markdown Language = "markdown" +) +``` + +The `Chunker` maps languages to providers. If you're processing a single language, you can use the provider directly without the chunker. + +## Next Steps + +- [Architecture](4.architecture.md) β€” How parsing works internally +- [Providers Guide](../2.guides/1.providers.md) β€” Language-specific details +- [Types Reference](../4.reference/2.types.md) β€” Complete type definitions diff --git a/docs/1.learn/4.architecture.md b/docs/1.learn/4.architecture.md new file mode 100644 index 0000000..78b0124 --- /dev/null +++ b/docs/1.learn/4.architecture.md @@ -0,0 +1,168 @@ +--- +title: Architecture +description: How chisel works internally +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Architecture + - Internals +--- + +# Architecture + +This document explains how chisel parses code internally. It's intended for contributors and users who want to understand the implementation or extend chisel with new providers. + +## Component Overview + +```text +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Chunker β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ providers map[Language]Provider β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β–Ό β–Ό β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Go β”‚ β”‚TypeScriptβ”‚ β”‚ Rust β”‚ β”‚ +β”‚ β”‚Providerβ”‚ β”‚ Provider β”‚ β”‚Provider β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β–Ό β–Ό β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚go/parserβ”‚ β”‚tree-sitterβ”‚ β”‚tree-sitterβ”‚ β”‚ +β”‚ β”‚ (stdlib)β”‚ β”‚ (cgo) β”‚ β”‚ (cgo) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +The `Chunker` routes requests to the appropriate `Provider` based on language. Each provider uses language-specific parsingβ€”stdlib for Go, tree-sitter for others. + +## Parsing Strategies + +### Go Provider + +The Go provider uses the standard library's `go/parser` package. This gives us: + +- Zero external dependencies +- Mature, well-tested parser +- Access to Go's AST types + +The extraction walks the AST looking for: + +1. **Package documentation** (`*ast.File.Doc`) +2. **Function declarations** (`*ast.FuncDecl`) +3. **Type declarations** (`*ast.GenDecl` with `token.TYPE`) + +For methods, we extract the receiver type to build the context chain. + +```go +// Simplified extraction +for _, decl := range file.Decls { + switch d := decl.(type) { + case *ast.FuncDecl: + // Extract function/method + case *ast.GenDecl: + if d.Tok == token.TYPE { + // Extract type (struct, interface, alias) + } + } +} +``` + +### Tree-sitter Providers + +TypeScript, Python, and Rust use [tree-sitter](https://tree-sitter.github.io/tree-sitter/) via the `go-tree-sitter` bindings. Tree-sitter provides: + +- Incremental parsing (not currently used, but available) +- Error recovery for malformed code +- Consistent node types across languages + +Each provider walks the syntax tree recursively: + +```go +func walkNode(node *sitter.Node, content []byte, ctx []string, chunks *[]Chunk) { + switch node.Type() { + case "function_declaration": + *chunks = append(*chunks, extractFunction(node, content, ctx)) + case "class_declaration": + // Extract class, then walk children with updated context + newCtx := append(ctx, "class "+className) + for i := 0; i < node.ChildCount(); i++ { + walkNode(node.Child(i), content, newCtx, chunks) + } + } + // Continue walking... +} +``` + +### Markdown Provider + +The Markdown provider uses simple string scanningβ€”no AST, no dependencies. It splits on headers: + +```go +for i, line := range lines { + if strings.HasPrefix(line, "#") { + // Start new section + } +} +``` + +Each header starts a new section chunk. Content accumulates until the next header of equal or higher level. + +## Context Propagation + +Context flows from parent to child during tree walking. When we enter a class, we push its name onto the context stack. Methods inside inherit that context. + +```text +class UserService { // ctx = [] + getUser() {} // ctx = ["class UserService"] + + class Inner { // ctx = ["class UserService"] + helper() {} // ctx = ["class UserService", "class Inner"] + } +} +``` + +The context is copied (not shared) for each chunk to avoid mutation issues. + +## Design Q&A + +**Why separate providers instead of one unified parser?** + +Dependencies. Tree-sitter requires cgo and pulls in C libraries. Users who only need Go support shouldn't pay that cost. The workspace structure (`go.work`) isolates dependencies at the module level. + +**Why not use tree-sitter for Go?** + +The stdlib parser is battle-tested, has zero dependencies, and runs ~10x faster (see benchmarks). Tree-sitter would add complexity without benefit for Go. + +**Why preserve the full source in `Content`?** + +Embeddings need context. A function signature alone ("func Add(a, b int) int") is less meaningful than the full function with its documentation and body. We let the embedding model decide what matters. + +**Why use `Kind` instead of language-specific types?** + +Consistency. A Python `class` and a Go `struct` serve similar purposes. Normalizing to `KindClass` lets downstream tools treat them uniformly. Language-specific details can be recovered from the source. + +## Performance + +Benchmarks on a Ryzen 5 3600X (representative ~50-line files): + +| Provider | Time | Memory | Allocations | +|----------|------|--------|-------------| +| Go | 32Β΅s | 17KB | 402 | +| TypeScript | 313Β΅s | 63KB | 579 | +| Python | 328Β΅s | 63KB | 569 | +| Rust | 293Β΅s | 61KB | 566 | +| Markdown | 4Β΅s | 7KB | 45 | + +The Go provider is ~10x faster than tree-sitter providers due to stdlib optimization. Markdown is fastest as it does no AST construction. + +For large files, tree-sitter's incremental parsing could be leveraged (not currently implemented). + +## Next Steps + +- [Providers Guide](../2.guides/1.providers.md) β€” Language-specific extraction details +- [API Reference](../4.reference/1.api.md) β€” Function signatures +- [Testing Guide](../2.guides/2.testing.md) β€” Testing code that uses chisel diff --git a/docs/2.guides/1.providers.md b/docs/2.guides/1.providers.md new file mode 100644 index 0000000..0020521 --- /dev/null +++ b/docs/2.guides/1.providers.md @@ -0,0 +1,309 @@ +--- +title: Providers Guide +description: Language-specific chunking behavior +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Guide + - Providers +--- + +# Providers Guide + +Each language provider extracts semantic chunks according to that language's idioms. This guide covers language-specific behavior and what to expect from each provider. + +## Go + +**Package:** `github.com/zoobzio/chisel/golang` + +**Dependencies:** None (uses stdlib `go/parser`) + +**Extracts:** + +| Construct | Kind | Symbol Format | +|-----------|------|---------------| +| Package doc | `module` | Package name | +| Function | `function` | Function name | +| Method | `method` | `Receiver.Method` | +| Struct | `class` | Type name | +| Interface | `interface` | Type name | +| Type alias | `type` | Type name | + +**Example:** + +```go +// Package auth provides authentication utilities. +package auth + +// User represents an authenticated user. +type User struct { + ID string +} + +// Authenticate validates credentials. +func Authenticate(u, p string) (*User, error) { ... } + +// Validate checks if a user session is valid. +func (u *User) Validate() bool { ... } +``` + +Produces: + +```text +[module] auth (lines 1-2) +[class] User (lines 4-7) +[function] Authenticate (lines 9-9) +[method] User.Validate (lines 11-11) +``` + +**Notes:** + +- Package documentation is extracted as a `module` chunk +- Methods include receiver type in symbol: `User.Validate` +- Documentation comments are included in chunk content +- Requires syntactically valid Go (parse errors fail the entire file) + +## TypeScript + +**Package:** `github.com/zoobzio/chisel/typescript` + +**Dependencies:** `go-tree-sitter` (cgo) + +**Extracts:** + +| Construct | Kind | Symbol Format | +|-----------|------|---------------| +| Function declaration | `function` | Function name | +| Arrow function | `function` | Variable name or `` | +| Class | `class` | Class name | +| Method | `method` | Method name | +| Interface | `interface` | Interface name | +| Type alias | `type` | Type name | + +**Example:** + +```typescript +interface User { + id: string; + email: string; +} + +class UserService { + async getUser(id: string): Promise { + return this.db.find(id); + } +} + +function createService(): UserService { + return new UserService(); +} + +const helper = (x: number) => x * 2; +``` + +Produces: + +```text +[interface] User (lines 1-4) +[class] UserService (lines 6-10) +[method] getUser (lines 7-9) context: [class UserService] +[function] createService (lines 12-14) +[function] helper (lines 16-16) +``` + +**Notes:** + +- Methods have their parent class in `Context` +- Arrow functions assigned to `const` use the variable name +- Anonymous functions use `` as symbol + +## JavaScript + +**Package:** `github.com/zoobzio/chisel/typescript` (use `NewJavaScript()`) + +**Dependencies:** `go-tree-sitter` (cgo) + +Uses the same parser as TypeScript but configured for JavaScript. Behavior is identical except: + +- No type annotations +- No interfaces or type aliases + +```go +provider := typescript.NewJavaScript() +chunks, err := provider.Chunk(ctx, "app.js", source) +``` + +## Python + +**Package:** `github.com/zoobzio/chisel/python` + +**Dependencies:** `go-tree-sitter` (cgo) + +**Extracts:** + +| Construct | Kind | Symbol Format | +|-----------|------|---------------| +| Function | `function` | Function name | +| Method | `method` | Method name | +| Class | `class` | Class name | +| Decorated function | `function` | Function name | + +**Example:** + +```python +class UserService: + """Manages user operations.""" + + def __init__(self, db): + self.db = db + + def get_user(self, user_id: str) -> User: + """Fetch a user by ID.""" + return self.db.find(user_id) + +@dataclass +class User: + id: str + email: str + +def create_service() -> UserService: + return UserService(Database()) +``` + +Produces: + +```text +[class] UserService (lines 1-9) +[method] __init__ (lines 4-5) context: [class UserService] +[method] get_user (lines 7-9) context: [class UserService] +[class] User (lines 11-14) +[function] create_service (lines 16-17) +``` + +**Notes:** + +- Decorators are included in the chunk content +- Docstrings are part of the chunk +- `__init__` and other dunder methods are extracted as methods + +## Rust + +**Package:** `github.com/zoobzio/chisel/rust` + +**Dependencies:** `go-tree-sitter` (cgo) + +**Extracts:** + +| Construct | Kind | Symbol Format | +|-----------|------|---------------| +| Function | `function` | Function name | +| Method (in impl) | `method` | Method name | +| Struct | `type` | Struct name | +| Enum | `enum` | Enum name | +| Trait | `interface` | Trait name | +| Impl block | `class` | Type name | +| Module | `module` | Module name | + +**Example:** + +```rust +pub struct User { + pub id: String, + pub email: String, +} + +impl User { + pub fn new(id: String, email: String) -> Self { + User { id, email } + } + + pub fn validate(&self) -> bool { + !self.id.is_empty() + } +} + +trait Authenticatable { + fn authenticate(&self) -> bool; +} + +enum Status { + Active, + Inactive, +} +``` + +Produces: + +```text +[type] User (lines 1-4) +[class] User (lines 6-14) # impl block +[method] new (lines 7-9) context: [impl User] +[method] validate (lines 11-13) context: [impl User] +[interface] Authenticatable (lines 16-18) +[enum] Status (lines 20-23) +``` + +**Notes:** + +- Structs are `type`, impl blocks are `class` +- Methods know their impl block via `Context` +- Traits map to `interface` +- Enums have their own kind + +## Markdown + +**Package:** `github.com/zoobzio/chisel/markdown` + +**Dependencies:** None + +**Extracts:** + +| Construct | Kind | Symbol Format | +|-----------|------|---------------| +| Header | `section` | Header text (without `#`) | + +**Example:** + +```markdown +# Installation + +Install the package: + +```bash +go get github.com/example/pkg +``` + +## Configuration + +Set the following environment variables: + +### Required Variables + +- `API_KEY`: Your API key + +### Optional Variables + +- `DEBUG`: Enable debug mode +``` + +Produces: + +```text +[section] Installation (lines 1-7) +[section] Configuration (lines 9-11) +[section] Required Variables (lines 13-15) +[section] Optional Variables (lines 17-19) +``` + +**Notes:** + +- Each header starts a new section +- Section content continues until the next header of equal or higher level +- Nested headers create nested sections (no context chain currently) + +## Next Steps + +- [Testing Guide](2.testing.md) β€” Test code that uses chisel +- [Troubleshooting](3.troubleshooting.md) β€” Common issues +- [API Reference](../4.reference/1.api.md) β€” Function signatures diff --git a/docs/2.guides/2.testing.md b/docs/2.guides/2.testing.md new file mode 100644 index 0000000..1fee8b6 --- /dev/null +++ b/docs/2.guides/2.testing.md @@ -0,0 +1,237 @@ +--- +title: Testing Guide +description: Testing code that uses chisel +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Guide + - Testing +--- + +# Testing Guide + +This guide covers how to test code that uses chisel, including test helpers, fixtures, and mocking strategies. + +## Test Helpers + +Chisel provides test utilities in `github.com/zoobzio/chisel/testing`: + +```go +import ( + "testing" + + "github.com/zoobzio/chisel" + chitesting "github.com/zoobzio/chisel/testing" +) + +func TestChunking(t *testing.T) { + chunks := getChunks() // your code + + // Assert chunk count + chitesting.AssertChunkCount(t, chunks, 3) + + // Assert symbol exists + chitesting.AssertHasSymbol(t, chunks, "UserService") + + // Assert kind exists + chitesting.AssertHasKind(t, chunks, chisel.KindClass) +} +``` + +### Available Helpers + +| Function | Description | +|----------|-------------| +| `AssertChunkCount(t, chunks, n)` | Fails if `len(chunks) != n` | +| `AssertHasSymbol(t, chunks, sym)` | Fails if no chunk has symbol `sym` | +| `AssertHasKind(t, chunks, kind)` | Fails if no chunk has kind `kind` | +| `FindBySymbol(chunks, sym)` | Returns first chunk with symbol, or nil | +| `FindByKind(chunks, kind)` | Returns first chunk with kind, or nil | +| `CountByKind(chunks, kind)` | Returns count of chunks with kind | + +## Testing Chunk Output + +For detailed assertions, use `FindBySymbol` or `FindByKind`: + +```go +func TestAuthenticateFunction(t *testing.T) { + source := []byte(` +func Authenticate(user, pass string) (*User, error) { + // implementation +} +`) + provider := golang.New() + chunks, err := provider.Chunk(context.Background(), "auth.go", source) + if err != nil { + t.Fatal(err) + } + + chunk := chitesting.FindBySymbol(chunks, "Authenticate") + if chunk == nil { + t.Fatal("Authenticate not found") + } + + if chunk.Kind != chisel.KindFunction { + t.Errorf("Kind = %v, want function", chunk.Kind) + } + + if chunk.StartLine != 2 { + t.Errorf("StartLine = %d, want 2", chunk.StartLine) + } +} +``` + +## Table-Driven Tests + +For comprehensive coverage, use table-driven tests: + +```go +func TestProviderChunking(t *testing.T) { + tests := []struct { + name string + source string + wantSyms []string + wantKind chisel.Kind + }{ + { + name: "function", + source: "func Add(a, b int) int { return a + b }", + wantSyms: []string{"Add"}, + wantKind: chisel.KindFunction, + }, + { + name: "method", + source: "func (c *Calc) Add(n int) { c.v += n }", + wantSyms: []string{"Calc.Add"}, + wantKind: chisel.KindMethod, + }, + } + + provider := golang.New() + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + src := "package main\n" + tt.source + chunks, err := provider.Chunk(context.Background(), "test.go", []byte(src)) + if err != nil { + t.Fatal(err) + } + + for _, sym := range tt.wantSyms { + chitesting.AssertHasSymbol(t, chunks, sym) + } + chitesting.AssertHasKind(t, chunks, tt.wantKind) + }) + } +} +``` + +## Mocking Providers + +For unit testing code that consumes chunks, mock the provider: + +```go +type mockProvider struct { + chunks []chisel.Chunk + err error +} + +func (m *mockProvider) Chunk(ctx context.Context, filename string, content []byte) ([]chisel.Chunk, error) { + return m.chunks, m.err +} + +func (m *mockProvider) Language() chisel.Language { + return chisel.Go +} + +func TestChunkProcessor(t *testing.T) { + mock := &mockProvider{ + chunks: []chisel.Chunk{ + {Symbol: "Test", Kind: chisel.KindFunction, StartLine: 1, EndLine: 5}, + }, + } + + processor := NewProcessor(mock) + result := processor.Process(context.Background(), "test.go", []byte("...")) + + // Assert on result +} +``` + +## Testing Error Handling + +Test that your code handles parsing errors gracefully: + +```go +func TestInvalidSource(t *testing.T) { + provider := golang.New() + + // Invalid Go syntax + source := []byte("func broken( {") + _, err := provider.Chunk(context.Background(), "bad.go", source) + + if err == nil { + t.Error("expected error for invalid syntax") + } +} +``` + +## Fixture Files + +For complex test cases, use fixture files: + +```text +testdata/ +β”œβ”€β”€ simple.go +β”œβ”€β”€ complex.go +└── expected/ + β”œβ”€β”€ simple.json + └── complex.json +``` + +```go +func TestFixtures(t *testing.T) { + files, _ := filepath.Glob("testdata/*.go") + for _, file := range files { + t.Run(filepath.Base(file), func(t *testing.T) { + source, _ := os.ReadFile(file) + chunks, err := provider.Chunk(ctx, file, source) + if err != nil { + t.Fatal(err) + } + + // Compare against expected output + expected := loadExpected(t, file) + assertChunksEqual(t, chunks, expected) + }) + } +} +``` + +## Benchmarking + +Chisel includes benchmarks in `testing/benchmarks/`. Run them with: + +```bash +go test -bench=. github.com/zoobzio/chisel/testing/benchmarks -benchmem +``` + +To add benchmarks for your own code: + +```go +func BenchmarkMyProcessor(b *testing.B) { + provider := golang.New() + source := loadLargeFile() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + chunks, _ := provider.Chunk(context.Background(), "large.go", source) + processChunks(chunks) + } +} +``` + +## Next Steps + +- [Troubleshooting](3.troubleshooting.md) β€” Common issues and solutions +- [API Reference](../4.reference/1.api.md) β€” Function signatures diff --git a/docs/2.guides/3.troubleshooting.md b/docs/2.guides/3.troubleshooting.md new file mode 100644 index 0000000..fd9f5ea --- /dev/null +++ b/docs/2.guides/3.troubleshooting.md @@ -0,0 +1,207 @@ +--- +title: Troubleshooting +description: Common issues and solutions +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Guide + - Troubleshooting +--- + +# Troubleshooting + +Common issues when using chisel and how to resolve them. + +## Parse Errors + +### "parse: expected..." + +**Symptom:** Go provider returns parse error. + +**Cause:** Source code has syntax errors. + +**Solution:** The Go provider requires valid Go syntax. Unlike tree-sitter providers, it cannot recover from errors. + +```go +// This fails +source := []byte("func broken( {") +chunks, err := provider.Chunk(ctx, "test.go", source) +// err: parse: expected ')', found '{' + +// Fix the syntax first +source := []byte("func working() {}") +``` + +### Empty chunks from tree-sitter + +**Symptom:** TypeScript/Python/Rust provider returns empty slice. + +**Cause:** File contains only syntax errors or unsupported constructs. + +**Solution:** Tree-sitter can parse partially valid code, but completely malformed files may yield no extractable chunks. Check that your source is valid for the target language. + +## Import Issues + +### "cannot find package" + +**Symptom:** Import fails for a provider. + +**Cause:** Provider not installed. + +**Solution:** Install the specific provider: + +```bash +go get github.com/zoobzio/chisel/golang +go get github.com/zoobzio/chisel/typescript +``` + +### CGO errors with tree-sitter + +**Symptom:** Build fails with CGO errors when importing TypeScript/Python/Rust providers. + +**Cause:** Tree-sitter requires CGO. + +**Solution:** + +1. Ensure CGO is enabled: `CGO_ENABLED=1` +2. Install a C compiler (gcc, clang) +3. On macOS: `xcode-select --install` +4. On Linux: `apt install build-essential` or equivalent + +If you can't use CGO, stick to the Go and Markdown providers which have zero external dependencies. + +## Chunker Issues + +### "no provider for language" + +**Symptom:** `Chunker.Chunk()` returns error. + +**Cause:** No provider registered for the requested language. + +**Solution:** Register a provider when creating the chunker: + +```go +// Wrong: no providers +c := chisel.New() +chunks, err := c.Chunk(ctx, chisel.Go, "main.go", source) +// err: no provider for language: go + +// Right: register providers +c := chisel.New( + golang.New(), + typescript.New(), +) +``` + +### Wrong language specified + +**Symptom:** Chunks don't match expected structure. + +**Cause:** Language doesn't match file content. + +**Solution:** Ensure the language matches the source: + +```go +// Wrong: TypeScript source with Go language +c.Chunk(ctx, chisel.Go, "app.ts", tsSource) + +// Right: match language to content +c.Chunk(ctx, chisel.TypeScript, "app.ts", tsSource) +``` + +## Unexpected Chunk Output + +### Missing chunks + +**Symptom:** Expected function/class not in output. + +**Possible causes:** + +1. **Nested in unexpected structure** β€” Check if it's inside a block we don't walk +2. **Anonymous construct** β€” Arrow functions may use `` +3. **Language-specific behavior** β€” Check [Providers Guide](1.providers.md) + +**Debug approach:** + +```go +chunks, _ := provider.Chunk(ctx, filename, source) +for _, c := range chunks { + fmt.Printf("%s %s (lines %d-%d) ctx=%v\n", + c.Kind, c.Symbol, c.StartLine, c.EndLine, c.Context) +} +``` + +### Unexpected kind + +**Symptom:** Struct shows as `class`, trait shows as `interface`. + +**Cause:** Chisel normalizes language constructs to semantic kinds. + +**Solution:** This is intentional. See [Concepts: Kinds](../1.learn/3.concepts.md#kinds) for the mapping rationale. + +### Missing context + +**Symptom:** Method has empty `Context` slice. + +**Cause:** Method is at top level, not inside a class/impl block. + +**Solution:** Context only applies to nested constructs: + +```go +// No context (top-level) +func TopLevel() {} + +// Has context +type Service struct{} +func (s *Service) Method() {} // Context: ["type Service"] +``` + +## Performance Issues + +### Slow parsing + +**Symptom:** Chunking takes too long. + +**Possible causes:** + +1. **Very large files** β€” Tree-sitter is O(n) but constant factors matter +2. **Many small files** β€” Provider creation overhead + +**Solutions:** + +1. Reuse providers across files (they're stateless and thread-safe) +2. For very large files, consider pre-splitting +3. Use Go/Markdown providers where possible (faster than tree-sitter) + +### High memory usage + +**Symptom:** Memory grows when processing many files. + +**Solution:** Chunks hold copies of source content. If you're processing many files: + +```go +for _, file := range files { + chunks, _ := provider.Chunk(ctx, file.Name, file.Content) + process(chunks) + // chunks go out of scope here, eligible for GC +} +``` + +Don't accumulate all chunks in memory if you can process them incrementally. + +## Getting Help + +If you encounter an issue not covered here: + +1. Check [GitHub Issues](https://github.com/zoobzio/chisel/issues) +2. Open a new issue with: + - Go version + - Chisel version + - Minimal reproduction + - Expected vs actual output + +## Next Steps + +- [Providers Guide](1.providers.md) β€” Language-specific behavior +- [Architecture](../1.learn/4.architecture.md) β€” How parsing works diff --git a/docs/3.integrations/1.vicky.md b/docs/3.integrations/1.vicky.md new file mode 100644 index 0000000..84c73d6 --- /dev/null +++ b/docs/3.integrations/1.vicky.md @@ -0,0 +1,82 @@ +--- +title: Vicky Integration +description: Using chisel with vicky for code search +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Integration + - Vicky +--- + +# Vicky + +Vicky is a code search and retrieval service that uses chisel for semantic code chunking. + +## The Pipeline + +```text +Repository β†’ Enumerate Files β†’ Chunk (chisel) β†’ Embed (vex) β†’ Store (pgvector) β†’ Search +``` + +1. **Enumerate** β€” Vicky clones/fetches the repository +2. **Chunk** β€” Chisel parses files into semantic units +3. **Embed** β€” Vex generates embeddings for each chunk +4. **Store** β€” Chunks and embeddings go into PostgreSQL with pgvector +5. **Search** β€” Queries are embedded and matched against stored chunks + +## What Chisel Provides + +| Vicky needs | Chisel provides | +|-------------|-----------------| +| Semantic boundaries | Chunks split at function/class boundaries | +| Symbol names | `chunk.Symbol` for display and filtering | +| Kind classification | `chunk.Kind` for faceted search | +| Source locations | `chunk.StartLine`, `chunk.EndLine` for navigation | +| Parent context | `chunk.Context` for hierarchical browsing | +| Embeddable content | `chunk.Content` with docs and implementation | + +## Configuration + +Vicky selects chisel providers based on file extension: + +| Extension | Provider | +|-----------|----------| +| `.go` | `golang.New()` | +| `.ts`, `.tsx` | `typescript.New()` | +| `.js`, `.jsx` | `typescript.NewJavaScript()` | +| `.py` | `python.New()` | +| `.rs` | `rust.New()` | +| `.md` | `markdown.New()` | + +Files with unrecognized extensions are skipped. + +## Metadata Storage + +Vicky stores chunk metadata alongside embeddings: + +```sql +CREATE TABLE chunks ( + id UUID PRIMARY KEY, + repo_id UUID REFERENCES repos(id), + file_path TEXT NOT NULL, + symbol TEXT, + kind TEXT, + start_line INTEGER, + end_line INTEGER, + context TEXT[], + content TEXT NOT NULL, + embedding vector(1024) +); +``` + +This enables queries like: + +- "Find functions named `Authenticate`" +- "Show all classes in `pkg/auth`" +- "Methods in the `UserService` class" + +## Learn More + +- [Vicky Repository](https://github.com/zoobzio/vicky) +- [Vicky Documentation](https://github.com/zoobzio/vicky/tree/main/docs) diff --git a/docs/4.reference/1.api.md b/docs/4.reference/1.api.md new file mode 100644 index 0000000..cdae847 --- /dev/null +++ b/docs/4.reference/1.api.md @@ -0,0 +1,349 @@ +--- +title: API Reference +description: Function signatures and behavior +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Reference + - API +--- + +# API Reference + +## Core Package + +### New + +```go +func New(providers ...Provider) *Chunker +``` + +Creates a new Chunker with the given providers. Each provider handles one language. + +**Panics:** Never. + +```go +c := chisel.New( + golang.New(), + typescript.New(), + python.New(), +) +``` + +### Chunker.Chunk + +```go +func (c *Chunker) Chunk(ctx context.Context, lang Language, filename string, content []byte) ([]Chunk, error) +``` + +Routes the request to the appropriate provider and returns chunks. + +**Errors:** +- Returns error if no provider is registered for `lang` +- Returns error if the provider fails to parse + +```go +chunks, err := c.Chunk(ctx, chisel.Go, "main.go", source) +if err != nil { + // Handle parse error or missing provider +} +``` + +### Chunker.Register + +```go +func (c *Chunker) Register(p Provider) +``` + +Adds a provider to the chunker. If a provider for the same language exists, it is replaced. + +**Panics:** Never. + +```go +c := chisel.New() +c.Register(golang.New()) +c.Register(typescript.New()) +``` + +### Chunker.Languages + +```go +func (c *Chunker) Languages() []Language +``` + +Returns all registered languages. Order is not guaranteed. + +**Panics:** Never. + +```go +c := chisel.New(golang.New(), typescript.New()) +langs := c.Languages() +// langs contains chisel.Go and chisel.TypeScript +``` + +### Chunker.HasProvider + +```go +func (c *Chunker) HasProvider(lang Language) bool +``` + +Returns true if a provider is registered for the language. + +**Panics:** Never. + +```go +c := chisel.New(golang.New()) +c.HasProvider(chisel.Go) // true +c.HasProvider(chisel.TypeScript) // false +``` + +--- + +## Go Provider + +**Package:** `github.com/zoobzio/chisel/golang` + +### New + +```go +func New() *Provider +``` + +Creates a new Go provider using stdlib `go/parser`. + +**Panics:** Never. + +```go +provider := golang.New() +``` + +### Provider.Chunk + +```go +func (p *Provider) Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) +``` + +Parses Go source and extracts semantic chunks. + +**Errors:** +- Returns parse error if source is invalid Go + +```go +chunks, err := provider.Chunk(ctx, "main.go", source) +``` + +### Provider.Language + +```go +func (p *Provider) Language() Language +``` + +Returns `chisel.Go`. + +--- + +## TypeScript Provider + +**Package:** `github.com/zoobzio/chisel/typescript` + +### New + +```go +func New() *Provider +``` + +Creates a TypeScript provider using tree-sitter. + +```go +provider := typescript.New() +``` + +### NewJavaScript + +```go +func NewJavaScript() *Provider +``` + +Creates a JavaScript provider (same parser, different language identifier). + +```go +provider := typescript.NewJavaScript() +``` + +### Provider.Chunk + +```go +func (p *Provider) Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) +``` + +Parses TypeScript/JavaScript source and extracts chunks. + +**Errors:** +- Returns error on tree-sitter failure (rare) + +### Provider.Language + +```go +func (p *Provider) Language() Language +``` + +Returns `chisel.TypeScript` or `chisel.JavaScript` depending on constructor. + +--- + +## Python Provider + +**Package:** `github.com/zoobzio/chisel/python` + +### New + +```go +func New() *Provider +``` + +Creates a Python provider using tree-sitter. + +```go +provider := python.New() +``` + +### Provider.Chunk + +```go +func (p *Provider) Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) +``` + +Parses Python source and extracts chunks. + +### Provider.Language + +```go +func (p *Provider) Language() Language +``` + +Returns `chisel.Python`. + +--- + +## Rust Provider + +**Package:** `github.com/zoobzio/chisel/rust` + +### New + +```go +func New() *Provider +``` + +Creates a Rust provider using tree-sitter. + +```go +provider := rust.New() +``` + +### Provider.Chunk + +```go +func (p *Provider) Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) +``` + +Parses Rust source and extracts chunks. + +### Provider.Language + +```go +func (p *Provider) Language() Language +``` + +Returns `chisel.Rust`. + +--- + +## Markdown Provider + +**Package:** `github.com/zoobzio/chisel/markdown` + +### New + +```go +func New() *Provider +``` + +Creates a Markdown provider that splits on headers. + +```go +provider := markdown.New() +``` + +### Provider.Chunk + +```go +func (p *Provider) Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) +``` + +Splits Markdown into sections based on headers. + +**Errors:** Never returns error (simple string parsing). + +### Provider.Language + +```go +func (p *Provider) Language() Language +``` + +Returns `chisel.Markdown`. + +--- + +## Test Helpers + +**Package:** `github.com/zoobzio/chisel/testing` + +### AssertChunkCount + +```go +func AssertChunkCount(t *testing.T, chunks []Chunk, want int) +``` + +Fails the test if chunk count doesn't match. + +### AssertHasSymbol + +```go +func AssertHasSymbol(t *testing.T, chunks []Chunk, symbol string) +``` + +Fails the test if no chunk has the given symbol. + +### AssertHasKind + +```go +func AssertHasKind(t *testing.T, chunks []Chunk, kind Kind) +``` + +Fails the test if no chunk has the given kind. + +### FindBySymbol + +```go +func FindBySymbol(chunks []Chunk, symbol string) *Chunk +``` + +Returns first chunk with symbol, or nil if not found. + +### FindByKind + +```go +func FindByKind(chunks []Chunk, kind Kind) *Chunk +``` + +Returns first chunk with kind, or nil if not found. + +### CountByKind + +```go +func CountByKind(chunks []Chunk, kind Kind) int +``` + +Returns number of chunks with the given kind. diff --git a/docs/4.reference/2.types.md b/docs/4.reference/2.types.md new file mode 100644 index 0000000..30782d1 --- /dev/null +++ b/docs/4.reference/2.types.md @@ -0,0 +1,174 @@ +--- +title: Types Reference +description: Type definitions and constants +author: zoobzio +published: 2026-01-19 +updated: 2026-01-19 +tags: + - Reference + - Types +--- + +# Types Reference + +## Chunk + +A semantic unit of code or documentation. + +```go +type Chunk struct { + Content string + Symbol string + Kind Kind + StartLine int + EndLine int + Context []string +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `Content` | `string` | The actual source code or text, including comments | +| `Symbol` | `string` | Name of the function, class, type, or section | +| `Kind` | `Kind` | Category of this chunk (function, method, class, etc.) | +| `StartLine` | `int` | 1-indexed starting line number | +| `EndLine` | `int` | 1-indexed ending line number | +| `Context` | `[]string` | Parent chain, e.g. `["class UserService"]` | + +**Notes:** +- `Content` includes documentation comments attached to the construct +- `Symbol` for methods may include receiver: `"User.Validate"` +- `Context` is empty for top-level constructs +- Line numbers are 1-indexed (first line is 1, not 0) + +## Kind + +Categorizes what a chunk represents. + +```go +type Kind string + +const ( + KindFunction Kind = "function" + KindMethod Kind = "method" + KindClass Kind = "class" + KindInterface Kind = "interface" + KindType Kind = "type" + KindEnum Kind = "enum" + KindConstant Kind = "constant" + KindVariable Kind = "variable" + KindSection Kind = "section" + KindModule Kind = "module" +) +``` + +| Constant | Value | Description | +|----------|-------|-------------| +| `KindFunction` | `"function"` | Standalone function | +| `KindMethod` | `"method"` | Function with receiver/self | +| `KindClass` | `"class"` | Class, struct, or impl block | +| `KindInterface` | `"interface"` | Interface, trait, or protocol | +| `KindType` | `"type"` | Type alias or other type definition | +| `KindEnum` | `"enum"` | Enumeration | +| `KindConstant` | `"constant"` | Constant declaration | +| `KindVariable` | `"variable"` | Variable declaration | +| `KindSection` | `"section"` | Markdown header/section | +| `KindModule` | `"module"` | Package or file-level construct | + +**Language mappings:** + +| Language | Construct | Kind | +|----------|-----------|------| +| Go | `func` | `function` | +| Go | `func (r *T)` | `method` | +| Go | `type T struct` | `class` | +| Go | `type T interface` | `interface` | +| Go | `type T = U` | `type` | +| TypeScript | `function` | `function` | +| TypeScript | `class` method | `method` | +| TypeScript | `class` | `class` | +| TypeScript | `interface` | `interface` | +| TypeScript | `type` | `type` | +| Python | `def` (top-level) | `function` | +| Python | `def` (in class) | `method` | +| Python | `class` | `class` | +| Rust | `fn` (top-level) | `function` | +| Rust | `fn` (in impl) | `method` | +| Rust | `struct` | `type` | +| Rust | `impl` | `class` | +| Rust | `trait` | `interface` | +| Rust | `enum` | `enum` | +| Markdown | `#` header | `section` | + +## Language + +Identifies a programming language. + +```go +type Language string + +const ( + Go Language = "go" + TypeScript Language = "typescript" + JavaScript Language = "javascript" + Python Language = "python" + Rust Language = "rust" + Markdown Language = "markdown" +) +``` + +| Constant | Value | Provider Package | +|----------|-------|------------------| +| `Go` | `"go"` | `chisel/golang` | +| `TypeScript` | `"typescript"` | `chisel/typescript` | +| `JavaScript` | `"javascript"` | `chisel/typescript` | +| `Python` | `"python"` | `chisel/python` | +| `Rust` | `"rust"` | `chisel/rust` | +| `Markdown` | `"markdown"` | `chisel/markdown` | + +**Notes:** +- TypeScript and JavaScript use the same provider package +- Use `typescript.NewJavaScript()` for JavaScript files + +## Provider + +Interface for language-specific parsers. + +```go +type Provider interface { + Chunk(ctx context.Context, filename string, content []byte) ([]Chunk, error) + Language() Language +} +``` + +| Method | Description | +|--------|-------------| +| `Chunk` | Parses content and returns semantic chunks | +| `Language` | Returns the language this provider handles | + +**Implementing a custom provider:** + +```go +type MyProvider struct{} + +func (p *MyProvider) Chunk(ctx context.Context, filename string, content []byte) ([]chisel.Chunk, error) { + // Parse content and extract chunks + return chunks, nil +} + +func (p *MyProvider) Language() chisel.Language { + return "mylang" +} +``` + +## Chunker + +Routes chunking requests to appropriate providers. + +```go +type Chunker struct { + // contains filtered or unexported fields +} +``` + +Created with `chisel.New()`. See [API Reference](1.api.md#new) for methods. diff --git a/go.mod b/go.mod index 88a0371..98aab5e 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/zoobzio/chisel -go 1.24.5 +go 1.24 + +toolchain go1.25.5 diff --git a/markdown/markdown.go b/markdown/markdown.go index a40b47d..0950b65 100644 --- a/markdown/markdown.go +++ b/markdown/markdown.go @@ -22,7 +22,7 @@ func (p *Provider) Language() chisel.Language { } // Chunk splits Markdown content into sections based on headers. -func (p *Provider) Chunk(_ context.Context, filename string, content []byte) ([]chisel.Chunk, error) { +func (p *Provider) Chunk(_ context.Context, _ string, content []byte) ([]chisel.Chunk, error) { text := string(content) lines := strings.Split(text, "\n") diff --git a/python/python.go b/python/python.go index ca93239..125cf1d 100644 --- a/python/python.go +++ b/python/python.go @@ -24,7 +24,7 @@ func (p *Provider) Language() chisel.Language { } // Chunk parses Python source and extracts semantic chunks. -func (p *Provider) Chunk(_ context.Context, filename string, content []byte) ([]chisel.Chunk, error) { +func (p *Provider) Chunk(_ context.Context, _ string, content []byte) ([]chisel.Chunk, error) { parser := sitter.NewParser() parser.SetLanguage(python.GetLanguage()) @@ -55,7 +55,7 @@ func walkNode(node *sitter.Node, content []byte, ctx []string, chunks *[]chisel. // Walk children with class context className := getChildByField(node, "name", content) - newCtx := append(ctx, "class "+className) + newCtx := append(copyContext(ctx), "class "+className) for i := 0; i < int(node.ChildCount()); i++ { child := node.Child(i) if child.Type() == "block" { diff --git a/rust/rust.go b/rust/rust.go index 0ed15f2..42c165e 100644 --- a/rust/rust.go +++ b/rust/rust.go @@ -24,7 +24,7 @@ func (p *Provider) Language() chisel.Language { } // Chunk parses Rust source and extracts semantic chunks. -func (p *Provider) Chunk(_ context.Context, filename string, content []byte) ([]chisel.Chunk, error) { +func (p *Provider) Chunk(_ context.Context, _ string, content []byte) ([]chisel.Chunk, error) { parser := sitter.NewParser() parser.SetLanguage(rust.GetLanguage()) @@ -56,9 +56,9 @@ func walkNode(node *sitter.Node, content []byte, ctx []string, chunks *[]chisel. // Get the type being implemented typeName := getImplTypeName(node, content) - newCtx := ctx + newCtx := copyContext(ctx) if typeName != "" { - newCtx = append(ctx, "impl "+typeName) + newCtx = append(newCtx, "impl "+typeName) } // Walk children with impl context diff --git a/testing/README.md b/testing/README.md new file mode 100644 index 0000000..d343b56 --- /dev/null +++ b/testing/README.md @@ -0,0 +1,64 @@ +# Testing + +Test utilities and infrastructure for chisel. + +## Structure + +```text +testing/ +β”œβ”€β”€ helpers.go # Test assertion helpers +β”œβ”€β”€ helpers_test.go # Tests for helpers +β”œβ”€β”€ benchmarks/ # Performance benchmarks +└── integration/ # End-to-end tests +``` + +## Helpers + +The `testing` package provides domain-specific assertion helpers: + +```go +import chitesting "github.com/zoobzio/chisel/testing" + +func TestMyChunker(t *testing.T) { + chunks := getChunks() + + chitesting.AssertChunkCount(t, chunks, 3) + chitesting.AssertHasSymbol(t, chunks, "MyFunction") + chitesting.AssertHasKind(t, chunks, chisel.KindFunction) +} +``` + +### Available Helpers + +| Function | Description | +|----------|-------------| +| `AssertChunkCount(t, chunks, n)` | Assert exact chunk count | +| `AssertHasSymbol(t, chunks, sym)` | Assert symbol exists | +| `AssertHasKind(t, chunks, kind)` | Assert kind exists | +| `FindBySymbol(chunks, sym)` | Find chunk by symbol | +| `FindByKind(chunks, kind)` | Find chunk by kind | +| `CountByKind(chunks, kind)` | Count chunks by kind | + +## Running Tests + +```bash +# All tests +make test + +# Unit tests only (short mode) +make test-unit + +# Integration tests +make test-integration + +# Benchmarks +make test-bench +``` + +## Coverage + +```bash +make coverage +``` + +Target: 70% project, 80% patch. diff --git a/testing/benchmarks/README.md b/testing/benchmarks/README.md new file mode 100644 index 0000000..0dc04b8 --- /dev/null +++ b/testing/benchmarks/README.md @@ -0,0 +1,50 @@ +# Benchmarks + +Performance benchmarks for chisel providers. + +## Running + +```bash +make test-bench + +# Or directly: +go test -bench=. github.com/zoobzio/chisel/testing/benchmarks -benchmem +``` + +## Current Results + +Representative results on AMD Ryzen 5 3600X (~50-line files): + +| Provider | Time | Memory | Allocations | +|----------|------|--------|-------------| +| Go | 32Β΅s | 17KB | 402 | +| TypeScript | 313Β΅s | 63KB | 579 | +| Python | 328Β΅s | 63KB | 569 | +| Rust | 293Β΅s | 61KB | 566 | +| Markdown | 4Β΅s | 7KB | 45 | + +## Analysis + +- **Go provider** uses stdlib `go/parser`, ~10x faster than tree-sitter +- **Markdown** is fastest (simple string scanning, no AST) +- **Tree-sitter providers** have similar performance characteristics + +## Adding Benchmarks + +Follow the pattern in `benchmarks_test.go`: + +```go +func BenchmarkNewProvider(b *testing.B) { + p := newprovider.New() + ctx := context.Background() + source := []byte(`...`) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := p.Chunk(ctx, "file.ext", source) + if err != nil { + b.Fatal(err) + } + } +} +``` diff --git a/testing/helpers_test.go b/testing/helpers_test.go index ba9b445..876382f 100644 --- a/testing/helpers_test.go +++ b/testing/helpers_test.go @@ -14,7 +14,7 @@ func TestFindBySymbol(t *testing.T) { result := FindBySymbol(chunks, "foo") if result == nil { - t.Error("expected to find foo") + t.Fatal("expected to find foo") } if result.Symbol != "foo" { t.Errorf("Symbol = %q, want %q", result.Symbol, "foo") @@ -34,7 +34,7 @@ func TestFindByKind(t *testing.T) { result := FindByKind(chunks, chisel.KindClass) if result == nil { - t.Error("expected to find class") + t.Fatal("expected to find class") } if result.Kind != chisel.KindClass { t.Errorf("Kind = %v, want %v", result.Kind, chisel.KindClass) diff --git a/testing/integration/README.md b/testing/integration/README.md new file mode 100644 index 0000000..a85cef9 --- /dev/null +++ b/testing/integration/README.md @@ -0,0 +1,39 @@ +# Integration Tests + +End-to-end tests for chisel. + +## Purpose + +Integration tests verify that: + +- Providers work correctly with real-world code samples +- The `Chunker` correctly routes to providers +- Multi-file scenarios produce expected results + +## Running + +```bash +make test-integration + +# Or directly: +go test -v ./testing/integration/... +``` + +## Structure + +```text +integration/ +β”œβ”€β”€ README.md +β”œβ”€β”€ testdata/ # Sample source files +β”‚ β”œβ”€β”€ go/ +β”‚ β”œβ”€β”€ typescript/ +β”‚ β”œβ”€β”€ python/ +β”‚ └── rust/ +└── integration_test.go +``` + +## Adding Tests + +1. Add sample files to `testdata/[language]/` +2. Add test cases in `integration_test.go` +3. Verify expected chunks match actual output diff --git a/typescript/typescript.go b/typescript/typescript.go index 0ea8b3b..5b2f556 100644 --- a/typescript/typescript.go +++ b/typescript/typescript.go @@ -31,7 +31,7 @@ func (p *Provider) Language() chisel.Language { } // Chunk parses TypeScript/JavaScript source and extracts semantic chunks. -func (p *Provider) Chunk(_ context.Context, filename string, content []byte) ([]chisel.Chunk, error) { +func (p *Provider) Chunk(_ context.Context, _ string, content []byte) ([]chisel.Chunk, error) { parser := sitter.NewParser() parser.SetLanguage(typescript.GetLanguage()) @@ -66,7 +66,7 @@ func walkNode(node *sitter.Node, content []byte, ctx []string, chunks *[]chisel. // Walk children with class context className := getChildByField(node, "name", content) - newCtx := append(ctx, "class "+className) + newCtx := append(copyContext(ctx), "class "+className) for i := 0; i < int(node.ChildCount()); i++ { walkNode(node.Child(i), content, newCtx, chunks) }