Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ index.html
.archdoc.config.json

.idea

benchmarks/runs/*.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,7 @@ Choose your LLM provider and model based on your needs and budget:
- **Alternatives**: openai/gpt-oss-20b

**Ollama (Local LLMs β€” Free)**

- Run models entirely on your machine (no API key required)
- Supports **Llama 3**, **Mistral**, **Gemma 2**, and more
- Works offline once the model is pulled
Expand Down
19 changes: 19 additions & 0 deletions benchmarks/benchmark-dataset-opus-4.5.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
commit_hash,repo_path,functionalImpact,idealTimeHours,testCoverage,codeQuality,codeComplexity,actualTimeHours,technicalDebtHours,debtReductionHours,notes
e8201970,/Users/eneaxharau/Documents/TechDebtGPT,5.6,6.92,1.7,5,5.2,6.09,4.54,0.74,"Generated by claude-opus-4-5-20251101"
903d6c83,/Users/eneaxharau/Documents/TechDebtGPT,5.3,2.19,2,5.7,2.3,1.44,1.2,0.24,"Generated by claude-opus-4-5-20251101"
13732457,/Users/eneaxharau/Documents/TechDebtGPT,1.4,0.36,1.9,5.1,1.8,0.31,1,0.26,"Generated by claude-opus-4-5-20251101"
883311c7,/Users/eneaxharau/Documents/TechDebtGPT,4.7,3.37,2,5,3.8,3.14,4.09,2.5,"Generated by claude-opus-4-5-20251101"
059bd334,/Users/eneaxharau/Documents/TechDebtGPT,6.4,2.1,1.8,3.6,3.6,1.14,3.59,0.22,"Generated by claude-opus-4-5-20251101"
389915e3,/Users/eneaxharau/Documents/TechDebtGPT,3.5,3.79,1.9,5,4.9,4.41,4.13,2.26,"Generated by claude-opus-4-5-20251101"
4dd230bb,/Users/eneaxharau/Documents/TechDebtGPT,3.8,0.61,1.7,5,3.4,0.68,1.27,0.14,"Generated by claude-opus-4-5-20251101"
c5154cd3,/Users/eneaxharau/Documents/TechDebtGPT,5.6,6,1.2,4.3,5.9,8,4.43,0.84,"Generated by claude-opus-4-5-20251101"
f48a3f22,/Users/eneaxharau/Documents/TechDebtGPT,6.1,7.33,1.2,5.1,4.1,4,3.09,0.35,"Generated by claude-opus-4-5-20251101"
6a07893a,/Users/eneaxharau/Documents/TechDebtGPT,2.8,0.41,2.5,6.1,1.2,0.48,0.27,0.06,"Generated by claude-opus-4-5-20251101"
2a91b7cf,/Users/eneaxharau/Documents/TechDebtGPT,1.5,0.45,1.6,4.4,2.2,0.28,1.22,0,"Generated by claude-opus-4-5-20251101"
69f4988e,/Users/eneaxharau/Documents/TechDebtGPT,5.1,9.41,2,4.7,5.4,8.95,6.55,0,"Generated by claude-opus-4-5-20251101"
a71d319d,/Users/eneaxharau/Documents/TechDebtGPT,3.3,2.4,1.8,4.9,4.7,1.36,2.5,1.82,"Generated by claude-opus-4-5-20251101"
62cea0f6,/Users/eneaxharau/Documents/TechDebtGPT,5.4,1.52,1.9,4.7,3.5,1.04,1.65,0,"Generated by claude-opus-4-5-20251101"
b97b457f,/Users/eneaxharau/Documents/TechDebtGPT,5.6,3.79,1.7,4.5,4.2,2.95,2.33,0,"Generated by claude-opus-4-5-20251101"
a277f897,/Users/eneaxharau/Documents/TechDebtGPT,1.1,0.81,1.8,4.5,3.8,0.91,1.78,0.07,"Generated by claude-opus-4-5-20251101"
3216c050,/Users/eneaxharau/Documents/TechDebtGPT,5.7,2.21,1.2,5.4,4.3,1.11,1.93,0.2,"Generated by claude-opus-4-5-20251101"
bd6a1199,/Users/eneaxharau/Documents/TechDebtGPT,3.5,1.79,1,4.1,3.3,1.66,2.33,0.22,"Generated by claude-opus-4-5-20251101"
271 changes: 271 additions & 0 deletions cli/commands/benchmark-command.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
// cli/commands/benchmark-command.ts
// CLI command for running benchmarks and comparing model performance

import chalk from 'chalk';
import * as fs from 'fs';
import * as path from 'path';
import {
runBenchmark,
listBenchmarkRuns,
loadBenchmarkRuns,
getBenchmarkRunsDir,
} from '../../src/benchmark/benchmark-runner';
import {
printBenchmarkResult,
printBenchmarkList,
printModelComparison,
generateComparisonJSON,
} from '../../src/benchmark/benchmark-reporter';
import { BenchmarkOptions, CompareOptions } from '../../src/benchmark/types';

/**
* Parse CLI arguments for benchmark command
*/
function parseArgs(args: string[]): {
subcommand: 'run' | 'compare' | 'list';
options: any;
} {
// Check for subcommands
if (args[0] === 'compare') {
return {
subcommand: 'compare',
options: parseCompareArgs(args.slice(1)),
};
}

if (args[0] === 'list') {
return {
subcommand: 'list',
options: {},
};
}

// Default: run benchmark
return {
subcommand: 'run',
options: parseRunArgs(args),
};
}

/**
* Parse arguments for benchmark run
*/
function parseRunArgs(args: string[]): BenchmarkOptions {
const options: BenchmarkOptions = {
datasetPath: '',
};

for (let i = 0; i < args.length; i++) {
const arg = args[i];

if (arg === '--dataset' || arg === '-d') {
options.datasetPath = args[++i];
} else if (arg === '--name' || arg === '-n') {
options.name = args[++i];
} else if (arg === '--output' || arg === '-o') {
options.outputPath = args[++i];
} else if (arg === '--concurrency' || arg === '-c') {
const raw = args[++i];
const parsed = Number.parseInt(raw, 10);
if (Number.isFinite(parsed) && parsed >= 1) {
options.concurrency = parsed;
}
} else if (arg === '--depth') {
const depth = args[++i];
if (['fast', 'normal', 'deep'].includes(depth)) {
options.depthMode = depth as 'fast' | 'normal' | 'deep';
}
} else if (arg === '--silent' || arg === '-s') {
options.silent = true;
}
}

return options;
}

/**
* Parse arguments for benchmark compare
*/
function parseCompareArgs(args: string[]): CompareOptions {
const options: CompareOptions = {};

for (let i = 0; i < args.length; i++) {
const arg = args[i];

if (arg === '--runs' || arg === '-r') {
options.runNames = args[++i].split(',').map((s) => s.trim());
} else if (arg === '--all' || arg === '-a') {
options.all = true;
}
}

return options;
}

/**
* Print usage help
*/
function printUsage(): void {
console.log(chalk.cyan('\nπŸ“Š Codewave Benchmark Tool\n'));
console.log(chalk.white('Usage:'));
console.log(
chalk.gray(' codewave benchmark --dataset <path> Run benchmark against dataset')
);
console.log(
chalk.gray(' codewave benchmark --dataset <path> --name <name> Run with custom name')
);
console.log(chalk.gray(' codewave benchmark compare --runs a,b,c Compare specific runs'));
console.log(chalk.gray(' codewave benchmark compare --all Compare all saved runs'));
console.log(chalk.gray(' codewave benchmark list List all saved runs'));
console.log('');
console.log(chalk.white('Options:'));
console.log(chalk.gray(' --dataset, -d <path> Path to ground truth CSV dataset'));
console.log(chalk.gray(' --name, -n <name> Custom name for this benchmark run'));
console.log(chalk.gray(' --output, -o <path> Path to save JSON results'));
console.log(
chalk.gray(' --concurrency, -c <n> Number of commits to evaluate in parallel (default: 1)')
);
console.log(chalk.gray(' --depth <mode> Analysis depth: fast, normal, deep'));
console.log(chalk.gray(' --silent, -s Suppress progress output'));
console.log('');
console.log(chalk.white('Compare Options:'));
console.log(chalk.gray(' --runs, -r <names> Comma-separated run names to compare'));
console.log(chalk.gray(' --all, -a Compare all saved benchmark runs'));
console.log('');
console.log(chalk.white('Examples:'));
console.log(chalk.gray(' codewave benchmark --dataset ./ground-truth.csv'));
console.log(chalk.gray(' codewave benchmark --dataset ./data.csv --name "claude-baseline"'));
console.log(chalk.gray(' codewave benchmark --dataset ./data.csv --concurrency 4'));
console.log(chalk.gray(' codewave benchmark compare --runs claude-baseline,gpt4-test'));
console.log(chalk.gray(' codewave benchmark list'));
console.log('');
}

/**
* Run benchmark command
*/
async function runBenchmarkCommand(options: BenchmarkOptions): Promise<void> {
if (!options.datasetPath) {
console.log(chalk.red('\n❌ Error: --dataset is required\n'));
printUsage();
process.exit(1);
}

// Resolve dataset path
const datasetPath = path.resolve(options.datasetPath);
if (!fs.existsSync(datasetPath)) {
console.log(chalk.red(`\n❌ Error: Dataset file not found: ${datasetPath}\n`));
process.exit(1);
}

options.datasetPath = datasetPath;

console.log(chalk.cyan('\nπŸš€ Starting benchmark run...\n'));

try {
const result = await runBenchmark(options, (message) => {
if (!options.silent) {
console.log(message);
}
});

// Print results
printBenchmarkResult(result);

// Save JSON output if requested
if (options.outputPath) {
const outputPath = path.resolve(options.outputPath);
fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
console.log(chalk.green(`\nπŸ’Ύ Results saved to: ${outputPath}\n`));
}
} catch (error) {
console.log(
chalk.red(
`\n❌ Benchmark failed: ${error instanceof Error ? error.message : String(error)}\n`
)
);
process.exit(1);
}
}

/**
* Run compare command
*/
async function runCompareCommand(options: CompareOptions): Promise<void> {
let runNames: string[];

if (options.all) {
// Load all runs
const runs = listBenchmarkRuns();
if (runs.length < 2) {
console.log(chalk.yellow('\n⚠️ Need at least 2 benchmark runs to compare.\n'));
console.log(
chalk.gray(' Run: codewave benchmark --dataset <path> to create benchmark runs.\n')
);
process.exit(1);
}
runNames = runs.map((r) => r.name);
} else if (options.runNames && options.runNames.length > 0) {
runNames = options.runNames;
} else {
console.log(chalk.red('\n❌ Error: Specify --runs or --all for comparison\n'));
printUsage();
process.exit(1);
}

console.log(chalk.cyan(`\nπŸ” Loading ${runNames.length} benchmark runs for comparison...\n`));

const results = loadBenchmarkRuns(runNames);

if (results.length < 2) {
console.log(chalk.red('\n❌ Error: Could not load enough benchmark runs\n'));
console.log(
chalk.gray(
' Make sure the run names are correct. Use "codewave benchmark list" to see available runs.\n'
)
);
process.exit(1);
}

// Print comparison
printModelComparison(results);

// Also output JSON comparison to file
const comparison = generateComparisonJSON(results);
const comparisonPath = path.join(getBenchmarkRunsDir(), `comparison-${Date.now()}.json`);
fs.writeFileSync(comparisonPath, JSON.stringify(comparison, null, 2));
console.log(chalk.gray(` πŸ“„ Comparison JSON saved to: ${comparisonPath}\n`));
}

/**
* Run list command
*/
async function runListCommand(): Promise<void> {
const runs = listBenchmarkRuns();
printBenchmarkList(runs);
}

/**
* Main entry point for benchmark command
*/
export async function runBenchmarkCommandHandler(args: string[]): Promise<void> {
// Handle help
if (args.includes('--help') || args.includes('-h') || args.length === 0) {
printUsage();
return;
}

const { subcommand, options } = parseArgs(args);

switch (subcommand) {
case 'run':
await runBenchmarkCommand(options);
break;
case 'compare':
await runCompareCommand(options);
break;
case 'list':
await runListCommand();
break;
}
}
Loading