Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 16 additions & 95 deletions apps/cli/src/commands/results/remote.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@ import {
type EvaluationResult,
type ResultsExportConfig,
type ResultsRepoStatus,
commitAndPushResultsBranch,
createDraftResultsPr,
directPushResults,
directorySizeBytes,
getResultsRepoStatus,
loadConfig,
prepareResultsRepoBranch,
resolveResultsRepoRunsDir,
stageResultsArtifacts,
syncResultsRepo,
} from '@agentv/core';

Expand Down Expand Up @@ -78,15 +75,6 @@ function statusForResult(result: EvaluationResult): 'PASS' | 'FAIL' | 'ERROR' {
return result.score >= DEFAULT_THRESHOLD ? 'PASS' : 'FAIL';
}

function slugify(value: string): string {
return value
.trim()
.replace(/[^A-Za-z0-9._/-]+/g, '-')
.replace(/\/+/g, '/')
.replace(/^-+|-+$/g, '')
.slice(0, 120);
}

function getRelativeRunPath(cwd: string, runDir: string): string {
const relative = path.relative(path.join(cwd, '.agentv', 'results', 'runs'), runDir);
if (!relative.startsWith('..') && !path.isAbsolute(relative)) {
Expand All @@ -98,23 +86,6 @@ function getRelativeRunPath(cwd: string, runDir: string): string {
return experiment && experiment !== runName ? path.join(experiment, runName) : runName;
}

function buildBranchName(
config: Required<ResultsExportConfig>,
payload: RemoteExportPayload,
): string {
const timestamp = path.basename(payload.run_dir);
const evalStem =
payload.test_files.length === 1
? path
.basename(payload.test_files[0])
.replace(/\.eval\.ya?ml$/i, '')
.replace(/\.[^.]+$/i, '')
: `${payload.test_files.length}-evals`;
const experiment = slugify(payload.experiment ?? 'default');
const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
return `${config.branch_prefix}/${branchLeaf}`;
}

function buildCommitTitle(payload: RemoteExportPayload): string {
const passed = payload.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length;
const avgScore =
Expand All @@ -125,35 +96,6 @@ function buildCommitTitle(payload: RemoteExportPayload): string {
return `feat(results): ${experiment} - ${passed}/${payload.results.length} PASS (${avgScore.toFixed(3)})`;
}

function buildPrBody(payload: RemoteExportPayload): string {
const sections = payload.eval_summaries
.map((summary) => {
const table = summary.results
.map((result) => `| ${result.test_id} | ${result.score.toFixed(3)} | ${result.status} |`)
.join('\n');
return [
`### ${summary.eval_file}`,
'',
`Summary: ${summary.passed}/${summary.total} PASS (${summary.avg_score.toFixed(3)})`,
'',
'| Test | Score | Status |',
'|---|---|---|',
table || '| (no results) | 0.000 | ERROR |',
].join('\n');
})
.join('\n\n');

return [
'## Results',
'',
sections,
'',
`Run: ${path.basename(payload.run_dir)}`,
`Experiment: ${payload.experiment ?? 'default'}`,
`Eval Files: ${payload.test_files.join(', ')}`,
].join('\n');
}

async function maybeWarnLargeArtifact(runDir: string): Promise<void> {
const sizeBytes = await directorySizeBytes(runDir);
if (sizeBytes > SIZE_WARNING_BYTES) {
Expand Down Expand Up @@ -279,43 +221,22 @@ export async function maybeAutoExportRunArtifacts(payload: RemoteExportPayload):
try {
await maybeWarnLargeArtifact(payload.run_dir);

const branchName = buildBranchName(config, payload);
const prepared = await prepareResultsRepoBranch(config, branchName);

try {
const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
const destinationDir = path.join(prepared.repoDir, config.path, relativeRunPath);
await stageResultsArtifacts({
repoDir: prepared.repoDir,
sourceDir: payload.run_dir,
destinationDir,
});

const commitTitle = buildCommitTitle(payload);
const changed = await commitAndPushResultsBranch({
repoDir: prepared.repoDir,
branchName,
commitMessage: commitTitle,
});

if (!changed) {
console.warn('Warning: results export produced no git changes. Skipping PR creation.');
return;
}

const prUrl = await createDraftResultsPr({
repo: config.repo,
repoDir: prepared.repoDir,
baseBranch: prepared.baseBranch,
branchName,
title: commitTitle,
body: buildPrBody(payload),
});

console.log(`Remote results draft PR created: ${prUrl}`);
} finally {
await prepared.cleanup();
const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
const commitTitle = buildCommitTitle(payload);

const pushed = await directPushResults({
config,
sourceDir: payload.run_dir,
destinationPath: relativeRunPath,
commitMessage: commitTitle,
});

if (!pushed) {
console.warn('Warning: results export produced no git changes. Skipping push.');
return;
}

console.log(`Results pushed to ${config.repo} (${config.path}/${relativeRunPath})`);
} catch (error) {
console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
Expand Down
7 changes: 4 additions & 3 deletions apps/web/src/content/docs/docs/tools/studio.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,12 @@ results:
export:
repo: EntityProcess/agentv-evals # GitHub repo (owner/repo or full URL)
path: runs # Directory within the repo
auto_push: true # Push automatically after every eval run
branch_prefix: eval-results # Branch naming prefix (default: eval-results)
auto_push: true # Push directly to base branch after every eval run
```

With `auto_push: true`, every `agentv eval run` or `agentv pipeline bench` automatically creates a draft PR in the configured repo with a structured results table.
With `auto_push: true`, every `agentv eval` or `agentv pipeline bench` pushes results directly to the configured repo's base branch (e.g., `main`). Results appear immediately in Studio without requiring PR merges.

Each run writes to a unique timestamped directory, so concurrent pushes from multiple machines are safe — non-fast-forward conflicts are resolved automatically via rebase retry.

### Authentication

Expand Down
57 changes: 57 additions & 0 deletions packages/core/src/evaluation/results-repo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,60 @@ export async function createDraftResultsPr(params: {
);
return stdout.trim();
}

const DIRECT_PUSH_MAX_RETRIES = 3;

/**
* Push results directly to the base branch of the results repo.
* Handles non-fast-forward conflicts by pulling with rebase and retrying.
* Returns true if artifacts were pushed, false if no changes were detected.
*/
export async function directPushResults(params: {
readonly config: ResultsExportConfig;
readonly sourceDir: string;
readonly destinationPath: string;
readonly commitMessage: string;
}): Promise<boolean> {
const normalized = normalizeResultsExportConfig(params.config);
const repoDir = await ensureResultsRepoClone(normalized);
const baseBranch = await resolveDefaultBranch(repoDir);
await updateCacheRepo(repoDir, baseBranch);

const destinationDir = path.join(repoDir, normalized.path, params.destinationPath);
await stageResultsArtifacts({
repoDir,
sourceDir: params.sourceDir,
destinationDir,
});

await runGit(['add', '--all'], { cwd: repoDir });
const { stdout: status } = await runGit(['status', '--porcelain'], {
cwd: repoDir,
check: false,
});
if (status.trim().length === 0) {
return false;
}

await runGit(['commit', '-m', params.commitMessage], { cwd: repoDir });

for (let attempt = 1; attempt <= DIRECT_PUSH_MAX_RETRIES; attempt++) {
try {
await runGit(['push', 'origin', baseBranch], { cwd: repoDir });
updateStatusFile(normalized, {
last_synced_at: new Date().toISOString(),
last_error: undefined,
});
return true;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
if (attempt < DIRECT_PUSH_MAX_RETRIES && message.includes('non-fast-forward')) {
await runGit(['pull', '--rebase', 'origin', baseBranch], { cwd: repoDir });
} else {
throw error;
}
}
}

return false;
}
1 change: 1 addition & 0 deletions packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ export {
commitAndPushResultsBranch,
pushResultsRepoBranch,
createDraftResultsPr,
directPushResults,
type CheckedOutResultsRepoBranch,
type PreparedResultsRepoBranch,
type ResultsRepoCachePaths,
Expand Down
Loading