From 3ee546b7ff6f5d6687f9547143095164a8381b7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Martin?= Date: Mon, 8 Jun 2026 13:16:19 +0200 Subject: [PATCH 1/5] ci: align optionals automation guardrails --- .github/workflows/ci.yml | 46 ++---- .github/workflows/commitlint.yml | 24 +-- .github/workflows/publish-tessl.yml | 50 +++--- .github/workflows/release-please.yml | 188 ++++++++++------------ .github/workflows/skill-review.yml | 14 +- renovate.json | 7 +- scripts/check_tessl_token_available.sh | 11 ++ scripts/commitlint_release_pr.sh | 32 ++++ scripts/install_commitlint.sh | 15 ++ scripts/lint_pr_commits.sh | 18 +++ scripts/post_commit_status.sh | 40 +++++ scripts/post_commit_status_for_outcome.sh | 39 +++++ scripts/read_release_pr_output.sh | 51 ++++++ scripts/require_tessl_token.sh | 7 + scripts/test_read_release_pr_output.sh | 97 +++++++++++ scripts/trigger_tessl_publish.sh | 12 ++ scripts/validate_json_files.py | 19 +++ scripts/validate_openai_agent_yaml.py | 25 +++ scripts/validate_publish_ready.sh | 10 ++ scripts/validate_publish_ref.sh | 54 +++++++ scripts/validate_repo.sh | 10 ++ 21 files changed, 578 insertions(+), 191 deletions(-) create mode 100755 scripts/check_tessl_token_available.sh create mode 100755 scripts/commitlint_release_pr.sh create mode 100755 scripts/install_commitlint.sh create mode 100755 scripts/lint_pr_commits.sh create mode 100755 scripts/post_commit_status.sh create mode 100755 scripts/post_commit_status_for_outcome.sh create mode 100755 scripts/read_release_pr_output.sh create mode 100755 scripts/require_tessl_token.sh create mode 100755 scripts/test_read_release_pr_output.sh create mode 100755 scripts/trigger_tessl_publish.sh create mode 100755 scripts/validate_json_files.py create mode 100755 scripts/validate_openai_agent_yaml.py create mode 100755 scripts/validate_publish_ready.sh create mode 100755 scripts/validate_publish_ref.sh create mode 100755 scripts/validate_repo.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec2684f..d4a4aec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,12 +21,12 @@ jobs: name: Validate skill and plugin runs-on: ubuntu-latest steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - name: Setup Tessl CLI uses: tesslio/setup-tessl@25ec223fc0da33b41b8044ff5ab2b85235f4f91e # v2 with: - version: "0.81.2" + version: "0.82.0" - name: Validate skill metadata run: python3 scripts/validate_skill.py skills/java-optionals @@ -35,34 +35,16 @@ jobs: run: python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression - name: Compile validation scripts - run: python3 -m py_compile scripts/validate_skill.py scripts/validate_eval_criteria.py + run: python3 -m py_compile scripts/*.py - name: Check shell scripts - run: bash -n scripts/check_publish_dry_run.sh + run: bash -n scripts/*.sh - name: Parse JSON files - run: | - python3 - <<'PY' - import json - import pathlib - for path in pathlib.Path('.').rglob('*.json'): - json.load(open(path, encoding='utf-8')) - print('JSON ok') - PY - - - name: Parse YAML files - run: | - python3 - <<'PY' - import pathlib - try: - import yaml - except ImportError: - print('PyYAML unavailable; skipping YAML parse') - raise SystemExit(0) - for path in list(pathlib.Path('.').rglob('*.yml')) + list(pathlib.Path('.').rglob('*.yaml')): - yaml.safe_load(open(path, encoding='utf-8')) - print('YAML ok') - PY + run: python3 scripts/validate_json_files.py + + - name: Validate YAML metadata + run: python3 scripts/validate_openai_agent_yaml.py - name: Lint Tessl plugin run: tessl plugin lint . @@ -72,25 +54,19 @@ jobs: if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - name: Check Tessl token id: tessl-token env: TESSL_TOKEN: ${{ secrets.TESSL_TOKEN }} - run: | - if [ -n "${TESSL_TOKEN:-}" ]; then - echo "available=true" >> "$GITHUB_OUTPUT" - else - echo "available=false" >> "$GITHUB_OUTPUT" - echo "TESSL_TOKEN isn't configured; skipping Tessl publish dry-runs." - fi + run: scripts/check_tessl_token_available.sh - name: Setup Tessl CLI if: ${{ steps.tessl-token.outputs.available == 'true' }} uses: tesslio/setup-tessl@25ec223fc0da33b41b8044ff5ab2b85235f4f91e # v2 with: - version: "0.81.2" + version: "0.82.0" token: ${{ secrets.TESSL_TOKEN }} - name: Check fast publish dry-run diff --git a/.github/workflows/commitlint.yml b/.github/workflows/commitlint.yml index a7b0318..88818ab 100644 --- a/.github/workflows/commitlint.yml +++ b/.github/workflows/commitlint.yml @@ -30,7 +30,7 @@ jobs: name: Commitlint runs-on: ubuntu-latest steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 with: fetch-depth: 0 ref: ${{ github.event.pull_request.head.sha || github.sha }} @@ -40,17 +40,7 @@ jobs: node-version: "24" - name: Prepare commitlint - run: | - set -euo pipefail - commitlint_home="$RUNNER_TEMP/commitlint" - mkdir -p "$commitlint_home" - printf '{"private":true}\n' > "$commitlint_home/package.json" - cp commitlint.config.cjs "$commitlint_home/commitlint.config.cjs" - npm --prefix "$commitlint_home" install --silent \ - @commitlint/cli@21.0.1 \ - @commitlint/config-conventional@21.0.1 - echo "COMMITLINT_BIN=$commitlint_home/node_modules/.bin/commitlint" >> "$GITHUB_ENV" - echo "COMMITLINT_CONFIG=$commitlint_home/commitlint.config.cjs" >> "$GITHUB_ENV" + run: scripts/install_commitlint.sh - name: Lint pull request title env: @@ -63,12 +53,4 @@ jobs: PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} BASE_REF: ${{ inputs.base_ref || 'main' }} EVENT_NAME: ${{ github.event_name }} - run: | - if [ "$EVENT_NAME" = "pull_request" ]; then - "$COMMITLINT_BIN" --config "$COMMITLINT_CONFIG" \ - --from "$PR_BASE_SHA" --to "$PR_HEAD_SHA" --verbose - else - git fetch origin "$BASE_REF" - "$COMMITLINT_BIN" --config "$COMMITLINT_CONFIG" \ - --from "origin/$BASE_REF" --to HEAD --verbose - fi + run: scripts/lint_pr_commits.sh diff --git a/.github/workflows/publish-tessl.yml b/.github/workflows/publish-tessl.yml index 1ea84fd..b5e21fe 100644 --- a/.github/workflows/publish-tessl.yml +++ b/.github/workflows/publish-tessl.yml @@ -4,9 +4,14 @@ on: workflow_dispatch: inputs: ref: - description: "Git ref to publish. Defaults to the selected workflow ref." - required: false + description: "Git ref to publish. Use refs/tags/v for releases." + required: true type: string + allow_non_tag_ref: + description: "Allow publishing a non-v ref. Use only for maintainer-approved recovery." + required: false + type: boolean + default: false release: types: [published] @@ -14,7 +19,7 @@ permissions: contents: read concurrency: - group: publish-tessl-${{ github.event.release.tag_name || github.run_id }} + group: publish-tessl-${{ github.event.release.tag_name || inputs.ref || github.ref }} cancel-in-progress: false jobs: @@ -25,40 +30,45 @@ jobs: steps: - name: Checkout manual ref if: ${{ github.event_name == 'workflow_dispatch' }} - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 with: - ref: ${{ inputs.ref || github.ref }} + ref: ${{ inputs.ref }} - name: Checkout release tag if: ${{ github.event_name == 'release' }} - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 with: ref: ${{ github.event.release.tag_name }} + - name: Show publish ref + run: | + echo "event_name=${{ github.event_name }}" + echo "requested_ref=${{ github.event.release.tag_name || inputs.ref }}" + echo "checked_out_ref=$(git rev-parse --abbrev-ref HEAD)" + echo "checked_out_sha=$(git rev-parse HEAD)" + git describe --tags --always --dirty + + - name: Validate publish ref + env: + EVENT_NAME: ${{ github.event_name }} + RELEASE_TAG: ${{ github.event.release.tag_name }} + MANUAL_REF: ${{ inputs.ref }} + ALLOW_NON_TAG_REF: ${{ inputs.allow_non_tag_ref }} + run: scripts/validate_publish_ref.sh + - name: Require Tessl token env: TESSL_TOKEN: ${{ secrets.TESSL_TOKEN }} - run: | - if [ -z "${TESSL_TOKEN:-}" ]; then - echo "TESSL_TOKEN is required to publish the Tessl plugin." >&2 - exit 1 - fi + run: scripts/require_tessl_token.sh - name: Setup Tessl CLI uses: tesslio/setup-tessl@25ec223fc0da33b41b8044ff5ab2b85235f4f91e # v2 with: - version: "0.81.2" + version: "0.82.0" token: ${{ secrets.TESSL_TOKEN }} - name: Validate plugin before publish - run: | - python3 scripts/validate_skill.py skills/java-optionals - python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression - python3 -m py_compile scripts/validate_skill.py scripts/validate_eval_criteria.py - bash -n scripts/check_publish_dry_run.sh - tessl plugin lint . - tessl skill review --threshold 100 skills/java-optionals/SKILL.md - tessl plugin publish --dry-run . + run: scripts/validate_publish_ready.sh - name: Publish plugin run: tessl plugin publish . diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 6a378ea..82b631c 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -28,148 +28,126 @@ jobs: config-file: release-please-config.json manifest-file: .release-please-manifest.json + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + with: + persist-credentials: false + - name: Trigger Tessl publish if: ${{ steps.release.outputs.release_created == 'true' }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} TAG_NAME: ${{ steps.release.outputs.tag_name }} - run: | - set -euo pipefail - - if [ -z "${TAG_NAME:-}" ]; then - echo "Release Please reported a release but did not output tag_name." >&2 - exit 1 - fi + run: scripts/trigger_tessl_publish.sh "$TAG_NAME" - gh workflow run publish-tessl.yml --repo "$GITHUB_REPOSITORY" --ref main -f ref="$TAG_NAME" - - - name: Find release PR + - name: Read release PR output id: release-pr env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_REPO: ${{ github.repository }} - run: | - set -euo pipefail - - pr_json="$(gh pr list \ - --state open \ - --head release-please--branches--main--components--java-optionals \ - --json title,headRefName \ - --jq '.[0] // empty')" - - if [ -z "$pr_json" ]; then - echo "No open release PR to check." - echo "found=false" >> "$GITHUB_OUTPUT" - exit 0 - fi - - title="$(printf '%s\n' "$pr_json" | jq -r '.title')" - branch="$(printf '%s\n' "$pr_json" | jq -r '.headRefName')" - - { - echo "found=true" - echo "title<> "$GITHUB_OUTPUT" + RELEASE_PR: ${{ steps.release.outputs.pr }} + run: scripts/read_release_pr_output.sh - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 if: ${{ steps.release-pr.outputs.found == 'true' }} with: fetch-depth: 0 + persist-credentials: false ref: ${{ steps.release-pr.outputs.branch }} - - name: Setup Tessl CLI + - name: Check Tessl token + id: release-pr-tessl-token if: ${{ steps.release-pr.outputs.found == 'true' }} + env: + TESSL_TOKEN: ${{ secrets.TESSL_TOKEN }} + run: scripts/check_tessl_token_available.sh + + - name: Setup Tessl CLI + if: ${{ steps.release-pr.outputs.found == 'true' && steps.release-pr-tessl-token.outputs.available != 'true' }} uses: tesslio/setup-tessl@25ec223fc0da33b41b8044ff5ab2b85235f4f91e # v2 with: - version: "0.81.2" + version: "0.82.0" + + - name: Setup authenticated Tessl CLI + if: ${{ steps.release-pr.outputs.found == 'true' && steps.release-pr-tessl-token.outputs.available == 'true' }} + uses: tesslio/setup-tessl@25ec223fc0da33b41b8044ff5ab2b85235f4f91e # v2 + with: + version: "0.82.0" token: ${{ secrets.TESSL_TOKEN }} - - name: Validate release PR + - name: Mark release PR validation pending if: ${{ steps.release-pr.outputs.found == 'true' }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - TESSL_TOKEN: ${{ secrets.TESSL_TOKEN }} - STATUS_SHA: ${{ github.sha }} TARGET_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: > + scripts/post_commit_status.sh "$(git rev-parse HEAD)" + "Validate skill and plugin" + pending + "Release PR validation is running" + "$TARGET_URL" + + - name: Validate release PR + if: ${{ steps.release-pr.outputs.found == 'true' }} + id: validate-release-pr + continue-on-error: true + env: + TESSL_TOKEN_AVAILABLE: ${{ steps.release-pr-tessl-token.outputs.available }} run: | - set -euo pipefail - - release_sha="$(git rev-parse HEAD)" - context="Validate skill and plugin" - - set_status() { - local state="$1" - local description="$2" - gh api "repos/${GITHUB_REPOSITORY}/statuses/${release_sha}" \ - -f state="$state" \ - -f context="$context" \ - -f description="$description" \ - -f target_url="$TARGET_URL" >/dev/null - } - - set_status pending "Release PR validation is running" - trap 'code=$?; if [ "$code" -eq 0 ]; then set_status success "Release PR validation passed"; else set_status failure "Release PR validation failed"; fi' EXIT - - python3 scripts/validate_skill.py skills/java-optionals - python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression - python3 -m py_compile scripts/validate_skill.py scripts/validate_eval_criteria.py - bash -n scripts/check_publish_dry_run.sh - tessl plugin lint . - - if [ -n "${TESSL_TOKEN:-}" ]; then + scripts/validate_repo.sh + if [[ "$TESSL_TOKEN_AVAILABLE" == "true" ]]; then bash scripts/check_publish_dry_run.sh . tessl plugin publish --dry-run . - else - echo "TESSL_TOKEN isn't configured; skipping publish dry-run." fi + - name: Mark release PR validation result + if: ${{ steps.release-pr.outputs.found == 'true' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + TARGET_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + VALIDATION_OUTCOME: ${{ steps.validate-release-pr.outcome }} + run: > + scripts/post_commit_status_for_outcome.sh "$(git rev-parse HEAD)" + "Validate skill and plugin" + "$VALIDATION_OUTCOME" + "Release PR validation passed" + "Release PR validation failed" + "$TARGET_URL" + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6 if: ${{ steps.release-pr.outputs.found == 'true' }} with: node-version: "24" - - name: Commitlint release PR + - name: Mark release PR commitlint pending if: ${{ steps.release-pr.outputs.found == 'true' }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + TARGET_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: > + scripts/post_commit_status.sh "$(git rev-parse HEAD)" + "Commitlint" + pending + "Release PR commitlint is running" + "$TARGET_URL" + + - name: Commitlint release PR + if: ${{ steps.release-pr.outputs.found == 'true' }} + id: commitlint-release-pr + continue-on-error: true + env: PR_TITLE: ${{ steps.release-pr.outputs.title }} + run: scripts/commitlint_release_pr.sh "$PR_TITLE" + + - name: Mark release PR commitlint result + if: ${{ steps.release-pr.outputs.found == 'true' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} TARGET_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - run: | - set -euo pipefail - - release_sha="$(git rev-parse HEAD)" - context="Commitlint" - - set_status() { - local state="$1" - local description="$2" - gh api "repos/${GITHUB_REPOSITORY}/statuses/${release_sha}" \ - -f state="$state" \ - -f context="$context" \ - -f description="$description" \ - -f target_url="$TARGET_URL" >/dev/null - } - - set_status pending "Release PR commitlint is running" - trap 'code=$?; if [ "$code" -eq 0 ]; then set_status success "Release PR commitlint passed"; else set_status failure "Release PR commitlint failed"; fi' EXIT - - commitlint_home="$RUNNER_TEMP/commitlint" - mkdir -p "$commitlint_home" - printf '{"private":true}\n' > "$commitlint_home/package.json" - cp commitlint.config.cjs "$commitlint_home/commitlint.config.cjs" - npm --prefix "$commitlint_home" install --silent \ - @commitlint/cli@21.0.1 \ - @commitlint/config-conventional@21.0.1 - - commitlint_bin="$commitlint_home/node_modules/.bin/commitlint" - commitlint_config="$commitlint_home/commitlint.config.cjs" - - printf '%s\n' "$PR_TITLE" | "$commitlint_bin" --config "$commitlint_config" - git fetch origin main - "$commitlint_bin" --config "$commitlint_config" \ - --from origin/main --to HEAD --verbose + COMMITLINT_OUTCOME: ${{ steps.commitlint-release-pr.outcome }} + run: > + scripts/post_commit_status_for_outcome.sh "$(git rev-parse HEAD)" + "Commitlint" + "$COMMITLINT_OUTCOME" + "Release PR commitlint passed" + "Release PR commitlint failed" + "$TARGET_URL" diff --git a/.github/workflows/skill-review.yml b/.github/workflows/skill-review.yml index aee0f7e..1c80e3a 100644 --- a/.github/workflows/skill-review.yml +++ b/.github/workflows/skill-review.yml @@ -27,21 +27,21 @@ jobs: if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} runs-on: ubuntu-latest env: - TESSL_TOKEN: ${{ secrets.TESSL_TOKEN }} + TESSL_TOKEN_AVAILABLE: ${{ secrets.TESSL_TOKEN != '' }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - name: Skip review when Tessl token is unavailable - if: ${{ env.TESSL_TOKEN == '' }} + if: ${{ env.TESSL_TOKEN_AVAILABLE != 'true' }} run: echo "TESSL_TOKEN isn't configured; skipping Tessl skill review." - name: Setup Tessl CLI - if: ${{ env.TESSL_TOKEN != '' }} + if: ${{ env.TESSL_TOKEN_AVAILABLE == 'true' }} uses: tesslio/setup-tessl@25ec223fc0da33b41b8044ff5ab2b85235f4f91e # v2 with: - version: "0.81.2" - token: ${{ env.TESSL_TOKEN }} + version: "0.82.0" + token: ${{ secrets.TESSL_TOKEN }} - name: Review skill - if: ${{ env.TESSL_TOKEN != '' }} + if: ${{ env.TESSL_TOKEN_AVAILABLE == 'true' }} run: tessl skill review --threshold 100 skills/java-optionals/SKILL.md diff --git a/renovate.json b/renovate.json index 7b774f1..de9a5ca 100644 --- a/renovate.json +++ b/renovate.json @@ -21,7 +21,8 @@ { "customType": "regex", "managerFilePatterns": [ - "/^\\.github/workflows/(?:commitlint|release-please)\\.yml$/" + "/^\\.github/workflows/(?:commitlint|release-please)\\.yml$/", + "/^scripts/(?:commitlint_release_pr|install_commitlint)\\.sh$/" ], "matchStrings": [ "(?:\\s+)(?@commitlint/(?:cli|config-conventional))@(?\\d+\\.\\d+\\.\\d+)" @@ -38,8 +39,8 @@ "version:\\s*[\"']?(?\\d+\\.\\d+\\.\\d+)[\"']?" ], "depNameTemplate": "tessl-cli", - "datasourceTemplate": "github-releases", - "packageNameTemplate": "tesslio/cli", + "datasourceTemplate": "npm", + "packageNameTemplate": "tessl", "versioningTemplate": "semver" } ], diff --git a/scripts/check_tessl_token_available.sh b/scripts/check_tessl_token_available.sh new file mode 100755 index 0000000..6def7cc --- /dev/null +++ b/scripts/check_tessl_token_available.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${GITHUB_OUTPUT:?GITHUB_OUTPUT is required}" + +if [[ -n "${TESSL_TOKEN:-}" ]]; then + echo "available=true" >> "$GITHUB_OUTPUT" +else + echo "available=false" >> "$GITHUB_OUTPUT" + echo "TESSL_TOKEN isn't configured; skipping Tessl publish dry-runs." +fi diff --git a/scripts/commitlint_release_pr.sh b/scripts/commitlint_release_pr.sh new file mode 100755 index 0000000..a87b435 --- /dev/null +++ b/scripts/commitlint_release_pr.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/commitlint_release_pr.sh + +Lints the Release Please PR title and commits from origin/main to HEAD. +USAGE +} + +if [[ $# -ne 1 ]]; then + usage >&2 + exit 2 +fi + +pr_title="$1" +commitlint_home="${RUNNER_TEMP:-$(mktemp -d)}/commitlint" +mkdir -p "$commitlint_home" +printf '{"private":true}\n' > "$commitlint_home/package.json" +cp commitlint.config.cjs "$commitlint_home/commitlint.config.cjs" +npm --prefix "$commitlint_home" install --silent --ignore-scripts \ + @commitlint/cli@21.0.2 \ + @commitlint/config-conventional@21.0.2 + +commitlint_bin="$commitlint_home/node_modules/.bin/commitlint" +commitlint_config="$commitlint_home/commitlint.config.cjs" + +printf '%s\n' "$pr_title" | "$commitlint_bin" --config "$commitlint_config" +"$commitlint_bin" --config "$commitlint_config" \ + --from origin/main --to HEAD --verbose diff --git a/scripts/install_commitlint.sh b/scripts/install_commitlint.sh new file mode 100755 index 0000000..841217b --- /dev/null +++ b/scripts/install_commitlint.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +commitlint_home="${RUNNER_TEMP:-$(mktemp -d)}/commitlint" +mkdir -p "$commitlint_home" +printf '{"private":true}\n' > "$commitlint_home/package.json" +cp commitlint.config.cjs "$commitlint_home/commitlint.config.cjs" +npm --prefix "$commitlint_home" install --silent --ignore-scripts \ + @commitlint/cli@21.0.2 \ + @commitlint/config-conventional@21.0.2 + +{ + echo "COMMITLINT_BIN=$commitlint_home/node_modules/.bin/commitlint" + echo "COMMITLINT_CONFIG=$commitlint_home/commitlint.config.cjs" +} >> "${GITHUB_ENV:?GITHUB_ENV is required}" diff --git a/scripts/lint_pr_commits.sh b/scripts/lint_pr_commits.sh new file mode 100755 index 0000000..36bd85f --- /dev/null +++ b/scripts/lint_pr_commits.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${COMMITLINT_BIN:?COMMITLINT_BIN is required}" +: "${COMMITLINT_CONFIG:?COMMITLINT_CONFIG is required}" +: "${EVENT_NAME:?EVENT_NAME is required}" + +if [[ "$EVENT_NAME" == "pull_request" ]]; then + : "${PR_BASE_SHA:?PR_BASE_SHA is required for pull_request events}" + : "${PR_HEAD_SHA:?PR_HEAD_SHA is required for pull_request events}" + "$COMMITLINT_BIN" --config "$COMMITLINT_CONFIG" \ + --from "$PR_BASE_SHA" --to "$PR_HEAD_SHA" --verbose +else + base_ref="${BASE_REF:-main}" + git fetch origin "$base_ref" + "$COMMITLINT_BIN" --config "$COMMITLINT_CONFIG" \ + --from "origin/$base_ref" --to HEAD --verbose +fi diff --git a/scripts/post_commit_status.sh b/scripts/post_commit_status.sh new file mode 100755 index 0000000..0fa99f6 --- /dev/null +++ b/scripts/post_commit_status.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/post_commit_status.sh + +Posts a GitHub commit status for the current repository. Requires GH_TOKEN and GITHUB_REPOSITORY. +USAGE +} + +if [[ $# -ne 5 ]]; then + usage >&2 + exit 2 +fi + +sha="$1" +context="$2" +state="$3" +description="$4" +target_url="$5" + +: "${GH_TOKEN:?GH_TOKEN is required}" +: "${GITHUB_REPOSITORY:?GITHUB_REPOSITORY is required}" + +case "$state" in + pending|success|failure|error) + ;; + *) + echo "Unsupported commit status state: $state" >&2 + exit 2 + ;; +esac + +gh api "repos/${GITHUB_REPOSITORY}/statuses/${sha}" \ + -f state="$state" \ + -f context="$context" \ + -f description="$description" \ + -f target_url="$target_url" >/dev/null diff --git a/scripts/post_commit_status_for_outcome.sh b/scripts/post_commit_status_for_outcome.sh new file mode 100755 index 0000000..7ab1def --- /dev/null +++ b/scripts/post_commit_status_for_outcome.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/post_commit_status_for_outcome.sh +USAGE +} + +if [[ $# -ne 6 ]]; then + usage >&2 + exit 2 +fi + +sha="$1" +context="$2" +outcome="$3" +success_description="$4" +failure_description="$5" +target_url="$6" + +case "$outcome" in + success) + state="success" + description="$success_description" + ;; + failure|cancelled|skipped) + state="failure" + description="$failure_description" + ;; + *) + echo "Unsupported GitHub Actions outcome: $outcome" >&2 + exit 2 + ;; +esac + +scripts/post_commit_status.sh "$sha" "$context" "$state" "$description" "$target_url" +[[ "$outcome" == "success" ]] diff --git a/scripts/read_release_pr_output.sh b/scripts/read_release_pr_output.sh new file mode 100755 index 0000000..36c2072 --- /dev/null +++ b/scripts/read_release_pr_output.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${GITHUB_OUTPUT:?GITHUB_OUTPUT is required}" + +release_pr="${RELEASE_PR:-}" +if [[ -z "$release_pr" || "$release_pr" == "null" ]]; then + echo "Release Please did not emit a PR output; checking for an unchanged open release PR." + release_branch="release-please--branches--main--components--java-optionals" + release_branch_prefix="release-please--branches--main" + release_pr="$(gh pr list \ + --state open \ + --base main \ + --head "$release_branch" \ + --json title,headRefName \ + --jq 'map({title, headBranchName: .headRefName}) | .[0] // empty')" + + if [[ -z "$release_pr" ]]; then + release_pr="$(gh pr list \ + --state open \ + --base main \ + --limit 100 \ + --json title,headRefName \ + --jq "map(select(.headRefName | startswith(\"$release_branch_prefix\")) | {title, headBranchName: .headRefName}) | sort_by(.headBranchName) | .[0] // empty")" + fi + + if [[ -z "$release_pr" ]]; then + echo "No open release PR to check." + echo "found=false" >> "$GITHUB_OUTPUT" + exit 0 + fi +fi + +title="$(printf '%s\n' "$release_pr" | jq -r '.title')" +branch="$(printf '%s\n' "$release_pr" | jq -r '.headBranchName')" + +if [[ -z "$title" || "$title" == "null" || -z "$branch" || "$branch" == "null" ]]; then + echo "Release Please returned a release PR without title or headBranchName." >&2 + printf '%s\n' "$release_pr" >&2 + exit 1 +fi + +{ + echo "found=true" + echo "title<> "$GITHUB_OUTPUT" diff --git a/scripts/require_tessl_token.sh b/scripts/require_tessl_token.sh new file mode 100755 index 0000000..b958eea --- /dev/null +++ b/scripts/require_tessl_token.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ -z "${TESSL_TOKEN:-}" ]]; then + echo "TESSL_TOKEN is required to publish the Tessl plugin." >&2 + exit 1 +fi diff --git a/scripts/test_read_release_pr_output.sh b/scripts/test_read_release_pr_output.sh new file mode 100755 index 0000000..756c709 --- /dev/null +++ b/scripts/test_read_release_pr_output.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +tmp_dir="$(mktemp -d)" +trap 'rm -rf "$tmp_dir"' EXIT + +write_gh_stub() { + local exact_output="$1" + local fallback_output="${2:-}" + mkdir -p "$tmp_dir/bin" + cat > "$tmp_dir/bin/gh" <<'STUB' +#!/usr/bin/env bash +set -euo pipefail + +if [[ "$*" != pr\ list* ]]; then + echo "unexpected gh invocation: $*" >&2 + exit 1 +fi + +if [[ "$*" == *"--head release-please--branches--main--components--java-optionals"* ]]; then + printf '%s\n' "${GH_STUB_EXACT_OUTPUT:-}" +else + printf '%s\n' "${GH_STUB_FALLBACK_OUTPUT:-}" +fi +STUB + chmod +x "$tmp_dir/bin/gh" + GH_STUB_EXACT_OUTPUT="$exact_output" + GH_STUB_FALLBACK_OUTPUT="$fallback_output" +} + +run_case() { + local name="$1" + local release_pr="$2" + local gh_exact_output="$3" + local gh_fallback_output="$4" + local expected_found="$5" + local expected_branch="${6:-}" + + local output_file="$tmp_dir/$name.out" + : > "$output_file" + write_gh_stub "$gh_exact_output" "$gh_fallback_output" + + ( + export PATH="$tmp_dir/bin:$PATH" + export GITHUB_OUTPUT="$output_file" + export RELEASE_PR="$release_pr" + export GH_STUB_EXACT_OUTPUT + export GH_STUB_FALLBACK_OUTPUT + "$repo_root/scripts/read_release_pr_output.sh" + ) + + if ! grep -qx "found=$expected_found" "$output_file"; then + echo "case '$name' expected found=$expected_found" >&2 + cat "$output_file" >&2 + exit 1 + fi + + if [[ -n "$expected_branch" ]] && ! grep -qx "$expected_branch" "$output_file"; then + echo "case '$name' expected branch output '$expected_branch'" >&2 + cat "$output_file" >&2 + exit 1 + fi +} + +run_case \ + "release-pr-json" \ + '{"title":"chore: release 1.2.3","headBranchName":"release-please--branches--main--components--java-optionals"}' \ + "" \ + "" \ + "true" \ + "release-please--branches--main--components--java-optionals" + +run_case \ + "fallback-exact-branch-without-labels" \ + "" \ + '{"title":"chore: release 1.2.3","headBranchName":"release-please--branches--main--components--java-optionals"}' \ + "" \ + "true" \ + "release-please--branches--main--components--java-optionals" + +run_case \ + "fallback-prefix-without-labels" \ + "" \ + "" \ + '{"title":"chore: release 1.2.3","headBranchName":"release-please--branches--main--components--java-optionals"}' \ + "true" \ + "release-please--branches--main--components--java-optionals" + +run_case \ + "fallback-empty" \ + "" \ + "" \ + "" \ + "false" + +echo "read_release_pr_output smoke tests passed" diff --git a/scripts/trigger_tessl_publish.sh b/scripts/trigger_tessl_publish.sh new file mode 100755 index 0000000..e771cdd --- /dev/null +++ b/scripts/trigger_tessl_publish.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${GITHUB_REPOSITORY:?GITHUB_REPOSITORY is required}" + +tag_name="${1:-}" +if [[ -z "$tag_name" ]]; then + echo "Release Please reported a release but did not output tag_name." >&2 + exit 1 +fi + +gh workflow run publish-tessl.yml --repo "$GITHUB_REPOSITORY" --ref main -f ref="refs/tags/${tag_name}" diff --git a/scripts/validate_json_files.py b/scripts/validate_json_files.py new file mode 100755 index 0000000..eeb54b3 --- /dev/null +++ b/scripts/validate_json_files.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +"""Parse every JSON file in the repository.""" + +from __future__ import annotations + +import json +from pathlib import Path + + +def main() -> int: + for path in Path(".").rglob("*.json"): + with path.open(encoding="utf-8") as file: + json.load(file) + print("JSON ok") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/validate_openai_agent_yaml.py b/scripts/validate_openai_agent_yaml.py new file mode 100755 index 0000000..7873ba8 --- /dev/null +++ b/scripts/validate_openai_agent_yaml.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +"""Validate the supported subset of OpenAI agent YAML metadata.""" + +from __future__ import annotations + +from pathlib import Path + +from validate_skill import parse_openai_agent_metadata + + +def main() -> int: + failures: list[str] = [] + for path in Path("skills").glob("*/agents/openai.yaml"): + _, metadata_failures = parse_openai_agent_metadata(path.read_text(encoding="utf-8")) + failures.extend(f"{path}: {failure}" for failure in metadata_failures) + if failures: + for failure in failures: + print(failure) + return 1 + print("YAML metadata ok") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/validate_publish_ready.sh b/scripts/validate_publish_ready.sh new file mode 100755 index 0000000..998a2f7 --- /dev/null +++ b/scripts/validate_publish_ready.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3 scripts/validate_skill.py skills/java-optionals +python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression +python3 -m py_compile scripts/*.py +bash -n scripts/*.sh +tessl plugin lint . +tessl skill review --threshold 100 skills/java-optionals/SKILL.md +tessl plugin publish --dry-run . diff --git a/scripts/validate_publish_ref.sh b/scripts/validate_publish_ref.sh new file mode 100755 index 0000000..9b04b00 --- /dev/null +++ b/scripts/validate_publish_ref.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail + +event_name="${EVENT_NAME:-}" +release_tag="${RELEASE_TAG:-}" +manual_ref="${MANUAL_REF:-}" +allow_non_tag_ref="${ALLOW_NON_TAG_REF:-false}" + +plugin_version="$(python3 - <<'PY' +import json +print(json.load(open(".tessl-plugin/plugin.json", encoding="utf-8"))["version"]) +PY +)" +expected_tag="v${plugin_version}" + +echo "plugin_version=${plugin_version}" +echo "expected_tag=${expected_tag}" + +if [[ "$event_name" == "release" ]]; then + if [[ "$release_tag" != "$expected_tag" ]]; then + echo "Release tag '${release_tag}' must match plugin version tag '${expected_tag}'." >&2 + exit 1 + fi + exit 0 +fi + +if [[ -z "$manual_ref" ]]; then + echo "Manual publish requires an explicit ref input." >&2 + exit 1 +fi + +git fetch --force --tags origin '+refs/tags/*:refs/tags/*' + +if [[ "$manual_ref" == "refs/tags/${expected_tag}" ]]; then + exit 0 +fi + +if [[ "$manual_ref" == "$expected_tag" ]]; then + echo "Manual release publishes must use fully qualified tag ref 'refs/tags/${expected_tag}'." >&2 + exit 1 +fi + +tag_ref="${manual_ref#refs/tags/}" +if git show-ref --verify --quiet "refs/tags/${tag_ref}"; then + echo "Manual release tag '${manual_ref}' must match 'refs/tags/${expected_tag}'." >&2 + exit 1 +fi + +if [[ "$allow_non_tag_ref" != "true" ]]; then + echo "Manual publish from non-tag ref '${manual_ref}' requires allow_non_tag_ref=true." >&2 + exit 1 +fi + +echo "Publishing non-tag ref '${manual_ref}' with explicit maintainer override." diff --git a/scripts/validate_repo.sh b/scripts/validate_repo.sh new file mode 100755 index 0000000..e7c4a1e --- /dev/null +++ b/scripts/validate_repo.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3 scripts/validate_skill.py skills/java-optionals +python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression +python3 scripts/validate_json_files.py +python3 scripts/validate_openai_agent_yaml.py +python3 -m py_compile scripts/*.py +bash -n scripts/*.sh +tessl plugin lint . From b4dbc2b163890331907b5aff20ec77430ff01344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Martin?= Date: Mon, 8 Jun 2026 13:16:24 +0200 Subject: [PATCH 2/5] test(evals): add suite guardrail tooling --- .gitignore | 2 + .../criteria.json | 1 + .../criteria.json | 1 + scripts/classify_eval_result.py | 233 +++++++++++ scripts/run_eval_suite.sh | 202 ++++++++++ scripts/validate_eval_criteria.py | 368 +++++++++++++++++- 6 files changed, 806 insertions(+), 1 deletion(-) create mode 100755 scripts/classify_eval_result.py create mode 100755 scripts/run_eval_suite.sh diff --git a/.gitignore b/.gitignore index b8fb173..e1cff3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ .DS_Store *.tgz +__pycache__/ +*.py[cod] .tessl/cache/ .tessl/tmp/ .codex/ diff --git a/evals-regression/48-baseline-solved-workpad-feature-cleanup/criteria.json b/evals-regression/48-baseline-solved-workpad-feature-cleanup/criteria.json index 21b496f..89c033c 100644 --- a/evals-regression/48-baseline-solved-workpad-feature-cleanup/criteria.json +++ b/evals-regression/48-baseline-solved-workpad-feature-cleanup/criteria.json @@ -40,6 +40,7 @@ } ], "metadata": { + "evidence_type": "solved_regression", "invocation": "explicit", "task_type": "implementation" } diff --git a/evals-regression/49-baseline-solved-workspace-prompt-feature/criteria.json b/evals-regression/49-baseline-solved-workspace-prompt-feature/criteria.json index 1027ace..510eeba 100644 --- a/evals-regression/49-baseline-solved-workspace-prompt-feature/criteria.json +++ b/evals-regression/49-baseline-solved-workspace-prompt-feature/criteria.json @@ -46,6 +46,7 @@ } ], "metadata": { + "evidence_type": "solved_regression", "invocation": "explicit", "task_type": "implementation" } diff --git a/scripts/classify_eval_result.py b/scripts/classify_eval_result.py new file mode 100755 index 0000000..ba78636 --- /dev/null +++ b/scripts/classify_eval_result.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +"""Classify one hosted eval scenario into main, reference, regression, or fix-required. + +The script reads Tessl `eval view --json` output and applies the repository's +suite policy. It is intentionally conservative: promote to main only when an +isolated run shows clean with-context behavior and a delta meeting the +repository main promotion floor. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + + +DEFAULT_MAIN_DELTA_FLOOR = 30.0 + + +def error(message: str) -> int: + print(f"error: {message}", file=sys.stderr) + return 1 + + +def score(solution: dict[str, Any]) -> tuple[float, float]: + results = solution.get("assessmentResults") or [] + earned = sum(float(item.get("score") or 0) for item in results) + maximum = sum(float(item.get("max_score") or item.get("maxScore") or 0) for item in results) + return earned, maximum + + +def normalized(text: str) -> str: + return " ".join(re.findall(r"[a-z0-9]+", text.lower())) + + +def scenario_text_from_dir(path: Path | None) -> str: + if path is None: + return "" + parts: list[str] = [] + for name in ("task.md", "criteria.json", "capability.txt"): + file_path = path / name + if file_path.is_file(): + parts.append(file_path.read_text(encoding="utf-8")) + return "\n".join(parts) + + +def scenario_metadata_from_dir(path: Path | None) -> dict[str, Any]: + if path is None: + return {} + criteria_path = path / "criteria.json" + if not criteria_path.is_file(): + return {} + try: + data = json.loads(criteria_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return {} + metadata = data.get("metadata") + if not isinstance(metadata, dict): + return {} + return metadata + + +def is_skill_context_dependent(text: str) -> bool: + lowered = text.lower() + context_terms = ( + "skill bundle", + "skill package", + "skill-provided", + "skill-only context", + "agent instructions", + "from the skill", + "from the skill bundle", + "bundled reference", + "bundled reference text", + "exact skill-provided text", + "exact wording", + "exact text", + "exact scan", + "exact scan header", + "exact checklist", + "exact procedure", + "exact command", + "scan command from the skill", + "hard-stop rg scan command", + ) + required_terms = ( + "exact", + "skill-provided", + "skill-only context", + "skill package", + "agent instructions", + "from the skill", + "bundled reference", + ) + return any(term in lowered for term in context_terms) and any( + term in lowered for term in required_terms + ) + + +def find_scenario(data: dict[str, Any], query: str | None) -> dict[str, Any]: + scenarios = data.get("data", {}).get("attributes", {}).get("scenarios", []) + if not isinstance(scenarios, list): + raise ValueError("run JSON does not contain data.attributes.scenarios") + if not scenarios: + raise ValueError("run JSON contains no scenarios") + if query is None: + if len(scenarios) != 1: + raise ValueError("run contains multiple scenarios; pass --scenario") + return scenarios[0] + + query_norm = normalized(query) + matches = [] + for scenario in scenarios: + title = scenario.get("shortDescription") or "" + task = scenario.get("task") or "" + haystack = normalized(f"{title}\n{task}") + if query_norm in haystack: + matches.append(scenario) + if len(matches) != 1: + raise ValueError(f"expected exactly one scenario match for {query!r}, found {len(matches)}") + return matches[0] + + +def classify( + *, + with_score: tuple[float, float] | None, + without_score: tuple[float, float] | None, + skill_context_dependent: bool, + main_delta_floor: float, +) -> tuple[str, str]: + if with_score is None: + return "fix-required", "with-context result is missing; run with context before classifying" + + with_earned, with_max = with_score + if with_max <= 0: + return "fix-required", "with-context max score is zero; scoring did not finish cleanly" + with_percent = 100 * with_earned / with_max + + if with_percent < 100: + return ( + "fix-required", + "with-context is below 100%; fix the skill or eval and rerun targeted before choosing a suite", + ) + + if skill_context_dependent: + return ( + "regression", + "skill-context-dependent recall is only fair as with-context regression coverage", + ) + + if without_score is None: + return "reference", "without-context result is missing; run both variants before lift classification" + + without_earned, without_max = without_score + if without_max <= 0: + return "reference", "without-context max score is zero; baseline scoring did not finish cleanly" + without_percent = 100 * without_earned / without_max + + if without_percent == 100: + return "regression", "both variants scored 100%; keep as with-context safety coverage" + + delta = with_percent - without_percent + if delta >= main_delta_floor: + return "main", f"clean with-context result and {delta:.1f} pp delta meets main floor {main_delta_floor:.1f} pp" + + return "reference", f"clean with-context result but {delta:.1f} pp delta is below main floor {main_delta_floor:.1f} pp" + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("run_json", type=Path, help="Path to Tessl eval view --json output") + parser.add_argument("--scenario", help="Scenario title, directory name, or distinctive text") + parser.add_argument( + "--scenario-dir", + type=Path, + help="Local scenario directory for skill-context-dependent detection", + ) + parser.add_argument( + "--main-delta-floor", + type=float, + default=DEFAULT_MAIN_DELTA_FLOOR, + help="Minimum percentage-point delta required for main promotion", + ) + args = parser.parse_args() + + try: + data = json.loads(args.run_json.read_text(encoding="utf-8")) + scenario = find_scenario(data, args.scenario) + except (OSError, json.JSONDecodeError, ValueError) as exc: + return error(str(exc)) + + solutions = {solution.get("variant"): score(solution) for solution in scenario.get("solutions", [])} + with_score = solutions.get("usage-spec") or solutions.get("with-context") + without_score = solutions.get("baseline") or solutions.get("without-context") + + title = scenario.get("shortDescription") or "(untitled scenario)" + task_text = scenario.get("task") or "" + local_text = scenario_text_from_dir(args.scenario_dir) + local_metadata = scenario_metadata_from_dir(args.scenario_dir) + skill_context_dependent = ( + local_metadata.get("evidence_type") == "skill_context_dependent" + or is_skill_context_dependent(f"{title}\n{task_text}\n{local_text}") + ) + + suite, reason = classify( + with_score=with_score, + without_score=without_score, + skill_context_dependent=skill_context_dependent, + main_delta_floor=args.main_delta_floor, + ) + + def fmt(value: tuple[float, float] | None) -> str: + if value is None: + return "missing" + earned, maximum = value + if maximum <= 0: + return f"{earned:g}/{maximum:g}" + return f"{earned:g}/{maximum:g} ({100 * earned / maximum:.1f}%)" + + print(f"scenario: {title}") + print(f"with-context: {fmt(with_score)}") + print(f"without-context: {fmt(without_score)}") + print(f"skill-context-dependent: {'yes' if skill_context_dependent else 'no'}") + print(f"recommended-suite: {suite}") + print(f"reason: {reason}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_eval_suite.sh b/scripts/run_eval_suite.sh new file mode 100755 index 0000000..c75ebf7 --- /dev/null +++ b/scripts/run_eval_suite.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/run_eval_suite.sh [scenario ...] [-- tessl eval run args...] + +Runs hosted Tessl evals with the repository's variant policy: + main -> without-context and with-context + reference -> without-context and with-context + regression -> with-context only + +Examples: + scripts/run_eval_suite.sh main -- --label "main check" + scripts/run_eval_suite.sh reference 01-display-name -- --label "targeted reference" + scripts/run_eval_suite.sh regression -- --label "regression safety" + +Do not pass --variant. This script chooses variants from the suite purpose. +USAGE +} + +print_suite_scenarios() { + local dir="$1" + local scenario + + for scenario in "$dir"/*; do + if [[ -d "$scenario" ]]; then + printf ' %s\n' "$(basename "$scenario")" + fi + done | sort +} + +if [[ $# -lt 1 ]]; then + usage >&2 + exit 2 +fi + +suite="$1" +shift + +case "$suite" in + main) + source_dir="evals" + variants=(--variant without-context --variant with-context) + ;; + reference) + source_dir="evals-reference" + variants=(--variant without-context --variant with-context) + ;; + regression) + source_dir="evals-regression" + variants=(--variant with-context) + ;; + -h|--help|help) + usage + exit 0 + ;; + *) + echo "Unknown suite: $suite" >&2 + usage >&2 + exit 2 + ;; +esac + +scenarios=() +extra_args=() +while [[ $# -gt 0 ]]; do + case "$1" in + --) + shift + extra_args=("$@") + break + ;; + --variant|--variant=*) + echo "Do not pass --variant; scripts/run_eval_suite.sh chooses variants by suite." >&2 + exit 2 + ;; + *) + scenarios+=("$1") + shift + ;; + esac +done + +for arg in "${extra_args[@]}"; do + case "$arg" in + --variant|--variant=*) + echo "Do not pass --variant; scripts/run_eval_suite.sh chooses variants by suite." >&2 + exit 2 + ;; + esac +done + +if ! command -v tessl >/dev/null 2>&1; then + echo "tessl CLI is required to run hosted evals." >&2 + exit 127 +fi + +repo_root="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" +source_path="$repo_root/$source_dir" +if [[ ! -d "$source_path" ]]; then + echo "Missing suite directory: $source_path" >&2 + exit 1 +fi + +has_agent=false +for arg in "${extra_args[@]}"; do + case "$arg" in + --agent|--agent=*) + has_agent=true + ;; + esac +done + +agent_args=() +if [[ "$has_agent" == false ]]; then + agent_args=(--agent claude:claude-sonnet-4-6) +fi + +if [[ "$suite" == "main" && "${#scenarios[@]}" -eq 0 ]]; then + echo "Running main eval suite from the linked plugin path." + echo "Scenarios:" + print_suite_scenarios "$source_path" + echo "Variants: ${variants[*]}" + + ( + cd "$repo_root" + tessl eval run "${agent_args[@]}" "${variants[@]}" "${extra_args[@]}" . + ) + exit 0 +fi + +tmp_dir="$(mktemp -d)" +backup_evals="$tmp_dir/evals-original" +staged_evals="$repo_root/evals" +if [[ "$suite" == "main" ]]; then + source_path="$backup_evals" +fi + +restore() { + set +e + if [[ -d "$backup_evals" ]]; then + rm -rf "$staged_evals" + mv "$backup_evals" "$staged_evals" + fi + rm -rf "$tmp_dir" +} +trap restore EXIT + +mv "$staged_evals" "$backup_evals" +mkdir -p "$staged_evals" + +copy_scenario() { + local requested="$1" + local candidate + + if [[ -d "$source_path/$requested" ]]; then + candidate="$source_path/$requested" + elif [[ -d "$requested" ]]; then + candidate="$(cd "$requested" && pwd)" + else + local base + base="$(basename "$requested")" + if [[ -d "$source_path/$base" ]]; then + candidate="$source_path/$base" + else + echo "Unknown $suite scenario: $requested" >&2 + exit 1 + fi + fi + + cp -a "$candidate" "$staged_evals/" +} + +if [[ "${#scenarios[@]}" -eq 0 ]]; then + found=false + for scenario in "$source_path"/*; do + if [[ -d "$scenario" ]]; then + found=true + cp -a "$scenario" "$staged_evals/" + fi + done + if [[ "$found" == false ]]; then + echo "No scenarios found in $source_path" >&2 + exit 1 + fi +else + for scenario in "${scenarios[@]}"; do + copy_scenario "$scenario" + done +fi + +echo "Running $suite eval suite from the linked plugin path with a temporary evals/ staging area." +echo "Scenarios:" +print_suite_scenarios "$staged_evals" +echo "Variants: ${variants[*]}" + +( + cd "$repo_root" + tessl eval run "${agent_args[@]}" "${variants[@]}" "${extra_args[@]}" . +) diff --git a/scripts/validate_eval_criteria.py b/scripts/validate_eval_criteria.py index 9a455c4..2bca33d 100755 --- a/scripts/validate_eval_criteria.py +++ b/scripts/validate_eval_criteria.py @@ -45,12 +45,81 @@ "redact", ) CRITERION_CATEGORIES = {"safety", "optional_quality", "maintainability"} +EVIDENCE_TYPES = {"ordinary_lift", "solved_regression", "skill_context_dependent"} +INTERNAL_LABEL_ALLOW_PATTERNS = ( + r"\bdo not deduct\b.{0,120}\b(?:hard[- ]stop|checklist|scan|marker|skill)\b", + r"\baward full credit\b.{0,120}\b(?:hard[- ]stop|checklist|scan|marker|skill)\b", + r"\ballow(?:s|ed)?\b.{0,120}\b(?:hard[- ]stop|checklist|scan|marker|skill)\b", + r"\bbrief(?:ly)? uses\b.{0,120}\b(?:hard[- ]stop|checklist|scan|marker|skill)\b", +) EXPLICIT_INVOCATION_PATTERNS = ( r"\$java-optionals\b", r"\buse\s+java-optionals\b", r"\buse\s+the\s+java-optionals\s+skill\b", r"\bjava-optionals\s+skill\b", ) +IDENTIFIER_STOP_WORDS = { + "abstractmap", + "api", + "arraylist", + "bigdecimal", + "boolean", + "class", + "collectors", + "comparator", + "completablefuture", + "comparing", + "double", + "exception", + "filter", + "function", + "gatherers", + "hashmap", + "integer", + "intoptional", + "java", + "list", + "long", + "longoptional", + "map", + "object", + "objects", + "optional", + "parallel", + "paralleloptional", + "predicate", + "record", + "runtimeexception", + "set", + "simpleimmutableentry", + "sorted", + "string", + "stream", + "streams", + "system", + "throw", + "tolist", + "total", + "null", + "unsupportedoperationexception", + "void", +} +SCENARIO_REFERENCE_FILES = ( + Path("README.md"), + Path("CONTRIBUTING.md"), + Path(".github/pull_request_template.md"), + Path("evals/NUMBERING.md"), + Path("evals-reference/NUMBERING.md"), + Path("evals-regression/NUMBERING.md"), + Path("evals-regression/README.md"), +) +SCENARIO_REFERENCE_DIRS = (Path("docs"),) +AGENT_DOC_FORBIDDEN_EXTERNAL_HISTORY_PATTERNS = ( + re.compile(r"\bissue\s+#?\d+\b", re.IGNORECASE), + re.compile(r"\bpr\s+#?\d+\b", re.IGNORECASE), + re.compile(r"https://github\.com/[^)\s]+/(?:issues|pull)/\d+", re.IGNORECASE), + re.compile(r"\b019e[a-f0-9-]{20,}\b", re.IGNORECASE), +) def error(message: str) -> int: @@ -92,6 +161,103 @@ def text_of(item: dict[str, Any]) -> str: return f"{item.get('name', '')} {item.get('description', '')}".lower() +def normalized_words(text: str) -> list[str]: + return re.findall(r"[a-z0-9]+", text.lower()) + + +def normalized_text(text: str) -> str: + return " ".join(normalized_words(text)) + + +def ngrams(words: list[str], size: int) -> set[tuple[str, ...]]: + if len(words) < size: + return set() + return {tuple(words[index : index + size]) for index in range(len(words) - size + 1)} + + +def code_like_text(text: str) -> str: + chunks = re.findall(r"```(?:[A-Za-z0-9_-]+)?\n(.*?)```", text, flags=re.DOTALL) + chunks.extend(re.findall(r"`([^`\n]+)`", text)) + return "\n".join(chunks) + + +def task_similarity(left: str, right: str) -> float: + left_words = normalized_words(left) + right_words = normalized_words(right) + left_text = " ".join(left_words) + right_text = " ".join(right_words) + if not left_text or not right_text: + return 0.0 + exact_ratio = 1.0 if left_text == right_text else 0.0 + left_grams = ngrams(left_words, 8) + right_grams = ngrams(right_words, 8) + if not left_grams or not right_grams: + return exact_ratio + overlap = len(left_grams & right_grams) / min(len(left_grams), len(right_grams)) + return max(exact_ratio, overlap) + + +def domain_identifiers(text: str) -> set[str]: + identifiers = set(re.findall(r"\b[A-Za-z_][A-Za-z0-9_]*\b", code_like_text(text))) + result: set[str] = set() + for identifier in identifiers: + lowered = identifier.lower() + if lowered in IDENTIFIER_STOP_WORDS or len(identifier) < 4: + continue + if identifier.isupper() and len(identifier) <= 6: + continue + result.add(identifier) + return result + + +def api_markers(text: str) -> set[str]: + markers = set( + re.findall( + r"\b[A-Z][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+\b" + r"|\b[A-Za-z_][A-Za-z0-9_]*::[A-Za-z_][A-Za-z0-9_]*\b", + text, + ) + ) + return {marker for marker in markers if marker not in {"System.out"}} + + +def is_skill_context_dependent_text(text: str) -> bool: + lowered = text.lower() + context_terms = ( + "skill bundle", + "skill package", + "skill-provided", + "skill-only context", + "agent instructions", + "from the skill", + "from the skill bundle", + "bundled reference", + "bundled reference text", + "exact skill-provided text", + "exact wording", + "exact text", + "exact scan", + "exact scan header", + "exact checklist", + "exact procedure", + "exact command", + "scan command from the skill", + "hard-stop rg scan command", + ) + required_terms = ( + "exact", + "skill-provided", + "skill-only context", + "skill package", + "agent instructions", + "from the skill", + "bundled reference", + ) + return any(term in lowered for term in context_terms) and any( + term in lowered for term in required_terms + ) + + def validate_scenario(scenario: Path, main_eval_root: Path | None) -> list[str]: failures: list[str] = [] task_file = scenario / "task.md" @@ -174,18 +340,61 @@ def validate_scenario(scenario: Path, main_eval_root: Path | None) -> list[str]: if any(word in haystack for word in BEHAVIOR_WORDS): behavior_score += max_score + task_text = task_file.read_text(encoding="utf-8") if task_file.is_file() else "" metadata = data.get("metadata") if not isinstance(metadata, dict): failures.append(f"{criteria_file}: missing metadata object") metadata = {} invocation = metadata.get("invocation") task_type = metadata.get("task_type") + evidence_type = metadata.get("evidence_type") if invocation not in {"natural", "explicit"}: failures.append(f"{criteria_file}: metadata.invocation must be natural or explicit") if task_type not in {"implementation", "cleanup", "review"}: failures.append(f"{criteria_file}: metadata.task_type must be implementation, cleanup, or review") + if evidence_type is not None and evidence_type not in EVIDENCE_TYPES: + failures.append( + f"{criteria_file}: metadata.evidence_type must be one of {sorted(EVIDENCE_TYPES)}" + ) + if scenario.parent.name == "evals-regression" and evidence_type not in { + "solved_regression", + "skill_context_dependent", + }: + failures.append( + f"{criteria_file}: regression scenarios must set metadata.evidence_type to " + "solved_regression or skill_context_dependent" + ) + scenario_text = f"{scenario.name}\n{task_text}\n{json.dumps(data, sort_keys=True)}" + detected_skill_context = is_skill_context_dependent_text(scenario_text) + if evidence_type == "skill_context_dependent" and scenario.parent.name != "evals-regression": + failures.append( + f"{criteria_file}: metadata.evidence_type=skill_context_dependent must live in evals-regression" + ) + if evidence_type == "solved_regression" and scenario.parent.name != "evals-regression": + failures.append( + f"{criteria_file}: metadata.evidence_type=solved_regression must live in evals-regression" + ) + if evidence_type == "ordinary_lift" and scenario.parent.name == "evals-regression": + failures.append( + f"{criteria_file}: metadata.evidence_type=ordinary_lift must live in evals or evals-reference" + ) + if evidence_type != "skill_context_dependent" and detected_skill_context: + failures.append( + f"{criteria_file}: scenario appears skill-context-dependent; set " + "metadata.evidence_type to skill_context_dependent and keep it in evals-regression" + ) + if evidence_type != "skill_context_dependent" and task_type == "review": + for index, item in enumerate(checklist, start=1): + if not isinstance(item, dict): + continue + criterion_text = text_of(item) + if any(re.search(pattern, criterion_text) for pattern in INTERNAL_LABEL_ALLOW_PATTERNS): + failures.append( + f"{criteria_file}: checklist item {index} appears to allow internal workflow " + "labels in ordinary review output; prohibit them or move the scenario to " + "skill-context-dependent regression if exact workflow wording is required" + ) - task_text = task_file.read_text(encoding="utf-8") if task_file.is_file() else "" if task_text and not re.search(r"\bAssume Java\s+\d+\b", task_text): failures.append(f"{task_file}: task must state the Java version to assume, e.g. 'Assume Java 17.'") has_explicit_invocation = invocation_from_task(task_text) @@ -212,6 +421,139 @@ def validate_scenario(scenario: Path, main_eval_root: Path | None) -> list[str]: return failures +def validate_cross_suite_duplicates(dirs: list[Path]) -> list[str]: + failures: list[str] = [] + active = [scenario for scenario in dirs if scenario.parent.name == "evals"] + reference = [ + scenario + for scenario in dirs + if scenario.parent.name in {"evals-reference", "evals-regression"} + ] + for active_scenario in active: + if not (active_scenario / "task.md").is_file(): + continue + active_task = (active_scenario / "task.md").read_text(encoding="utf-8") + for reference_scenario in reference: + if not (reference_scenario / "task.md").is_file(): + continue + reference_task = (reference_scenario / "task.md").read_text(encoding="utf-8") + similarity = task_similarity(active_task, reference_task) + if similarity >= 0.85: + failures.append( + f"{active_scenario}: task.md is too similar to {reference_scenario} " + f"(normalized task overlap {similarity:.2f})" + ) + return failures + + +def validate_runtime_reference_overlap(dirs: list[Path]) -> list[str]: + failures: list[str] = [] + references_root = Path("skills/java-optionals/references") + if not references_root.exists(): + return failures + + runtime_text = "\n".join( + path.read_text(encoding="utf-8") for path in sorted(references_root.glob("*.md")) + ) + runtime_identifiers = domain_identifiers(runtime_text) + runtime_words = normalized_words(runtime_text) + runtime_grams = ngrams(runtime_words, 12) + + for scenario in dirs: + if scenario.parent.name != "evals": + continue + task_file = scenario / "task.md" + if not task_file.exists(): + continue + task_text = task_file.read_text(encoding="utf-8") + if is_skill_context_dependent_text(f"{scenario.name}\n{task_text}"): + continue + + task_identifiers = domain_identifiers(task_text) + shared_identifiers = sorted(task_identifiers & runtime_identifiers) + task_words = normalized_words(task_text) + task_grams = ngrams(task_words, 12) + long_overlap_count = len(task_grams & runtime_grams) + + if long_overlap_count >= 8 and len(shared_identifiers) >= 8: + failures.append( + f"{scenario}: task.md overlaps runtime references too closely; shared identifiers: " + f"{', '.join(shared_identifiers[:12])}" + ) + return failures + + +def runtime_reference_overlap_warnings(dirs: list[Path]) -> list[str]: + warnings: list[str] = [] + references_root = Path("skills/java-optionals/references") + if not references_root.exists(): + return warnings + + runtime_text = "\n".join( + path.read_text(encoding="utf-8") for path in sorted(references_root.glob("*.md")) + ) + runtime_identifiers = domain_identifiers(runtime_text) + runtime_api_markers = api_markers(runtime_text) + runtime_words = normalized_words(runtime_text) + runtime_grams = ngrams(runtime_words, 12) + + for scenario in dirs: + if scenario.parent.name != "evals": + continue + task_file = scenario / "task.md" + criteria_file = scenario / "criteria.json" + if not task_file.exists() or not criteria_file.exists(): + continue + task_text = task_file.read_text(encoding="utf-8") + criteria_text = criteria_file.read_text(encoding="utf-8") + combined_text = f"{task_text}\n{criteria_text}" + if is_skill_context_dependent_text(f"{scenario.name}\n{combined_text}"): + continue + + shared_identifiers = sorted(domain_identifiers(combined_text) & runtime_identifiers) + shared_api_markers = sorted(api_markers(combined_text) & runtime_api_markers) + combined_grams = ngrams(normalized_words(combined_text), 12) + long_overlap_count = len(combined_grams & runtime_grams) + repeated_api_shape = ( + "blocking" in combined_text.lower() + and "bounded" in combined_text.lower() + and "Gatherers.mapConcurrent" in shared_api_markers + and "Map.entry" in shared_api_markers + ) + if ( + len(shared_identifiers) >= 4 + or (len(shared_identifiers) >= 3 and long_overlap_count) + or repeated_api_shape + ): + warnings.append( + f"{scenario}: task.md plus criteria.json are close to runtime references; " + f"document a focused-coverage rationale if intentional. Shared identifiers: " + f"{', '.join(shared_identifiers[:12]) or '(none)'}; shared API markers: " + f"{', '.join(shared_api_markers[:12]) or '(none)'}" + ) + + return warnings + + +def validate_scenario_path_references() -> list[str]: + failures: list[str] = [] + files = [path for path in SCENARIO_REFERENCE_FILES if path.exists()] + for directory in SCENARIO_REFERENCE_DIRS: + if directory.exists(): + files.extend(sorted(directory.rglob("*.md"))) + + pattern = re.compile(r"`?((?:evals|evals-reference|evals-regression)/[A-Za-z0-9_.\-/]+)`?") + for path in files: + text = path.read_text(encoding="utf-8") + for match in pattern.finditer(text): + candidate = match.group(1).rstrip(".,);:") + if "*" in candidate: + continue + if not Path(candidate).exists(): + failures.append(f"{path}: stale scenario path reference {candidate!r}") + return failures + + def validate_runtime_references() -> list[str]: failures: list[str] = [] root = Path("skills/java-optionals/references") @@ -227,6 +569,23 @@ def validate_runtime_references() -> list[str]: return failures +def validate_agent_docs_self_contained() -> list[str]: + failures: list[str] = [] + root = Path("docs/agents") + if not root.exists(): + return failures + for path in sorted(root.glob("*.md")): + text = path.read_text(encoding="utf-8") + for pattern in AGENT_DOC_FORBIDDEN_EXTERNAL_HISTORY_PATTERNS: + match = pattern.search(text) + if match: + failures.append( + f"{path}: docs/agents must be self-contained; remove external history " + f"reference {match.group(0)!r}" + ) + return failures + + def validate_numbering(root: Path) -> list[str]: failures: list[str] = [] if not root.exists(): @@ -326,7 +685,14 @@ def main() -> int: for path in paths: if path.is_dir(): failures.extend(validate_numbering(path)) + failures.extend(validate_cross_suite_duplicates(dirs)) + warnings = runtime_reference_overlap_warnings(dirs) + failures.extend(validate_scenario_path_references()) failures.extend(validate_runtime_references()) + failures.extend(validate_agent_docs_self_contained()) + + for warning in warnings: + print(f"warning: {warning}", file=sys.stderr) if failures: for failure in failures: From 424a96c4f9ff991d63d40309cc4b9286d3c76088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Martin?= Date: Mon, 8 Jun 2026 13:16:45 +0200 Subject: [PATCH 3/5] docs(evals): document suite workflow policy --- CONTRIBUTING.md | 61 +++++++- docs/agents/evals.md | 38 ++++- docs/agents/maintaining-agent-docs.md | 9 ++ docs/agents/workflow.md | 209 ++++++++++++++++++++------ evals-regression/README.md | 43 +++++- 5 files changed, 302 insertions(+), 58 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4a2c667..5fb183a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,8 +90,8 @@ Run these before committing skill, eval, README, package, script, or CI changes: ```bash python3 scripts/validate_skill.py skills/java-optionals python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression -python3 -m py_compile scripts/validate_skill.py scripts/validate_eval_criteria.py -bash -n scripts/check_publish_dry_run.sh +python3 -m py_compile scripts/*.py +bash -n scripts/*.sh tessl plugin lint . ``` @@ -196,10 +196,23 @@ history simple and use one clear commit for a focused pull request. Hosted evals are useful when a change affects the skill behavior, benchmark scenarios, or README score claims. They require Tessl authentication and a linked Tessl project. -If you have your own Tessl workspace, link your checkout to your own project and run: +Use Sonnet 4.6 for this repository's main eval checks. Prefer `scripts/run_eval_suite.sh`; it runs +from the plugin root and chooses variants by suite purpose. + +Run hosted eval variants by suite purpose: + +- `evals/`: run both `without-context` and `with-context`; these runs support public lift + reporting. Use `scripts/run_eval_suite.sh main`. +- `evals-reference/`: run both `without-context` and `with-context`; these runs decide whether a + scenario has meaningful lift or should move suites. Use `scripts/run_eval_suite.sh reference`. +- `evals-regression/`: run `with-context` only by default; these runs are safety checks, not lift + discovery. Run regression `without-context` only when deliberately checking whether a scenario + should move back to `evals-reference/`. Use `scripts/run_eval_suite.sh regression`. + +For example: ```bash -tessl eval run --variant with-context --variant without-context . +scripts/run_eval_suite.sh main ``` If you don't have a Tessl workspace, that's fine. Open the pull request with the local check results, @@ -207,9 +220,9 @@ and a maintainer can run the hosted evals before release. In this repository, the main eval set lives in `evals/` and is used for public lift reporting. `evals-reference/` contains candidate and diagnostic coverage that helps tune the skill or decide -what to promote later. `evals-regression/` contains scenarios that are consistently solved by both -with-context and without-context; these are useful safety checks, but they do not directly drive the -main lift claim. +what to promote later. `evals-regression/` contains solved scenarios and skill-context-dependent +checks; these are useful with-context safety checks, but they do not directly drive the main lift +claim. The Java Optional skill is broadly about Optional correctness, readability, fallback timing, boundary handling, stream interop, primitive Optional usage, and avoiding cleanup changes that @@ -235,10 +248,37 @@ baseline-solved scenarios just to improve lift. Move repeatedly baseline-solved `evals-regression/` only when hosted evidence shows both variants are consistently 100%. Keep low-delta but still diagnostic scenarios in `evals-reference/`. +Use `metadata.evidence_type` when scenario placement needs to be explicit: + +- `ordinary_lift`: both variants are fair to compare, so the scenario can live in main or reference. + This value is invalid in `evals-regression/`. +- `solved_regression`: hosted history shows both variants solve the scenario at 100%. This value is + invalid outside `evals-regression/`. +- `skill_context_dependent`: the scenario requires exact skill-provided text, commands, procedures, + checklists, headers, or bundled reference text. It must live in `evals-regression/`. + +When adding a new scenario, classify it from an isolated hosted run: + +```bash +tessl eval view --json > /tmp/eval-run.json +scripts/classify_eval_result.py /tmp/eval-run.json --scenario-dir +``` + +The main promotion floor is 30 percentage points. Treat that as maintainer policy for future +promotion or demotion decisions, not as a current hosted benchmark result. Old hosted deltas are +historical evidence only until rerun against the current active suite membership, denominator, +commit/ref, natural/explicit split, and pinned CLI behavior. + When with-context is below 100%, keep the scenario wherever it already lives. Fix the skill or eval there, then rerun only that targeted scenario until it is clean before running broader suites. After targeted failures are clean, run `evals/` for the main score, relevant `evals-reference/` scenarios -for nearby behavior, and `evals-regression/` only for final release safety or broad changes. +with both variants for nearby behavior, and `evals-regression/` with context only for final release +safety or broad changes. + +Every substantive eval scenario edit requires a targeted hosted rerun of that scenario before the PR +is ready. A pure move between `evals/`, `evals-reference/`, and `evals-regression/` does not need a +hosted rerun when `task.md`, scoring criteria, and `capability.txt` are unchanged except for +suite-placement metadata or numbering notes; run local validators and update suite totals instead. Runtime skill references must not contain eval inventories, expected answers, score rubrics, hosted run IDs, or benchmark claims. Put maintainer-only eval history in `docs/agents/`. @@ -271,6 +311,11 @@ Before merging a release pull request: - confirm `README.md` stays user-focused; - confirm contributor-only process details live here. +Manual Tessl publishing is for maintainer-approved recovery only. Dispatch `publish-tessl.yml` with +an explicit `ref`; normal releases should use the fully qualified tag that matches +`.tessl-plugin/plugin.json` as `refs/tags/v`. Publishing a branch or other non-tag ref +requires the workflow's explicit `allow_non_tag_ref` override. + ## Dependency Updates Renovate keeps GitHub Actions, commitlint, and pinned action digests current. Major updates need diff --git a/docs/agents/evals.md b/docs/agents/evals.md index d91f542..c5287c3 100644 --- a/docs/agents/evals.md +++ b/docs/agents/evals.md @@ -41,15 +41,29 @@ benchmark claims, or scoring rules. - Include evals where the agent writes new Optional code, not only reviews or refactors snippets. - Review-only or no-op evals must still require a concrete artifact, such as `review.md`, so empty answers can't pass by accident. +- Skill-context-dependent evals require information that only comes from the skill package or agent + instructions, such as exact wording, commands, procedures, checklists, headers, or bundled + reference text. Keep them in `evals-regression/` once with-context is 100%, regardless of the + without-context score. Do not count them in the main or reference lift score, do not describe them + as natural activation or independent Java Optional reasoning, and do not call weighted checklist + items hard gates. - Keep three eval buckets: - `evals/` is the main eval set used for public lift reporting. - `evals-reference/` is for candidate, diagnostic, and broad coverage scenarios that may still help tune or promote future main evals. - `evals-regression/` is for scenarios that hosted history shows are consistently solved by both - with-context and without-context. These protect against regressions but should not be part of + with-context and without-context, plus skill-context-dependent checks that are only fair as + with-context regression coverage. These protect against regressions but should not be part of normal lift discovery runs. - Every scenario directory must contain `task.md`, `criteria.json`, and `capability.txt`. - Every `criteria.json` must classify `metadata.invocation` and `metadata.task_type`. +- Use `metadata.evidence_type` when scenario placement needs to be explicit: + - `ordinary_lift`: an ordinary main or reference scenario where both variants are fair to compare. + This value is invalid in `evals-regression/`. + - `solved_regression`: a regression scenario moved because hosted evidence shows both variants + repeatedly score 100%. + - `skill_context_dependent`: a regression scenario that requires skill-package or agent-instruction + context, so without-context comparison is not fair. - Every main eval criterion must classify `category` as `safety`, `optional_quality`, or `maintainability`. - Main eval implementation scenarios need compile/artifact checks and behavior checks as safety @@ -88,6 +102,12 @@ benchmark claims, or scoring rules. reference depending on coverage and weighting. - `with-context = 100` and `without-context = 100` repeatedly: candidate for `evals-regression/`. +- A new scenario should not move to main unless its percentage-point delta is at least 30 percentage + points and it improves capability coverage. Treat 30 pp as maintainer policy for future promotion + or demotion decisions, not as a current hosted benchmark result. Old hosted deltas are historical + evidence only; do not use them for release-readiness claims, public score/lift claims, or current + benchmark claims until they are rerun against the current active suite membership, denominator, + commit/ref, natural/explicit split, and pinned CLI behavior. - Don't hide scenarios merely because the baseline solves them. Move them to `evals-regression/` only when they're consistently solved by both variants and are better as safety-net coverage than lift or diagnostic evidence. @@ -99,14 +119,26 @@ benchmark claims, or scoring rules. as active documentation. Keep current policy in these docs and use git history for old answer keys, replay logs, and one-off run details. - Keep hosted eval usage minimal while preserving confidence: - - For skill or eval changes, first run only the affected scenario directories, with both variants - when lift or regression risk matters. + - Use `scripts/run_eval_suite.sh` so variants match suite purpose and runs use the plugin context. + - Main and reference scenarios run with both variants. + - Regression scenarios run with context only by default. Run regression without-context only when + intentionally checking whether a scenario should move back to reference. + - For skill or eval changes, first run only the affected scenario directories. - If any affected with-context result is below 100%, keep rerunning only those targeted scenarios after fixes until they are clean. - Then run `evals/` for the main score. - Run relevant `evals-reference/` scenarios when deciding promotion or checking nearby behavior. - Run `evals-regression/` as a final safety check before release or after broad changes, not on every tuning loop. + - A pure move between `evals/`, `evals-reference/`, and `evals-regression/` does not need a hosted + rerun when `task.md`, scoring criteria, and `capability.txt` are unchanged except for + suite-placement metadata or numbering notes. + +Current active suite structure: + +- `evals/`: 4 scenarios, 360 checklist points, 3 natural and 1 explicit. +- `evals-reference/`: 46 scenarios, 2470 checklist points, broad candidate and diagnostic coverage. +- `evals-regression/`: 2 scenarios, 200 checklist points, with-context safety coverage. ## Checks diff --git a/docs/agents/maintaining-agent-docs.md b/docs/agents/maintaining-agent-docs.md index fb25d8e..a0b7916 100644 --- a/docs/agents/maintaining-agent-docs.md +++ b/docs/agents/maintaining-agent-docs.md @@ -39,6 +39,15 @@ keep. Don't auto-resolve. When editing these docs, delete or rewrite anything redundant, vague, or too obvious to be actionable. +## Self-Contained Docs + +Files under `docs/agents/` must be self-contained maintainer guidance. Do not require agents to load +GitHub issues, pull requests, transcripts, hosted run pages, or other external history to understand +the rule. Avoid specific issue numbers, PR numbers, hosted run IDs, and links to those records. If +historical evidence matters, summarize the relevant decision, scores, and durable rationale in the +doc itself. External links are acceptable only for stable public references or current data that must +be fetched fresh. + ## Minimal Template For New Pages ```md diff --git a/docs/agents/workflow.md b/docs/agents/workflow.md index 11a7566..bd681ef 100644 --- a/docs/agents/workflow.md +++ b/docs/agents/workflow.md @@ -2,74 +2,197 @@ ## Scope -Use this for day-to-day work in this repository: auth checks, validation, commits, and pushes. +Use this for day-to-day work in this repository: auth checks, validation, commits, pushes, and +release-readiness. ## Rules - If a Tessl or GitHub command fails because auth, login, workspace, or permission state appears - missing, re-check after the user says they changed it. Don't keep assuming the old state. -- When the maintainer explicitly asks for autonomous pull request work, carry it through - implementation, validation, commit, push, and PR creation unless they ask to stop earlier. -- Before committing changes to the skill, README, evals, package metadata, scripts, CI, agent docs, - or this file, run: + missing, re-check after the user says they changed it. +- When the maintainer explicitly asks for autonomous repository work, carry it through + implementation, validation, commit, push, and pull request creation unless they ask to stop + earlier. +- Before committing changes to the skill, README, evals, package metadata, scripts, CI, or agent + docs, run: ```bash python3 scripts/validate_skill.py skills/java-optionals python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression - python3 -m py_compile scripts/validate_skill.py scripts/validate_eval_criteria.py - bash -n scripts/check_publish_dry_run.sh + python3 -m py_compile scripts/*.py + bash -n scripts/*.sh tessl plugin lint . bash scripts/check_publish_dry_run.sh . tessl plugin publish --dry-run --bump patch . tessl plugin publish --dry-run . ``` -- After a version has been published, the dry-run may fail only because that exact version already - exists. For docs-only changes that don't need a new plugin release, record that as expected and don't - bump the version. For skill, eval, or package changes that should be published, bump the - version before publishing again. - -- PR CI runs tokenless validation and plugin lint. Authenticated Tessl publish dry-runs run only on - trusted `main` pushes and release/publish workflows. The optional skill review workflow runs - `tessl skill review --threshold 100` when `TESSL_TOKEN` is configured. -- Use `tessl plugin publish --dry-run --bump patch .` as a PR-safe local/manual dry-run when the - current manifest version is already published. Release publishing uses exact-version - `tessl plugin publish --dry-run .` immediately before `tessl plugin publish .`. -- Tessl release publishing runs in the `tessl-release` GitHub environment. Configure required - reviewers or other environment protection rules in the repository settings when the plan supports - them. -- Keep the review threshold at 100 while the skill passes that bar without weakening its Java - guidance. Don't remove useful Java guidance only to improve the review score. -- For skill behavior or eval changes, run hosted evals with the smallest useful set first. Start - with targeted affected scenarios: + If `tessl plugin publish --dry-run .` fails only because the current manifest version has already + been published, record that as expected for ordinary non-release changes and rely on the + patch-bump dry-run for PR safety. For skill, eval, package, or release changes that should publish + a new version, let Release Please bump the version before the exact-version publish check. + +- For skill behavior or eval changes, run hosted evals with Sonnet 4.6, but start with the smallest + useful set to conserve Tessl daily rate-limit budget. Use `scripts/run_eval_suite.sh` so the run + uses plugin context and the right variant policy. + + If any eval scenario's `task.md`, `criteria.json`, or `capability.txt` changed, run that exact + scenario before finishing the PR. A pure move between `evals/`, `evals-reference/`, and + `evals-regression/` does not need a hosted rerun when the task, scoring criteria, and capability + text are unchanged except for suite-placement metadata or numbering notes; run local validators + and update suite totals instead. The with-context result for every substantively changed scenario + must be 100% before broader suite results, benchmark claims, or release-readiness claims are + trusted. This rule applies even when the edit looks like a prompt cleanup or metadata-only scoring + clarification. If Tessl hosted evals are unavailable, the PR must document the blocker and + remaining targeted runs; do not make benchmark or release-readiness claims until those runs pass. + + Run targeted affected main or reference scenarios with both variants: + + ```bash + scripts/run_eval_suite.sh main + scripts/run_eval_suite.sh reference + ``` + + Run targeted affected regression scenarios with context only by default: ```bash - tessl eval run --variant with-context --variant without-context + scripts/run_eval_suite.sh regression ``` If any targeted with-context result is below 100%, fix the skill or eval and rerun only those targeted scenarios until they are clean. Then run the main eval set: ```bash - tessl eval run --variant with-context --variant without-context . + scripts/run_eval_suite.sh main + ``` + + For runtime skill text or runtime reference changes, progressively widen the hosted checks before + calling the PR done: first affected scenarios, then the full main suite, then every reference + scenario with both variants, then every regression scenario with context only. The final post-change + evidence must show 100% with context for every retained scenario in every suite. Run regression + without-context only when intentionally checking whether a scenario should move back to reference. + If a broad run finds isolated failures, fix and rerun those scenarios targeted after the fix before + spending rate-limit budget on another broad suite run; once targeted failures are clean, finish the + remaining broad suites that have not yet run against the final skill state. If Tessl hosted evals + are unavailable or rate-limited, document the exact missing runs and do not call the PR + release-ready. + + Release evals only cover the published main suite. After any runtime skill text or runtime + reference change, a successful publish run is not enough by itself: before saying the release or + repository is done, also verify that every reference scenario has run with both variants and every + regression scenario has run with context only against the final skill state. These runs may be + split across targeted and suite runs to conserve Tessl quota, but they must be after the last + runtime-context change. If quota, auth, or hosted availability blocks the broad reference or + regression checks, open or update a GitHub issue with the exact missing commands, run IDs already + completed, and the blocking condition. + +- When adding or moving one scenario, classify it from the isolated run before choosing the final + suite: + + ```bash + tessl eval view --json > /tmp/eval-run.json + scripts/classify_eval_result.py /tmp/eval-run.json --scenario-dir + ``` + + Follow the recommendation unless the pull request documents a maintainer-approved override. + +- Run the Tessl skill review at threshold 100 when changing runtime skill content: + + ```bash + tessl skill review --threshold 100 skills/java-optionals/SKILL.md ``` - Run relevant `evals-reference/` scenarios for nearby behavior and `evals-regression/` only as a - final safety check before release or after broad changes. - Pull request titles and commits must use Conventional Commits. Release Please uses them to update - `CHANGELOG.md`, `.tessl-plugin/plugin.json`, and GitHub releases. When Release Please creates a - release with `GITHUB_TOKEN`, the normal `release: published` trigger does not fire, so the Release - Please workflow dispatches `.github/workflows/publish-tessl.yml` with the created tag. Tessl - publishing still happens only in `.github/workflows/publish-tessl.yml`. -- Renovate manages GitHub Actions, action digests, commitlint packages, and the pinned Tessl CLI - version in workflows. Keep `minimumReleaseAge` at 7 days with `internalChecksFilter: "strict"` so - Renovate waits before creating branches or PRs for updates that haven't passed the age gate. Keep - custom managers only for dependencies Renovate can't detect natively: commitlint packages installed - inside workflow shell commands, and the Tessl CLI version passed to `tesslio/setup-tessl`. Don't - add Maven, Docker, or vendored Tessl dependency rules unless those files exist here. - -- For maintainer-requested automation tasks where the user has asked for a pull request, commit and - push finished changes. Otherwise, don't push without explicit instruction. + `CHANGELOG.md`, `.tessl-plugin/plugin.json`, and GitHub releases. + Any change that could affect hosted lift, baseline score, with-context score, skill activation, runtime skill behavior, active eval tasks, active eval criteria, or active eval membership must be made in a separate commit. Label it as lift-sensitive in the PR summary and include a revert strategy. + - Use `fix(skill): ...` for corrections to `skills/java-optionals/SKILL.md` or files it links as + runtime references. + - Use `feat(skill): ...` when adding a new runtime capability or materially broader skill behavior. + - Use `test(evals): ...` when adding, moving, or reclassifying scenarios without changing their + scoring intent. + - Use `fix(evals): ...` when correcting a flawed task, criterion, score interpretation, or unfair + eval expectation. + - Use `docs: ...` only for user/contributor/agent docs that do not change runtime skill behavior + and do not change eval scoring or suite membership. + - Use the PR title type/scope for the highest-impact change in the PR; if runtime skill behavior + changed, the PR title should normally be `fix(skill)` or `feat(skill)`, not `docs`. + + When Release Please creates a release with `GITHUB_TOKEN`, the normal `release: published` trigger + does not fire, so the Release Please workflow dispatches `.github/workflows/publish-tessl.yml` with + the created tag. Tessl publishing still happens only in `.github/workflows/publish-tessl.yml`. + That workflow requires an explicit publish ref and validates that release tags match + `.tessl-plugin/plugin.json` as `v`. + Release Please PRs created or updated with `GITHUB_TOKEN` may not trigger ordinary `pull_request` + workflows, so `.github/workflows/release-please.yml` also posts the required release-PR + `Commitlint` and `Validate skill and plugin` statuses. Prefer the Release Please `pr` output for + those statuses, and fall back to the existing pending release PR only when Release Please emits no + PR output because the PR was unchanged. +- When the maintainer asks for a release, keep Release Please as the source of truth. Do not edit + `CHANGELOG.md`, `.release-please-manifest.json`, `.tessl-plugin/plugin.json`, tags, or GitHub + releases by hand unless the maintainer explicitly asks to repair broken release state. + + If a Release Please PR is already open: + + ```bash + gh pr list --state open --author "github-actions[bot]" \ + --head release-please--branches--main--components--java-optionals + gh pr checks --fail-fast=false + ``` + + Make sure the PR only contains Release Please files (`CHANGELOG.md`, + `.release-please-manifest.json`, `.tessl-plugin/plugin.json`) and that required checks pass. If + the release PR has no checks because it was just created, rerun the Release Please workflow for the + current `main` run so it finds the existing PR and attaches the validation statuses. Then merge the + release PR with the repository's linear-history merge method, normally squash merge, and wait for + `.github/workflows/publish-tessl.yml` to finish. + + If no Release Please PR is open: + + ```bash + git status --short --branch + git log --oneline "$(git describe --tags --abbrev=0)"..main + gh run list --workflow release-please.yml --limit 5 + ``` + + If unreleased commits already include a releasable Conventional Commit such as `fix:` or `feat:`, + rerun or trigger the Release Please workflow on `main` and wait for the release PR. If the only + unreleased commits are non-releasable types such as `docs:`, `test:`, or `chore:`, and the + maintainer still wants a new published version, create an empty releasable commit that accurately + describes why a release is needed, for example: + + ```bash + git commit --allow-empty -m "fix(evals): publish updated main eval suite" + git push origin main + ``` + + Then let Release Please open the release PR, validate it, merge it, and wait for the Tessl publish + workflow. After the publish run completes, confirm the GitHub release, Tessl latest version, and + that no stale Release Please PR or branch remains. + + For a maintainer-approved manual publish, dispatch `publish-tessl.yml` with an explicit `ref`. + Normal releases should use the fully qualified release tag, which must equal + `refs/tags/v`. Publishing a branch or other non-tag ref requires setting + `allow_non_tag_ref=true`; use that only for recovery when Release Please cannot complete the + normal handoff. + + If the release contains any runtime skill text or runtime reference change, do not stop after the + registry main eval passes. Confirm the post-change eval evidence also includes: + + ```bash + scripts/run_eval_suite.sh reference + scripts/run_eval_suite.sh regression + ``` + + `reference` must be run with both variants through the wrapper. `regression` must be run with + context only through the wrapper. If these broad suite runs were already completed after the final + runtime-context commit, reuse those run IDs; otherwise run them before reporting the release as + complete. The completion report must state the main release eval run plus the reference and + regression run IDs, or link the GitHub issue that records why the remaining checks are blocked. +- The GitHub repository is public. Keep docs, metadata, license, security policy, and contribution + workflow public-safe. +- Keep `.tessl-plugin/plugin.json` public with `"private": false`, but do not run a real Tessl + publish unless the maintainer explicitly asks for publication. +- For maintainer-requested automation tasks where the user has asked for GitHub state, commit and + push finished changes. ## References diff --git a/evals-regression/README.md b/evals-regression/README.md index ca61b82..b25c232 100644 --- a/evals-regression/README.md +++ b/evals-regression/README.md @@ -1,12 +1,47 @@ # Regression Evals This directory is for scenarios that are consistently solved by both the with-context and -without-context variants in hosted runs. +without-context variants in hosted runs, plus skill-context-dependent scenarios that are only fair as +with-context regression checks. Keep these scenarios out of the main lift score and out of normal reference-candidate runs. Run them as a final safety check before release, after broad skill changes, or when the changed area is directly related to one of these scenarios. -Do not move a scenario here just because it currently fails with context. If with-context is below -100%, keep the scenario in its current suite, fix the skill or eval in place, and run that scenario -targeted until it is clean before moving on to broader eval runs. +Run regression evals with context only by default: + +```bash +scripts/run_eval_suite.sh regression +``` + +Do not run regression `without-context` during normal maintenance. Without-context regression runs +are only useful when deliberately checking whether a scenario should move back to +`evals-reference/`. + +With-context must be 100% for every regression scenario. Do not move a scenario here just because it +currently fails with context. If with-context is below 100%, keep the scenario in its current suite, +fix the skill or eval in place, and run that scenario targeted until it is clean before moving on to +broader eval runs. + +Skill-context-dependent scenarios also live here. They require exact skill-provided text, commands, +procedures, checklists, headers, or bundled reference text. Use their with-context results as +regression coverage and do not count their without-context scores as fair lift evidence, regardless +of how the without-context variant happens to score. + +Mark skill-context-dependent scenarios with: + +```json +"metadata": { + "evidence_type": "skill_context_dependent" +} +``` + +Mark solved regression scenarios with: + +```json +"metadata": { + "evidence_type": "solved_regression" +} +``` + +Every regression scenario must declare either `solved_regression` or `skill_context_dependent`. From 6b650d96f8688724a4ff5ac9022480356b68350f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Martin?= Date: Mon, 8 Jun 2026 13:17:13 +0200 Subject: [PATCH 4/5] docs: restore pull request template guidance --- .github/pull_request_template.md | 52 +++++++++++++------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index c4e21dc..dac15ce 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -46,10 +46,9 @@ Checks most contributors can run: - [ ] `python3 scripts/validate_skill.py skills/java-optionals` - [ ] `python3 scripts/validate_eval_criteria.py evals evals-reference evals-regression` -- [ ] `python3 -m py_compile scripts/validate_skill.py scripts/validate_eval_criteria.py` -- [ ] `bash -n scripts/check_publish_dry_run.sh` +- [ ] `python3 -m py_compile scripts/*.py` +- [ ] `bash -n scripts/*.sh` - [ ] `tessl plugin lint .` -- [ ] `markdownlint`, if Markdown changed - [ ] Manual rendered-doc or example review, if docs or examples changed Tessl-authenticated checks: @@ -57,10 +56,14 @@ Tessl-authenticated checks: - [ ] `bash scripts/check_publish_dry_run.sh .` - [ ] `tessl plugin publish --dry-run --bump patch .` - [ ] `tessl skill review --threshold 100 skills/java-optionals/SKILL.md`, if skill text or references changed -- [ ] Targeted `tessl eval run --variant with-context --variant without-context `, if - skill behavior or evals changed -- [ ] Full/main `tessl eval run --variant with-context --variant without-context .`, if benchmark - claims changed or targeted with-context results are clean +- [ ] Targeted main/reference `scripts/run_eval_suite.sh `, if skill behavior or those evals changed +- [ ] Targeted regression `scripts/run_eval_suite.sh regression `, if regression evals changed +- [ ] Every substantively changed eval scenario was rerun targeted and reached 100% with context, or the PR explains the Tessl blocker and remaining work +- [ ] Runtime skill/reference changes only: full `scripts/run_eval_suite.sh reference` was run after the final runtime-context change, or the PR links the blocker issue +- [ ] Runtime skill/reference changes only: full `scripts/run_eval_suite.sh regression` was run after the final runtime-context change, or the PR links the blocker issue +- [ ] Pure eval suite moves did not change task wording, scoring criteria, or capability text beyond suite-placement metadata/numbering notes +- [ ] `scripts/classify_eval_result.py --scenario-dir `, if a scenario was added or moved between suites +- [ ] Full/main `scripts/run_eval_suite.sh main`, if benchmark claims changed or targeted with-context results are clean `bash scripts/check_publish_dry_run.sh .`, `tessl skill review`, and hosted Tessl evals require Tessl authentication. Hosted evals also require a linked Tessl project. If you can't run one of @@ -83,30 +86,17 @@ explain why. ## Review Checklist -- [ ] Docs updated, or N/A -- [ ] Evals updated, or N/A -- [ ] Scenario directories include `task.md`, `criteria.json`, and `capability.txt`, or N/A -- [ ] Scenario invocation style is classified as natural or explicit, or N/A -- [ ] Natural activation prompts don't explicitly invoke the skill, or N/A -- [ ] Explicit invocation prompts are labeled as explicit, or N/A -- [ ] Main eval criteria include compile/artifact checks, or N/A -- [ ] Main eval criteria include behavior correctness checks, or N/A -- [ ] Runtime references contain no eval answer keys, scenario inventory, hosted run IDs, or fixed - score claims -- [ ] If any with-context result was below 100%, targeted failing scenarios were fixed and rerun - before broader eval suites -- [ ] Java baseline compatibility has been considered, or N/A -- [ ] `OptionalInt`, `OptionalLong`, and `OptionalDouble` guidance has been considered, or N/A -- [ ] Optional-producing stream terminals and collectors are covered, or N/A -- [ ] Java 26 Javadocs were checked for Optional-family coverage, or N/A -- [ ] Valid README package-runner instructions were preserved, or N/A -- [ ] Tessl package commands match the verified plugin package format -- [ ] Full/reference eval reporting is not hidden or cherry-picked -- [ ] Tessl checks were run, or unavailability is documented -- [ ] PR title or squash title uses Conventional Commits -- [ ] Redaction checked: no Tessl tokens, GitHub tokens, package manager tokens, private repository - links, private eval artifacts, private registry/workspace links, local host paths, or - proprietary Java source +- [ ] The change is scoped to the sections, skill files, evals, or workflows described above. +- [ ] Validation that applies to this change is checked above, or any unavailable check is explained. +- [ ] If Java Optional guidance changed, Java baseline compatibility, fallback timing, null interop, primitive Optionals, and checked boundaries were considered. +- [ ] If evals or benchmark claims changed, the eval scenarios remain fair and do not leak answer keys, run IDs, or fixed score claims into runtime references. +- [ ] If runtime skill text or references changed, hosted checks were widened from targeted affected scenarios to main/reference/regression as described in `docs/agents/workflow.md`, or any Tessl blocker is documented. +- [ ] If a runtime skill/reference change was released, the final report includes the published main eval run plus post-change reference and regression run IDs, or a blocker issue for missing broad suites. +- [ ] Main and reference evals were run with both variants when hosted evals were needed; regression evals were run with context only unless reclassification back to reference was being checked. +- [ ] New or moved eval scenarios follow the classifier recommendation, or the PR explains the maintainer-approved override. +- [ ] Every retained eval scenario has a 100% with-context result, or any below-100 result is documented as blocking follow-up rather than classified/reportable coverage. +- [ ] PR title or squash title uses Conventional Commits. +- [ ] Redaction checked: no tokens, private links, private eval artifacts, local host paths, or proprietary Java source. ## AI Assistance (if used) From b2cd1e1aadc3a11d32e66f569737371f76f74c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Martin?= Date: Mon, 8 Jun 2026 13:17:13 +0200 Subject: [PATCH 5/5] docs: clarify optionals public metadata --- .tessl-plugin/plugin.json | 4 ++-- docs/agents/project-identity.md | 12 +++++++++--- docs/agents/public-metadata.md | 17 ++++++++++++----- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/.tessl-plugin/plugin.json b/.tessl-plugin/plugin.json index 9103921..8532c18 100644 --- a/.tessl-plugin/plugin.json +++ b/.tessl-plugin/plugin.json @@ -1,8 +1,8 @@ { "name": "martinfrancois/java-optionals", "version": "1.0.0", - "description": "Help AI coding agents use Java Optional well in new code and cleanups, without replacing one antipattern with another.", - "summary": "Help AI coding agents use Java Optional well in new code and cleanups, without replacing one antipattern with another.", + "description": "Help AI coding agents use Java Optional well in new code, review, and cleanup without replacing one antipattern with another.", + "summary": "Help AI coding agents use Java Optional well in new code, review, and cleanup without replacing one antipattern with another.", "repository": "https://github.com/martinfrancois/java-optionals-skill", "homepage": "https://github.com/martinfrancois/java-optionals-skill#readme", "license": "MIT", diff --git a/docs/agents/project-identity.md b/docs/agents/project-identity.md index 3be3ebb..08dc247 100644 --- a/docs/agents/project-identity.md +++ b/docs/agents/project-identity.md @@ -10,14 +10,20 @@ Use this when naming the repository, skill, package, workspace, or public source - Skill name: `java-optionals`. - Tessl package name: `martinfrancois/java-optionals`. - Tessl workspace: `martinfrancois`. -- If the Tessl project needs to be recreated, use: +- The GitHub repository is public. Keep public docs free of private paths, local transcript paths, + unpublished workspace details, and secret references. +- The Tessl plugin manifest is public with `"private": false`; keep it that way unless the + maintainer asks to return to private package metadata. +- If `tessl project repair` cannot relink this checkout and the Tessl project needs to be + recreated, use the pinned CLI project command: ```bash tessl project create --workspace martinfrancois java-optionals-skill ``` -- The GitHub repository is public. Keep public docs free of private paths, local transcript paths, - unpublished workspace details, and secret references. + Do this only for project identity recovery. It is not part of normal plugin publishing, release, + or eval execution. + - Keep the project independent of company naming. Don't add company names to the repo name, package name, README, or public metadata unless the user asks. - Don't mention private gists or secret references in public docs. Public origin links should point diff --git a/docs/agents/public-metadata.md b/docs/agents/public-metadata.md index 2b8e1ea..4f05ed0 100644 --- a/docs/agents/public-metadata.md +++ b/docs/agents/public-metadata.md @@ -8,17 +8,24 @@ topics. ## Rules - GitHub description should be short, clickable, and user-benefit focused. -- Current preferred shape: "Help AI coding agents use Java Optional well in new code and cleanups, - without replacing one antipattern with another." -- Use the maximum useful number of relevant discoverability topics when the repo becomes public. +- Current preferred wording: "Help AI coding agents use Java Optional well in new code, review, and + cleanup without replacing one antipattern with another." +- Use the maximum useful number of relevant discoverability topics for the public repository. - If asked about topics, report how many GitHub repositories exist for each topic when you can. - Before calling the repo OSS-ready, check for a license, no private/secret references, a user-focused README, contributor docs, passing lint, and benchmark claims that match the current evals. - Tessl packaging currently uses `.tessl-plugin/plugin.json`. Keep docs, scripts, workflows, and release config aligned with plugin terminology unless official docs and CLI behavior change. -- This repository currently uses `.tessl-plugin/plugin.json` as the active manifest. Do not add - `tile.json` unless current Tessl docs and CLI behavior require it. +- Do not add `tile.json` unless current Tessl docs and CLI behavior require it. +- The workflow-pinned Tessl CLI version accepts the current plugin format with + `.tessl-plugin/plugin.json`. + `tessl plugin lint .`, `tessl plugin publish --dry-run --skip-evals .`, and + `tessl plugin publish --dry-run --bump patch .` are the authority for package validity here. + `tessl plugin pack` must include `skills/java-optionals/SKILL.md` and the referenced files under + `skills/java-optionals/references/`. Do not add a `skills` field or migrate to `tile.json` unless + those pinned CLI checks or current official docs prove the active skill is not included, + discoverable, or publishable. ## References