diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index f22bc8b..4165f75 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -83,6 +83,21 @@ jobs: } >> "$GITHUB_OUTPUT" fi + # Verify that the mcpchecker JSON output format is compatible with our + # doc-generation script. This catches breaking changes in mcpchecker's + # output schema before they reach the weekly model-evaluation workflow. + - name: Validate eval script compatibility + if: always() && steps.parse_results.outcome == 'success' + run: | + RESULTS_FILE="e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json" + if [ -f "$RESULTS_FILE" ]; then + ./scripts/update-model-evaluation.sh \ + --model-id "smoke-test" \ + --results "$RESULTS_FILE" + # Revert the doc file. + git checkout docs/model-evaluation.md + fi + - name: Upload test artifacts if: always() id: upload_artifacts diff --git a/e2e-tests/README.md b/e2e-tests/README.md index fcbef57..0e4dc54 100644 --- a/e2e-tests/README.md +++ b/e2e-tests/README.md @@ -70,10 +70,10 @@ Results are saved to `mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json`. ```bash # Summary -jq '.[] | {taskName, taskPassed}' mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json +jq '.results[] | {taskName, taskPassed}' mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json # Tool calls -jq '[.[] | .callHistory.ToolCalls[]? | {name: .request.Params.name, arguments: .request.Params.arguments}]' mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json +jq '[.results[] | .callHistory.ToolCalls[]? | {name: .request.Params.name, arguments: .request.Params.arguments}]' mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json ``` ## Test Cases diff --git a/scripts/update-model-evaluation.sh b/scripts/update-model-evaluation.sh index fa6e032..c2c3d56 100755 --- a/scripts/update-model-evaluation.sh +++ b/scripts/update-model-evaluation.sh @@ -76,8 +76,8 @@ END_MARKER="" # Generate the markdown block generate_block() { local total passed - total=$(jq 'length' "${RESULTS_FILE}") - passed=$(jq '[.[] | select(.taskPassed == true)] | length' "${RESULTS_FILE}") + total=$(jq '.results | length' "${RESULTS_FILE}") + passed=$(jq '[.results[] | select(.taskPassed == true)] | length' "${RESULTS_FILE}") local pct=$((100 * passed / total)) echo "${START_MARKER}" @@ -93,7 +93,7 @@ generate_block() { # Generate table rows jq -r ' - to_entries[] | + .results | to_entries[] | .key as $i | .value | ($i + 1) as $num | @@ -123,8 +123,8 @@ generate_block() { # Token totals local input_tokens output_tokens - input_tokens=$(jq '[.[].tokenEstimate.inputTokens] | add' "${RESULTS_FILE}") - output_tokens=$(jq '[.[].tokenEstimate.outputTokens] | add' "${RESULTS_FILE}") + input_tokens=$(jq '[.results[].tokenEstimate.inputTokens] | add' "${RESULTS_FILE}") + output_tokens=$(jq '[.results[].tokenEstimate.outputTokens] | add' "${RESULTS_FILE}") echo "**Total input tokens**: ${input_tokens} | **Total output tokens**: ${output_tokens}" echo "" echo "${END_MARKER}"