From f168f2ed5e4403d585ab3c37563529949bbda17a Mon Sep 17 00:00:00 2001 From: Garrett Spong Date: Wed, 17 Jun 2026 12:42:08 -0600 Subject: [PATCH] Add llm perf matrix workflow automation --- .github/scripts/llm-matrix/sync_matrix.sh | 31 ++++ .github/workflows/sync-llm-matrix-keyless.yml | 135 ++++++++++++++++++ ...large-language-model-performance-matrix.md | 30 ++-- .../open-source-models.csv | 2 + .../proprietary-models.csv | 6 + 5 files changed, 185 insertions(+), 19 deletions(-) create mode 100755 .github/scripts/llm-matrix/sync_matrix.sh create mode 100644 .github/workflows/sync-llm-matrix-keyless.yml create mode 100644 solutions/security/ai/llm-performance-matrix/open-source-models.csv create mode 100644 solutions/security/ai/llm-performance-matrix/proprietary-models.csv diff --git a/.github/scripts/llm-matrix/sync_matrix.sh b/.github/scripts/llm-matrix/sync_matrix.sh new file mode 100755 index 0000000000..28e29a58f7 --- /dev/null +++ b/.github/scripts/llm-matrix/sync_matrix.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Downloads generated LLM performance matrix CSVs from the GCS bucket populated by +# the Kibana `kibana-evals-security-matrix` Buildkite pipeline and copies them into +# the docs source tree. +# +# Required env: +# MATRIX_GCS_BUCKET GCS bucket name (no gs:// prefix). +# Optional env: +# MATRIX_DOMAIN Source prefix domain (default: security). +# MATRIX_PREFIX 'latest' (serverless/weekly) or a Stack version (default: latest). +# DEST_DIR Destination directory for the CSVs. + +BUCKET_NAME="${MATRIX_GCS_BUCKET:?MATRIX_GCS_BUCKET is required}" +MATRIX_DOMAIN="${MATRIX_DOMAIN:-security}" +MATRIX_PREFIX="${MATRIX_PREFIX:-latest}" +DEST_DIR="${DEST_DIR:-solutions/security/ai/llm-performance-matrix}" + +SRC="gs://${BUCKET_NAME}/${MATRIX_DOMAIN}/${MATRIX_PREFIX}" + +mkdir -p "$DEST_DIR" + +for f in proprietary-models.csv open-source-models.csv; do + echo "Downloading ${SRC}/${f} -> ${DEST_DIR}/${f}" + gsutil cp "${SRC}/${f}" "${DEST_DIR}/${f}" + echo "OUTPUT_CSV_PATH=${DEST_DIR}/${f}" +done + +echo "Synced LLM matrix CSVs from ${SRC} into ${DEST_DIR}" diff --git a/.github/workflows/sync-llm-matrix-keyless.yml b/.github/workflows/sync-llm-matrix-keyless.yml new file mode 100644 index 0000000000..e9a90ccee1 --- /dev/null +++ b/.github/workflows/sync-llm-matrix-keyless.yml @@ -0,0 +1,135 @@ +name: Sync LLM performance matrix + +on: + # Weekly schedule keeps the serverless (latest) matrix fresh. + # Runs Tuesday after the Kibana weekly evals + matrix generation pipelines. + schedule: + - cron: '0 9 * * 2' + + # Manual hatch for versioned (Stack release) updates. + workflow_dispatch: + inputs: + version: + description: 'Stack version to sync (e.g. 9.2). Leave blank to sync the serverless "latest" matrix to main.' + required: false + default: '' + type: string + dry_run: + description: 'Dry run (skip PR creation)' + required: false + default: false + type: boolean + +permissions: + contents: write + pull-requests: write + id-token: write # Required for OIDC token authentication + +jobs: + sync-matrix: + runs-on: ubuntu-latest + + steps: + - name: Resolve target prefix and base branch + id: resolve + run: | + VERSION="${{ github.event.inputs.version }}" + if [ -n "$VERSION" ]; then + echo "prefix=$VERSION" >> "$GITHUB_OUTPUT" + echo "base=$VERSION" >> "$GITHUB_OUTPUT" + echo "branch=automated/llm-matrix-sync-$VERSION" >> "$GITHUB_OUTPUT" + echo "title=[Automation] Update Security LLM performance matrix ($VERSION)" >> "$GITHUB_OUTPUT" + else + echo "prefix=latest" >> "$GITHUB_OUTPUT" + echo "base=main" >> "$GITHUB_OUTPUT" + echo "branch=automated/llm-matrix-sync-latest" >> "$GITHUB_OUTPUT" + echo "title=[Automation] Update Security LLM performance matrix" >> "$GITHUB_OUTPUT" + fi + + - name: Checkout repository + uses: actions/checkout@v6 + with: + ref: ${{ steps.resolve.outputs.base }} + + # Keyless authentication using Workload Identity Federation. + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3.0.0 + with: + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ vars.GCP_SERVICE_ACCOUNT_EMAIL }} + project_id: ${{ vars.GCP_PROJECT_ID }} + access_token_scopes: 'https://www.googleapis.com/auth/devstorage.read_only' + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Sync matrix CSVs from GCS + id: run-sync + env: + MATRIX_GCS_BUCKET: ${{ vars.LLM_MATRIX_GCS_BUCKET }} + MATRIX_DOMAIN: security + MATRIX_PREFIX: ${{ steps.resolve.outputs.prefix }} + DEST_DIR: solutions/security/ai/llm-performance-matrix + run: | + chmod +x .github/scripts/llm-matrix/sync_matrix.sh + OUTPUT=$(.github/scripts/llm-matrix/sync_matrix.sh) + echo "$OUTPUT" + + ADD_PATHS="" + while IFS= read -r line; do + path="${line#OUTPUT_CSV_PATH=}" + if [ -n "$ADD_PATHS" ]; then + ADD_PATHS="${ADD_PATHS} + ${path}" + else + ADD_PATHS="${path}" + fi + done < <(echo "$OUTPUT" | grep '^OUTPUT_CSV_PATH=') + + echo "add_paths<> "$GITHUB_OUTPUT" + echo "$ADD_PATHS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + - name: Create Pull Request + if: github.event.inputs.dry_run != 'true' + uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0 + with: + token: ${{ github.token }} + base: ${{ steps.resolve.outputs.base }} + branch: ${{ steps.resolve.outputs.branch }} + delete-branch: true + title: ${{ steps.resolve.outputs.title }} + commit-message: | + Update Security LLM performance matrix CSVs + + Source prefix: ${{ steps.resolve.outputs.prefix }} + Workflow run: ${{ github.run_id }} + body: | + ## Summary + + Automated update to the [Security LLM performance matrix](/solutions/security/ai/large-language-model-performance-matrix.md) CSVs, generated from Elastic Security LLM evaluation results. + + - **Source prefix:** `${{ steps.resolve.outputs.prefix }}` + - **Generated by:** Kibana `kibana-evals-security-matrix` Buildkite pipeline -> GCS -> this workflow + + ### Review checklist + + - [ ] Review the CSV diffs (scores + Not recommended cells) + - [ ] Verify model lineup and column taxonomy + - [ ] **Merge this PR once approved** + + --- + + Automated update from the [sync-llm-matrix workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}). + add-paths: | + ${{ steps.run-sync.outputs.add_paths }} + + - name: Upload CSV artifacts + if: always() + uses: actions/upload-artifact@v6 + with: + name: llm-matrix-csv-${{ github.run_number }} + path: | + ${{ steps.run-sync.outputs.add_paths }} + retention-days: 30 + if-no-files-found: warn diff --git a/solutions/security/ai/large-language-model-performance-matrix.md b/solutions/security/ai/large-language-model-performance-matrix.md index 54640bab28..c6d5a4a04d 100644 --- a/solutions/security/ai/large-language-model-performance-matrix.md +++ b/solutions/security/ai/large-language-model-performance-matrix.md @@ -16,36 +16,28 @@ products: This page summarizes internal test results comparing large language models (LLMs) across {{elastic-sec}} [AI chat](/explore-analyze/ai-features/ai-chat-experiences.md) and AI-powered feature use cases. These ratings apply equally whether you're using [AI Assistant](/solutions/security/ai/ai-assistant.md) or [Agent Builder](/solutions/security/ai/agent-builder/agent-builder.md). To learn more about these use cases, refer to [AI-powered features](/explore-analyze/ai-features.md#security-features). ::::{important} -Higher scores indicate better performance. A score of 10 on a task means the model met or exceeded all task-specific benchmarks. +Higher scores indicate better performance. A score of 10 on a task means the model met or exceeded all task-specific benchmarks. Models with a score of "Not recommended" failed testing. This could be due to various issues, including context window constraints. :::: +% The tables below are generated automatically from Elastic Security LLM evaluation +% results. Do not edit them by hand: update the CSVs via the `Sync LLM matrix` +% automation (see .github/workflows/sync-llm-matrix-keyless.yml), which pulls the +% latest artifacts produced by the `kibana-evals-security-matrix` Buildkite pipeline. ## Proprietary models [_proprietary_models] Models from third-party LLM providers. -| **Model** | **Alerts** | **Security Knowledge** | **{{esql}} Query Generation** | **Knowledge Base Retrieval** | **Attack Discovery** | **Automatic Migration** | **Average Score** | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **Opus 4.6** | 8.9 | 9.5 | 8.5 | 8.42 | 8.7 | 10 | **9** | -| **Sonnet 4.5** | 8.6 | 7.6 | 7.7 | 7.23 | 8 | 10 | **8.19** | -| **Opus 4.5** | 9 | 8.2 | 7.5 | 7.94 | 8.5 | 7.3 | **8.07** | -| **GPT 5.2** | 8.6 | 6.6 | 8 | 6 | 8.5 | 10 | **7.95** | -| **Sonnet 4** | 7.5 | 7.4 | 8 | 7.85 | 7 | 7.5 | **7.54** | -| **Sonnet 4.6** | 9.3 | 9.5 | 8.4 | 7.45 | Not recommended | 10 | **7.44** | -| **Sonnet 3.7** | 7.4 | 6.9 | 6.1 | 7.04 | 7 | 9.7 | **7.36** | -| **GPT 5.1** | 9.3 | 4.3 | 7.2 | 6 | 6.5 | 9.8 | **7.18** | -| **GPT 4.1 Mini** | 6.5 | 6.4 | 6 | 6.96 | 4.5 | 9.9 | **6.71** | -| **Gemini 2.5 Flash** | 7.8 | 6.2 | 4.4 | 5.71 | 6 | 9.81 | **6.65** | -| **Gemini 2.5 Pro** | 8 | 5.6 | 1.9 | 5.3 | 8.7 | 6.3 | **5.97** | -| **GPT 4.1** | 7.4 | 5.7 | 4.4 | 5.85 | 8 | 3.1 | **5.74** | +:::{csv-include} llm-performance-matrix/proprietary-models.csv +:caption: Scroll horizontally to view more information. +::: ## Open-source models [_open_source_models] Models you can [deploy yourself](/explore-analyze/ai-features/llm-guides/local-llms-overview.md). -| **Model** | **Alerts** | **Security Knowledge** | **{{esql}} Query Generation** | **Knowledge Base Retrieval** | **Attack Discovery** | **Automatic Migration** | **Average Score** | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **GPT OSS 120B** | 7.6 | 3.7 | 5.5 | 6 | 3.5 | 9.7 | **6** | -| **GPT OSS 20b** | 8.2 | 1.5 | 2.5 | Not recommended | Not recommended | Not recommended | **2.03** | +:::{csv-include} llm-performance-matrix/open-source-models.csv +:caption: Scroll horizontally to view more information. +::: diff --git a/solutions/security/ai/llm-performance-matrix/open-source-models.csv b/solutions/security/ai/llm-performance-matrix/open-source-models.csv new file mode 100644 index 0000000000..9e82dc846f --- /dev/null +++ b/solutions/security/ai/llm-performance-matrix/open-source-models.csv @@ -0,0 +1,2 @@ +Model,Alert Triage,Detection Engineering,Investigation,KB Retrieval,Workflow Execution,Overall +GPT OSS 120B,7.31,1.81,6.94,6.79,5.17,5.6 diff --git a/solutions/security/ai/llm-performance-matrix/proprietary-models.csv b/solutions/security/ai/llm-performance-matrix/proprietary-models.csv new file mode 100644 index 0000000000..c33e12a973 --- /dev/null +++ b/solutions/security/ai/llm-performance-matrix/proprietary-models.csv @@ -0,0 +1,6 @@ +Model,Alert Triage,Detection Engineering,Investigation,KB Retrieval,Workflow Execution,Overall +Claude Sonnet 4.6,10,4.88,6.44,6.26,10,7.52 +Claude Opus 4.6,10,4.31,6.58,6.41,9.71,7.4 +Gemini 3.1 Pro,10,4.69,6.21,6.02,9.62,7.31 +GPT-5.4,10,4.41,6.83,6.67,8.6,7.3 +Gemini 3.0 Flash,8.43,4.09,5.71,5.49,9.14,6.57