elastic · spong · Jun 17, 2026 · Jun 17, 2026
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Downloads generated LLM performance matrix CSVs from the GCS bucket populated by
+# the Kibana `kibana-evals-security-matrix` Buildkite pipeline and copies them into
+# the docs source tree.
+#
+# Required env:
+#   MATRIX_GCS_BUCKET   GCS bucket name (no gs:// prefix).
+# Optional env:
+#   MATRIX_DOMAIN       Source prefix domain (default: security).
+#   MATRIX_PREFIX       'latest' (serverless/weekly) or a Stack version (default: latest).
+#   DEST_DIR            Destination directory for the CSVs.
+
+BUCKET_NAME="${MATRIX_GCS_BUCKET:?MATRIX_GCS_BUCKET is required}"
+MATRIX_DOMAIN="${MATRIX_DOMAIN:-security}"
+MATRIX_PREFIX="${MATRIX_PREFIX:-latest}"
+DEST_DIR="${DEST_DIR:-solutions/security/ai/llm-performance-matrix}"
+
+SRC="gs://${BUCKET_NAME}/${MATRIX_DOMAIN}/${MATRIX_PREFIX}"
+
+mkdir -p "$DEST_DIR"
+
+for f in proprietary-models.csv open-source-models.csv; do
+  echo "Downloading ${SRC}/${f} -> ${DEST_DIR}/${f}"
+  gsutil cp "${SRC}/${f}" "${DEST_DIR}/${f}"
+  echo "OUTPUT_CSV_PATH=${DEST_DIR}/${f}"
+done
+
+echo "Synced LLM matrix CSVs from ${SRC} into ${DEST_DIR}"
@@ -0,0 +1,135 @@
+name: Sync LLM performance matrix
+
+on:
+  # Weekly schedule keeps the serverless (latest) matrix fresh.
+  # Runs Tuesday after the Kibana weekly evals + matrix generation pipelines.
+  schedule:
+    - cron: '0 9 * * 2'
+
+  # Manual hatch for versioned (Stack release) updates.
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Stack version to sync (e.g. 9.2). Leave blank to sync the serverless "latest" matrix to main.'
+        required: false
+        default: ''
+        type: string
+      dry_run:
+        description: 'Dry run (skip PR creation)'
+        required: false
+        default: false
+        type: boolean
+
+permissions:
+  contents: write
+  pull-requests: write
+  id-token: write # Required for OIDC token authentication
+
+jobs:
+  sync-matrix:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Resolve target prefix and base branch
+        id: resolve
+        run: |
+          VERSION="${{ github.event.inputs.version }}"
+          if [ -n "$VERSION" ]; then
+            echo "prefix=$VERSION" >> "$GITHUB_OUTPUT"
+            echo "base=$VERSION" >> "$GITHUB_OUTPUT"
+            echo "branch=automated/llm-matrix-sync-$VERSION" >> "$GITHUB_OUTPUT"
+            echo "title=[Automation] Update Security LLM performance matrix ($VERSION)" >> "$GITHUB_OUTPUT"
+          else
+            echo "prefix=latest" >> "$GITHUB_OUTPUT"
+            echo "base=main" >> "$GITHUB_OUTPUT"
+            echo "branch=automated/llm-matrix-sync-latest" >> "$GITHUB_OUTPUT"
+            echo "title=[Automation] Update Security LLM performance matrix" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ steps.resolve.outputs.base }}
+
+      # Keyless authentication using Workload Identity Federation.
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3.0.0
+        with:
+          workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ vars.GCP_SERVICE_ACCOUNT_EMAIL }}
+          project_id: ${{ vars.GCP_PROJECT_ID }}
+          access_token_scopes: 'https://www.googleapis.com/auth/devstorage.read_only'
+
+      - name: Set up Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+
+      - name: Sync matrix CSVs from GCS
+        id: run-sync
+        env:
+          MATRIX_GCS_BUCKET: ${{ vars.LLM_MATRIX_GCS_BUCKET }}
+          MATRIX_DOMAIN: security
+          MATRIX_PREFIX: ${{ steps.resolve.outputs.prefix }}
+          DEST_DIR: solutions/security/ai/llm-performance-matrix
+        run: |
+          chmod +x .github/scripts/llm-matrix/sync_matrix.sh
+          OUTPUT=$(.github/scripts/llm-matrix/sync_matrix.sh)
+          echo "$OUTPUT"
+
+          ADD_PATHS=""
+          while IFS= read -r line; do
+            path="${line#OUTPUT_CSV_PATH=}"
+            if [ -n "$ADD_PATHS" ]; then
+              ADD_PATHS="${ADD_PATHS}
+          ${path}"
+            else
+              ADD_PATHS="${path}"
+            fi
+          done < <(echo "$OUTPUT" | grep '^OUTPUT_CSV_PATH=')
+
+          echo "add_paths<<EOF" >> "$GITHUB_OUTPUT"
+          echo "$ADD_PATHS" >> "$GITHUB_OUTPUT"
+          echo "EOF" >> "$GITHUB_OUTPUT"
+
+      - name: Create Pull Request
+        if: github.event.inputs.dry_run != 'true'
+        uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0
+        with:
+          token: ${{ github.token }}
+          base: ${{ steps.resolve.outputs.base }}
+          branch: ${{ steps.resolve.outputs.branch }}
+          delete-branch: true
+          title: ${{ steps.resolve.outputs.title }}
+          commit-message: |
+            Update Security LLM performance matrix CSVs
+
+            Source prefix: ${{ steps.resolve.outputs.prefix }}
+            Workflow run: ${{ github.run_id }}
+          body: |
+            ## Summary
+
+            Automated update to the [Security LLM performance matrix](/solutions/security/ai/large-language-model-performance-matrix.md) CSVs, generated from Elastic Security LLM evaluation results.
+
+            - **Source prefix:** `${{ steps.resolve.outputs.prefix }}`
+            - **Generated by:** Kibana `kibana-evals-security-matrix` Buildkite pipeline -> GCS -> this workflow
+
+            ### Review checklist
+
+            - [ ] Review the CSV diffs (scores + Not recommended cells)
+            - [ ] Verify model lineup and column taxonomy
+            - [ ] **Merge this PR once approved**
+
+            ---
+
+            Automated update from the [sync-llm-matrix workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
+          add-paths: |
+            ${{ steps.run-sync.outputs.add_paths }}
+
+      - name: Upload CSV artifacts
+        if: always()
+        uses: actions/upload-artifact@v6
+        with:
+          name: llm-matrix-csv-${{ github.run_number }}
+          path: |
+            ${{ steps.run-sync.outputs.add_paths }}
+          retention-days: 30
+          if-no-files-found: warn
@@ -16,36 +16,28 @@ products:
 This page summarizes internal test results comparing large language models (LLMs) across {{elastic-sec}} [AI chat](/explore-analyze/ai-features/ai-chat-experiences.md) and AI-powered feature use cases. These ratings apply equally whether you're using [AI Assistant](/solutions/security/ai/ai-assistant.md) or [Agent Builder](/solutions/security/ai/agent-builder/agent-builder.md). To learn more about these use cases, refer to [AI-powered features](/explore-analyze/ai-features.md#security-features).
 
 ::::{important}
-Higher scores indicate better performance. A score of 10 on a task means the model met or exceeded all task-specific benchmarks. 
+Higher scores indicate better performance. A score of 10 on a task means the model met or exceeded all task-specific benchmarks.
 
 Models with a score of "Not recommended" failed testing. This could be due to various issues, including context window constraints.
 ::::
 
+% The tables below are generated automatically from Elastic Security LLM evaluation
+% results. Do not edit them by hand: update the CSVs via the `Sync LLM matrix`
+% automation (see .github/workflows/sync-llm-matrix-keyless.yml), which pulls the
+% latest artifacts produced by the `kibana-evals-security-matrix` Buildkite pipeline.
 
 ## Proprietary models [_proprietary_models]
 
 Models from third-party LLM providers.
 
-| **Model** | **Alerts** | **Security Knowledge** | **{{esql}} Query Generation** | **Knowledge Base Retrieval** | **Attack Discovery** | **Automatic Migration** | **Average Score** |
-| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
-| **Opus 4.6** | 8.9 | 9.5 | 8.5 | 8.42 | 8.7 | 10 | **9** |
-| **Sonnet 4.5** | 8.6 | 7.6 | 7.7 | 7.23 | 8 | 10 | **8.19** |
-| **Opus 4.5** | 9 | 8.2 | 7.5 | 7.94 | 8.5 | 7.3 | **8.07** |
-| **GPT 5.2** | 8.6 | 6.6 | 8 | 6 | 8.5 | 10 | **7.95** |
-| **Sonnet 4** | 7.5 | 7.4 | 8 | 7.85 | 7 | 7.5 | **7.54** |
-| **Sonnet 4.6** | 9.3 | 9.5 | 8.4 | 7.45 | Not recommended | 10 | **7.44** |
-| **Sonnet 3.7** | 7.4 | 6.9 | 6.1 | 7.04 | 7 | 9.7 | **7.36** |
-| **GPT 5.1** | 9.3 | 4.3 | 7.2 | 6 | 6.5 | 9.8 | **7.18** |
-| **GPT 4.1 Mini** | 6.5 | 6.4 | 6 | 6.96 | 4.5 | 9.9 | **6.71** |
-| **Gemini 2.5 Flash** | 7.8 | 6.2 | 4.4 | 5.71 | 6 | 9.81 | **6.65** |
-| **Gemini 2.5 Pro** | 8 | 5.6 | 1.9 | 5.3 | 8.7 | 6.3 | **5.97** |
-| **GPT 4.1** | 7.4 | 5.7 | 4.4 | 5.85 | 8 | 3.1 | **5.74** |
+:::{csv-include} llm-performance-matrix/proprietary-models.csv
+:caption: Scroll horizontally to view more information.
+:::
 
 ## Open-source models [_open_source_models]
 
 Models you can [deploy yourself](/explore-analyze/ai-features/llm-guides/local-llms-overview.md).
 
-| **Model** | **Alerts** | **Security Knowledge** | **{{esql}} Query Generation** | **Knowledge Base Retrieval** | **Attack Discovery** | **Automatic Migration** | **Average Score** |
-| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
-| **GPT OSS 120B** | 7.6 | 3.7 | 5.5 | 6 | 3.5 | 9.7 | **6** |
-| **GPT OSS 20b** | 8.2 | 1.5 | 2.5 | Not recommended | Not recommended | Not recommended | **2.03** |
+:::{csv-include} llm-performance-matrix/open-source-models.csv
+:caption: Scroll horizontally to view more information.
+:::
@@ -0,0 +1,2 @@
+Model,Alert Triage,Detection Engineering,Investigation,KB Retrieval,Workflow Execution,Overall
+GPT OSS 120B,7.31,1.81,6.94,6.79,5.17,5.6
@@ -0,0 +1,6 @@
+Model,Alert Triage,Detection Engineering,Investigation,KB Retrieval,Workflow Execution,Overall
+Claude Sonnet 4.6,10,4.88,6.44,6.26,10,7.52
+Claude Opus 4.6,10,4.31,6.58,6.41,9.71,7.4
+Gemini 3.1 Pro,10,4.69,6.21,6.02,9.62,7.31
+GPT-5.4,10,4.41,6.83,6.67,8.6,7.3
+Gemini 3.0 Flash,8.43,4.09,5.71,5.49,9.14,6.57
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Model,Alert Triage,Detection Engineering,Investigation,KB Retrieval,Workflow Execution,Overall
		GPT OSS 120B,7.31,1.81,6.94,6.79,5.17,5.6