From c159c0c926ff194d2fba67cc0f6e0a69ec167c3d Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Tue, 24 Mar 2026 14:14:37 +0100
Subject: [PATCH 01/23] Add CI script and hardened skill for AI-driven E2E
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a locked-down Claude Code setup for running AI E2E tests in CI:

- CI entry point (Scripts/ci/run-ai-e2e-tests.sh) that manages the full
  lifecycle: simulator, WDA, Claude Code with --allowedTools, results
- Wrapper scripts (wda-curl.sh, wp-api.sh, launch-app.sh) that replace
  raw curl — validate methods, reject path traversal, read credentials
  from env vars so Claude never sees them in commands
- CI-specific skill (ci-test-runner) with all WDA interaction patterns
  using wrapper scripts instead of raw curl

Ref: AINFRA-2176

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/skills/ci-test-runner/SKILL.md | 415 +++++++++++++++++++++++++
 Scripts/ci/launch-app.sh               |  25 ++
 Scripts/ci/run-ai-e2e-tests.sh         | 171 ++++++++++
 Scripts/ci/wda-curl.sh                 |  40 +++
 Scripts/ci/wp-api.sh                   |  50 +++
 5 files changed, 701 insertions(+)
 create mode 100644 .claude/skills/ci-test-runner/SKILL.md
 create mode 100755 Scripts/ci/launch-app.sh
 create mode 100755 Scripts/ci/run-ai-e2e-tests.sh
 create mode 100755 Scripts/ci/wda-curl.sh
 create mode 100755 Scripts/ci/wp-api.sh

diff --git a/.claude/skills/ci-test-runner/SKILL.md b/.claude/skills/ci-test-runner/SKILL.md
new file mode 100644
index 000000000000..5e3e8e306079
--- /dev/null
+++ b/.claude/skills/ci-test-runner/SKILL.md
@@ -0,0 +1,415 @@
+---
+name: ci-test-runner
+description: >-
+  CI-hardened E2E test runner for WordPress/Jetpack iOS. Use when the prompt
+  mentions "ci-test-runner" or asks to run AI E2E tests in CI mode. Drives
+  the iOS Simulator through wrapper scripts with a locked-down tool set.
+---
+
+# CI Test Runner
+
+Run plain-language E2E test cases against the WordPress or Jetpack iOS app
+on an iOS Simulator. All external interactions use wrapper scripts — no raw
+curl, no arbitrary shell commands.
+
+## Environment
+
+All values are pre-set as environment variables by the CI script. You do NOT
+need to ask for credentials or configure anything.
+
+| Env var | Description |
+|---------|-------------|
+| `SIMULATOR_UDID` | Booted simulator UDID |
+| `WDA_SESSION_ID` | Active WebDriverAgent session ID |
+| `WDA_PORT` | WDA port (default 8100) |
+| `APP_BUNDLE_ID` | `org.wordpress` or `com.automattic.jetpack` |
+| `SITE_URL` | WordPress test site URL |
+| `WP_USERNAME` | WordPress username |
+| `WP_APP_PASSWORD` | WordPress application password |
+
+These are also read by the wrapper scripts, so you do not need to pass
+credentials as command arguments.
+
+## Available Commands
+
+You have exactly these commands available:
+
+| Command | Purpose |
+|---------|---------|
+| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | HTTP to WDA (localhost only) |
+| `./Scripts/ci/wp-api.sh METHOD PATH [BODY]` | WordPress REST API (auth handled) |
+| `./Scripts/ci/launch-app.sh` | (Re)launch app with test credentials |
+| `xcrun simctl terminate $SIMULATOR_UDID $APP_BUNDLE_ID` | Kill app |
+| `xcrun simctl io $SIMULATOR_UDID screenshot PATH` | Take screenshot |
+| `sleep N` | Wait N seconds |
+| `mkdir -p Tests/AgentTests/results/...` | Create results directories |
+
+## WDA Interactions
+
+WDA is already running. A session ID is in the `WDA_SESSION_ID` env var and
+also provided in the prompt.
+
+### Get Accessibility Tree
+
+```bash
+# Compact text format (~25 KB) — use this by default
+./Scripts/ci/wda-curl.sh GET '/source?format=description'
+
+# Structured JSON (~375 KB) — use when you need precise rect coordinates
+./Scripts/ci/wda-curl.sh GET '/source?format=json'
+```
+
+**Note:** `wda-curl.sh` returns raw JSON. The tree content is inside the
+`value` field. For the description format, parse the `value` string from the
+JSON response to get the indented tree text.
+
+The description format returns lines like:
+```
+NavigationBar, 0x105351660, {{0.0, 62.0}, {402.0, 54.0}}, identifier: 'my-site-navigation-bar'
+  Button, 0x105351a20, {{16.0, 62.0}, {44.0, 44.0}}, identifier: 'BackButton', label: 'Site Name'
+  StaticText, 0x105351b40, {{178.7, 73.7}, {44.7, 20.7}}, label: 'Posts'
+```
+
+### Computing Tap Coordinates
+
+Parse the frame `{{x, y}, {width, height}}` from the description tree:
+
+```
+tap_x = x + width / 2
+tap_y = y + height / 2
+```
+
+### Session Management
+
+If WDA actions return HTTP 4xx errors, the session may have expired. Create
+a new one:
+
+```bash
+./Scripts/ci/wda-curl.sh POST /session '{"capabilities":{"alwaysMatch":{}}}'
+```
+
+Extract `value.sessionId` from the JSON response and use it in subsequent
+action paths.
+
+### Tap
+
+```bash
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
+  "actions": [{
+    "type": "pointer",
+    "id": "finger1",
+    "parameters": {"pointerType": "touch"},
+    "actions": [
+      {"type": "pointerMove", "duration": 0, "x": X, "y": Y},
+      {"type": "pointerDown"},
+      {"type": "pointerUp"}
+    ]
+  }]
+}'
+```
+
+### Tap Element by Accessibility ID
+
+```bash
+# Find element
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/elements" '{
+  "using": "accessibility id",
+  "value": "IDENTIFIER"
+}'
+
+# Click it (ELEMENT_ID from response value[0].ELEMENT)
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/element/${ELEMENT_ID}/click"
+```
+
+### Long Press
+
+```bash
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
+  "actions": [{
+    "type": "pointer",
+    "id": "finger1",
+    "parameters": {"pointerType": "touch"},
+    "actions": [
+      {"type": "pointerMove", "duration": 0, "x": X, "y": Y},
+      {"type": "pointerDown"},
+      {"type": "pause", "duration": 1000},
+      {"type": "pointerUp"}
+    ]
+  }]
+}'
+```
+
+### Swipe
+
+```bash
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
+  "actions": [{
+    "type": "pointer",
+    "id": "finger1",
+    "parameters": {"pointerType": "touch"},
+    "actions": [
+      {"type": "pointerMove", "duration": 0, "x": X1, "y": Y1},
+      {"type": "pointerDown"},
+      {"type": "pointerMove", "duration": 500, "x": X2, "y": Y2},
+      {"type": "pointerUp"}
+    ]
+  }]
+}'
+```
+
+**Swipe direction guide** (given screen size `W x H`):
+- **Up** (scroll down): from `(W/2, H/2 + H/6)` to `(W/2, H/2 - H/6)`
+- **Down** (scroll up): from `(W/2, H/2 - H/6)` to `(W/2, H/2 + H/6)`
+- **Left**: from `(W/2 + W/4, H/2)` to `(W/2 - W/4, H/2)`
+- **Right**: from `(W/2 - W/4, H/2)` to `(W/2 + W/4, H/2)`
+- **Back** (from left edge): from `(5, H/2)` to `(W*2/3, H/2)`
+
+### Type Text
+
+```bash
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{
+  "value": ["h","e","l","l","o"]
+}'
+```
+
+An element must be focused first (tap a text field before typing).
+
+### Clear Text Field
+
+```bash
+# Select all (Ctrl+A)
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value": ["\u0001"]}'
+# Delete
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value": ["\u007F"]}'
+```
+
+### Press Hardware Button
+
+```bash
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/pressButton" '{"name": "home"}'
+```
+
+## WordPress REST API
+
+Use `wp-api.sh` for all REST API calls. Authentication is handled by the
+script — do not pass credentials.
+
+```bash
+# Search for a post
+./Scripts/ci/wp-api.sh GET 'wp/v2/posts?search=My+Post&status=publish'
+
+# Create a category
+./Scripts/ci/wp-api.sh POST wp/v2/categories '{"name":"Test Category"}'
+
+# Delete a post (force = skip trash)
+./Scripts/ci/wp-api.sh DELETE 'wp/v2/posts/123?force=true'
+
+# Create a tag
+./Scripts/ci/wp-api.sh POST wp/v2/tags '{"name":"Test Tag"}'
+```
+
+## Navigation Strategy
+
+**Always prefer the accessibility tree over screenshots.**
+
+### Finding Elements
+
+Use this priority order:
+1. **`identifier` / `name`** — most stable, developer-assigned
+2. **`label`** — accessibility label, user-visible text
+3. **`type` + context** — e.g., "Button inside NavigationBar"
+4. **Partial matching** — label contains target text
+5. **Positional heuristics** — last resort
+
+### Screen Size
+
+The root node's frame in the tree gives screen dimensions (e.g., `{{0, 0}, {393, 852}}`).
+
+### Waiting for UI Stability
+
+After every action (tap, swipe, type), wait 0.5–1 second then re-fetch the
+tree. Do not use fixed long sleeps. Instead, poll:
+
+1. Fetch the tree
+2. Check if expected element or screen is present
+3. If not, `sleep 1` and retry
+4. After 10 retries (10 seconds), declare element not found
+
+### Scroll View Navigation
+
+1. Fetch tree, search for target element
+2. If found, tap it
+3. If not, swipe up from `(screen_width - 30, screen_height / 2)` to scroll
+4. Re-fetch tree and search again
+5. If tree is identical after scroll, you've hit the bottom — stop
+
+### Back Navigation
+
+- **Primary**: find a Button inside NavigationBar, tap it
+- **Fallback**: edge swipe from `(5, H/2)` to `(W*2/3, H/2)`
+
+### Tab Bar Navigation
+
+Look for elements with type containing `TabBar` in the tree. Its children
+are the individual tabs. Tap the tab you need to switch to.
+
+### System Alert Handling
+
+If actions fail, check the tree for `Alert` or `Sheet` elements. Dismiss
+with "Allow", "Don't Allow", "OK", or "Cancel" before retrying.
+
+### App Crash Recovery
+
+If the tree looks unexpected or actions consistently fail:
+1. Relaunch with `./Scripts/ci/launch-app.sh`
+2. Wait 3 seconds
+3. Create a new WDA session if needed
+4. Continue the test
+
+## Test Execution Flow
+
+### Step 1: Discover Tests
+
+Use `Glob` to find all `*.md` files in the test directory provided in the
+prompt. Sort alphabetically. Print:
+
+```
+Discovered N test(s):
+- create-blank-page.md
+- text-post-publish.md
+```
+
+If none found, write a results.md noting this and stop.
+
+### Step 2: Initialize
+
+The results directory is provided in the prompt. Create subdirectories:
+
+```bash
+mkdir -p ${RESULTS_DIR}/screenshots
+```
+
+### Step 3: Run Each Test Sequentially
+
+For each test file:
+
+#### 3a. Relaunch app
+
+```bash
+./Scripts/ci/launch-app.sh
+sleep 3
+```
+
+#### 3b. Check login state
+
+Fetch the tree. If the app shows a login screen:
+1. Tap "Enter your existing site address"
+2. Type the site URL (from the prompt)
+3. Tap Continue
+4. Wait 3 seconds for auto-login
+
+If the app shows the logged-in state (My Site), skip login.
+
+#### 3c. Read test file
+
+Use `Read` to get the test case markdown. Parse the sections:
+- **Prerequisites** — setup steps (REST API or UI)
+- **Steps** — actions to perform
+- **Verification (REST API)** — REST API assertions (if present)
+- **Cleanup (REST API)** — REST API cleanup (if present)
+- **Expected Outcome** — what success looks like
+
+#### 3d. Fulfill prerequisites
+
+For REST API prerequisites (create categories, tags, posts), use
+`./Scripts/ci/wp-api.sh`. For UI prerequisites like "logged in", the
+relaunch in 3a handles it.
+
+If a prerequisite cannot be fulfilled, mark the test as FAIL with reason
+"Prerequisite not met: <details>" and skip directly to step 3h (record
+result).
+
+#### 3e. Execute steps
+
+Follow the numbered steps using WDA commands. After each action, wait
+briefly and re-fetch the tree to verify the UI changed as expected.
+
+#### 3f. Verify (if section present)
+
+If the test has a `## Verification (REST API)` section, use `wp-api.sh`
+to verify. The verification MUST succeed for the test to pass.
+
+#### 3g. Cleanup (if section present)
+
+If the test has a `## Cleanup (REST API)` section, use `wp-api.sh` to
+clean up. Always run cleanup regardless of pass/fail.
+
+#### 3h. Record result
+
+Write a per-test result file at `${RESULTS_DIR}/<test-name>.md`:
+
+On pass:
+```
+### PASS: <Test Title>
+Passed.
+```
+
+On fail — take a screenshot first:
+```bash
+xcrun simctl io $SIMULATOR_UDID screenshot Tests/AgentTests/results/${TIMESTAMP}/screenshots/<test-name>-failure.png
+```
+Then write:
+```
+### FAIL: <Test Title>
+**Reason:** <what went wrong>
+**Screenshot:** screenshots/<test-name>-failure.png
+```
+
+#### 3i. Print status
+
+```
+[2/5] PASS: create-blank-page
+```
+or:
+```
+[2/5] FAIL: create-blank-page — Element "Publish" not found
+```
+
+### Step 4: Assemble Final Results
+
+Read all per-test result files. Write `${RESULTS_DIR}/results.md`:
+
+```markdown
+# Test Results
+
+- **Date:** YYYY-MM-DD HH:mm
+- **App:** <app name>
+- **Site:** <site_url>
+- **Total:** N | **Passed:** P | **Failed:** F
+
+## Results
+
+<per-test results concatenated>
+```
+
+### Step 5: Print Summary
+
+```
+Test run complete.
+Total: N | Passed: P | Failed: F
+Results: Tests/AgentTests/results/<timestamp>/results.md
+```
+
+## Important Rules
+
+- **The app MUST already be built and installed** on the simulator. The CI
+  pipeline handles building. This skill only drives tests.
+- **NEVER stop on failure.** Always continue to the next test.
+- **Always run cleanup** regardless of pass/fail.
+- **Prefer the accessibility tree** over screenshots for navigation.
+- **After every action**, wait 0.5–1s then re-fetch the tree.
+- **For scrolling**, swipe from the right edge (`screen_width - 30`) to
+  avoid tapping interactive elements.
+- **Use `duration: 1000`** (1 second) for swipes on tappable items.
+- **Coordinates are in points**, not pixels — use tree coordinates, not
+  screenshot dimensions.
diff --git a/Scripts/ci/launch-app.sh b/Scripts/ci/launch-app.sh
new file mode 100755
index 000000000000..a1b8a8192018
--- /dev/null
+++ b/Scripts/ci/launch-app.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Launch the app on the simulator with test credentials.
+# Takes no arguments — all values come from environment variables.
+#
+# Usage: launch-app.sh
+#
+# Environment (required):
+#   SIMULATOR_UDID   Simulator UDID
+#   APP_BUNDLE_ID    App bundle ID (org.wordpress or com.automattic.jetpack)
+#   SITE_URL         WordPress site URL
+#   WP_USERNAME      WordPress username
+#   WP_APP_PASSWORD  WordPress application password
+set -euo pipefail
+
+: "${SIMULATOR_UDID:?SIMULATOR_UDID is required}"
+: "${APP_BUNDLE_ID:?APP_BUNDLE_ID is required}"
+: "${SITE_URL:?SITE_URL is required}"
+: "${WP_USERNAME:?WP_USERNAME is required}"
+: "${WP_APP_PASSWORD:?WP_APP_PASSWORD is required}"
+
+exec xcrun simctl launch --terminate-running-process \
+  "$SIMULATOR_UDID" "$APP_BUNDLE_ID" \
+  -ui-test-site-url "$SITE_URL" \
+  -ui-test-site-user "$WP_USERNAME" \
+  -ui-test-site-pass "$WP_APP_PASSWORD"
diff --git a/Scripts/ci/run-ai-e2e-tests.sh b/Scripts/ci/run-ai-e2e-tests.sh
new file mode 100755
index 000000000000..8bda48749315
--- /dev/null
+++ b/Scripts/ci/run-ai-e2e-tests.sh
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+# Run AI-driven E2E tests on an iOS Simulator using Claude Code.
+#
+# This script manages the full lifecycle:
+#   1. Install Claude Code (if needed)
+#   2. Detect or boot a simulator
+#   3. Start WebDriverAgent and create a session
+#   4. Run Claude Code with a locked-down tool allowlist
+#   5. Stop WebDriverAgent
+#   6. Exit with the test result code
+#
+# Required environment variables:
+#   ANTHROPIC_API_KEY   Claude API key
+#   SITE_URL            WordPress test site URL
+#   WP_USERNAME         WordPress username
+#   WP_APP_PASSWORD     WordPress application password
+#
+# Optional environment variables:
+#   APP                 wordpress | jetpack (default: jetpack)
+#   SIMULATOR_NAME      Simulator to boot if none running (default: iPhone 16)
+#   WDA_PORT            WebDriverAgent port (default: 8100)
+#   CLAUDE_MAX_TURNS    Max Claude Code tool-use turns (default: 200)
+#   TEST_DIR            Test directory (default: Tests/AgentTests/ui-tests)
+#   CLAUDE_MODEL        Model to use (default: claude-sonnet-4-20250514)
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+cd "$REPO_ROOT"
+
+# ── Required env vars ────────────────────────────────────────────────
+: "${ANTHROPIC_API_KEY:?Set ANTHROPIC_API_KEY}"
+: "${SITE_URL:?Set SITE_URL}"
+: "${WP_USERNAME:?Set WP_USERNAME}"
+: "${WP_APP_PASSWORD:?Set WP_APP_PASSWORD}"
+
+# ── Defaults ─────────────────────────────────────────────────────────
+APP="${APP:-jetpack}"
+SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
+WDA_PORT="${WDA_PORT:-8100}"
+CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-200}"
+TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-20250514}"
+
+case "$APP" in
+  wordpress) BUNDLE_ID="org.wordpress" ;;
+  jetpack)   BUNDLE_ID="com.automattic.jetpack" ;;
+  *) echo "Error: APP must be 'wordpress' or 'jetpack', got '$APP'" >&2; exit 1 ;;
+esac
+
+# ── Locate WDA scripts ──────────────────────────────────────────────
+WDA_START="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-start.rb"
+WDA_STOP="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-stop.rb"
+
+if [ ! -f "$WDA_START" ]; then
+  echo "Error: WDA start script not found at $WDA_START" >&2
+  exit 1
+fi
+
+# ── Step 1: Install Claude Code ─────────────────────────────────────
+if ! command -v claude &>/dev/null; then
+  echo "Installing Claude Code..."
+  npm install -g @anthropic-ai/claude-code
+fi
+echo "Claude Code: $(claude --version 2>/dev/null || echo 'unknown')"
+
+# ── Step 2: Detect or boot simulator ────────────────────────────────
+get_booted_udid() {
+  xcrun simctl list devices booted -j 2>/dev/null \
+    | ruby -rjson -e '
+        data = JSON.parse(STDIN.read)
+        data.fetch("devices", {}).each_value do |devs|
+          devs.each { |d| (puts d["udid"]; exit) if d["state"] == "Booted" }
+        end
+      ' 2>/dev/null || true
+}
+
+UDID="$(get_booted_udid)"
+
+if [ -z "$UDID" ]; then
+  echo "No booted simulator found. Booting '$SIMULATOR_NAME'..."
+  xcrun simctl boot "$SIMULATOR_NAME"
+  sleep 5
+  UDID="$(get_booted_udid)"
+fi
+
+if [ -z "$UDID" ]; then
+  echo "Error: could not find a booted simulator" >&2
+  exit 1
+fi
+echo "Simulator UDID: $UDID"
+
+# ── Step 3: Start WDA ───────────────────────────────────────────────
+echo "Starting WebDriverAgent on port $WDA_PORT..."
+ruby "$WDA_START" --udid "$UDID" --port "$WDA_PORT"
+
+# Create a WDA session
+SESSION_ID="$(curl -s -X POST "http://localhost:${WDA_PORT}/session" \
+  -H 'Content-Type: application/json' \
+  -d '{"capabilities":{"alwaysMatch":{}}}' \
+  | ruby -rjson -e 'puts JSON.parse(STDIN.read).dig("value", "sessionId")')"
+
+if [ -z "$SESSION_ID" ]; then
+  echo "Error: failed to create WDA session" >&2
+  ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
+  exit 1
+fi
+echo "WDA Session: $SESSION_ID"
+
+# ── Step 4: Export env vars for wrapper scripts and Claude ───────────
+export SIMULATOR_UDID="$UDID"
+export WDA_SESSION_ID="$SESSION_ID"
+export WDA_PORT
+export APP_BUNDLE_ID="$BUNDLE_ID"
+export SITE_URL
+export WP_USERNAME
+export WP_APP_PASSWORD
+
+# ── Step 5: Prepare results directory ────────────────────────────────
+TIMESTAMP="$(date +%Y-%m-%d-%H%M)"
+RESULTS_DIR="Tests/AgentTests/results/${TIMESTAMP}"
+mkdir -p "$RESULTS_DIR"
+
+# ── Step 6: Run Claude Code ──────────────────────────────────────────
+PROMPT="Run all AI E2E test cases in ${TEST_DIR}/ using the ci-test-runner skill.
+
+Environment (already set as env vars, also available to wrapper scripts):
+- App: ${APP} (bundle ID: ${BUNDLE_ID})
+- Simulator UDID: ${UDID}
+- WDA Session ID: ${SESSION_ID}
+- WDA Port: ${WDA_PORT}
+- Site URL: ${SITE_URL}
+- Username: ${WP_USERNAME}
+- Results directory: ${RESULTS_DIR}
+- Screenshots directory: ${RESULTS_DIR}/screenshots"
+
+CLAUDE_EXIT=0
+claude --print \
+  --model "$CLAUDE_MODEL" \
+  --max-turns "$CLAUDE_MAX_TURNS" \
+  --allowedTools "Read" \
+  --allowedTools "Glob(Tests/AgentTests/**)" \
+  --allowedTools "Write(Tests/AgentTests/results/*)" \
+  --allowedTools "Bash(./Scripts/ci/wda-curl.sh *)" \
+  --allowedTools "Bash(./Scripts/ci/wp-api.sh *)" \
+  --allowedTools "Bash(./Scripts/ci/launch-app.sh)" \
+  --allowedTools "Bash(xcrun simctl terminate *)" \
+  --allowedTools "Bash(xcrun simctl io * screenshot Tests/AgentTests/results/*)" \
+  --allowedTools "Bash(sleep *)" \
+  --allowedTools "Bash(mkdir -p Tests/AgentTests/results/*)" \
+  --prompt "$PROMPT" \
+  || CLAUDE_EXIT=$?
+
+# ── Step 7: Stop WDA ────────────────────────────────────────────────
+echo "Stopping WebDriverAgent..."
+ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
+
+# ── Step 8: Report results ───────────────────────────────────────────
+RESULTS_FILE="${RESULTS_DIR}/results.md"
+if [ -f "$RESULTS_FILE" ]; then
+  echo ""
+  echo "═══════════════════════════════════════"
+  echo "  Test Results: ${RESULTS_DIR}/results.md"
+  echo "═══════════════════════════════════════"
+  echo ""
+  cat "$RESULTS_FILE"
+else
+  echo "Warning: no results.md found at $RESULTS_FILE"
+fi
+
+exit "$CLAUDE_EXIT"
diff --git a/Scripts/ci/wda-curl.sh b/Scripts/ci/wda-curl.sh
new file mode 100755
index 000000000000..07ed3cfce4b1
--- /dev/null
+++ b/Scripts/ci/wda-curl.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Constrained HTTP proxy to WebDriverAgent on localhost.
+# All WDA interactions from Claude Code go through this script.
+#
+# Usage: wda-curl.sh <METHOD> <PATH> [JSON_BODY]
+#
+# Examples:
+#   wda-curl.sh GET  /status
+#   wda-curl.sh GET  '/source?format=description'
+#   wda-curl.sh POST /session '{"capabilities":{"alwaysMatch":{}}}'
+#   wda-curl.sh POST /session/ID/actions '{"actions":[...]}'
+#
+# Environment:
+#   WDA_PORT  WDA port (default: 8100)
+set -euo pipefail
+
+METHOD="${1:?Usage: wda-curl.sh METHOD PATH [BODY]}"
+URL_PATH="${2:?Usage: wda-curl.sh METHOD PATH [BODY]}"
+BODY="${3:-}"
+PORT="${WDA_PORT:-8100}"
+
+case "$METHOD" in
+  GET|POST) ;;
+  *) echo "Error: method must be GET or POST, got '$METHOD'" >&2; exit 1 ;;
+esac
+
+# Ensure path starts with /
+if [[ "$URL_PATH" != /* ]]; then
+  URL_PATH="/${URL_PATH}"
+fi
+
+if [ -n "$BODY" ]; then
+  exec curl -s -X "$METHOD" \
+    -H 'Content-Type: application/json' \
+    -d "$BODY" \
+    "http://localhost:${PORT}${URL_PATH}"
+else
+  exec curl -s -X "$METHOD" \
+    "http://localhost:${PORT}${URL_PATH}"
+fi
diff --git a/Scripts/ci/wp-api.sh b/Scripts/ci/wp-api.sh
new file mode 100755
index 000000000000..6acb73170c41
--- /dev/null
+++ b/Scripts/ci/wp-api.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Constrained WordPress REST API client.
+# Handles authentication internally — Claude Code never sees credentials.
+#
+# Usage: wp-api.sh <METHOD> <API_PATH> [JSON_BODY]
+#
+# Examples:
+#   wp-api.sh GET  'wp/v2/posts?search=My+Post'
+#   wp-api.sh POST  wp/v2/posts '{"title":"Test","status":"publish"}'
+#   wp-api.sh DELETE 'wp/v2/posts/123?force=true'
+#
+# Environment (required):
+#   SITE_URL        WordPress site URL (e.g., https://example.com)
+#   WP_USERNAME     WordPress username
+#   WP_APP_PASSWORD WordPress application password
+set -euo pipefail
+
+METHOD="${1:?Usage: wp-api.sh METHOD API_PATH [BODY]}"
+API_PATH="${2:?Usage: wp-api.sh METHOD API_PATH [BODY]}"
+BODY="${3:-}"
+
+: "${SITE_URL:?SITE_URL is required}"
+: "${WP_USERNAME:?WP_USERNAME is required}"
+: "${WP_APP_PASSWORD:?WP_APP_PASSWORD is required}"
+
+case "$METHOD" in
+  GET|POST|PUT|DELETE) ;;
+  *) echo "Error: method must be GET, POST, PUT, or DELETE, got '$METHOD'" >&2; exit 1 ;;
+esac
+
+# Reject path traversal
+if [[ "$API_PATH" == *..* ]]; then
+  echo "Error: path traversal ('..') is not allowed" >&2
+  exit 1
+fi
+
+# Strip leading slash if present
+API_PATH="${API_PATH#/}"
+
+if [ -n "$BODY" ]; then
+  exec curl -s -X "$METHOD" \
+    -u "${WP_USERNAME}:${WP_APP_PASSWORD}" \
+    -H 'Content-Type: application/json' \
+    -d "$BODY" \
+    "${SITE_URL}/wp-json/${API_PATH}"
+else
+  exec curl -s -X "$METHOD" \
+    -u "${WP_USERNAME}:${WP_APP_PASSWORD}" \
+    "${SITE_URL}/wp-json/${API_PATH}"
+fi

From ab7665d478525addd2192cf78b34cc2ed77ea3b6 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Tue, 24 Mar 2026 14:36:16 +0100
Subject: [PATCH 02/23] Add Buildkite pipeline step for AI E2E tests

Merge the CI entry point into a single .buildkite/commands script that:
- Checks for "Testing" label on PR (skips early if missing)
- Downloads build artifacts and installs app on simulator
- Runs Claude Code with locked-down --allowedTools

Added as an inline step in pipeline.yml (depends on build_jetpack,
soft_fail, 30min timeout). Remove the separate Scripts/ci entry point.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../commands}/run-ai-e2e-tests.sh             | 84 ++++++++++++++-----
 .buildkite/pipeline.yml                       | 18 ++++
 2 files changed, 79 insertions(+), 23 deletions(-)
 rename {Scripts/ci => .buildkite/commands}/run-ai-e2e-tests.sh (65%)

diff --git a/Scripts/ci/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
similarity index 65%
rename from Scripts/ci/run-ai-e2e-tests.sh
rename to .buildkite/commands/run-ai-e2e-tests.sh
index 8bda48749315..05fdcff835ef 100755
--- a/Scripts/ci/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -2,12 +2,14 @@
 # Run AI-driven E2E tests on an iOS Simulator using Claude Code.
 #
 # This script manages the full lifecycle:
-#   1. Install Claude Code (if needed)
-#   2. Detect or boot a simulator
-#   3. Start WebDriverAgent and create a session
-#   4. Run Claude Code with a locked-down tool allowlist
-#   5. Stop WebDriverAgent
-#   6. Exit with the test result code
+#   1. Check for "Testing" label on PR (Buildkite only, skips if missing)
+#   2. Download build artifacts and install app (Buildkite only)
+#   3. Install Claude Code (if needed)
+#   4. Detect or boot a simulator
+#   5. Start WebDriverAgent and create a session
+#   6. Run Claude Code with a locked-down tool allowlist
+#   7. Stop WebDriverAgent
+#   8. Exit with the test result code
 #
 # Required environment variables:
 #   ANTHROPIC_API_KEY   Claude API key
@@ -22,12 +24,25 @@
 #   CLAUDE_MAX_TURNS    Max Claude Code tool-use turns (default: 200)
 #   TEST_DIR            Test directory (default: Tests/AgentTests/ui-tests)
 #   CLAUDE_MODEL        Model to use (default: claude-sonnet-4-20250514)
+
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 cd "$REPO_ROOT"
 
+# ── Label gate (Buildkite only) ─────────────────────────────────────
+if [ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]; then
+  echo "--- 🏷 Checking for 'Testing' label"
+
+  if ! echo ";${BUILDKITE_PULL_REQUEST_LABELS};" | grep -q ";Testing;"; then
+    echo "PR does not have the 'Testing' label. Skipping."
+    echo "Add the label and re-run this step to trigger AI E2E tests."
+    exit 0
+  fi
+  echo "'Testing' label found."
+fi
+
 # ── Required env vars ────────────────────────────────────────────────
 : "${ANTHROPIC_API_KEY:?Set ANTHROPIC_API_KEY}"
 : "${SITE_URL:?Set SITE_URL}"
@@ -48,6 +63,16 @@ case "$APP" in
   *) echo "Error: APP must be 'wordpress' or 'jetpack', got '$APP'" >&2; exit 1 ;;
 esac
 
+# ── Artifact download (Buildkite only) ───────────────────────────────
+if [ -n "${BUILDKITE:-}" ]; then
+  echo "--- 📦 Downloading Build Artifacts"
+  download_artifact "build-products-${APP}.tar"
+  tar -xf "build-products-${APP}.tar"
+
+  echo "--- :rubygems: Setting up Gems"
+  install_gems
+fi
+
 # ── Locate WDA scripts ──────────────────────────────────────────────
 WDA_START="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-start.rb"
 WDA_STOP="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-stop.rb"
@@ -57,14 +82,16 @@ if [ ! -f "$WDA_START" ]; then
   exit 1
 fi
 
-# ── Step 1: Install Claude Code ─────────────────────────────────────
+# ── Install Claude Code ─────────────────────────────────────────────
 if ! command -v claude &>/dev/null; then
-  echo "Installing Claude Code..."
+  echo "--- 🤖 Installing Claude Code"
   npm install -g @anthropic-ai/claude-code
 fi
 echo "Claude Code: $(claude --version 2>/dev/null || echo 'unknown')"
 
-# ── Step 2: Detect or boot simulator ────────────────────────────────
+# ── Detect or boot simulator ────────────────────────────────────────
+echo "--- 📱 Setting up Simulator"
+
 get_booted_udid() {
   xcrun simctl list devices booted -j 2>/dev/null \
     | ruby -rjson -e '
@@ -90,11 +117,24 @@ if [ -z "$UDID" ]; then
 fi
 echo "Simulator UDID: $UDID"
 
-# ── Step 3: Start WDA ───────────────────────────────────────────────
-echo "Starting WebDriverAgent on port $WDA_PORT..."
+# ── Install app on simulator (Buildkite only) ────────────────────────
+if [ -n "${BUILDKITE:-}" ]; then
+  APP_DISPLAY_NAME="Jetpack"
+  [ "$APP" = "wordpress" ] && APP_DISPLAY_NAME="WordPress"
+
+  APP_PATH=$(find DerivedData/Build/Products -name "${APP_DISPLAY_NAME}.app" -path "*Debug-iphonesimulator*" | head -1)
+  if [ -z "$APP_PATH" ]; then
+    echo "Error: ${APP_DISPLAY_NAME}.app not found in build products" >&2
+    exit 1
+  fi
+  echo "Installing $APP_PATH on simulator..."
+  xcrun simctl install "$UDID" "$APP_PATH"
+fi
+
+# ── Start WDA ────────────────────────────────────────────────────────
+echo "--- 🔌 Starting WebDriverAgent"
 ruby "$WDA_START" --udid "$UDID" --port "$WDA_PORT"
 
-# Create a WDA session
 SESSION_ID="$(curl -s -X POST "http://localhost:${WDA_PORT}/session" \
   -H 'Content-Type: application/json' \
   -d '{"capabilities":{"alwaysMatch":{}}}' \
@@ -107,7 +147,7 @@ if [ -z "$SESSION_ID" ]; then
 fi
 echo "WDA Session: $SESSION_ID"
 
-# ── Step 4: Export env vars for wrapper scripts and Claude ───────────
+# ── Export env vars for wrapper scripts and Claude ───────────────────
 export SIMULATOR_UDID="$UDID"
 export WDA_SESSION_ID="$SESSION_ID"
 export WDA_PORT
@@ -116,12 +156,14 @@ export SITE_URL
 export WP_USERNAME
 export WP_APP_PASSWORD
 
-# ── Step 5: Prepare results directory ────────────────────────────────
+# ── Prepare results directory ────────────────────────────────────────
 TIMESTAMP="$(date +%Y-%m-%d-%H%M)"
 RESULTS_DIR="Tests/AgentTests/results/${TIMESTAMP}"
 mkdir -p "$RESULTS_DIR"
 
-# ── Step 6: Run Claude Code ──────────────────────────────────────────
+# ── Run Claude Code ──────────────────────────────────────────────────
+echo "--- 🧪 Running AI E2E Tests"
+
 PROMPT="Run all AI E2E test cases in ${TEST_DIR}/ using the ci-test-runner skill.
 
 Environment (already set as env vars, also available to wrapper scripts):
@@ -151,18 +193,14 @@ claude --print \
   --prompt "$PROMPT" \
   || CLAUDE_EXIT=$?
 
-# ── Step 7: Stop WDA ────────────────────────────────────────────────
-echo "Stopping WebDriverAgent..."
+# ── Stop WDA ─────────────────────────────────────────────────────────
+echo "--- 🧹 Cleanup"
 ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
 
-# ── Step 8: Report results ───────────────────────────────────────────
+# ── Report results ───────────────────────────────────────────────────
+echo "--- 🚦 Results"
 RESULTS_FILE="${RESULTS_DIR}/results.md"
 if [ -f "$RESULTS_FILE" ]; then
-  echo ""
-  echo "═══════════════════════════════════════"
-  echo "  Test Results: ${RESULTS_DIR}/results.md"
-  echo "═══════════════════════════════════════"
-  echo ""
   cat "$RESULTS_FILE"
 else
   echo "Warning: no results.md found at $RESULTS_FILE"
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index f74e5cee4cfa..62c099cac071 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -138,6 +138,24 @@ steps:
         command: .buildkite/commands/lint-localized-strings-format.sh
         plugins: [$CI_TOOLKIT_PLUGIN]
 
+  #################
+  # AI E2E Tests (requires "Testing" label on PR)
+  #################
+  - label: "🤖 AI E2E Tests"
+    command: .buildkite/commands/run-ai-e2e-tests.sh
+    depends_on: "build_jetpack"
+    if: "build.pull_request.id != null"
+    soft_fail: true
+    timeout_in_minutes: 30
+    plugins: [$CI_TOOLKIT_PLUGIN]
+    env:
+      APP: jetpack
+    artifact_paths:
+      - "Tests/AgentTests/results/**/*"
+    notify:
+      - github_commit_status:
+          context: "AI E2E Tests"
+
   #################
   # Claude Build Analysis - dynamically uploaded so Build result conditions evaluate at runtime after the wait
   #################

From 026eebf058dc0cd0c784d54c3122d4e0a58acf50 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Tue, 24 Mar 2026 17:29:47 +0100
Subject: [PATCH 03/23] Use [[ instead of [ for conditional tests

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/run-ai-e2e-tests.sh | 20 ++++++++++----------
 Scripts/ci/wda-curl.sh                  |  2 +-
 Scripts/ci/wp-api.sh                    |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 05fdcff835ef..71e725910155 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -32,7 +32,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 cd "$REPO_ROOT"
 
 # ── Label gate (Buildkite only) ─────────────────────────────────────
-if [ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]; then
+if [[-n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then
   echo "--- 🏷 Checking for 'Testing' label"
 
   if ! echo ";${BUILDKITE_PULL_REQUEST_LABELS};" | grep -q ";Testing;"; then
@@ -64,7 +64,7 @@ case "$APP" in
 esac
 
 # ── Artifact download (Buildkite only) ───────────────────────────────
-if [ -n "${BUILDKITE:-}" ]; then
+if [[-n "${BUILDKITE:-}" ]]; then
   echo "--- 📦 Downloading Build Artifacts"
   download_artifact "build-products-${APP}.tar"
   tar -xf "build-products-${APP}.tar"
@@ -77,7 +77,7 @@ fi
 WDA_START="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-start.rb"
 WDA_STOP="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-stop.rb"
 
-if [ ! -f "$WDA_START" ]; then
+if [[! -f "$WDA_START" ]]; then
   echo "Error: WDA start script not found at $WDA_START" >&2
   exit 1
 fi
@@ -104,26 +104,26 @@ get_booted_udid() {
 
 UDID="$(get_booted_udid)"
 
-if [ -z "$UDID" ]; then
+if [[-z "$UDID" ]]; then
   echo "No booted simulator found. Booting '$SIMULATOR_NAME'..."
   xcrun simctl boot "$SIMULATOR_NAME"
   sleep 5
   UDID="$(get_booted_udid)"
 fi
 
-if [ -z "$UDID" ]; then
+if [[-z "$UDID" ]]; then
   echo "Error: could not find a booted simulator" >&2
   exit 1
 fi
 echo "Simulator UDID: $UDID"
 
 # ── Install app on simulator (Buildkite only) ────────────────────────
-if [ -n "${BUILDKITE:-}" ]; then
+if [[-n "${BUILDKITE:-}" ]]; then
   APP_DISPLAY_NAME="Jetpack"
-  [ "$APP" = "wordpress" ] && APP_DISPLAY_NAME="WordPress"
+  [[ "$APP" = "wordpress" ]] && APP_DISPLAY_NAME="WordPress"
 
   APP_PATH=$(find DerivedData/Build/Products -name "${APP_DISPLAY_NAME}.app" -path "*Debug-iphonesimulator*" | head -1)
-  if [ -z "$APP_PATH" ]; then
+  if [[-z "$APP_PATH" ]]; then
     echo "Error: ${APP_DISPLAY_NAME}.app not found in build products" >&2
     exit 1
   fi
@@ -140,7 +140,7 @@ SESSION_ID="$(curl -s -X POST "http://localhost:${WDA_PORT}/session" \
   -d '{"capabilities":{"alwaysMatch":{}}}' \
   | ruby -rjson -e 'puts JSON.parse(STDIN.read).dig("value", "sessionId")')"
 
-if [ -z "$SESSION_ID" ]; then
+if [[-z "$SESSION_ID" ]]; then
   echo "Error: failed to create WDA session" >&2
   ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
   exit 1
@@ -200,7 +200,7 @@ ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
 # ── Report results ───────────────────────────────────────────────────
 echo "--- 🚦 Results"
 RESULTS_FILE="${RESULTS_DIR}/results.md"
-if [ -f "$RESULTS_FILE" ]; then
+if [[-f "$RESULTS_FILE" ]]; then
   cat "$RESULTS_FILE"
 else
   echo "Warning: no results.md found at $RESULTS_FILE"
diff --git a/Scripts/ci/wda-curl.sh b/Scripts/ci/wda-curl.sh
index 07ed3cfce4b1..d7043b953556 100755
--- a/Scripts/ci/wda-curl.sh
+++ b/Scripts/ci/wda-curl.sh
@@ -29,7 +29,7 @@ if [[ "$URL_PATH" != /* ]]; then
   URL_PATH="/${URL_PATH}"
 fi
 
-if [ -n "$BODY" ]; then
+if [[ -n "$BODY" ]]; then
   exec curl -s -X "$METHOD" \
     -H 'Content-Type: application/json' \
     -d "$BODY" \
diff --git a/Scripts/ci/wp-api.sh b/Scripts/ci/wp-api.sh
index 6acb73170c41..e413fb7cef62 100755
--- a/Scripts/ci/wp-api.sh
+++ b/Scripts/ci/wp-api.sh
@@ -37,7 +37,7 @@ fi
 # Strip leading slash if present
 API_PATH="${API_PATH#/}"
 
-if [ -n "$BODY" ]; then
+if [[ -n "$BODY" ]]; then
   exec curl -s -X "$METHOD" \
     -u "${WP_USERNAME}:${WP_APP_PASSWORD}" \
     -H 'Content-Type: application/json' \

From 756becca6b672a6ae5223508cec8878213870ab7 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Wed, 25 Mar 2026 22:07:10 +0100
Subject: [PATCH 04/23] Fix label check (comma-separated), broken [[ syntax,
 and missing npm

- BUILDKITE_PULL_REQUEST_LABELS is comma-separated, not semicolons
- Fix missing spaces after [[ in conditional tests
- Install Node.js via brew if npm is not available
- Add explicit return to get_booted_udid function

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/run-ai-e2e-tests.sh | 43 ++++++++++++++-----------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 71e725910155..2cb22c7e97ce 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -32,10 +32,10 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 cd "$REPO_ROOT"
 
 # ── Label gate (Buildkite only) ─────────────────────────────────────
-if [[-n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then
-  echo "--- 🏷 Checking for 'Testing' label"
+if [[ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then
+  echo "--- Checking for 'Testing' label"
 
-  if ! echo ";${BUILDKITE_PULL_REQUEST_LABELS};" | grep -q ";Testing;"; then
+  if ! echo ",${BUILDKITE_PULL_REQUEST_LABELS}," | grep -qF ",Testing,"; then
     echo "PR does not have the 'Testing' label. Skipping."
     echo "Add the label and re-run this step to trigger AI E2E tests."
     exit 0
@@ -64,12 +64,12 @@ case "$APP" in
 esac
 
 # ── Artifact download (Buildkite only) ───────────────────────────────
-if [[-n "${BUILDKITE:-}" ]]; then
-  echo "--- 📦 Downloading Build Artifacts"
+if [[ -n "${BUILDKITE:-}" ]]; then
+  echo "--- Downloading Build Artifacts"
   download_artifact "build-products-${APP}.tar"
   tar -xf "build-products-${APP}.tar"
 
-  echo "--- :rubygems: Setting up Gems"
+  echo "--- Setting up Gems"
   install_gems
 fi
 
@@ -77,20 +77,24 @@ fi
 WDA_START="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-start.rb"
 WDA_STOP="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-stop.rb"
 
-if [[! -f "$WDA_START" ]]; then
+if [[ ! -f "$WDA_START" ]]; then
   echo "Error: WDA start script not found at $WDA_START" >&2
   exit 1
 fi
 
 # ── Install Claude Code ─────────────────────────────────────────────
 if ! command -v claude &>/dev/null; then
-  echo "--- 🤖 Installing Claude Code"
+  echo "--- Installing Claude Code"
+  if ! command -v npm &>/dev/null; then
+    echo "npm not found, installing Node.js via Homebrew..."
+    brew install node
+  fi
   npm install -g @anthropic-ai/claude-code
 fi
 echo "Claude Code: $(claude --version 2>/dev/null || echo 'unknown')"
 
 # ── Detect or boot simulator ────────────────────────────────────────
-echo "--- 📱 Setting up Simulator"
+echo "--- Setting up Simulator"
 
 get_booted_udid() {
   xcrun simctl list devices booted -j 2>/dev/null \
@@ -100,30 +104,31 @@ get_booted_udid() {
           devs.each { |d| (puts d["udid"]; exit) if d["state"] == "Booted" }
         end
       ' 2>/dev/null || true
+  return 0
 }
 
 UDID="$(get_booted_udid)"
 
-if [[-z "$UDID" ]]; then
+if [[ -z "$UDID" ]]; then
   echo "No booted simulator found. Booting '$SIMULATOR_NAME'..."
   xcrun simctl boot "$SIMULATOR_NAME"
   sleep 5
   UDID="$(get_booted_udid)"
 fi
 
-if [[-z "$UDID" ]]; then
+if [[ -z "$UDID" ]]; then
   echo "Error: could not find a booted simulator" >&2
   exit 1
 fi
 echo "Simulator UDID: $UDID"
 
 # ── Install app on simulator (Buildkite only) ────────────────────────
-if [[-n "${BUILDKITE:-}" ]]; then
+if [[ -n "${BUILDKITE:-}" ]]; then
   APP_DISPLAY_NAME="Jetpack"
   [[ "$APP" = "wordpress" ]] && APP_DISPLAY_NAME="WordPress"
 
   APP_PATH=$(find DerivedData/Build/Products -name "${APP_DISPLAY_NAME}.app" -path "*Debug-iphonesimulator*" | head -1)
-  if [[-z "$APP_PATH" ]]; then
+  if [[ -z "$APP_PATH" ]]; then
     echo "Error: ${APP_DISPLAY_NAME}.app not found in build products" >&2
     exit 1
   fi
@@ -132,7 +137,7 @@ if [[-n "${BUILDKITE:-}" ]]; then
 fi
 
 # ── Start WDA ────────────────────────────────────────────────────────
-echo "--- 🔌 Starting WebDriverAgent"
+echo "--- Starting WebDriverAgent"
 ruby "$WDA_START" --udid "$UDID" --port "$WDA_PORT"
 
 SESSION_ID="$(curl -s -X POST "http://localhost:${WDA_PORT}/session" \
@@ -140,7 +145,7 @@ SESSION_ID="$(curl -s -X POST "http://localhost:${WDA_PORT}/session" \
   -d '{"capabilities":{"alwaysMatch":{}}}' \
   | ruby -rjson -e 'puts JSON.parse(STDIN.read).dig("value", "sessionId")')"
 
-if [[-z "$SESSION_ID" ]]; then
+if [[ -z "$SESSION_ID" ]]; then
   echo "Error: failed to create WDA session" >&2
   ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
   exit 1
@@ -162,7 +167,7 @@ RESULTS_DIR="Tests/AgentTests/results/${TIMESTAMP}"
 mkdir -p "$RESULTS_DIR"
 
 # ── Run Claude Code ──────────────────────────────────────────────────
-echo "--- 🧪 Running AI E2E Tests"
+echo "--- Running AI E2E Tests"
 
 PROMPT="Run all AI E2E test cases in ${TEST_DIR}/ using the ci-test-runner skill.
 
@@ -194,13 +199,13 @@ claude --print \
   || CLAUDE_EXIT=$?
 
 # ── Stop WDA ─────────────────────────────────────────────────────────
-echo "--- 🧹 Cleanup"
+echo "--- Cleanup"
 ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
 
 # ── Report results ───────────────────────────────────────────────────
-echo "--- 🚦 Results"
+echo "--- Results"
 RESULTS_FILE="${RESULTS_DIR}/results.md"
-if [[-f "$RESULTS_FILE" ]]; then
+if [[ -f "$RESULTS_FILE" ]]; then
   cat "$RESULTS_FILE"
 else
   echo "Warning: no results.md found at $RESULTS_FILE"

From 564c05ad135402736b2cacddfda6f42b8b26b120 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Wed, 25 Mar 2026 22:42:43 +0100
Subject: [PATCH 05/23] Clone and build WebDriverAgent if not present on CI
 agent

Extract WDA build to a separate build-wda.sh script for clarity.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/build-wda.sh        | 27 +++++++++++++++++++++++++
 .buildkite/commands/run-ai-e2e-tests.sh |  4 ++++
 2 files changed, 31 insertions(+)
 create mode 100755 .buildkite/commands/build-wda.sh

diff --git a/.buildkite/commands/build-wda.sh b/.buildkite/commands/build-wda.sh
new file mode 100755
index 000000000000..0445b9bbfab1
--- /dev/null
+++ b/.buildkite/commands/build-wda.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Clone and build WebDriverAgent for iOS Simulator testing.
+#
+# Skips if WDA is already built at .build/WebDriverAgent/.
+#
+# Required:
+#   SIMULATOR_NAME  Simulator name for the build destination (e.g., iPhone 16)
+
+set -euo pipefail
+
+SIMULATOR_NAME="${SIMULATOR_NAME:?Set SIMULATOR_NAME}"
+WDA_PROJECT=".build/WebDriverAgent/WebDriverAgent.xcodeproj"
+
+if [[ -d "$WDA_PROJECT" ]]; then
+  echo "WebDriverAgent already built, skipping."
+  return 0 2>/dev/null || exit 0
+fi
+
+mkdir -p .build
+git clone --depth 1 https://github.com/appium/WebDriverAgent.git .build/WebDriverAgent
+
+xcodebuild build-for-testing \
+  -project "$WDA_PROJECT" \
+  -scheme WebDriverAgentRunner \
+  -destination "platform=iOS Simulator,name=$SIMULATOR_NAME" \
+  CODE_SIGNING_ALLOWED=NO \
+  | tail -1
diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 2cb22c7e97ce..64ec475e72a8 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -136,6 +136,10 @@ if [[ -n "${BUILDKITE:-}" ]]; then
   xcrun simctl install "$UDID" "$APP_PATH"
 fi
 
+# ── Build WebDriverAgent (if not present) ────────────────────────────
+echo "--- Building WebDriverAgent"
+"$(dirname "$0")/build-wda.sh"
+
 # ── Start WDA ────────────────────────────────────────────────────────
 echo "--- Starting WebDriverAgent"
 ruby "$WDA_START" --udid "$UDID" --port "$WDA_PORT"

From ceeff2d906d043b064233aa797f4761163317e3f Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 10:54:36 +0100
Subject: [PATCH 06/23] Export SIMULATOR_NAME so build-wda.sh can read it

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/run-ai-e2e-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 64ec475e72a8..2d132abd9850 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -51,7 +51,7 @@ fi
 
 # ── Defaults ─────────────────────────────────────────────────────────
 APP="${APP:-jetpack}"
-SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
+export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
 CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-200}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"

From d7cffef3e9894e852863fd04757844280cf66103 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 11:41:14 +0100
Subject: [PATCH 07/23] Harden Claude AI E2E runner

---
 .buildkite/commands/build-wda.sh        |  62 +++-
 .buildkite/commands/run-ai-e2e-tests.sh | 383 +++++++++++++++++-------
 .buildkite/pipeline.yml                 |   2 +
 .claude/settings.json                   |  24 +-
 .claude/skills/ci-test-runner/SKILL.md  | 350 ++++++----------------
 Scripts/ci/assemble-ai-test-results.rb  |  40 +++
 Scripts/ci/create-wda-session.rb        |  22 ++
 Scripts/ci/find-booted-simulator.rb     |  38 +++
 Scripts/ci/inspect-ai-test-case.rb      |  39 +++
 Scripts/ci/launch-app.sh                |   5 +
 Scripts/ci/read-ai-test-result.rb       |  13 +
 Scripts/ci/record-ai-test-result.sh     |  27 ++
 Scripts/ci/take-ai-test-screenshot.sh   |  21 ++
 Scripts/ci/wda-curl.sh                  |  12 +-
 Scripts/ci/wp-api.sh                    |  55 +++-
 Scripts/ci/write-ai-test-result.rb      |  24 ++
 16 files changed, 715 insertions(+), 402 deletions(-)
 create mode 100755 Scripts/ci/assemble-ai-test-results.rb
 create mode 100644 Scripts/ci/create-wda-session.rb
 create mode 100644 Scripts/ci/find-booted-simulator.rb
 create mode 100644 Scripts/ci/inspect-ai-test-case.rb
 create mode 100644 Scripts/ci/read-ai-test-result.rb
 create mode 100755 Scripts/ci/record-ai-test-result.sh
 create mode 100755 Scripts/ci/take-ai-test-screenshot.sh
 create mode 100755 Scripts/ci/write-ai-test-result.rb

diff --git a/.buildkite/commands/build-wda.sh b/.buildkite/commands/build-wda.sh
index 0445b9bbfab1..5b079b0c6c30 100755
--- a/.buildkite/commands/build-wda.sh
+++ b/.buildkite/commands/build-wda.sh
@@ -1,27 +1,69 @@
 #!/usr/bin/env bash
 # Clone and build WebDriverAgent for iOS Simulator testing.
 #
-# Skips if WDA is already built at .build/WebDriverAgent/.
+# Skips the build only when a usable build-for-testing artifact already exists.
 #
-# Required:
+# Required (one of):
+#   SIMULATOR_UDID  Simulator UDID for the build destination
 #   SIMULATOR_NAME  Simulator name for the build destination (e.g., iPhone 16)
+#
+# Optional:
+#   WEBDRIVERAGENT_REPO_URL  Repo URL (default: appium/WebDriverAgent)
+#   WEBDRIVERAGENT_REF       Git ref or commit to build (default: current remote HEAD / existing checkout)
 
 set -euo pipefail
 
-SIMULATOR_NAME="${SIMULATOR_NAME:?Set SIMULATOR_NAME}"
-WDA_PROJECT=".build/WebDriverAgent/WebDriverAgent.xcodeproj"
+if [[ -z "${SIMULATOR_UDID:-}" && -z "${SIMULATOR_NAME:-}" ]]; then
+  echo "Error: set SIMULATOR_UDID or SIMULATOR_NAME" >&2
+  exit 1
+fi
+
+WDA_DIR=".build/WebDriverAgent"
+WDA_PROJECT="${WDA_DIR}/WebDriverAgent.xcodeproj"
+WDA_DERIVED_DATA="${WDA_DIR}/DerivedData"
+WEBDRIVERAGENT_REPO_URL="${WEBDRIVERAGENT_REPO_URL:-https://github.com/appium/WebDriverAgent.git}"
+WEBDRIVERAGENT_REF="${WEBDRIVERAGENT_REF:-}"
 
-if [[ -d "$WDA_PROJECT" ]]; then
-  echo "WebDriverAgent already built, skipping."
-  return 0 2>/dev/null || exit 0
+if [[ -n "${SIMULATOR_UDID:-}" ]]; then
+  DESTINATION="platform=iOS Simulator,id=${SIMULATOR_UDID}"
+else
+  DESTINATION="platform=iOS Simulator,name=${SIMULATOR_NAME}"
 fi
 
-mkdir -p .build
-git clone --depth 1 https://github.com/appium/WebDriverAgent.git .build/WebDriverAgent
+ensure_wda_checkout() {
+  mkdir -p .build
+
+  if [[ ! -d "${WDA_DIR}/.git" ]]; then
+    git clone --depth 1 "${WEBDRIVERAGENT_REPO_URL}" "${WDA_DIR}"
+  fi
+
+  if [[ -n "${WEBDRIVERAGENT_REF}" ]]; then
+    git -C "${WDA_DIR}" fetch --depth 1 origin "${WEBDRIVERAGENT_REF}"
+    git -C "${WDA_DIR}" checkout --detach "${WEBDRIVERAGENT_REF}"
+  fi
+}
+
+has_built_artifacts() {
+  [[ -d "${WDA_DERIVED_DATA}/Build/Products" ]] && \
+    find "${WDA_DERIVED_DATA}/Build/Products" -name '*.xctestrun' -print -quit | grep -q .
+}
+
+ensure_wda_checkout
+
+if [[ -d "$WDA_PROJECT" ]] && has_built_artifacts; then
+  echo "WebDriverAgent already built, skipping."
+  exit 0
+fi
 
 xcodebuild build-for-testing \
   -project "$WDA_PROJECT" \
   -scheme WebDriverAgentRunner \
-  -destination "platform=iOS Simulator,name=$SIMULATOR_NAME" \
+  -destination "$DESTINATION" \
+  -derivedDataPath "$WDA_DERIVED_DATA" \
   CODE_SIGNING_ALLOWED=NO \
   | tail -1
+
+if ! has_built_artifacts; then
+  echo "Error: WebDriverAgent build completed without an .xctestrun artifact" >&2
+  exit 1
+fi
diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 2d132abd9850..4ba11cefbc38 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -1,15 +1,16 @@
 #!/usr/bin/env bash
-# Run AI-driven E2E tests on an iOS Simulator using Claude Code.
+# Run AI-driven E2E tests on an iOS Simulator using Claude Code with a
+# tightly scoped set of wrapper scripts and runner-side result enforcement.
 #
 # This script manages the full lifecycle:
 #   1. Check for "Testing" label on PR (Buildkite only, skips if missing)
 #   2. Download build artifacts and install app (Buildkite only)
 #   3. Install Claude Code (if needed)
-#   4. Detect or boot a simulator
-#   5. Start WebDriverAgent and create a session
-#   6. Run Claude Code with a locked-down tool allowlist
-#   7. Stop WebDriverAgent
-#   8. Exit with the test result code
+#   4. Resolve a specific simulator UDID
+#   5. Start WebDriverAgent
+#   6. Run each markdown test file separately with locked-down wrappers
+#   7. Enforce verification / cleanup / final-result contracts per test
+#   8. Stop WebDriverAgent and print results
 #
 # Required environment variables:
 #   ANTHROPIC_API_KEY   Claude API key
@@ -18,20 +19,32 @@
 #   WP_APP_PASSWORD     WordPress application password
 #
 # Optional environment variables:
-#   APP                 wordpress | jetpack (default: jetpack)
-#   SIMULATOR_NAME      Simulator to boot if none running (default: iPhone 16)
-#   WDA_PORT            WebDriverAgent port (default: 8100)
-#   CLAUDE_MAX_TURNS    Max Claude Code tool-use turns (default: 200)
-#   TEST_DIR            Test directory (default: Tests/AgentTests/ui-tests)
-#   CLAUDE_MODEL        Model to use (default: claude-sonnet-4-20250514)
+#   APP                            wordpress | jetpack (default: jetpack)
+#   SIMULATOR_NAME                 Simulator to boot if none running (default: iPhone 16)
+#   WDA_PORT                       WebDriverAgent port (default: 8100)
+#   CLAUDE_MAX_TURNS               Max Claude Code tool-use turns (default: 120)
+#   TEST_DIR                       Test directory (default: Tests/AgentTests/ui-tests)
+#   CLAUDE_MODEL                   Model to use (default: claude-sonnet-4-20250514)
+#   CLAUDE_CODE_EXPECTED_VERSION   Claude Code version to install (default: 2.1.84)
+#   CLAUDE_CODE_NPM_SPEC           npm package spec for Claude Code
 
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 cd "$REPO_ROOT"
+WDA_STARTED=0
 
-# ── Label gate (Buildkite only) ─────────────────────────────────────
+cleanup_wda() {
+  if [[ "$WDA_STARTED" -eq 1 ]]; then
+    echo "--- Cleanup"
+    ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
+  fi
+}
+
+trap cleanup_wda EXIT
+
+# ── Label gate (Buildkite only) ────────────────────────────────────────
 if [[ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then
   echo "--- Checking for 'Testing' label"
 
@@ -43,19 +56,21 @@ if [[ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then
   echo "'Testing' label found."
 fi
 
-# ── Required env vars ────────────────────────────────────────────────
+# ── Required env vars ──────────────────────────────────────────────────
 : "${ANTHROPIC_API_KEY:?Set ANTHROPIC_API_KEY}"
 : "${SITE_URL:?Set SITE_URL}"
 : "${WP_USERNAME:?Set WP_USERNAME}"
 : "${WP_APP_PASSWORD:?Set WP_APP_PASSWORD}"
 
-# ── Defaults ─────────────────────────────────────────────────────────
+# ── Defaults ───────────────────────────────────────────────────────────
 APP="${APP:-jetpack}"
 export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
-CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-200}"
+CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-120}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
 CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-20250514}"
+CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
+CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 
 case "$APP" in
   wordpress) BUNDLE_ID="org.wordpress" ;;
@@ -63,7 +78,7 @@ case "$APP" in
   *) echo "Error: APP must be 'wordpress' or 'jetpack', got '$APP'" >&2; exit 1 ;;
 esac
 
-# ── Artifact download (Buildkite only) ───────────────────────────────
+# ── Artifact download (Buildkite only) ─────────────────────────────────
 if [[ -n "${BUILDKITE:-}" ]]; then
   echo "--- Downloading Build Artifacts"
   download_artifact "build-products-${APP}.tar"
@@ -73,7 +88,12 @@ if [[ -n "${BUILDKITE:-}" ]]; then
   install_gems
 fi
 
-# ── Locate WDA scripts ──────────────────────────────────────────────
+if [[ ! -d "$TEST_DIR" ]]; then
+  echo "Error: test directory not found: $TEST_DIR" >&2
+  exit 1
+fi
+
+# ── Locate WDA scripts ─────────────────────────────────────────────────
 WDA_START="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-start.rb"
 WDA_STOP="$REPO_ROOT/.claude/skills/ios-sim-navigation/scripts/wda-stop.rb"
 
@@ -82,137 +102,274 @@ if [[ ! -f "$WDA_START" ]]; then
   exit 1
 fi
 
-# ── Install Claude Code ─────────────────────────────────────────────
-if ! command -v claude &>/dev/null; then
-  echo "--- Installing Claude Code"
+write_result_file() {
+  local status="$1"
+  local reason="$2"
+  local screenshot_rel="${3:-}"
+
+  ruby Scripts/ci/write-ai-test-result.rb \
+    "$AI_TEST_RESULT_FILE" \
+    "$AI_TEST_TITLE" \
+    "$AI_TEST_FILE" \
+    "$status" \
+    "$reason" \
+    "$screenshot_rel"
+}
+
+result_field() {
+  local key="$1"
+  ruby Scripts/ci/read-ai-test-result.rb "$AI_TEST_RESULT_FILE" "$key"
+}
+
+recorded_result_count() {
+  if [[ -f "$AI_TEST_RESULT_EVENTS_FILE" ]]; then
+    awk 'END { print NR + 0 }' "$AI_TEST_RESULT_EVENTS_FILE"
+  else
+    echo 0
+  fi
+}
+
+successful_rest_calls() {
+  local purpose="$1"
+  if [[ -f "$AI_TEST_USAGE_FILE" ]]; then
+    awk -F '\t' -v purpose="$purpose" '$1 == purpose && $3 == "1" { count += 1 } END { print count + 0 }' "$AI_TEST_USAGE_FILE"
+  else
+    echo 0
+  fi
+}
+
+join_reasons() {
+  local joined=""
+  local reason
+  for reason in "$@"; do
+    if [[ -n "$joined" ]]; then
+      joined="${joined}; ${reason}"
+    else
+      joined="$reason"
+    fi
+  done
+  printf '%s' "$joined"
+}
+
+# ── Install Claude Code ────────────────────────────────────────────────
+if ! command -v claude &>/dev/null || ! claude --version 2>/dev/null | grep -Fq "$CLAUDE_CODE_EXPECTED_VERSION"; then
+  echo "--- Installing Claude Code (${CLAUDE_CODE_NPM_SPEC})"
   if ! command -v npm &>/dev/null; then
     echo "npm not found, installing Node.js via Homebrew..."
     brew install node
   fi
-  npm install -g @anthropic-ai/claude-code
+  npm install -g "$CLAUDE_CODE_NPM_SPEC"
 fi
 echo "Claude Code: $(claude --version 2>/dev/null || echo 'unknown')"
 
-# ── Detect or boot simulator ────────────────────────────────────────
+# CI permissions are defined explicitly here. Do not rely on
+# .claude/settings.json for the Buildkite execution path.
+CLAUDE_ALLOWED_TOOLS=(
+  --allowedTools "Bash(./Scripts/ci/launch-app.sh)"
+  --allowedTools "Bash(./Scripts/ci/wda-curl.sh *)"
+  --allowedTools "Bash(./Scripts/ci/wp-api.sh *)"
+  --allowedTools "Bash(./Scripts/ci/take-ai-test-screenshot.sh *)"
+  --allowedTools "Bash(./Scripts/ci/record-ai-test-result.sh *)"
+  --allowedTools "Bash(sleep *)"
+)
+
+# ── Resolve simulator ──────────────────────────────────────────────────
 echo "--- Setting up Simulator"
 
-get_booted_udid() {
-  xcrun simctl list devices booted -j 2>/dev/null \
-    | ruby -rjson -e '
-        data = JSON.parse(STDIN.read)
-        data.fetch("devices", {}).each_value do |devs|
-          devs.each { |d| (puts d["udid"]; exit) if d["state"] == "Booted" }
-        end
-      ' 2>/dev/null || true
-  return 0
-}
-
-UDID="$(get_booted_udid)"
-
-if [[ -z "$UDID" ]]; then
-  echo "No booted simulator found. Booting '$SIMULATOR_NAME'..."
-  xcrun simctl boot "$SIMULATOR_NAME"
-  sleep 5
-  UDID="$(get_booted_udid)"
+SIMULATOR_UDID="$(ruby Scripts/ci/find-booted-simulator.rb "$SIMULATOR_NAME" 2>/dev/null || true)"
+if [[ -z "$SIMULATOR_UDID" ]]; then
+  echo "No booted simulator named '$SIMULATOR_NAME' found. Booting..."
+  xcrun simctl boot "$SIMULATOR_NAME" 2>/dev/null || true
+  SIMULATOR_UDID="$(ruby Scripts/ci/find-booted-simulator.rb "$SIMULATOR_NAME" 30 1 2>/dev/null || true)"
 fi
 
-if [[ -z "$UDID" ]]; then
-  echo "Error: could not find a booted simulator" >&2
+if [[ -z "$SIMULATOR_UDID" ]]; then
+  echo "Error: could not find a booted simulator named '$SIMULATOR_NAME'" >&2
   exit 1
 fi
-echo "Simulator UDID: $UDID"
+export SIMULATOR_UDID
+echo "Simulator UDID: $SIMULATOR_UDID"
 
-# ── Install app on simulator (Buildkite only) ────────────────────────
+# ── Install app on simulator (Buildkite only) ─────────────────────────
 if [[ -n "${BUILDKITE:-}" ]]; then
   APP_DISPLAY_NAME="Jetpack"
   [[ "$APP" = "wordpress" ]] && APP_DISPLAY_NAME="WordPress"
 
-  APP_PATH=$(find DerivedData/Build/Products -name "${APP_DISPLAY_NAME}.app" -path "*Debug-iphonesimulator*" | head -1)
+  APP_PATH="$(find DerivedData/Build/Products -name "${APP_DISPLAY_NAME}.app" -path "*Debug-iphonesimulator*" | head -1)"
   if [[ -z "$APP_PATH" ]]; then
     echo "Error: ${APP_DISPLAY_NAME}.app not found in build products" >&2
     exit 1
   fi
   echo "Installing $APP_PATH on simulator..."
-  xcrun simctl install "$UDID" "$APP_PATH"
+  xcrun simctl install "$SIMULATOR_UDID" "$APP_PATH"
 fi
 
-# ── Build WebDriverAgent (if not present) ────────────────────────────
+# ── Build and start WDA ────────────────────────────────────────────────
 echo "--- Building WebDriverAgent"
 "$(dirname "$0")/build-wda.sh"
 
-# ── Start WDA ────────────────────────────────────────────────────────
 echo "--- Starting WebDriverAgent"
-ruby "$WDA_START" --udid "$UDID" --port "$WDA_PORT"
-
-SESSION_ID="$(curl -s -X POST "http://localhost:${WDA_PORT}/session" \
-  -H 'Content-Type: application/json' \
-  -d '{"capabilities":{"alwaysMatch":{}}}' \
-  | ruby -rjson -e 'puts JSON.parse(STDIN.read).dig("value", "sessionId")')"
-
-if [[ -z "$SESSION_ID" ]]; then
-  echo "Error: failed to create WDA session" >&2
-  ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
+ruby "$WDA_START" --udid "$SIMULATOR_UDID" --port "$WDA_PORT"
+WDA_STARTED=1
+
+RESULTS_DIR="Tests/AgentTests/results/$(date +%Y-%m-%d-%H%M)"
+RESULTS_JSON_DIR="${RESULTS_DIR}/.results"
+RESULT_EVENTS_DIR="${RESULTS_DIR}/.result-events"
+USAGE_DIR="${RESULTS_DIR}/.rest-api-usage"
+SCREENSHOTS_DIR="${RESULTS_DIR}/screenshots"
+mkdir -p "$RESULTS_JSON_DIR" "$RESULT_EVENTS_DIR" "$USAGE_DIR" "$SCREENSHOTS_DIR"
+
+TEST_FILES=()
+while IFS= read -r test_file; do
+  TEST_FILES+=("$test_file")
+done < <(find "$TEST_DIR" -maxdepth 1 -type f -name '*.md' | sort)
+
+if [[ ${#TEST_FILES[@]} -eq 0 ]]; then
+  cat > "${RESULTS_DIR}/results.md" <<EOF
+# Test Results
+
+- **Date:** $(date +%Y-%m-%d\ %H:%M)
+- **App:** ${APP}
+- **Site:** ${SITE_URL}
+- **Total:** 0 | **Passed:** 0 | **Failed:** 0
+
+## Results
+
+No markdown test files were found in ${TEST_DIR}.
+EOF
+  echo "Error: no markdown test files found in ${TEST_DIR}" >&2
   exit 1
 fi
-echo "WDA Session: $SESSION_ID"
-
-# ── Export env vars for wrapper scripts and Claude ───────────────────
-export SIMULATOR_UDID="$UDID"
-export WDA_SESSION_ID="$SESSION_ID"
-export WDA_PORT
-export APP_BUNDLE_ID="$BUNDLE_ID"
-export SITE_URL
-export WP_USERNAME
-export WP_APP_PASSWORD
-
-# ── Prepare results directory ────────────────────────────────────────
-TIMESTAMP="$(date +%Y-%m-%d-%H%M)"
-RESULTS_DIR="Tests/AgentTests/results/${TIMESTAMP}"
-mkdir -p "$RESULTS_DIR"
-
-# ── Run Claude Code ──────────────────────────────────────────────────
+
 echo "--- Running AI E2E Tests"
 
-PROMPT="Run all AI E2E test cases in ${TEST_DIR}/ using the ci-test-runner skill.
+declare -a RESULT_FILES=()
+OVERALL_EXIT=0
+
+for index in "${!TEST_FILES[@]}"; do
+  AI_TEST_FILE="${TEST_FILES[$index]}"
+  AI_TEST_TITLE="$(ruby Scripts/ci/inspect-ai-test-case.rb "$AI_TEST_FILE" title)"
+  AI_TEST_SLUG="$(ruby Scripts/ci/inspect-ai-test-case.rb "$AI_TEST_FILE" slug)"
+  AI_TEST_RESULT_FILE="${RESULTS_JSON_DIR}/${AI_TEST_SLUG}.json"
+  AI_TEST_RESULT_EVENTS_FILE="${RESULT_EVENTS_DIR}/${AI_TEST_SLUG}.log"
+  AI_TEST_USAGE_FILE="${USAGE_DIR}/${AI_TEST_SLUG}.log"
+  AI_TEST_RESULTS_DIR="$RESULTS_DIR"
+  AI_TEST_SCREENSHOTS_DIR="$SCREENSHOTS_DIR"
+  export AI_TEST_FILE AI_TEST_TITLE AI_TEST_SLUG AI_TEST_RESULT_FILE AI_TEST_RESULT_EVENTS_FILE AI_TEST_USAGE_FILE AI_TEST_RESULTS_DIR AI_TEST_SCREENSHOTS_DIR
+
+  rm -f "$AI_TEST_RESULT_FILE" "$AI_TEST_RESULT_EVENTS_FILE" "$AI_TEST_USAGE_FILE"
+
+  VERIFICATION_EXPECTED="$(ruby Scripts/ci/inspect-ai-test-case.rb "$AI_TEST_FILE" section-present verification)"
+  CLEANUP_EXPECTED="$(ruby Scripts/ci/inspect-ai-test-case.rb "$AI_TEST_FILE" section-present cleanup)"
+  export WDA_SESSION_ID=""
+  WDA_SESSION_ID="$(ruby Scripts/ci/create-wda-session.rb "$WDA_PORT" 2>/dev/null || true)"
+
+  echo
+  echo "============================================================"
+  echo "[$((index + 1))/${#TEST_FILES[@]}] ${AI_TEST_TITLE}"
+  echo "============================================================"
+
+  if [[ -z "$WDA_SESSION_ID" ]]; then
+    write_result_file fail "Failed to create a WebDriverAgent session before test execution"
+    RESULT_FILES+=("$AI_TEST_RESULT_FILE")
+    OVERALL_EXIT=1
+    continue
+  fi
+  export WDA_SESSION_ID
+
+  TEST_CONTENT="$(cat "$AI_TEST_FILE")"
+  PROMPT="$(cat <<EOF
+Execute exactly one AI-driven iOS UI test case against the ${APP} app.
 
-Environment (already set as env vars, also available to wrapper scripts):
-- App: ${APP} (bundle ID: ${BUNDLE_ID})
-- Simulator UDID: ${UDID}
-- WDA Session ID: ${SESSION_ID}
+Environment:
+- App bundle ID: ${BUNDLE_ID}
+- Simulator UDID: ${SIMULATOR_UDID}
 - WDA Port: ${WDA_PORT}
+- WDA Session ID: ${WDA_SESSION_ID}
 - Site URL: ${SITE_URL}
 - Username: ${WP_USERNAME}
-- Results directory: ${RESULTS_DIR}
-- Screenshots directory: ${RESULTS_DIR}/screenshots"
-
-CLAUDE_EXIT=0
-claude --print \
-  --model "$CLAUDE_MODEL" \
-  --max-turns "$CLAUDE_MAX_TURNS" \
-  --allowedTools "Read" \
-  --allowedTools "Glob(Tests/AgentTests/**)" \
-  --allowedTools "Write(Tests/AgentTests/results/*)" \
-  --allowedTools "Bash(./Scripts/ci/wda-curl.sh *)" \
-  --allowedTools "Bash(./Scripts/ci/wp-api.sh *)" \
-  --allowedTools "Bash(./Scripts/ci/launch-app.sh)" \
-  --allowedTools "Bash(xcrun simctl terminate *)" \
-  --allowedTools "Bash(xcrun simctl io * screenshot Tests/AgentTests/results/*)" \
-  --allowedTools "Bash(sleep *)" \
-  --allowedTools "Bash(mkdir -p Tests/AgentTests/results/*)" \
-  --prompt "$PROMPT" \
-  || CLAUDE_EXIT=$?
-
-# ── Stop WDA ─────────────────────────────────────────────────────────
-echo "--- Cleanup"
-ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
-
-# ── Report results ───────────────────────────────────────────────────
+- Verification required: $( [[ "$VERIFICATION_EXPECTED" == "1" ]] && echo yes || echo no )
+- Cleanup required: $( [[ "$CLEANUP_EXPECTED" == "1" ]] && echo yes || echo no )
+
+Available commands:
+- ./Scripts/ci/launch-app.sh
+- ./Scripts/ci/wda-curl.sh METHOD PATH [JSON_BODY]
+- ./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [JSON_BODY]
+- ./Scripts/ci/take-ai-test-screenshot.sh LABEL
+- ./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT_RELATIVE_PATH]
+- sleep N
+
+Rules:
+- Start by running ./Scripts/ci/launch-app.sh, then sleep 3, then fetch the accessibility tree.
+- Use the accessibility tree instead of screenshots whenever possible.
+- Use wp-api.sh with purpose=setup for prerequisites, purpose=verification for verification work, and purpose=cleanup for cleanup work.
+- If you fail the test, take a screenshot first and pass the returned relative path to record-ai-test-result.sh.
+- You must call record-ai-test-result.sh exactly once before you stop.
+- Keep reasons short and single-line so they are safe to store in CI output.
+
+Test case:
+
+${TEST_CONTENT}
+EOF
+)"
+
+  CLAUDE_EXIT=0
+  claude --print \
+    --model "$CLAUDE_MODEL" \
+    --max-turns "$CLAUDE_MAX_TURNS" \
+    "${CLAUDE_ALLOWED_TOOLS[@]}" \
+    --prompt "$PROMPT" \
+    || CLAUDE_EXIT=$?
+
+  declare -a ENFORCEMENT_REASONS=()
+  if [[ $CLAUDE_EXIT -ne 0 ]]; then
+    ENFORCEMENT_REASONS+=("Claude exited with code ${CLAUDE_EXIT}")
+  fi
+
+  RESULT_CALLS="$(recorded_result_count)"
+  if [[ "$RESULT_CALLS" -eq 0 ]]; then
+    ENFORCEMENT_REASONS+=('Claude did not record a final result')
+  elif [[ "$RESULT_CALLS" -gt 1 ]]; then
+    ENFORCEMENT_REASONS+=('Claude recorded multiple final results')
+  fi
+
+  if [[ "$VERIFICATION_EXPECTED" == "1" && "$(successful_rest_calls verification)" -eq 0 ]]; then
+    ENFORCEMENT_REASONS+=('verification section was declared but no successful verification REST call completed')
+  fi
+
+  if [[ "$CLEANUP_EXPECTED" == "1" && "$(successful_rest_calls cleanup)" -eq 0 ]]; then
+    ENFORCEMENT_REASONS+=('cleanup section was declared but no successful cleanup REST call completed')
+  fi
+
+  if [[ ! -f "$AI_TEST_RESULT_FILE" ]]; then
+    write_result_file fail "$(join_reasons "${ENFORCEMENT_REASONS[@]}")"
+  elif [[ ${#ENFORCEMENT_REASONS[@]} -gt 0 ]]; then
+    CURRENT_REASON="$(result_field reason)"
+    CURRENT_SCREENSHOT="$(result_field screenshot)"
+    if [[ -n "$CURRENT_REASON" ]]; then
+      ENFORCED_REASON="${CURRENT_REASON}. Runner enforcement: $(join_reasons "${ENFORCEMENT_REASONS[@]}")"
+    else
+      ENFORCED_REASON="Runner enforcement: $(join_reasons "${ENFORCEMENT_REASONS[@]}")"
+    fi
+    write_result_file fail "$ENFORCED_REASON" "$CURRENT_SCREENSHOT"
+  fi
+
+  RESULT_STATUS="$(result_field status)"
+  RESULT_REASON="$(result_field reason)"
+  echo "[${RESULT_STATUS^^}] ${AI_TEST_TITLE}"
+  echo "  ${RESULT_REASON}"
+
+  if [[ "$RESULT_STATUS" != "pass" ]]; then
+    OVERALL_EXIT=1
+  fi
+
+  RESULT_FILES+=("$AI_TEST_RESULT_FILE")
+done
+
+ruby Scripts/ci/assemble-ai-test-results.rb "$RESULTS_DIR" "$APP" "$SITE_URL" "${RESULT_FILES[@]}"
+
+# ── Report results ─────────────────────────────────────────────────────
 echo "--- Results"
-RESULTS_FILE="${RESULTS_DIR}/results.md"
-if [[ -f "$RESULTS_FILE" ]]; then
-  cat "$RESULTS_FILE"
-else
-  echo "Warning: no results.md found at $RESULTS_FILE"
-fi
+cat "${RESULTS_DIR}/results.md"
 
-exit "$CLAUDE_EXIT"
+exit "$OVERALL_EXIT"
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 62c099cac071..9ef86d3a4a47 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -150,6 +150,8 @@ steps:
     plugins: [$CI_TOOLKIT_PLUGIN]
     env:
       APP: jetpack
+      CLAUDE_CODE_EXPECTED_VERSION: "2.1.84"
+      CLAUDE_CODE_NPM_SPEC: "@anthropic-ai/claude-code@2.1.84"
     artifact_paths:
       - "Tests/AgentTests/results/**/*"
     notify:
diff --git a/.claude/settings.json b/.claude/settings.json
index 049ec26d33c2..8139d5e1f503 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -1,17 +1,15 @@
 {
   "permissions": {
-    "allow": [
-      "Bash(cat:*)",
-      "Bash(ls:*)",
-      "Bash(rg:*)",
-      "Bash(find:*)",
-      "Bash(grep:*)",
-      "Bash(head:*)",
-      "Bash(tail:*)",
-      "Bash(wc:*)",
-      "Bash(tree:*)",
-      "Bash(git:log,status,diff,branch)"
-    ],
-    "deny": []
+    "deny": [
+      "Read(./.env)",
+      "Read(./.env.*)",
+      "Read(./.git/**)",
+      "Read(./DerivedData/**)",
+      "Read(./build/**)",
+      "Read(./build-products-*.tar)",
+      "Read(./**/*.mobileprovision)",
+      "Read(./**/*.p12)",
+      "Read(./**/*secret*)"
+    ]
   }
 }
diff --git a/.claude/skills/ci-test-runner/SKILL.md b/.claude/skills/ci-test-runner/SKILL.md
index 5e3e8e306079..d761002a27eb 100644
--- a/.claude/skills/ci-test-runner/SKILL.md
+++ b/.claude/skills/ci-test-runner/SKILL.md
@@ -1,34 +1,36 @@
 ---
 name: ci-test-runner
 description: >-
-  CI-hardened E2E test runner for WordPress/Jetpack iOS. Use when the prompt
-  mentions "ci-test-runner" or asks to run AI E2E tests in CI mode. Drives
-  the iOS Simulator through wrapper scripts with a locked-down tool set.
+  CI-hardened single-test runner for WordPress/Jetpack iOS. Use when the prompt
+  contains one test case and the available tools are the constrained Scripts/ci
+  wrappers from Buildkite.
 ---
 
 # CI Test Runner
 
-Run plain-language E2E test cases against the WordPress or Jetpack iOS app
-on an iOS Simulator. All external interactions use wrapper scripts — no raw
-curl, no arbitrary shell commands.
+Run exactly one markdown UI test case against the WordPress or Jetpack iOS app
+in a booted simulator. The shell runner owns test discovery, result assembly,
+and contract enforcement. Your job is to drive the app and record one final
+result for the current test.
 
 ## Environment
 
-All values are pre-set as environment variables by the CI script. You do NOT
-need to ask for credentials or configure anything.
+All values are pre-set by the shell runner:
 
 | Env var | Description |
 |---------|-------------|
 | `SIMULATOR_UDID` | Booted simulator UDID |
-| `WDA_SESSION_ID` | Active WebDriverAgent session ID |
-| `WDA_PORT` | WDA port (default 8100) |
+| `WDA_SESSION_ID` | Active WebDriverAgent session ID for this test |
+| `WDA_PORT` | WDA port |
 | `APP_BUNDLE_ID` | `org.wordpress` or `com.automattic.jetpack` |
 | `SITE_URL` | WordPress test site URL |
 | `WP_USERNAME` | WordPress username |
-| `WP_APP_PASSWORD` | WordPress application password |
+| `AI_TEST_TITLE` | Current test title |
 
-These are also read by the wrapper scripts, so you do not need to pass
-credentials as command arguments.
+The current test case markdown is included in the prompt. Use that content
+directly instead of trying to locate the file on disk.
+
+Do not ask for credentials or try to read files directly.
 
 ## Available Commands
 
@@ -36,60 +38,47 @@ You have exactly these commands available:
 
 | Command | Purpose |
 |---------|---------|
-| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | HTTP to WDA (localhost only) |
-| `./Scripts/ci/wp-api.sh METHOD PATH [BODY]` | WordPress REST API (auth handled) |
-| `./Scripts/ci/launch-app.sh` | (Re)launch app with test credentials |
-| `xcrun simctl terminate $SIMULATOR_UDID $APP_BUNDLE_ID` | Kill app |
-| `xcrun simctl io $SIMULATOR_UDID screenshot PATH` | Take screenshot |
-| `sleep N` | Wait N seconds |
-| `mkdir -p Tests/AgentTests/results/...` | Create results directories |
+| `./Scripts/ci/launch-app.sh` | Relaunch app with test credentials and UI-test flags |
+| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | Allowed WDA calls only |
+| `./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [BODY]` | REST API calls with purpose `setup`, `verification`, or `cleanup` |
+| `./Scripts/ci/take-ai-test-screenshot.sh LABEL` | Capture a failure screenshot and print its relative path |
+| `./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT]` | Record the final `pass` or `fail` result |
+| `sleep N` | Wait for UI stability |
 
 ## WDA Interactions
 
-WDA is already running. A session ID is in the `WDA_SESSION_ID` env var and
-also provided in the prompt.
+WDA is already running. A session ID is in `WDA_SESSION_ID`.
 
 ### Get Accessibility Tree
 
 ```bash
-# Compact text format (~25 KB) — use this by default
+# Compact text format — use this by default
 ./Scripts/ci/wda-curl.sh GET '/source?format=description'
 
-# Structured JSON (~375 KB) — use when you need precise rect coordinates
+# Structured JSON — only when you truly need precise rect coordinates
 ./Scripts/ci/wda-curl.sh GET '/source?format=json'
 ```
 
-**Note:** `wda-curl.sh` returns raw JSON. The tree content is inside the
-`value` field. For the description format, parse the `value` string from the
-JSON response to get the indented tree text.
-
-The description format returns lines like:
-```
-NavigationBar, 0x105351660, {{0.0, 62.0}, {402.0, 54.0}}, identifier: 'my-site-navigation-bar'
-  Button, 0x105351a20, {{16.0, 62.0}, {44.0, 44.0}}, identifier: 'BackButton', label: 'Site Name'
-  StaticText, 0x105351b40, {{178.7, 73.7}, {44.7, 20.7}}, label: 'Posts'
-```
+The tree content is inside the JSON `value` field.
 
 ### Computing Tap Coordinates
 
-Parse the frame `{{x, y}, {width, height}}` from the description tree:
+Parse a frame like `{{x, y}, {width, height}}` from the tree:
 
-```
+```text
 tap_x = x + width / 2
 tap_y = y + height / 2
 ```
 
 ### Session Management
 
-If WDA actions return HTTP 4xx errors, the session may have expired. Create
-a new one:
+If WDA starts returning 4xx session errors, create a fresh session:
 
 ```bash
 ./Scripts/ci/wda-curl.sh POST /session '{"capabilities":{"alwaysMatch":{}}}'
 ```
 
-Extract `value.sessionId` from the JSON response and use it in subsequent
-action paths.
+Extract `value.sessionId` from the JSON response and use it in later paths.
 
 ### Tap
 
@@ -111,17 +100,19 @@ action paths.
 ### Tap Element by Accessibility ID
 
 ```bash
-# Find element
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/elements" '{
   "using": "accessibility id",
   "value": "IDENTIFIER"
 }'
+```
 
-# Click it (ELEMENT_ID from response value[0].ELEMENT)
+Then click the returned element with:
+
+```bash
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/element/${ELEMENT_ID}/click"
 ```
 
-### Long Press
+### Swipe
 
 ```bash
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
@@ -130,16 +121,18 @@ action paths.
     "id": "finger1",
     "parameters": {"pointerType": "touch"},
     "actions": [
-      {"type": "pointerMove", "duration": 0, "x": X, "y": Y},
+      {"type": "pointerMove", "duration": 0, "x": X1, "y": Y1},
       {"type": "pointerDown"},
-      {"type": "pause", "duration": 1000},
+      {"type": "pointerMove", "duration": 500, "x": X2, "y": Y2},
       {"type": "pointerUp"}
     ]
   }]
 }'
 ```
 
-### Swipe
+For vertical scrolling, use `x = screen_width - 30` to avoid hitting tappable UI.
+
+### Long Press
 
 ```bash
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
@@ -148,22 +141,15 @@ action paths.
     "id": "finger1",
     "parameters": {"pointerType": "touch"},
     "actions": [
-      {"type": "pointerMove", "duration": 0, "x": X1, "y": Y1},
+      {"type": "pointerMove", "duration": 0, "x": X, "y": Y},
       {"type": "pointerDown"},
-      {"type": "pointerMove", "duration": 500, "x": X2, "y": Y2},
+      {"type": "pause", "duration": 1000},
       {"type": "pointerUp"}
     ]
   }]
 }'
 ```
 
-**Swipe direction guide** (given screen size `W x H`):
-- **Up** (scroll down): from `(W/2, H/2 + H/6)` to `(W/2, H/2 - H/6)`
-- **Down** (scroll up): from `(W/2, H/2 - H/6)` to `(W/2, H/2 + H/6)`
-- **Left**: from `(W/2 + W/4, H/2)` to `(W/2 - W/4, H/2)`
-- **Right**: from `(W/2 - W/4, H/2)` to `(W/2 + W/4, H/2)`
-- **Back** (from left edge): from `(5, H/2)` to `(W*2/3, H/2)`
-
 ### Type Text
 
 ```bash
@@ -172,14 +158,12 @@ action paths.
 }'
 ```
 
-An element must be focused first (tap a text field before typing).
+Tap a text field first so it has focus.
 
 ### Clear Text Field
 
 ```bash
-# Select all (Ctrl+A)
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value": ["\u0001"]}'
-# Delete
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value": ["\u007F"]}'
 ```
 
@@ -189,227 +173,89 @@ An element must be focused first (tap a text field before typing).
 ./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/pressButton" '{"name": "home"}'
 ```
 
-## WordPress REST API
-
-Use `wp-api.sh` for all REST API calls. Authentication is handled by the
-script — do not pass credentials.
-
-```bash
-# Search for a post
-./Scripts/ci/wp-api.sh GET 'wp/v2/posts?search=My+Post&status=publish'
-
-# Create a category
-./Scripts/ci/wp-api.sh POST wp/v2/categories '{"name":"Test Category"}'
-
-# Delete a post (force = skip trash)
-./Scripts/ci/wp-api.sh DELETE 'wp/v2/posts/123?force=true'
-
-# Create a tag
-./Scripts/ci/wp-api.sh POST wp/v2/tags '{"name":"Test Tag"}'
-```
-
 ## Navigation Strategy
 
-**Always prefer the accessibility tree over screenshots.**
+Always prefer the accessibility tree over screenshots.
 
 ### Finding Elements
 
 Use this priority order:
-1. **`identifier` / `name`** — most stable, developer-assigned
-2. **`label`** — accessibility label, user-visible text
-3. **`type` + context** — e.g., "Button inside NavigationBar"
-4. **Partial matching** — label contains target text
-5. **Positional heuristics** — last resort
-
-### Screen Size
-
-The root node's frame in the tree gives screen dimensions (e.g., `{{0, 0}, {393, 852}}`).
+1. Accessibility identifier
+2. Visible label text
+3. Type plus surrounding context
+4. Partial label matching
+5. Positional heuristics as a last resort
 
 ### Waiting for UI Stability
 
-After every action (tap, swipe, type), wait 0.5–1 second then re-fetch the
-tree. Do not use fixed long sleeps. Instead, poll:
+After every action:
+1. Wait briefly, usually `sleep 1`
+2. Re-fetch the tree
+3. Confirm the expected screen or element is now present
 
-1. Fetch the tree
-2. Check if expected element or screen is present
-3. If not, `sleep 1` and retry
-4. After 10 retries (10 seconds), declare element not found
+Do not rely on long blind sleeps when polling is enough.
 
 ### Scroll View Navigation
 
-1. Fetch tree, search for target element
-2. If found, tap it
-3. If not, swipe up from `(screen_width - 30, screen_height / 2)` to scroll
-4. Re-fetch tree and search again
-5. If tree is identical after scroll, you've hit the bottom — stop
+1. Fetch the tree and search for the target element.
+2. If found, tap it.
+3. If not, swipe up from the right edge.
+4. Re-fetch the tree and search again.
+5. If the tree is unchanged after a scroll, assume you reached the end.
+
+### Screen Size
+
+Use the root node frame from the tree to derive screen dimensions.
 
 ### Back Navigation
 
-- **Primary**: find a Button inside NavigationBar, tap it
-- **Fallback**: edge swipe from `(5, H/2)` to `(W*2/3, H/2)`
+- Primary: tap a navigation bar back button
+- Fallback: swipe from the left edge toward the center
 
 ### Tab Bar Navigation
 
-Look for elements with type containing `TabBar` in the tree. Its children
-are the individual tabs. Tap the tab you need to switch to.
+Look for `TabBar` elements in the tree and tap the needed tab.
 
 ### System Alert Handling
 
-If actions fail, check the tree for `Alert` or `Sheet` elements. Dismiss
-with "Allow", "Don't Allow", "OK", or "Cancel" before retrying.
+If actions fail unexpectedly, check for `Alert` or `Sheet` elements and dismiss
+them before retrying.
 
 ### App Crash Recovery
 
-If the tree looks unexpected or actions consistently fail:
+If the tree looks wrong or actions consistently fail:
 1. Relaunch with `./Scripts/ci/launch-app.sh`
 2. Wait 3 seconds
-3. Create a new WDA session if needed
-4. Continue the test
-
-## Test Execution Flow
-
-### Step 1: Discover Tests
-
-Use `Glob` to find all `*.md` files in the test directory provided in the
-prompt. Sort alphabetically. Print:
-
-```
-Discovered N test(s):
-- create-blank-page.md
-- text-post-publish.md
-```
-
-If none found, write a results.md noting this and stop.
-
-### Step 2: Initialize
-
-The results directory is provided in the prompt. Create subdirectories:
-
-```bash
-mkdir -p ${RESULTS_DIR}/screenshots
-```
-
-### Step 3: Run Each Test Sequentially
-
-For each test file:
-
-#### 3a. Relaunch app
-
-```bash
-./Scripts/ci/launch-app.sh
-sleep 3
-```
-
-#### 3b. Check login state
-
-Fetch the tree. If the app shows a login screen:
-1. Tap "Enter your existing site address"
-2. Type the site URL (from the prompt)
-3. Tap Continue
-4. Wait 3 seconds for auto-login
-
-If the app shows the logged-in state (My Site), skip login.
-
-#### 3c. Read test file
-
-Use `Read` to get the test case markdown. Parse the sections:
-- **Prerequisites** — setup steps (REST API or UI)
-- **Steps** — actions to perform
-- **Verification (REST API)** — REST API assertions (if present)
-- **Cleanup (REST API)** — REST API cleanup (if present)
-- **Expected Outcome** — what success looks like
-
-#### 3d. Fulfill prerequisites
-
-For REST API prerequisites (create categories, tags, posts), use
-`./Scripts/ci/wp-api.sh`. For UI prerequisites like "logged in", the
-relaunch in 3a handles it.
-
-If a prerequisite cannot be fulfilled, mark the test as FAIL with reason
-"Prerequisite not met: <details>" and skip directly to step 3h (record
-result).
-
-#### 3e. Execute steps
-
-Follow the numbered steps using WDA commands. After each action, wait
-briefly and re-fetch the tree to verify the UI changed as expected.
-
-#### 3f. Verify (if section present)
-
-If the test has a `## Verification (REST API)` section, use `wp-api.sh`
-to verify. The verification MUST succeed for the test to pass.
-
-#### 3g. Cleanup (if section present)
-
-If the test has a `## Cleanup (REST API)` section, use `wp-api.sh` to
-clean up. Always run cleanup regardless of pass/fail.
-
-#### 3h. Record result
-
-Write a per-test result file at `${RESULTS_DIR}/<test-name>.md`:
-
-On pass:
-```
-### PASS: <Test Title>
-Passed.
-```
-
-On fail — take a screenshot first:
-```bash
-xcrun simctl io $SIMULATOR_UDID screenshot Tests/AgentTests/results/${TIMESTAMP}/screenshots/<test-name>-failure.png
-```
-Then write:
-```
-### FAIL: <Test Title>
-**Reason:** <what went wrong>
-**Screenshot:** screenshots/<test-name>-failure.png
-```
-
-#### 3i. Print status
-
-```
-[2/5] PASS: create-blank-page
-```
-or:
-```
-[2/5] FAIL: create-blank-page — Element "Publish" not found
-```
-
-### Step 4: Assemble Final Results
-
-Read all per-test result files. Write `${RESULTS_DIR}/results.md`:
-
-```markdown
-# Test Results
-
-- **Date:** YYYY-MM-DD HH:mm
-- **App:** <app name>
-- **Site:** <site_url>
-- **Total:** N | **Passed:** P | **Failed:** F
-
-## Results
-
-<per-test results concatenated>
-```
-
-### Step 5: Print Summary
-
-```
-Test run complete.
-Total: N | Passed: P | Failed: F
-Results: Tests/AgentTests/results/<timestamp>/results.md
-```
+3. Re-fetch the tree
+4. Create a new WDA session if the old one expired
+
+## Single-Test Flow
+
+1. Start with `./Scripts/ci/launch-app.sh`, then `sleep 3`, then inspect the tree.
+2. Read the current test case carefully. It may include `Prerequisites`, `Steps`, `Verification`, `Cleanup`, `Expected Outcome`, or similar sections.
+3. Fulfill prerequisites using UI actions or `wp-api.sh setup ...`.
+4. If a prerequisite cannot be fulfilled, fail the test with reason `Prerequisite not met: <details>`.
+5. Execute the numbered test steps, verifying UI changes after each action.
+6. Use the expected outcome to confirm you reached the intended end state.
+7. Run any verification work with `wp-api.sh verification ...`.
+8. Run any cleanup work with `wp-api.sh cleanup ...`, even after failures.
+9. If the test fails, take a screenshot first and pass the returned relative path into `record-ai-test-result.sh`.
+10. Call `record-ai-test-result.sh` exactly once before stopping. Always pass a reason; `Passed.` is enough for a normal pass.
+11. Keep the recorded reason short and single-line.
+
+## Login Constraints
+
+- This CI flow is for a self-hosted site login path.
+- The app may already be logged in. If the tree already shows the logged-in state, skip login.
+- Prefer the self-hosted site address flow.
+- If a login screen is shown, tap `Enter your existing site address`, type the site URL, tap continue, then wait 2-3 seconds and re-fetch the tree for the logged-in state.
+- Do not switch into a WordPress.com email/password flow unless the test case explicitly requires it.
+- Do not invent credentials or ask for them.
 
 ## Important Rules
 
-- **The app MUST already be built and installed** on the simulator. The CI
-  pipeline handles building. This skill only drives tests.
-- **NEVER stop on failure.** Always continue to the next test.
-- **Always run cleanup** regardless of pass/fail.
-- **Prefer the accessibility tree** over screenshots for navigation.
-- **After every action**, wait 0.5–1s then re-fetch the tree.
-- **For scrolling**, swipe from the right edge (`screen_width - 30`) to
-  avoid tapping interactive elements.
-- **Use `duration: 1000`** (1 second) for swipes on tappable items.
-- **Coordinates are in points**, not pixels — use tree coordinates, not
-  screenshot dimensions.
+- The app is expected to already be built and installed on the simulator.
+- Never try to read or write arbitrary files.
+- Never call `record-ai-test-result.sh` more than once.
+- Never skip declared verification or cleanup work.
+- Never use screenshots as the primary navigation source when the tree is enough.
diff --git a/Scripts/ci/assemble-ai-test-results.rb b/Scripts/ci/assemble-ai-test-results.rb
new file mode 100755
index 000000000000..d165db32a1a8
--- /dev/null
+++ b/Scripts/ci/assemble-ai-test-results.rb
@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'json'
+
+results_dir, app, site_url, *result_files = ARGV
+
+abort 'Usage: assemble-ai-test-results.rb RESULTS_DIR APP SITE_URL [RESULT_FILES...]' if site_url.nil?
+
+results = result_files.map do |path|
+  JSON.parse(File.read(path))
+end
+
+passed = results.count { |result| result['status'] == 'pass' }
+failed = results.count { |result| result['status'] == 'fail' }
+
+lines = [
+  '# Test Results',
+  '',
+  "- **Date:** #{Time.now.strftime('%Y-%m-%d %H:%M')}",
+  "- **App:** #{app}",
+  "- **Site:** #{site_url}",
+  "- **Total:** #{results.length} | **Passed:** #{passed} | **Failed:** #{failed}",
+  '',
+  '## Results',
+  ''
+]
+
+results.each do |result|
+  status_label = result.fetch('status') == 'pass' ? 'PASS' : 'FAIL'
+  lines << "### #{status_label}: #{result.fetch('title')}"
+  lines << "**Reason:** #{result.fetch('reason')}"
+  lines << "**Test File:** #{result.fetch('test_file')}"
+  if result['screenshot']
+    lines << "**Screenshot:** #{result.fetch('screenshot')}"
+  end
+  lines << ''
+end
+
+File.write(File.join(results_dir, 'results.md'), "#{lines.join("\n")}\n")
diff --git a/Scripts/ci/create-wda-session.rb b/Scripts/ci/create-wda-session.rb
new file mode 100644
index 000000000000..3da77647f067
--- /dev/null
+++ b/Scripts/ci/create-wda-session.rb
@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'json'
+require 'net/http'
+require 'uri'
+
+port = Integer(ARGV[0] || 8100)
+uri = URI("http://localhost:#{port}/session")
+request = Net::HTTP::Post.new(uri)
+request['Content-Type'] = 'application/json'
+request.body = JSON.generate(capabilities: { alwaysMatch: {} })
+
+response = Net::HTTP.start(uri.hostname, uri.port, read_timeout: 30, open_timeout: 10) do |http|
+  http.request(request)
+end
+
+exit 1 unless response.code.to_i.between?(200, 499)
+
+parsed = JSON.parse(response.body)
+session_id = parsed.dig('value', 'sessionId') || parsed['sessionId']
+print(session_id.to_s)
diff --git a/Scripts/ci/find-booted-simulator.rb b/Scripts/ci/find-booted-simulator.rb
new file mode 100644
index 000000000000..67f33992366a
--- /dev/null
+++ b/Scripts/ci/find-booted-simulator.rb
@@ -0,0 +1,38 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'json'
+require 'open3'
+
+requested_name = ARGV[0].to_s
+wait_seconds = ARGV[1].to_f
+poll_interval = ARGV[2].to_f
+poll_interval = 1.0 if poll_interval <= 0
+deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + [wait_seconds, 0].max
+
+loop do
+  output, status = Open3.capture2('xcrun', 'simctl', 'list', 'devices', 'booted', '-j')
+  exit 1 unless status.success?
+
+  data = JSON.parse(output)
+  devices = data.fetch('devices', {}).each_value.flat_map do |list|
+    list.select { |device| device['state'] == 'Booted' }
+  end
+
+  device = if requested_name.empty?
+             devices.first
+           else
+             devices.find { |entry| entry['name'] == requested_name }
+           end
+
+  if device
+    print(device['udid'])
+    exit 0
+  end
+
+  break if wait_seconds <= 0 || Process.clock_gettime(Process::CLOCK_MONOTONIC) >= deadline
+
+  sleep poll_interval
+end
+
+exit 1
diff --git a/Scripts/ci/inspect-ai-test-case.rb b/Scripts/ci/inspect-ai-test-case.rb
new file mode 100644
index 000000000000..a1ba0e805df0
--- /dev/null
+++ b/Scripts/ci/inspect-ai-test-case.rb
@@ -0,0 +1,39 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+file_path = ARGV[0]
+command = ARGV[1]
+
+abort 'Usage: inspect-ai-test-case.rb FILE_PATH COMMAND [ARGS...]' if file_path.nil? || command.nil?
+
+content = File.read(file_path)
+sections = {}
+current_name = nil
+buffer = []
+
+content.each_line do |line|
+  heading = line.match(/^##\s+(.+)$/)
+  if heading
+    sections[current_name] = buffer.join.strip if current_name
+    current_name = heading[1].strip
+    buffer = []
+  elsif current_name
+    buffer << line
+  end
+end
+sections[current_name] = buffer.join.strip if current_name
+
+case command
+when 'title'
+  title = content[/^#\s+(.+)$/, 1] || File.basename(file_path, '.md')
+  print title
+when 'slug'
+  slug = File.basename(file_path, '.md').downcase.gsub(/[^a-z0-9]+/, '-').gsub(/\A-+|-+\z/, '')
+  print slug
+when 'section-present'
+  pattern = Regexp.new(ARGV.fetch(2), Regexp::IGNORECASE)
+  matched = sections.any? { |name, body| name.match?(pattern) && !body.to_s.strip.empty? }
+  print(matched ? '1' : '0')
+else
+  abort "Unknown command: #{command}"
+end
diff --git a/Scripts/ci/launch-app.sh b/Scripts/ci/launch-app.sh
index a1b8a8192018..e7f677249523 100755
--- a/Scripts/ci/launch-app.sh
+++ b/Scripts/ci/launch-app.sh
@@ -20,6 +20,11 @@ set -euo pipefail
 
 exec xcrun simctl launch --terminate-running-process \
   "$SIMULATOR_UDID" "$APP_BUNDLE_ID" \
+  -ui-testing YES \
+  -ui-test-reset-everything YES \
+  -ui-test-disable-prompts YES \
+  -ui-test-disable-animations YES \
+  -ui-test-disable-migration YES \
   -ui-test-site-url "$SITE_URL" \
   -ui-test-site-user "$WP_USERNAME" \
   -ui-test-site-pass "$WP_APP_PASSWORD"
diff --git a/Scripts/ci/read-ai-test-result.rb b/Scripts/ci/read-ai-test-result.rb
new file mode 100644
index 000000000000..984ee7d5976c
--- /dev/null
+++ b/Scripts/ci/read-ai-test-result.rb
@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'json'
+
+result_file = ARGV[0]
+key = ARGV[1]
+
+abort 'Usage: read-ai-test-result.rb RESULT_FILE KEY' if result_file.nil? || key.nil?
+
+data = JSON.parse(File.read(result_file))
+value = data[key]
+print(value.nil? ? '' : value.to_s)
diff --git a/Scripts/ci/record-ai-test-result.sh b/Scripts/ci/record-ai-test-result.sh
new file mode 100755
index 000000000000..19e0624d2815
--- /dev/null
+++ b/Scripts/ci/record-ai-test-result.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Record the final pass/fail status for the current AI-driven test case.
+#
+# Usage: record-ai-test-result.sh <pass|fail> <reason> [screenshot-relative-path]
+set -euo pipefail
+
+STATUS="${1:?Usage: record-ai-test-result.sh <pass|fail> <reason> [screenshot-relative-path]}"
+REASON="${2:?Usage: record-ai-test-result.sh <pass|fail> <reason> [screenshot-relative-path]}"
+SCREENSHOT_REL="${3:-}"
+
+: "${AI_TEST_RESULT_FILE:?AI_TEST_RESULT_FILE is required}"
+: "${AI_TEST_RESULT_EVENTS_FILE:?AI_TEST_RESULT_EVENTS_FILE is required}"
+: "${AI_TEST_TITLE:?AI_TEST_TITLE is required}"
+: "${AI_TEST_FILE:?AI_TEST_FILE is required}"
+
+mkdir -p "$(dirname "$AI_TEST_RESULT_EVENTS_FILE")"
+printf '%s\t%s\n' "$(date +%s)" "$STATUS" >> "$AI_TEST_RESULT_EVENTS_FILE"
+
+ruby Scripts/ci/write-ai-test-result.rb \
+  "$AI_TEST_RESULT_FILE" \
+  "$AI_TEST_TITLE" \
+  "$AI_TEST_FILE" \
+  "$STATUS" \
+  "$REASON" \
+  "$SCREENSHOT_REL"
+
+echo "Recorded ${STATUS} result for ${AI_TEST_TITLE}"
diff --git a/Scripts/ci/take-ai-test-screenshot.sh b/Scripts/ci/take-ai-test-screenshot.sh
new file mode 100755
index 000000000000..5707db729e1e
--- /dev/null
+++ b/Scripts/ci/take-ai-test-screenshot.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Capture a screenshot for the current AI-driven test case and print the
+# relative path that should be stored in the result metadata.
+#
+# Usage: take-ai-test-screenshot.sh <label>
+set -euo pipefail
+
+LABEL="${1:?Usage: take-ai-test-screenshot.sh <label>}"
+
+: "${SIMULATOR_UDID:?SIMULATOR_UDID is required}"
+: "${AI_TEST_RESULTS_DIR:?AI_TEST_RESULTS_DIR is required}"
+: "${AI_TEST_SCREENSHOTS_DIR:?AI_TEST_SCREENSHOTS_DIR is required}"
+: "${AI_TEST_SLUG:?AI_TEST_SLUG is required}"
+
+safe_label="$(echo "$LABEL" | tr -cs '[:alnum:]_-' '_')"
+mkdir -p "$AI_TEST_SCREENSHOTS_DIR"
+absolute_path="$(mktemp "${AI_TEST_SCREENSHOTS_DIR}/${AI_TEST_SLUG}-${safe_label}-XXXX.png")"
+xcrun simctl io "$SIMULATOR_UDID" screenshot "$absolute_path" >/dev/null
+
+relative_path="${absolute_path#${AI_TEST_RESULTS_DIR}/}"
+echo "$relative_path"
diff --git a/Scripts/ci/wda-curl.sh b/Scripts/ci/wda-curl.sh
index d7043b953556..b9fcf778aa03 100755
--- a/Scripts/ci/wda-curl.sh
+++ b/Scripts/ci/wda-curl.sh
@@ -29,12 +29,20 @@ if [[ "$URL_PATH" != /* ]]; then
   URL_PATH="/${URL_PATH}"
 fi
 
+case "$URL_PATH" in
+  /status|/session|/source\?format=description|/source\?format=json|/session/*/actions|/session/*/elements|/session/*/element/*/click|/session/*/wda/keys|/session/*/wda/pressButton) ;;
+  *)
+    echo "Error: WDA path '$URL_PATH' is not allowed" >&2
+    exit 1
+    ;;
+esac
+
 if [[ -n "$BODY" ]]; then
-  exec curl -s -X "$METHOD" \
+  exec curl -sS --max-time 30 -X "$METHOD" \
     -H 'Content-Type: application/json' \
     -d "$BODY" \
     "http://localhost:${PORT}${URL_PATH}"
 else
-  exec curl -s -X "$METHOD" \
+  exec curl -sS --max-time 30 -X "$METHOD" \
     "http://localhost:${PORT}${URL_PATH}"
 fi
diff --git a/Scripts/ci/wp-api.sh b/Scripts/ci/wp-api.sh
index e413fb7cef62..0fd10af67b68 100755
--- a/Scripts/ci/wp-api.sh
+++ b/Scripts/ci/wp-api.sh
@@ -1,27 +1,36 @@
 #!/usr/bin/env bash
 # Constrained WordPress REST API client.
-# Handles authentication internally — Claude Code never sees credentials.
+# Handles authentication internally and records whether setup / verification /
+# cleanup calls succeeded for the current test.
 #
-# Usage: wp-api.sh <METHOD> <API_PATH> [JSON_BODY]
+# Usage: wp-api.sh <PURPOSE> <METHOD> <API_PATH> [JSON_BODY]
 #
 # Examples:
-#   wp-api.sh GET  'wp/v2/posts?search=My+Post'
-#   wp-api.sh POST  wp/v2/posts '{"title":"Test","status":"publish"}'
-#   wp-api.sh DELETE 'wp/v2/posts/123?force=true'
+#   wp-api.sh setup GET 'wp/v2/posts?search=My+Post'
+#   wp-api.sh verification POST wp/v2/posts '{"title":"Test","status":"publish"}'
+#   wp-api.sh cleanup DELETE 'wp/v2/posts/123?force=true'
 #
 # Environment (required):
 #   SITE_URL        WordPress site URL (e.g., https://example.com)
 #   WP_USERNAME     WordPress username
 #   WP_APP_PASSWORD WordPress application password
+#   AI_TEST_USAGE_FILE  Path to the per-test usage log
 set -euo pipefail
 
-METHOD="${1:?Usage: wp-api.sh METHOD API_PATH [BODY]}"
-API_PATH="${2:?Usage: wp-api.sh METHOD API_PATH [BODY]}"
-BODY="${3:-}"
+PURPOSE="${1:?Usage: wp-api.sh PURPOSE METHOD API_PATH [BODY]}"
+METHOD="${2:?Usage: wp-api.sh PURPOSE METHOD API_PATH [BODY]}"
+API_PATH="${3:?Usage: wp-api.sh PURPOSE METHOD API_PATH [BODY]}"
+BODY="${4:-}"
 
 : "${SITE_URL:?SITE_URL is required}"
 : "${WP_USERNAME:?WP_USERNAME is required}"
 : "${WP_APP_PASSWORD:?WP_APP_PASSWORD is required}"
+: "${AI_TEST_USAGE_FILE:?AI_TEST_USAGE_FILE is required}"
+
+case "$PURPOSE" in
+  setup|verification|cleanup) ;;
+  *) echo "Error: purpose must be setup, verification, or cleanup, got '$PURPOSE'" >&2; exit 1 ;;
+esac
 
 case "$METHOD" in
   GET|POST|PUT|DELETE) ;;
@@ -34,17 +43,39 @@ if [[ "$API_PATH" == *..* ]]; then
   exit 1
 fi
 
+if [[ "$API_PATH" == http://* || "$API_PATH" == https://* ]]; then
+  echo "Error: absolute URLs are not allowed" >&2
+  exit 1
+fi
+
 # Strip leading slash if present
 API_PATH="${API_PATH#/}"
+API_PATH="${API_PATH#wp-json/}"
+
+tmp_body="$(mktemp)"
+trap 'rm -f "$tmp_body"' EXIT
+
+log_usage() {
+  printf '%s\t%s\t%s\n' "$PURPOSE" "$1" "$2" >> "$AI_TEST_USAGE_FILE"
+}
 
 if [[ -n "$BODY" ]]; then
-  exec curl -s -X "$METHOD" \
+  status_code="$(curl -sS --max-time 30 -o "$tmp_body" -w '%{http_code}' -X "$METHOD" \
     -u "${WP_USERNAME}:${WP_APP_PASSWORD}" \
     -H 'Content-Type: application/json' \
     -d "$BODY" \
-    "${SITE_URL}/wp-json/${API_PATH}"
+    "${SITE_URL}/wp-json/${API_PATH}")"
 else
-  exec curl -s -X "$METHOD" \
+  status_code="$(curl -sS --max-time 30 -o "$tmp_body" -w '%{http_code}' -X "$METHOD" \
     -u "${WP_USERNAME}:${WP_APP_PASSWORD}" \
-    "${SITE_URL}/wp-json/${API_PATH}"
+    "${SITE_URL}/wp-json/${API_PATH}")"
 fi
+
+if [[ "$status_code" =~ ^2[0-9][0-9]$ ]]; then
+  log_usage "$status_code" 1
+else
+  log_usage "$status_code" 0
+fi
+
+printf 'HTTP %s\n' "$status_code"
+cat "$tmp_body"
diff --git a/Scripts/ci/write-ai-test-result.rb b/Scripts/ci/write-ai-test-result.rb
new file mode 100755
index 000000000000..0fdfe20eac88
--- /dev/null
+++ b/Scripts/ci/write-ai-test-result.rb
@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'json'
+require 'fileutils'
+require 'time'
+
+result_file, title, test_file, status, reason, screenshot = ARGV
+
+abort 'Usage: write-ai-test-result.rb RESULT_FILE TITLE TEST_FILE STATUS REASON [SCREENSHOT]' if reason.nil?
+abort "Status must be 'pass' or 'fail'" unless %w[pass fail].include?(status)
+
+FileUtils.mkdir_p(File.dirname(result_file))
+
+payload = {
+  'status' => status,
+  'title' => title,
+  'test_file' => test_file,
+  'reason' => reason,
+  'screenshot' => screenshot.to_s.empty? ? nil : screenshot,
+  'updated_at' => Time.now.utc.iso8601
+}
+
+File.write(result_file, "#{JSON.pretty_generate(payload)}\n")

From 7caf9e2d149d3c9493ff1c9f08b6c4fab11c363c Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 14:21:05 +0100
Subject: [PATCH 08/23] Use APP_BUNDLE_ID consistently

---
 .buildkite/commands/run-ai-e2e-tests.sh    | 7 ++++---
 .claude/skills/ai-test-runner/SKILL.md     | 4 ++--
 .claude/skills/ios-sim-navigation/SKILL.md | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 4ba11cefbc38..b8b0a340da79 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -73,10 +73,11 @@ CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 
 case "$APP" in
-  wordpress) BUNDLE_ID="org.wordpress" ;;
-  jetpack)   BUNDLE_ID="com.automattic.jetpack" ;;
+  wordpress) APP_BUNDLE_ID="org.wordpress" ;;
+  jetpack)   APP_BUNDLE_ID="com.automattic.jetpack" ;;
   *) echo "Error: APP must be 'wordpress' or 'jetpack', got '$APP'" >&2; exit 1 ;;
 esac
+export APP_BUNDLE_ID
 
 # ── Artifact download (Buildkite only) ─────────────────────────────────
 if [[ -n "${BUILDKITE:-}" ]]; then
@@ -282,7 +283,7 @@ for index in "${!TEST_FILES[@]}"; do
 Execute exactly one AI-driven iOS UI test case against the ${APP} app.
 
 Environment:
-- App bundle ID: ${BUNDLE_ID}
+- App bundle ID: ${APP_BUNDLE_ID}
 - Simulator UDID: ${SIMULATOR_UDID}
 - WDA Port: ${WDA_PORT}
 - WDA Session ID: ${WDA_SESSION_ID}
diff --git a/.claude/skills/ai-test-runner/SKILL.md b/.claude/skills/ai-test-runner/SKILL.md
index c8f611c1dd9d..0828d9640920 100644
--- a/.claude/skills/ai-test-runner/SKILL.md
+++ b/.claude/skills/ai-test-runner/SKILL.md
@@ -96,7 +96,7 @@ Use the ios-sim-navigation skill for WDA interaction reference.
 
 ## Context
 
-- App Bundle ID: <BUNDLE_ID>
+- App Bundle ID: <APP_BUNDLE_ID>
 - WDA Session ID: <SESSION_ID>
 - Simulator UDID: <UDID>
 - Test file: <TEST_FILE_PATH> (absolute path)
@@ -117,7 +117,7 @@ Use the ios-sim-navigation skill for WDA interaction reference.
 2. **Relaunch the app** for a clean state:
 
    ```bash
-   xcrun simctl launch --terminate-running-process <UDID> <BUNDLE_ID> \
+   xcrun simctl launch --terminate-running-process <UDID> <APP_BUNDLE_ID> \
      -ui-test-site-url <SITE_URL> \
      -ui-test-site-user <USERNAME> \
      -ui-test-site-pass <APPLICATION_PASSWORD>
diff --git a/.claude/skills/ios-sim-navigation/SKILL.md b/.claude/skills/ios-sim-navigation/SKILL.md
index 0ec4ab7efab1..96cb6311aeb4 100644
--- a/.claude/skills/ios-sim-navigation/SKILL.md
+++ b/.claude/skills/ios-sim-navigation/SKILL.md
@@ -366,7 +366,7 @@ If actions consistently fail or the tree looks unexpected, the app may have cras
 xcrun simctl list devices booted
 
 # Re-launch the app
-xcrun simctl launch <UDID> <BUNDLE_ID>
+xcrun simctl launch <UDID> <APP_BUNDLE_ID>
 ```
 
 After re-launching, create a new WDA session before continuing.

From 325193c192b4b6c6c331580abb36e85edb9ce838 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 14:56:02 +0100
Subject: [PATCH 09/23] Normalize CI site URLs and extend WDA startup timeout

---
 .buildkite/commands/run-ai-e2e-tests.sh             | 13 ++++++++++++-
 .../skills/ios-sim-navigation/scripts/wda-start.rb  |  6 +++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index b8b0a340da79..9e444de24aaf 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -35,6 +35,15 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 cd "$REPO_ROOT"
 WDA_STARTED=0
 
+normalize_site_url() {
+  local site_url="$1"
+  if [[ "$site_url" == http://* || "$site_url" == https://* ]]; then
+    printf '%s' "$site_url"
+  else
+    printf 'https://%s' "$site_url"
+  fi
+}
+
 cleanup_wda() {
   if [[ "$WDA_STARTED" -eq 1 ]]; then
     echo "--- Cleanup"
@@ -61,6 +70,7 @@ fi
 : "${SITE_URL:?Set SITE_URL}"
 : "${WP_USERNAME:?Set WP_USERNAME}"
 : "${WP_APP_PASSWORD:?Set WP_APP_PASSWORD}"
+export SITE_URL="$(normalize_site_url "$SITE_URL")"
 
 # ── Defaults ───────────────────────────────────────────────────────────
 APP="${APP:-jetpack}"
@@ -71,6 +81,7 @@ TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
 CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-20250514}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
+WDA_START_TIMEOUT="${WDA_START_TIMEOUT:-120}"
 
 case "$APP" in
   wordpress) APP_BUNDLE_ID="org.wordpress" ;;
@@ -210,7 +221,7 @@ echo "--- Building WebDriverAgent"
 "$(dirname "$0")/build-wda.sh"
 
 echo "--- Starting WebDriverAgent"
-ruby "$WDA_START" --udid "$SIMULATOR_UDID" --port "$WDA_PORT"
+ruby "$WDA_START" --udid "$SIMULATOR_UDID" --port "$WDA_PORT" --timeout "$WDA_START_TIMEOUT"
 WDA_STARTED=1
 
 RESULTS_DIR="Tests/AgentTests/results/$(date +%Y-%m-%d-%H%M)"
diff --git a/.claude/skills/ios-sim-navigation/scripts/wda-start.rb b/.claude/skills/ios-sim-navigation/scripts/wda-start.rb
index 68c5dd8e3ee0..4d45c3b7ea56 100755
--- a/.claude/skills/ios-sim-navigation/scripts/wda-start.rb
+++ b/.claude/skills/ios-sim-navigation/scripts/wda-start.rb
@@ -9,6 +9,7 @@
 # Options:
 #   --udid <UDID>   Target a specific simulator (default: first booted)
 #   --port <PORT>   WDA port (default: 8100)
+#   --timeout <S>   Seconds to wait for WDA to become ready (default: 60)
 #
 # Exit codes:
 #   0  WDA started successfully
@@ -20,6 +21,7 @@
 require "json"
 
 DEFAULT_PORT = 8100
+DEFAULT_TIMEOUT = 60
 
 def get_booted_udid
   output = `xcrun simctl list devices booted -j 2>/dev/null`
@@ -55,11 +57,13 @@ def wda_running?(port)
 
 udid = nil
 port = DEFAULT_PORT
+timeout = DEFAULT_TIMEOUT
 
 parser = OptionParser.new do |opts|
   opts.banner = "Usage: wda-start.rb [options]"
   opts.on("--udid UDID", "Target a specific simulator") { |v| udid = v }
   opts.on("--port PORT", Integer, "WDA port (default: 8100)") { |v| port = v }
+  opts.on("--timeout SECONDS", Integer, "Seconds to wait for WDA to become ready (default: 60)") { |v| timeout = v }
 end
 parser.parse!
 
@@ -100,7 +104,7 @@ def wda_running?(port)
 Process.detach(pid)
 
 # Wait for WDA to become ready
-max_wait = 60
+max_wait = timeout
 interval = 2
 elapsed = 0
 

From a2143f0a4ef1f90eade856452640c732a085008b Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 17:06:49 +0100
Subject: [PATCH 10/23] Use built WDA artifacts in CI and extend AI timeout

---
 .buildkite/pipeline.yml                                | 2 +-
 .claude/skills/ios-sim-navigation/scripts/wda-start.rb | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 9ef86d3a4a47..daa5679acdb2 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -146,7 +146,7 @@ steps:
     depends_on: "build_jetpack"
     if: "build.pull_request.id != null"
     soft_fail: true
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     plugins: [$CI_TOOLKIT_PLUGIN]
     env:
       APP: jetpack
diff --git a/.claude/skills/ios-sim-navigation/scripts/wda-start.rb b/.claude/skills/ios-sim-navigation/scripts/wda-start.rb
index 4d45c3b7ea56..f8e6986e1afc 100755
--- a/.claude/skills/ios-sim-navigation/scripts/wda-start.rb
+++ b/.claude/skills/ios-sim-navigation/scripts/wda-start.rb
@@ -77,6 +77,7 @@ def wda_running?(port)
 
 # Find the WDA project
 wda_project = File.join(Dir.pwd, ".build", "WebDriverAgent", "WebDriverAgent.xcodeproj")
+wda_derived_data = File.join(Dir.pwd, ".build", "WebDriverAgent", "DerivedData")
 unless File.exist?(wda_project)
   $stderr.puts "Error: WebDriverAgent project not found at #{wda_project}"
   $stderr.puts "Clone it: git clone https://github.com/appium/WebDriverAgent.git .build/WebDriverAgent"
@@ -89,6 +90,7 @@ def wda_running?(port)
   "-project", wda_project,
   "-scheme", "WebDriverAgentRunner",
   "-destination", "id=#{udid}",
+  "-derivedDataPath", wda_derived_data,
   "USE_PORT=#{port}",
   "CODE_SIGNING_ALLOWED=NO"
 ]

From 745c37af7dc9667da1e3536eb5b8bf46c587fef2 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 19:23:42 +0100
Subject: [PATCH 11/23] Fix Claude CLI invocation and bash 3 status output

---
 .buildkite/commands/run-ai-e2e-tests.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 9e444de24aaf..7fe05f446c67 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -330,7 +330,7 @@ EOF
     --model "$CLAUDE_MODEL" \
     --max-turns "$CLAUDE_MAX_TURNS" \
     "${CLAUDE_ALLOWED_TOOLS[@]}" \
-    --prompt "$PROMPT" \
+    "$PROMPT" \
     || CLAUDE_EXIT=$?
 
   declare -a ENFORCEMENT_REASONS=()
@@ -368,7 +368,8 @@ EOF
 
   RESULT_STATUS="$(result_field status)"
   RESULT_REASON="$(result_field reason)"
-  echo "[${RESULT_STATUS^^}] ${AI_TEST_TITLE}"
+  RESULT_STATUS_LABEL="$(printf '%s' "$RESULT_STATUS" | tr '[:lower:]' '[:upper:]')"
+  echo "[${RESULT_STATUS_LABEL}] ${AI_TEST_TITLE}"
   echo "  ${RESULT_REASON}"
 
   if [[ "$RESULT_STATUS" != "pass" ]]; then

From b98598f8f49941e3912e0c8ff435c109375ddbbd Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 19:27:44 +0100
Subject: [PATCH 12/23] Use Claude Sonnet 4.6 by default

---
 .buildkite/commands/run-ai-e2e-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 7fe05f446c67..51350c8b8dc5 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -78,7 +78,7 @@ export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
 CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-120}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
-CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-20250514}"
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 WDA_START_TIMEOUT="${WDA_START_TIMEOUT:-120}"

From 36b52f92634814db3dcf0f15c456122f207f4923 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 19:44:20 +0100
Subject: [PATCH 13/23] Pass Claude prompt after option terminator

---
 .buildkite/commands/run-ai-e2e-tests.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 51350c8b8dc5..26fe8d6fee3a 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -24,7 +24,7 @@
 #   WDA_PORT                       WebDriverAgent port (default: 8100)
 #   CLAUDE_MAX_TURNS               Max Claude Code tool-use turns (default: 120)
 #   TEST_DIR                       Test directory (default: Tests/AgentTests/ui-tests)
-#   CLAUDE_MODEL                   Model to use (default: claude-sonnet-4-20250514)
+#   CLAUDE_MODEL                   Model to use (default: claude-sonnet-4-6)
 #   CLAUDE_CODE_EXPECTED_VERSION   Claude Code version to install (default: 2.1.84)
 #   CLAUDE_CODE_NPM_SPEC           npm package spec for Claude Code
 
@@ -330,6 +330,7 @@ EOF
     --model "$CLAUDE_MODEL" \
     --max-turns "$CLAUDE_MAX_TURNS" \
     "${CLAUDE_ALLOWED_TOOLS[@]}" \
+    -- \
     "$PROMPT" \
     || CLAUDE_EXIT=$?
 

From c9c34db32bf75f12f540cd92b1a3839f96d97219 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 22:54:58 +0100
Subject: [PATCH 14/23] Stream Claude AI E2E progress

---
 .buildkite/commands/run-ai-e2e-tests.sh | 24 ++++++++--
 Scripts/ci/ai-test-progress.sh          | 12 +++++
 Scripts/ci/launch-app.sh                | 11 ++++-
 Scripts/ci/record-ai-test-result.sh     |  5 +++
 Scripts/ci/take-ai-test-screenshot.sh   |  5 +++
 Scripts/ci/wda-curl.sh                  | 58 +++++++++++++++++++++++++
 Scripts/ci/wp-api.sh                    |  6 +++
 7 files changed, 116 insertions(+), 5 deletions(-)
 create mode 100644 Scripts/ci/ai-test-progress.sh

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 26fe8d6fee3a..7aa39d72eef6 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -45,12 +45,21 @@ normalize_site_url() {
 }
 
 cleanup_wda() {
+  stop_progress_tail
   if [[ "$WDA_STARTED" -eq 1 ]]; then
     echo "--- Cleanup"
     ruby "$WDA_STOP" --port "$WDA_PORT" 2>/dev/null || true
   fi
 }
 
+stop_progress_tail() {
+  if [[ -n "${AI_TEST_PROGRESS_TAIL_PID:-}" ]]; then
+    kill "$AI_TEST_PROGRESS_TAIL_PID" 2>/dev/null || true
+    wait "$AI_TEST_PROGRESS_TAIL_PID" 2>/dev/null || true
+    unset AI_TEST_PROGRESS_TAIL_PID
+  fi
+}
+
 trap cleanup_wda EXIT
 
 # ── Label gate (Buildkite only) ────────────────────────────────────────
@@ -228,8 +237,9 @@ RESULTS_DIR="Tests/AgentTests/results/$(date +%Y-%m-%d-%H%M)"
 RESULTS_JSON_DIR="${RESULTS_DIR}/.results"
 RESULT_EVENTS_DIR="${RESULTS_DIR}/.result-events"
 USAGE_DIR="${RESULTS_DIR}/.rest-api-usage"
+PROGRESS_DIR="${RESULTS_DIR}/.progress"
 SCREENSHOTS_DIR="${RESULTS_DIR}/screenshots"
-mkdir -p "$RESULTS_JSON_DIR" "$RESULT_EVENTS_DIR" "$USAGE_DIR" "$SCREENSHOTS_DIR"
+mkdir -p "$RESULTS_JSON_DIR" "$RESULT_EVENTS_DIR" "$USAGE_DIR" "$PROGRESS_DIR" "$SCREENSHOTS_DIR"
 
 TEST_FILES=()
 while IFS= read -r test_file; do
@@ -265,11 +275,13 @@ for index in "${!TEST_FILES[@]}"; do
   AI_TEST_RESULT_FILE="${RESULTS_JSON_DIR}/${AI_TEST_SLUG}.json"
   AI_TEST_RESULT_EVENTS_FILE="${RESULT_EVENTS_DIR}/${AI_TEST_SLUG}.log"
   AI_TEST_USAGE_FILE="${USAGE_DIR}/${AI_TEST_SLUG}.log"
+  AI_TEST_PROGRESS_FILE="${PROGRESS_DIR}/${AI_TEST_SLUG}.log"
   AI_TEST_RESULTS_DIR="$RESULTS_DIR"
   AI_TEST_SCREENSHOTS_DIR="$SCREENSHOTS_DIR"
-  export AI_TEST_FILE AI_TEST_TITLE AI_TEST_SLUG AI_TEST_RESULT_FILE AI_TEST_RESULT_EVENTS_FILE AI_TEST_USAGE_FILE AI_TEST_RESULTS_DIR AI_TEST_SCREENSHOTS_DIR
+  export AI_TEST_FILE AI_TEST_TITLE AI_TEST_SLUG AI_TEST_RESULT_FILE AI_TEST_RESULT_EVENTS_FILE AI_TEST_USAGE_FILE AI_TEST_PROGRESS_FILE AI_TEST_RESULTS_DIR AI_TEST_SCREENSHOTS_DIR
 
-  rm -f "$AI_TEST_RESULT_FILE" "$AI_TEST_RESULT_EVENTS_FILE" "$AI_TEST_USAGE_FILE"
+  rm -f "$AI_TEST_RESULT_FILE" "$AI_TEST_RESULT_EVENTS_FILE" "$AI_TEST_USAGE_FILE" "$AI_TEST_PROGRESS_FILE"
+  : > "$AI_TEST_PROGRESS_FILE"
 
   VERIFICATION_EXPECTED="$(ruby Scripts/ci/inspect-ai-test-case.rb "$AI_TEST_FILE" section-present verification)"
   CLEANUP_EXPECTED="$(ruby Scripts/ci/inspect-ai-test-case.rb "$AI_TEST_FILE" section-present cleanup)"
@@ -291,6 +303,8 @@ for index in "${!TEST_FILES[@]}"; do
 
   TEST_CONTENT="$(cat "$AI_TEST_FILE")"
   PROMPT="$(cat <<EOF
+Use the ci-test-runner Claude Code skill for this task.
+
 Execute exactly one AI-driven iOS UI test case against the ${APP} app.
 
 Environment:
@@ -318,6 +332,7 @@ Rules:
 - If you fail the test, take a screenshot first and pass the returned relative path to record-ai-test-result.sh.
 - You must call record-ai-test-result.sh exactly once before you stop.
 - Keep reasons short and single-line so they are safe to store in CI output.
+- Do not spend turns narrating a plan. Use the available commands to act.
 
 Test case:
 
@@ -326,6 +341,8 @@ EOF
 )"
 
   CLAUDE_EXIT=0
+  tail -n 0 -f "$AI_TEST_PROGRESS_FILE" &
+  AI_TEST_PROGRESS_TAIL_PID=$!
   claude --print \
     --model "$CLAUDE_MODEL" \
     --max-turns "$CLAUDE_MAX_TURNS" \
@@ -333,6 +350,7 @@ EOF
     -- \
     "$PROMPT" \
     || CLAUDE_EXIT=$?
+  stop_progress_tail
 
   declare -a ENFORCEMENT_REASONS=()
   if [[ $CLAUDE_EXIT -ne 0 ]]; then
diff --git a/Scripts/ci/ai-test-progress.sh b/Scripts/ci/ai-test-progress.sh
new file mode 100644
index 000000000000..0c96eb1d4116
--- /dev/null
+++ b/Scripts/ci/ai-test-progress.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+log_ai_test_progress() {
+  local message="${1:-}"
+
+  if [[ -z "$message" || -z "${AI_TEST_PROGRESS_FILE:-}" ]]; then
+    return 0
+  fi
+
+  mkdir -p "$(dirname "$AI_TEST_PROGRESS_FILE")"
+  printf '[%s] %s\n' "$(date +%H:%M:%S)" "$message" >> "$AI_TEST_PROGRESS_FILE"
+}
diff --git a/Scripts/ci/launch-app.sh b/Scripts/ci/launch-app.sh
index e7f677249523..374f9d88c4e9 100755
--- a/Scripts/ci/launch-app.sh
+++ b/Scripts/ci/launch-app.sh
@@ -12,13 +12,17 @@
 #   WP_APP_PASSWORD  WordPress application password
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=Scripts/ci/ai-test-progress.sh
+source "$SCRIPT_DIR/ai-test-progress.sh"
+
 : "${SIMULATOR_UDID:?SIMULATOR_UDID is required}"
 : "${APP_BUNDLE_ID:?APP_BUNDLE_ID is required}"
 : "${SITE_URL:?SITE_URL is required}"
 : "${WP_USERNAME:?WP_USERNAME is required}"
 : "${WP_APP_PASSWORD:?WP_APP_PASSWORD is required}"
 
-exec xcrun simctl launch --terminate-running-process \
+launch_output="$(xcrun simctl launch --terminate-running-process \
   "$SIMULATOR_UDID" "$APP_BUNDLE_ID" \
   -ui-testing YES \
   -ui-test-reset-everything YES \
@@ -27,4 +31,7 @@ exec xcrun simctl launch --terminate-running-process \
   -ui-test-disable-migration YES \
   -ui-test-site-url "$SITE_URL" \
   -ui-test-site-user "$WP_USERNAME" \
-  -ui-test-site-pass "$WP_APP_PASSWORD"
+  -ui-test-site-pass "$WP_APP_PASSWORD")"
+
+log_ai_test_progress "Launched ${APP_BUNDLE_ID}"
+printf '%s\n' "$launch_output"
diff --git a/Scripts/ci/record-ai-test-result.sh b/Scripts/ci/record-ai-test-result.sh
index 19e0624d2815..6a5fa6e7b1eb 100755
--- a/Scripts/ci/record-ai-test-result.sh
+++ b/Scripts/ci/record-ai-test-result.sh
@@ -4,6 +4,10 @@
 # Usage: record-ai-test-result.sh <pass|fail> <reason> [screenshot-relative-path]
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=Scripts/ci/ai-test-progress.sh
+source "$SCRIPT_DIR/ai-test-progress.sh"
+
 STATUS="${1:?Usage: record-ai-test-result.sh <pass|fail> <reason> [screenshot-relative-path]}"
 REASON="${2:?Usage: record-ai-test-result.sh <pass|fail> <reason> [screenshot-relative-path]}"
 SCREENSHOT_REL="${3:-}"
@@ -24,4 +28,5 @@ ruby Scripts/ci/write-ai-test-result.rb \
   "$REASON" \
   "$SCREENSHOT_REL"
 
+log_ai_test_progress "Test result: $(printf '%s' "$STATUS" | tr '[:lower:]' '[:upper:]') — ${REASON}"
 echo "Recorded ${STATUS} result for ${AI_TEST_TITLE}"
diff --git a/Scripts/ci/take-ai-test-screenshot.sh b/Scripts/ci/take-ai-test-screenshot.sh
index 5707db729e1e..e6935b374c28 100755
--- a/Scripts/ci/take-ai-test-screenshot.sh
+++ b/Scripts/ci/take-ai-test-screenshot.sh
@@ -5,6 +5,10 @@
 # Usage: take-ai-test-screenshot.sh <label>
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=Scripts/ci/ai-test-progress.sh
+source "$SCRIPT_DIR/ai-test-progress.sh"
+
 LABEL="${1:?Usage: take-ai-test-screenshot.sh <label>}"
 
 : "${SIMULATOR_UDID:?SIMULATOR_UDID is required}"
@@ -18,4 +22,5 @@ absolute_path="$(mktemp "${AI_TEST_SCREENSHOTS_DIR}/${AI_TEST_SLUG}-${safe_label
 xcrun simctl io "$SIMULATOR_UDID" screenshot "$absolute_path" >/dev/null
 
 relative_path="${absolute_path#${AI_TEST_RESULTS_DIR}/}"
+log_ai_test_progress "Screenshot: ${relative_path}"
 echo "$relative_path"
diff --git a/Scripts/ci/wda-curl.sh b/Scripts/ci/wda-curl.sh
index b9fcf778aa03..ea21fd4f183f 100755
--- a/Scripts/ci/wda-curl.sh
+++ b/Scripts/ci/wda-curl.sh
@@ -14,6 +14,10 @@
 #   WDA_PORT  WDA port (default: 8100)
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=Scripts/ci/ai-test-progress.sh
+source "$SCRIPT_DIR/ai-test-progress.sh"
+
 METHOD="${1:?Usage: wda-curl.sh METHOD PATH [BODY]}"
 URL_PATH="${2:?Usage: wda-curl.sh METHOD PATH [BODY]}"
 BODY="${3:-}"
@@ -37,6 +41,60 @@ case "$URL_PATH" in
     ;;
 esac
 
+log_request() {
+  case "$URL_PATH" in
+    /source\?format=description)
+      log_ai_test_progress 'Fetched accessibility tree'
+      ;;
+    /source\?format=json)
+      log_ai_test_progress 'Fetched accessibility tree JSON'
+      ;;
+    /session)
+      log_ai_test_progress 'Creating WDA session'
+      ;;
+    /session/*/elements)
+      local using
+      local value
+      using="$(printf '%s' "$BODY" | ruby -rjson -e 'body = STDIN.read; data = JSON.parse(body); print data["using"].to_s' 2>/dev/null || true)"
+      value="$(printf '%s' "$BODY" | ruby -rjson -e 'body = STDIN.read; data = JSON.parse(body); print data["value"].to_s' 2>/dev/null || true)"
+      if [[ -n "$using" || -n "$value" ]]; then
+        log_ai_test_progress "Find element using ${using:-unknown}: ${value:-<empty>}"
+      else
+        log_ai_test_progress 'Find element'
+      fi
+      ;;
+    /session/*/element/*/click)
+      log_ai_test_progress 'Clicked element'
+      ;;
+    /session/*/wda/keys)
+      local key_count
+      key_count="$(printf '%s' "$BODY" | ruby -rjson -e 'body = STDIN.read; data = JSON.parse(body); values = data["value"]; print(values.is_a?(Array) ? values.length : 0)' 2>/dev/null || true)"
+      if [[ -n "$key_count" && "$key_count" != "0" ]]; then
+        log_ai_test_progress "Typed ${key_count} key(s)"
+      else
+        log_ai_test_progress 'Typed keys'
+      fi
+      ;;
+    /session/*/wda/pressButton)
+      local button_name
+      button_name="$(printf '%s' "$BODY" | ruby -rjson -e 'body = STDIN.read; data = JSON.parse(body); print data["name"].to_s' 2>/dev/null || true)"
+      if [[ -n "$button_name" ]]; then
+        log_ai_test_progress "Pressed ${button_name} button"
+      else
+        log_ai_test_progress 'Pressed hardware button'
+      fi
+      ;;
+    /session/*/actions)
+      log_ai_test_progress 'Performed touch action'
+      ;;
+    /status)
+      log_ai_test_progress 'Checked WDA status'
+      ;;
+  esac
+}
+
+log_request
+
 if [[ -n "$BODY" ]]; then
   exec curl -sS --max-time 30 -X "$METHOD" \
     -H 'Content-Type: application/json' \
diff --git a/Scripts/ci/wp-api.sh b/Scripts/ci/wp-api.sh
index 0fd10af67b68..37fce107e9bc 100755
--- a/Scripts/ci/wp-api.sh
+++ b/Scripts/ci/wp-api.sh
@@ -17,6 +17,10 @@
 #   AI_TEST_USAGE_FILE  Path to the per-test usage log
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=Scripts/ci/ai-test-progress.sh
+source "$SCRIPT_DIR/ai-test-progress.sh"
+
 PURPOSE="${1:?Usage: wp-api.sh PURPOSE METHOD API_PATH [BODY]}"
 METHOD="${2:?Usage: wp-api.sh PURPOSE METHOD API_PATH [BODY]}"
 API_PATH="${3:?Usage: wp-api.sh PURPOSE METHOD API_PATH [BODY]}"
@@ -77,5 +81,7 @@ else
   log_usage "$status_code" 0
 fi
 
+log_ai_test_progress "REST ${PURPOSE} ${METHOD} /wp-json/${API_PATH} -> ${status_code}"
+
 printf 'HTTP %s\n' "$status_code"
 cat "$tmp_body"

From cad6432bb306a4b3ff2f19d16300ef06fe7b5794 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Thu, 26 Mar 2026 23:28:49 +0100
Subject: [PATCH 15/23] Fix Rubocop errors

---
 Scripts/ci/assemble-ai-test-results.rb | 4 +---
 Scripts/ci/create-wda-session.rb       | 2 +-
 Scripts/ci/find-booted-simulator.rb    | 0
 Scripts/ci/inspect-ai-test-case.rb     | 0
 Scripts/ci/read-ai-test-result.rb      | 0
 5 files changed, 2 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 Scripts/ci/create-wda-session.rb
 mode change 100644 => 100755 Scripts/ci/find-booted-simulator.rb
 mode change 100644 => 100755 Scripts/ci/inspect-ai-test-case.rb
 mode change 100644 => 100755 Scripts/ci/read-ai-test-result.rb

diff --git a/Scripts/ci/assemble-ai-test-results.rb b/Scripts/ci/assemble-ai-test-results.rb
index d165db32a1a8..8155cc9adba1 100755
--- a/Scripts/ci/assemble-ai-test-results.rb
+++ b/Scripts/ci/assemble-ai-test-results.rb
@@ -31,9 +31,7 @@
   lines << "### #{status_label}: #{result.fetch('title')}"
   lines << "**Reason:** #{result.fetch('reason')}"
   lines << "**Test File:** #{result.fetch('test_file')}"
-  if result['screenshot']
-    lines << "**Screenshot:** #{result.fetch('screenshot')}"
-  end
+  lines << "**Screenshot:** #{result.fetch('screenshot')}" if result['screenshot']
   lines << ''
 end
 
diff --git a/Scripts/ci/create-wda-session.rb b/Scripts/ci/create-wda-session.rb
old mode 100644
new mode 100755
index 3da77647f067..dedbe8358b35
--- a/Scripts/ci/create-wda-session.rb
+++ b/Scripts/ci/create-wda-session.rb
@@ -19,4 +19,4 @@
 
 parsed = JSON.parse(response.body)
 session_id = parsed.dig('value', 'sessionId') || parsed['sessionId']
-print(session_id.to_s)
+print(session_id)
diff --git a/Scripts/ci/find-booted-simulator.rb b/Scripts/ci/find-booted-simulator.rb
old mode 100644
new mode 100755
diff --git a/Scripts/ci/inspect-ai-test-case.rb b/Scripts/ci/inspect-ai-test-case.rb
old mode 100644
new mode 100755
diff --git a/Scripts/ci/read-ai-test-result.rb b/Scripts/ci/read-ai-test-result.rb
old mode 100644
new mode 100755

From c1febf01d8e608e5dd0a0cb7141d94f84adc4c53 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Fri, 27 Mar 2026 12:55:01 +0100
Subject: [PATCH 16/23] Attempt to simplify Claude E2E test running

---
 .buildkite/commands/run-ai-e2e-tests.sh |  36 +--
 .claude/skills/ci-test-runner/SKILL.md  | 301 ++++++++----------------
 2 files changed, 112 insertions(+), 225 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 7aa39d72eef6..d14a54083bff 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -87,7 +87,7 @@ export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
 CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-120}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
-CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-haiku-4-5}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 WDA_START_TIMEOUT="${WDA_START_TIMEOUT:-120}"
@@ -302,12 +302,14 @@ for index in "${!TEST_FILES[@]}"; do
   export WDA_SESSION_ID
 
   TEST_CONTENT="$(cat "$AI_TEST_FILE")"
+  SKILL_CONTENT="$(cat .claude/skills/ci-test-runner/SKILL.md | tail -n +8)"
   PROMPT="$(cat <<EOF
-Use the ci-test-runner Claude Code skill for this task.
+${SKILL_CONTENT}
 
-Execute exactly one AI-driven iOS UI test case against the ${APP} app.
+---
+
+## Environment
 
-Environment:
 - App bundle ID: ${APP_BUNDLE_ID}
 - Simulator UDID: ${SIMULATOR_UDID}
 - WDA Port: ${WDA_PORT}
@@ -317,33 +319,21 @@ Environment:
 - Verification required: $( [[ "$VERIFICATION_EXPECTED" == "1" ]] && echo yes || echo no )
 - Cleanup required: $( [[ "$CLEANUP_EXPECTED" == "1" ]] && echo yes || echo no )
 
-Available commands:
-- ./Scripts/ci/launch-app.sh
-- ./Scripts/ci/wda-curl.sh METHOD PATH [JSON_BODY]
-- ./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [JSON_BODY]
-- ./Scripts/ci/take-ai-test-screenshot.sh LABEL
-- ./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT_RELATIVE_PATH]
-- sleep N
-
-Rules:
-- Start by running ./Scripts/ci/launch-app.sh, then sleep 3, then fetch the accessibility tree.
-- Use the accessibility tree instead of screenshots whenever possible.
-- Use wp-api.sh with purpose=setup for prerequisites, purpose=verification for verification work, and purpose=cleanup for cleanup work.
-- If you fail the test, take a screenshot first and pass the returned relative path to record-ai-test-result.sh.
-- You must call record-ai-test-result.sh exactly once before you stop.
-- Keep reasons short and single-line so they are safe to store in CI output.
-- Do not spend turns narrating a plan. Use the available commands to act.
-
-Test case:
+## Test Case
 
 ${TEST_CONTENT}
+
+---
+
+Execute this test now. Start with ./Scripts/ci/launch-app.sh, then sleep 3, then fetch the accessibility tree.
 EOF
 )"
 
   CLAUDE_EXIT=0
   tail -n 0 -f "$AI_TEST_PROGRESS_FILE" &
   AI_TEST_PROGRESS_TAIL_PID=$!
-  claude --print \
+  NO_COLOR=1 claude --print \
+    --bare \
     --model "$CLAUDE_MODEL" \
     --max-turns "$CLAUDE_MAX_TURNS" \
     "${CLAUDE_ALLOWED_TOOLS[@]}" \
diff --git a/.claude/skills/ci-test-runner/SKILL.md b/.claude/skills/ci-test-runner/SKILL.md
index d761002a27eb..742d3b8a6138 100644
--- a/.claude/skills/ci-test-runner/SKILL.md
+++ b/.claude/skills/ci-test-runner/SKILL.md
@@ -8,254 +8,151 @@ description: >-
 
 # CI Test Runner
 
-Run exactly one markdown UI test case against the WordPress or Jetpack iOS app
-in a booted simulator. The shell runner owns test discovery, result assembly,
-and contract enforcement. Your job is to drive the app and record one final
-result for the current test.
+Drive the app through one UI test case. Every response must contain tool calls.
+Do not narrate plans — act.
 
-## Environment
-
-All values are pre-set by the shell runner:
-
-| Env var | Description |
-|---------|-------------|
-| `SIMULATOR_UDID` | Booted simulator UDID |
-| `WDA_SESSION_ID` | Active WebDriverAgent session ID for this test |
-| `WDA_PORT` | WDA port |
-| `APP_BUNDLE_ID` | `org.wordpress` or `com.automattic.jetpack` |
-| `SITE_URL` | WordPress test site URL |
-| `WP_USERNAME` | WordPress username |
-| `AI_TEST_TITLE` | Current test title |
-
-The current test case markdown is included in the prompt. Use that content
-directly instead of trying to locate the file on disk.
-
-Do not ask for credentials or try to read files directly.
-
-## Available Commands
-
-You have exactly these commands available:
+## Commands
 
 | Command | Purpose |
 |---------|---------|
-| `./Scripts/ci/launch-app.sh` | Relaunch app with test credentials and UI-test flags |
-| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | Allowed WDA calls only |
-| `./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [BODY]` | REST API calls with purpose `setup`, `verification`, or `cleanup` |
-| `./Scripts/ci/take-ai-test-screenshot.sh LABEL` | Capture a failure screenshot and print its relative path |
-| `./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT]` | Record the final `pass` or `fail` result |
-| `sleep N` | Wait for UI stability |
+| `./Scripts/ci/launch-app.sh` | Relaunch app with test credentials |
+| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | WDA HTTP calls (patterns below) |
+| `./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [BODY]` | REST API with purpose `setup`, `verification`, or `cleanup` |
+| `./Scripts/ci/take-ai-test-screenshot.sh LABEL` | Screenshot (use only on failure) |
+| `./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT]` | Record final result — call exactly once |
+| `sleep N` | Wait N seconds |
 
-## WDA Interactions
+## WDA Patterns
 
-WDA is already running. A session ID is in `WDA_SESSION_ID`.
+Session ID is in `$WDA_SESSION_ID`.
 
-### Get Accessibility Tree
+### Fetch accessibility tree
 
 ```bash
-# Compact text format — use this by default
 ./Scripts/ci/wda-curl.sh GET '/source?format=description'
-
-# Structured JSON — only when you truly need precise rect coordinates
-./Scripts/ci/wda-curl.sh GET '/source?format=json'
 ```
 
-The tree content is inside the JSON `value` field.
-
-### Computing Tap Coordinates
-
-Parse a frame like `{{x, y}, {width, height}}` from the tree:
-
-```text
-tap_x = x + width / 2
-tap_y = y + height / 2
-```
+Returns a text tree. Each element has type, frame `{{x, y}, {width, height}}`,
+optional identifier and label. The root node frame gives screen dimensions.
 
-### Session Management
+### Tap by coordinates (preferred — saves a turn)
 
-If WDA starts returning 4xx session errors, create a fresh session:
+Compute center from frame: `X = x + width/2`, `Y = y + height/2`, then:
 
 ```bash
-./Scripts/ci/wda-curl.sh POST /session '{"capabilities":{"alwaysMatch":{}}}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" \
+  '{"actions":[{"type":"pointer","id":"f1","parameters":{"pointerType":"touch"},"actions":[{"type":"pointerMove","duration":0,"x":X,"y":Y},{"type":"pointerDown"},{"type":"pointerUp"}]}]}'
 ```
 
-Extract `value.sessionId` from the JSON response and use it in later paths.
-
-### Tap
+### Tap by accessibility ID (when you know the exact ID)
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
-  "actions": [{
-    "type": "pointer",
-    "id": "finger1",
-    "parameters": {"pointerType": "touch"},
-    "actions": [
-      {"type": "pointerMove", "duration": 0, "x": X, "y": Y},
-      {"type": "pointerDown"},
-      {"type": "pointerUp"}
-    ]
-  }]
-}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/elements" \
+  '{"using":"accessibility id","value":"IDENTIFIER"}'
 ```
 
-### Tap Element by Accessibility ID
+Then click using the element ID from `value[0].ELEMENT` in the response:
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/elements" '{
-  "using": "accessibility id",
-  "value": "IDENTIFIER"
-}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/element/ELEMENT_ID/click"
 ```
 
-Then click the returned element with:
-
-```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/element/${ELEMENT_ID}/click"
-```
+### Type text
 
-### Swipe
+Tap the field first to focus it, then:
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
-  "actions": [{
-    "type": "pointer",
-    "id": "finger1",
-    "parameters": {"pointerType": "touch"},
-    "actions": [
-      {"type": "pointerMove", "duration": 0, "x": X1, "y": Y1},
-      {"type": "pointerDown"},
-      {"type": "pointerMove", "duration": 500, "x": X2, "y": Y2},
-      {"type": "pointerUp"}
-    ]
-  }]
-}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" \
+  '{"value":["h","e","l","l","o"]}'
 ```
 
-For vertical scrolling, use `x = screen_width - 30` to avoid hitting tappable UI.
+### Clear text field
 
-### Long Press
+Select all then delete:
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" '{
-  "actions": [{
-    "type": "pointer",
-    "id": "finger1",
-    "parameters": {"pointerType": "touch"},
-    "actions": [
-      {"type": "pointerMove", "duration": 0, "x": X, "y": Y},
-      {"type": "pointerDown"},
-      {"type": "pause", "duration": 1000},
-      {"type": "pointerUp"}
-    ]
-  }]
-}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value":["\u0001"]}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value":["\u007F"]}'
 ```
 
-### Type Text
+### Swipe / scroll
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{
-  "value": ["h","e","l","l","o"]
-}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" \
+  '{"actions":[{"type":"pointer","id":"f1","parameters":{"pointerType":"touch"},"actions":[{"type":"pointerMove","duration":0,"x":X1,"y":Y1},{"type":"pointerDown"},{"type":"pointerMove","duration":500,"x":X2,"y":Y2},{"type":"pointerUp"}]}]}'
 ```
 
-Tap a text field first so it has focus.
+- Scroll down: swipe from lower y to upper y. Use `x = screen_width - 30`.
+- Back gesture: swipe from `(5, H/2)` to `(W*2/3, H/2)`.
+- If the tree is unchanged after a scroll, you reached the end.
 
-### Clear Text Field
+### Long press
+
+Same as tap but add a pause between down and up:
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value": ["\u0001"]}'
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/keys" '{"value": ["\u007F"]}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" \
+  '{"actions":[{"type":"pointer","id":"f1","parameters":{"pointerType":"touch"},"actions":[{"type":"pointerMove","duration":0,"x":X,"y":Y},{"type":"pointerDown"},{"type":"pause","duration":1000},{"type":"pointerUp"}]}]}'
 ```
 
-### Press Hardware Button
+### Press hardware button
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/pressButton" '{"name": "home"}'
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/wda/pressButton" '{"name":"home"}'
 ```
 
-## Navigation Strategy
-
-Always prefer the accessibility tree over screenshots.
-
-### Finding Elements
-
-Use this priority order:
-1. Accessibility identifier
-2. Visible label text
-3. Type plus surrounding context
-4. Partial label matching
-5. Positional heuristics as a last resort
-
-### Waiting for UI Stability
-
-After every action:
-1. Wait briefly, usually `sleep 1`
-2. Re-fetch the tree
-3. Confirm the expected screen or element is now present
-
-Do not rely on long blind sleeps when polling is enough.
-
-### Scroll View Navigation
-
-1. Fetch the tree and search for the target element.
-2. If found, tap it.
-3. If not, swipe up from the right edge.
-4. Re-fetch the tree and search again.
-5. If the tree is unchanged after a scroll, assume you reached the end.
-
-### Screen Size
-
-Use the root node frame from the tree to derive screen dimensions.
-
-### Back Navigation
-
-- Primary: tap a navigation bar back button
-- Fallback: swipe from the left edge toward the center
-
-### Tab Bar Navigation
-
-Look for `TabBar` elements in the tree and tap the needed tab.
-
-### System Alert Handling
-
-If actions fail unexpectedly, check for `Alert` or `Sheet` elements and dismiss
-them before retrying.
-
-### App Crash Recovery
-
-If the tree looks wrong or actions consistently fail:
-1. Relaunch with `./Scripts/ci/launch-app.sh`
-2. Wait 3 seconds
-3. Re-fetch the tree
-4. Create a new WDA session if the old one expired
-
-## Single-Test Flow
-
-1. Start with `./Scripts/ci/launch-app.sh`, then `sleep 3`, then inspect the tree.
-2. Read the current test case carefully. It may include `Prerequisites`, `Steps`, `Verification`, `Cleanup`, `Expected Outcome`, or similar sections.
-3. Fulfill prerequisites using UI actions or `wp-api.sh setup ...`.
-4. If a prerequisite cannot be fulfilled, fail the test with reason `Prerequisite not met: <details>`.
-5. Execute the numbered test steps, verifying UI changes after each action.
-6. Use the expected outcome to confirm you reached the intended end state.
-7. Run any verification work with `wp-api.sh verification ...`.
-8. Run any cleanup work with `wp-api.sh cleanup ...`, even after failures.
-9. If the test fails, take a screenshot first and pass the returned relative path into `record-ai-test-result.sh`.
-10. Call `record-ai-test-result.sh` exactly once before stopping. Always pass a reason; `Passed.` is enough for a normal pass.
-11. Keep the recorded reason short and single-line.
-
-## Login Constraints
-
-- This CI flow is for a self-hosted site login path.
-- The app may already be logged in. If the tree already shows the logged-in state, skip login.
-- Prefer the self-hosted site address flow.
-- If a login screen is shown, tap `Enter your existing site address`, type the site URL, tap continue, then wait 2-3 seconds and re-fetch the tree for the logged-in state.
-- Do not switch into a WordPress.com email/password flow unless the test case explicitly requires it.
-- Do not invent credentials or ask for them.
-
-## Important Rules
-
-- The app is expected to already be built and installed on the simulator.
-- Never try to read or write arbitrary files.
-- Never call `record-ai-test-result.sh` more than once.
-- Never skip declared verification or cleanup work.
-- Never use screenshots as the primary navigation source when the tree is enough.
+## Test Flow
+
+1. `./Scripts/ci/launch-app.sh`, then `sleep 3`, then fetch the tree.
+2. If the tree shows a login/prologue screen, follow the Login Flow below.
+   If already logged in (e.g., My Site tab visible), skip login.
+3. Execute the test steps. After each action, `sleep 1` then fetch the tree
+   to confirm the UI changed before proceeding.
+4. Run verification with `./Scripts/ci/wp-api.sh verification ...` if required.
+5. Run cleanup with `./Scripts/ci/wp-api.sh cleanup ...` if required.
+6. Call `./Scripts/ci/record-ai-test-result.sh pass "Short reason"`.
+   On failure, take a screenshot first and pass its path.
+
+## Login Flow
+
+1. Tap `Prologue Self Hosted Button` (accessibility ID)
+2. Tap the `Site address` field
+3. Type the site host (without scheme, e.g., `example.com`)
+4. Tap `Site Address Next Button`
+5. `sleep 3`, fetch tree — you should see the logged-in state
+
+Never use the WordPress.com flow. Never type a password — it is passed via
+launch arguments.
+
+## Handling Common Situations
+
+- **System alerts** (permissions, tracking): Check the tree for `Alert` or
+  `Sheet` elements. Tap "Allow", "OK", or "Don't Allow" to dismiss, then retry.
+- **Loading states**: If the tree shows a spinner, `sleep 2` and re-fetch.
+- **Back navigation**: Tap the back button in the NavigationBar, or use the
+  back swipe gesture as a fallback.
+- **WDA session expired** (4xx errors): Create a new session:
+  ```bash
+  ./Scripts/ci/wda-curl.sh POST /session '{"capabilities":{"alwaysMatch":{}}}'
+  ```
+  Use `value.sessionId` from the response for subsequent calls.
+- **App crash**: Re-run `./Scripts/ci/launch-app.sh`, `sleep 3`, re-fetch tree.
+
+## Element Finding Priority
+
+1. Accessibility identifier (most stable)
+2. Label text
+3. Type + context (e.g., Button inside NavigationBar)
+4. Partial label match
+5. Coordinates from the tree as last resort
+
+## Rules
+
+- **Act, don't narrate.** Every response must contain tool calls.
+- **Prefer coordinate taps** from the tree — they save a turn vs find+click.
+- **Screenshots only on failure.** Do not screenshot during normal flow.
+- **Do not undo to recover from mistakes.** Move forward or fail the test.
+  Only use undo/redo if the test case specifically asks for it.
+- **Do not skip verification or cleanup** if the test case declares them.
+- **Call record-ai-test-result.sh exactly once.** Keep the reason short and
+  single-line.

From d5a5f0f62c870b5e83857c5ce049b58dfcc87094 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Fri, 27 Mar 2026 14:54:13 +0100
Subject: [PATCH 17/23] Back to Sonnet 4.6

---
 .buildkite/commands/run-ai-e2e-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index d14a54083bff..e3644b5abfc2 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -87,7 +87,7 @@ export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
 CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-120}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
-CLAUDE_MODEL="${CLAUDE_MODEL:-claude-haiku-4-5}"
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 WDA_START_TIMEOUT="${WDA_START_TIMEOUT:-120}"

From fc0ebff189074c9c3b0f0dc31af7adcb59e9a532 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Fri, 27 Mar 2026 18:37:49 +0100
Subject: [PATCH 18/23] Add tap-element.sh, reduce max turns, and extend
 timeout

- New tap-element.sh combines find+click into a single call, cutting
  turns per tap from 2-3 to 1. Tries accessibility ID first, falls
  back to label.
- Reduce CLAUDE_MAX_TURNS from 120 to 80 so failed tests bail out
  faster (gem completes most tests in 15-55 turns).
- Extend Buildkite timeout from 60 to 90 minutes to ensure all 11
  tests can complete.
- Update ci-test-runner skill to promote tap-element.sh as the
  preferred tap method.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/run-ai-e2e-tests.sh |  5 +-
 .buildkite/pipeline.yml                 |  2 +-
 .claude/skills/ci-test-runner/SKILL.md  | 32 ++++++------
 Scripts/ci/tap-element.sh               | 69 +++++++++++++++++++++++++
 4 files changed, 88 insertions(+), 20 deletions(-)
 create mode 100755 Scripts/ci/tap-element.sh

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index e3644b5abfc2..723e9614069c 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -22,7 +22,7 @@
 #   APP                            wordpress | jetpack (default: jetpack)
 #   SIMULATOR_NAME                 Simulator to boot if none running (default: iPhone 16)
 #   WDA_PORT                       WebDriverAgent port (default: 8100)
-#   CLAUDE_MAX_TURNS               Max Claude Code tool-use turns (default: 120)
+#   CLAUDE_MAX_TURNS               Max Claude Code tool-use turns (default: 80)
 #   TEST_DIR                       Test directory (default: Tests/AgentTests/ui-tests)
 #   CLAUDE_MODEL                   Model to use (default: claude-sonnet-4-6)
 #   CLAUDE_CODE_EXPECTED_VERSION   Claude Code version to install (default: 2.1.84)
@@ -85,7 +85,7 @@ export SITE_URL="$(normalize_site_url "$SITE_URL")"
 APP="${APP:-jetpack}"
 export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
-CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-120}"
+CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-80}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
 CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
@@ -188,6 +188,7 @@ echo "Claude Code: $(claude --version 2>/dev/null || echo 'unknown')"
 CLAUDE_ALLOWED_TOOLS=(
   --allowedTools "Bash(./Scripts/ci/launch-app.sh)"
   --allowedTools "Bash(./Scripts/ci/wda-curl.sh *)"
+  --allowedTools "Bash(./Scripts/ci/tap-element.sh *)"
   --allowedTools "Bash(./Scripts/ci/wp-api.sh *)"
   --allowedTools "Bash(./Scripts/ci/take-ai-test-screenshot.sh *)"
   --allowedTools "Bash(./Scripts/ci/record-ai-test-result.sh *)"
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index daa5679acdb2..f4292680a44b 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -146,7 +146,7 @@ steps:
     depends_on: "build_jetpack"
     if: "build.pull_request.id != null"
     soft_fail: true
-    timeout_in_minutes: 60
+    timeout_in_minutes: 90
     plugins: [$CI_TOOLKIT_PLUGIN]
     env:
       APP: jetpack
diff --git a/.claude/skills/ci-test-runner/SKILL.md b/.claude/skills/ci-test-runner/SKILL.md
index 742d3b8a6138..e309b985ff83 100644
--- a/.claude/skills/ci-test-runner/SKILL.md
+++ b/.claude/skills/ci-test-runner/SKILL.md
@@ -16,7 +16,8 @@ Do not narrate plans — act.
 | Command | Purpose |
 |---------|---------|
 | `./Scripts/ci/launch-app.sh` | Relaunch app with test credentials |
-| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | WDA HTTP calls (patterns below) |
+| `./Scripts/ci/tap-element.sh IDENTIFIER_OR_LABEL` | Find element by accessibility ID or label and tap it (one call) |
+| `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | Raw WDA HTTP calls (for actions, typing, scrolling — see patterns below) |
 | `./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [BODY]` | REST API with purpose `setup`, `verification`, or `cleanup` |
 | `./Scripts/ci/take-ai-test-screenshot.sh LABEL` | Screenshot (use only on failure) |
 | `./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT]` | Record final result — call exactly once |
@@ -35,26 +36,22 @@ Session ID is in `$WDA_SESSION_ID`.
 Returns a text tree. Each element has type, frame `{{x, y}, {width, height}}`,
 optional identifier and label. The root node frame gives screen dimensions.
 
-### Tap by coordinates (preferred — saves a turn)
-
-Compute center from frame: `X = x + width/2`, `Y = y + height/2`, then:
+### Tap element by ID or label (preferred — one call)
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" \
-  '{"actions":[{"type":"pointer","id":"f1","parameters":{"pointerType":"touch"},"actions":[{"type":"pointerMove","duration":0,"x":X,"y":Y},{"type":"pointerDown"},{"type":"pointerUp"}]}]}'
+./Scripts/ci/tap-element.sh 'Prologue Self Hosted Button'
 ```
 
-### Tap by accessibility ID (when you know the exact ID)
+Finds the element by accessibility ID first, then by label as fallback, and
+taps it. Use this for all taps where you know the identifier or label.
 
-```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/elements" \
-  '{"using":"accessibility id","value":"IDENTIFIER"}'
-```
+### Tap by coordinates (when no ID/label, or for precise positioning)
 
-Then click using the element ID from `value[0].ELEMENT` in the response:
+Compute center from frame: `X = x + width/2`, `Y = y + height/2`, then:
 
 ```bash
-./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/element/ELEMENT_ID/click"
+./Scripts/ci/wda-curl.sh POST "/session/${WDA_SESSION_ID}/actions" \
+  '{"actions":[{"type":"pointer","id":"f1","parameters":{"pointerType":"touch"},"actions":[{"type":"pointerMove","duration":0,"x":X,"y":Y},{"type":"pointerDown"},{"type":"pointerUp"}]}]}'
 ```
 
 ### Type text
@@ -115,10 +112,10 @@ Same as tap but add a pause between down and up:
 
 ## Login Flow
 
-1. Tap `Prologue Self Hosted Button` (accessibility ID)
-2. Tap the `Site address` field
+1. `./Scripts/ci/tap-element.sh 'Prologue Self Hosted Button'`
+2. `./Scripts/ci/tap-element.sh 'Site address'`
 3. Type the site host (without scheme, e.g., `example.com`)
-4. Tap `Site Address Next Button`
+4. `./Scripts/ci/tap-element.sh 'Site Address Next Button'`
 5. `sleep 3`, fetch tree — you should see the logged-in state
 
 Never use the WordPress.com flow. Never type a password — it is passed via
@@ -149,7 +146,8 @@ launch arguments.
 ## Rules
 
 - **Act, don't narrate.** Every response must contain tool calls.
-- **Prefer coordinate taps** from the tree — they save a turn vs find+click.
+- **Use `tap-element.sh`** whenever you know the element's identifier or label.
+  Fall back to coordinate taps only when there's no usable ID/label.
 - **Screenshots only on failure.** Do not screenshot during normal flow.
 - **Do not undo to recover from mistakes.** Move forward or fail the test.
   Only use undo/redo if the test case specifically asks for it.
diff --git a/Scripts/ci/tap-element.sh b/Scripts/ci/tap-element.sh
new file mode 100755
index 000000000000..a4a72f68f8ed
--- /dev/null
+++ b/Scripts/ci/tap-element.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# Find an element by accessibility ID or label and tap it in one call.
+# Combines the find-element + click WDA calls that otherwise cost two turns.
+#
+# Usage: tap-element.sh <IDENTIFIER_OR_LABEL>
+#
+# Tries accessibility ID first, then label. Prints the element JSON on
+# success or an error message on failure.
+#
+# Environment:
+#   WDA_PORT        WDA port (default: 8100)
+#   WDA_SESSION_ID  Active WDA session ID
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=Scripts/ci/ai-test-progress.sh
+source "$SCRIPT_DIR/ai-test-progress.sh"
+
+SELECTOR="${1:?Usage: tap-element.sh IDENTIFIER_OR_LABEL}"
+PORT="${WDA_PORT:-8100}"
+SESSION="${WDA_SESSION_ID:?WDA_SESSION_ID is required}"
+BASE="http://localhost:${PORT}/session/${SESSION}"
+
+find_elements() {
+  local using="$1"
+  local value="$2"
+  curl -sS --max-time 10 -X POST \
+    -H 'Content-Type: application/json' \
+    -d "{\"using\":\"${using}\",\"value\":\"${value}\"}" \
+    "${BASE}/elements"
+}
+
+click_element() {
+  local element_id="$1"
+  curl -sS --max-time 10 -X POST "${BASE}/element/${element_id}/click"
+}
+
+extract_element_id() {
+  # Extract the ELEMENT id from the first match in the WDA response.
+  # WDA returns value[0].ELEMENT (the key is literally "ELEMENT").
+  ruby -rjson -e '
+    data = JSON.parse(STDIN.read)
+    values = data["value"]
+    if values.is_a?(Array) && !values.empty?
+      eid = values[0]["ELEMENT"] || values[0].values.first
+      print eid
+    end
+  ' 2>/dev/null
+}
+
+# Try accessibility ID first
+response="$(find_elements "accessibility id" "$SELECTOR")"
+element_id="$(printf '%s' "$response" | extract_element_id)"
+
+# Fall back to label
+if [[ -z "$element_id" ]]; then
+  response="$(find_elements "link text" "$SELECTOR")"
+  element_id="$(printf '%s' "$response" | extract_element_id)"
+fi
+
+if [[ -z "$element_id" ]]; then
+  log_ai_test_progress "Element not found: ${SELECTOR}"
+  echo "Error: element not found for '${SELECTOR}'" >&2
+  echo "$response"
+  exit 1
+fi
+
+log_ai_test_progress "Tapped element '${SELECTOR}'"
+click_element "$element_id"

From 791448c48533a024be18c53d2be58fd831912319 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Fri, 27 Mar 2026 19:12:05 +0100
Subject: [PATCH 19/23] Raise max turns to 100 and limit screenshots to
 failures only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Increase CLAUDE_MAX_TURNS from 80 to 100 — 80 was too tight for
  complex tests like scheduled post that need date picker interaction.
- Hard-cap screenshots at 3 per test in take-ai-test-screenshot.sh.
  After the limit, the script returns a message instead of capturing.
- Strengthen the skill to make clear that screenshots are only for
  recording failures, never for UI inspection during normal flow.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/run-ai-e2e-tests.sh |  4 ++--
 .claude/skills/ci-test-runner/SKILL.md  |  7 +++++--
 Scripts/ci/take-ai-test-screenshot.sh   | 11 +++++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 723e9614069c..95da3553e40f 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -22,7 +22,7 @@
 #   APP                            wordpress | jetpack (default: jetpack)
 #   SIMULATOR_NAME                 Simulator to boot if none running (default: iPhone 16)
 #   WDA_PORT                       WebDriverAgent port (default: 8100)
-#   CLAUDE_MAX_TURNS               Max Claude Code tool-use turns (default: 80)
+#   CLAUDE_MAX_TURNS               Max Claude Code tool-use turns (default: 100)
 #   TEST_DIR                       Test directory (default: Tests/AgentTests/ui-tests)
 #   CLAUDE_MODEL                   Model to use (default: claude-sonnet-4-6)
 #   CLAUDE_CODE_EXPECTED_VERSION   Claude Code version to install (default: 2.1.84)
@@ -85,7 +85,7 @@ export SITE_URL="$(normalize_site_url "$SITE_URL")"
 APP="${APP:-jetpack}"
 export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
-CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-80}"
+CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-100}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
 CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
diff --git a/.claude/skills/ci-test-runner/SKILL.md b/.claude/skills/ci-test-runner/SKILL.md
index e309b985ff83..c99916bb8e6c 100644
--- a/.claude/skills/ci-test-runner/SKILL.md
+++ b/.claude/skills/ci-test-runner/SKILL.md
@@ -19,7 +19,7 @@ Do not narrate plans — act.
 | `./Scripts/ci/tap-element.sh IDENTIFIER_OR_LABEL` | Find element by accessibility ID or label and tap it (one call) |
 | `./Scripts/ci/wda-curl.sh METHOD PATH [BODY]` | Raw WDA HTTP calls (for actions, typing, scrolling — see patterns below) |
 | `./Scripts/ci/wp-api.sh PURPOSE METHOD PATH [BODY]` | REST API with purpose `setup`, `verification`, or `cleanup` |
-| `./Scripts/ci/take-ai-test-screenshot.sh LABEL` | Screenshot (use only on failure) |
+| `./Scripts/ci/take-ai-test-screenshot.sh LABEL` | Screenshot — **only before recording a failure** (max 3 per test) |
 | `./Scripts/ci/record-ai-test-result.sh STATUS REASON [SCREENSHOT]` | Record final result — call exactly once |
 | `sleep N` | Wait N seconds |
 
@@ -148,7 +148,10 @@ launch arguments.
 - **Act, don't narrate.** Every response must contain tool calls.
 - **Use `tap-element.sh`** whenever you know the element's identifier or label.
   Fall back to coordinate taps only when there's no usable ID/label.
-- **Screenshots only on failure.** Do not screenshot during normal flow.
+- **NEVER take screenshots to inspect the UI.** Use the accessibility tree
+  instead — it is faster and does not cost a turn. Only call
+  `take-ai-test-screenshot.sh` right before `record-ai-test-result.sh fail`
+  so there is evidence of the failure. Maximum 3 per test.
 - **Do not undo to recover from mistakes.** Move forward or fail the test.
   Only use undo/redo if the test case specifically asks for it.
 - **Do not skip verification or cleanup** if the test case declares them.
diff --git a/Scripts/ci/take-ai-test-screenshot.sh b/Scripts/ci/take-ai-test-screenshot.sh
index e6935b374c28..272746db687f 100755
--- a/Scripts/ci/take-ai-test-screenshot.sh
+++ b/Scripts/ci/take-ai-test-screenshot.sh
@@ -2,6 +2,9 @@
 # Capture a screenshot for the current AI-driven test case and print the
 # relative path that should be stored in the result metadata.
 #
+# A hard cap of 3 screenshots per test is enforced to prevent wasting
+# turns on unnecessary screenshots during normal flow.
+#
 # Usage: take-ai-test-screenshot.sh <label>
 set -euo pipefail
 
@@ -16,6 +19,14 @@ LABEL="${1:?Usage: take-ai-test-screenshot.sh <label>}"
 : "${AI_TEST_SCREENSHOTS_DIR:?AI_TEST_SCREENSHOTS_DIR is required}"
 : "${AI_TEST_SLUG:?AI_TEST_SLUG is required}"
 
+MAX_SCREENSHOTS=3
+existing_count="$(find "$AI_TEST_SCREENSHOTS_DIR" -maxdepth 1 -name "${AI_TEST_SLUG}-*" -type f 2>/dev/null | wc -l | tr -d ' ')"
+if [[ "$existing_count" -ge "$MAX_SCREENSHOTS" ]]; then
+  log_ai_test_progress "Screenshot skipped (limit of ${MAX_SCREENSHOTS} reached)"
+  echo "Screenshot limit reached (${MAX_SCREENSHOTS} per test). Save turns for actions instead."
+  exit 0
+fi
+
 safe_label="$(echo "$LABEL" | tr -cs '[:alnum:]_-' '_')"
 mkdir -p "$AI_TEST_SCREENSHOTS_DIR"
 absolute_path="$(mktemp "${AI_TEST_SCREENSHOTS_DIR}/${AI_TEST_SLUG}-${safe_label}-XXXX.png")"

From ea2deacef6b6787e941565d7305cf07c838219f7 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Wed, 25 Mar 2026 21:33:44 +0100
Subject: [PATCH 20/23] Trigger CI


From 7f03ef87ed31e6312b2832dffdde14f029fee7b9 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Fri, 27 Mar 2026 13:41:32 +0100
Subject: [PATCH 21/23] Trigger CI


From 82b9adf4c76c00871664ca44f980f200a6dfd793 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Mon, 30 Mar 2026 21:51:16 +0200
Subject: [PATCH 22/23] Test AI E2E with Claude Opus 4.6

Switch default model from Sonnet to Opus to compare turn efficiency
and pass rate on the same test suite.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .buildkite/commands/run-ai-e2e-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 95da3553e40f..7244e3edd0e3 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -87,7 +87,7 @@ export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
 CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-100}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
-CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-opus-4-6}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 WDA_START_TIMEOUT="${WDA_START_TIMEOUT:-120}"

From 0eab4bbdb04e9be4606e03a7415db081bfab0b21 Mon Sep 17 00:00:00 2001
From: Ian Maia <ian.maia@automattic.com>
Date: Tue, 31 Mar 2026 14:55:14 +0200
Subject: [PATCH 23/23] Revert "Test AI E2E with Claude Opus 4.6"

This reverts commit 82b9adf4c76c00871664ca44f980f200a6dfd793.
---
 .buildkite/commands/run-ai-e2e-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/commands/run-ai-e2e-tests.sh b/.buildkite/commands/run-ai-e2e-tests.sh
index 7244e3edd0e3..95da3553e40f 100755
--- a/.buildkite/commands/run-ai-e2e-tests.sh
+++ b/.buildkite/commands/run-ai-e2e-tests.sh
@@ -87,7 +87,7 @@ export SIMULATOR_NAME="${SIMULATOR_NAME:-iPhone 16}"
 WDA_PORT="${WDA_PORT:-8100}"
 CLAUDE_MAX_TURNS="${CLAUDE_MAX_TURNS:-100}"
 TEST_DIR="${TEST_DIR:-Tests/AgentTests/ui-tests}"
-CLAUDE_MODEL="${CLAUDE_MODEL:-claude-opus-4-6}"
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
 CLAUDE_CODE_EXPECTED_VERSION="${CLAUDE_CODE_EXPECTED_VERSION:-2.1.84}"
 CLAUDE_CODE_NPM_SPEC="${CLAUDE_CODE_NPM_SPEC:-@anthropic-ai/claude-code@${CLAUDE_CODE_EXPECTED_VERSION}}"
 WDA_START_TIMEOUT="${WDA_START_TIMEOUT:-120}"