-
Notifications
You must be signed in to change notification settings - Fork 3
472 lines (414 loc) · 19.8 KB
/
claude-code-test.yml
File metadata and controls
472 lines (414 loc) · 19.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
name: Claude Code Integration Test
on:
# Manual trigger for testing
workflow_dispatch:
inputs:
debug:
description: 'Enable debug logging'
required: false
default: 'false'
type: boolean
# Run on all PRs (shows as check, but steps skip unless in merge queue)
pull_request:
branches: [main]
# Run in the merge queue to validate before merging
merge_group:
branches: [main]
# Ensure only one instance runs at a time per PR/branch
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: true
# Minimal permissions for this workflow
permissions:
contents: read
jobs:
# Job 1: Validate skill generation from fixtures (no API key needed)
# Runs on all events, but actual work only happens in merge_group/workflow_dispatch
# This ensures the check name exists for PRs (needed for GitHub's merge queue)
validate-generation:
runs-on: ubuntu-latest
steps:
# For PRs: just pass quickly (actual tests run in merge queue)
- name: Skip on PR
if: github.event_name == 'pull_request'
run: echo "Validation will run in merge queue. Passing for PR."
- uses: actions/checkout@v4
if: github.event_name != 'pull_request'
- name: Install uv
if: github.event_name != 'pull_request'
uses: astral-sh/setup-uv@v4
with:
version: "latest"
enable-cache: true
- name: Set up Python
if: github.event_name != 'pull_request'
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install dependencies
if: github.event_name != 'pull_request'
run: uv sync --extra dev
- name: Validate fruits fixture parses and install generates correct structure
if: github.event_name != 'pull_request'
run: |
# Verify the fruits fixture parses correctly via deepwork's parser
uv run python -c "
from pathlib import Path
from deepwork.jobs.parser import parse_job_definition
job = parse_job_definition(Path('tests/fixtures/jobs/fruits'))
assert job.name == 'fruits'
assert job.summary is not None
# Step arguments (shared input/output definitions)
arg_names = [a.name for a in job.step_arguments]
assert 'raw_items' in arg_names
assert 'identified_fruits' in arg_names
assert 'classified_fruits' in arg_names
# Workflow definition
assert 'full' in job.workflows
wf = job.workflows['full']
assert wf.step_names == ['identify', 'classify']
# Identify step: raw_items input -> identified_fruits output
identify = wf.get_step('identify')
assert identify is not None
assert 'raw_items' in identify.inputs
assert 'identified_fruits' in identify.outputs
# Classify step: identified_fruits input -> classified_fruits output
classify = wf.get_step('classify')
assert classify is not None
assert 'identified_fruits' in classify.inputs
assert 'classified_fruits' in classify.outputs
# Validations pass (parse_job_definition already runs these, but verify they don't raise)
job.validate_unique_step_names()
job.validate_argument_refs()
job.validate_sub_workflows()
job.validate_step_exclusivity()
print('All fruits fixture validations passed!')
"
- name: Validate plugin structure and skill content
if: github.event_name != 'pull_request'
run: |
# Verify the plugin provides the required files
echo "Checking plugin structure..."
test -f plugins/claude/.claude-plugin/plugin.json || (echo "Missing plugin.json" && exit 1)
test -f plugins/claude/skills/deepwork/SKILL.md || (echo "Missing SKILL.md" && exit 1)
test -f plugins/claude/.mcp.json || (echo "Missing .mcp.json" && exit 1)
test -f plugins/claude/hooks/hooks.json || (echo "Missing hooks.json" && exit 1)
# Verify the skill references deepwork
grep -qi "deepwork" plugins/claude/skills/deepwork/SKILL.md
# Verify the MCP config runs deepwork serve
grep -q "deepwork" plugins/claude/.mcp.json
grep -q "serve" plugins/claude/.mcp.json
echo "Plugin structure validated successfully!"
# Job 2: Full end-to-end test with Claude Code
# Tests the COMPLETE workflow:
# Runs on all events, but actual work only happens in merge_group/workflow_dispatch
# This ensures the check name exists for PRs (needed for GitHub's merge queue)
claude-code-e2e:
runs-on: ubuntu-latest
needs: validate-generation
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
steps:
# Determine whether to run the full e2e test suite.
# Always runs in merge_group and workflow_dispatch.
# For PRs, only runs if the workflow file itself was changed (so we can iterate on CI fixes).
- name: Determine if tests should run
id: should-run
env:
GH_TOKEN: ${{ github.token }}
run: |
if [ "${{ github.event_name }}" != "pull_request" ]; then
echo "run=true" >> $GITHUB_OUTPUT
else
FILES=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/files --jq '.[].filename' 2>/dev/null || echo "")
if echo "$FILES" | grep -q '^\.github/workflows/claude-code-test\.yml$'; then
echo "run=true" >> $GITHUB_OUTPUT
echo "Workflow file changed in PR - running e2e tests"
else
echo "run=false" >> $GITHUB_OUTPUT
echo "E2E tests will run in merge queue. Passing for PR."
fi
fi
- uses: actions/checkout@v4
if: steps.should-run.outputs.run == 'true'
- name: Check for API key
if: steps.should-run.outputs.run == 'true'
id: check-key
run: |
if [ -z "$ANTHROPIC_API_KEY" ]; then
echo "has_key=false" >> $GITHUB_OUTPUT
echo "::warning::ANTHROPIC_API_KEY not set, skipping Claude Code e2e test"
else
echo "has_key=true" >> $GITHUB_OUTPUT
fi
- name: Install Node.js (for Claude Code CLI)
if: steps.check-key.outputs.has_key == 'true'
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install Claude Code CLI
if: steps.check-key.outputs.has_key == 'true'
run: npm install -g @anthropic-ai/claude-code
- name: Install uv
if: steps.check-key.outputs.has_key == 'true'
uses: astral-sh/setup-uv@v4
with:
version: "latest"
enable-cache: true
- name: Set up Python
if: steps.check-key.outputs.has_key == 'true'
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install deepwork
if: steps.check-key.outputs.has_key == 'true'
run: |
uv sync
# Add the uv virtualenv bin directory to PATH for all subsequent steps.
#
# Why: The MCP config uses "deepwork serve --path ." to spawn the
# MCP server. Without this PATH addition, the bare "deepwork" command
# is not found (it only exists inside the uv virtualenv).
echo "$(pwd)/.venv/bin" >> $GITHUB_PATH
- name: Set up fresh test project
if: steps.check-key.outputs.has_key == 'true'
run: |
# Create a fresh project with NO pre-existing job definitions
mkdir -p test_project/.claude
cd test_project
git init
git config user.email "test@test.com"
git config user.name "Test"
echo "# CI Test Project - DeepWork E2E Test" > README.md
git add . && git commit -m "init"
cd ..
# The plugin (--plugin-dir) provides skills, hooks, and MCP server config.
# Override the plugin's MCP config to use the bare `deepwork` command
# (the plugin uses `uvx` which may not resolve the local venv install).
# Server is named "deepwork-dev" to match the reviewer agent's tool
# patterns (mcp__deepwork-dev__*) — see CLAUDE.md MCP tool naming.
python3 -c "
import json
mcp = {'mcpServers': {'deepwork-dev': {
'command': 'deepwork',
'args': ['serve', '--path', '.', '--platform', 'claude']
}}}
with open('test_project/.mcp.json', 'w') as f:
json.dump(mcp, f, indent=2)
"
# Write settings.json with all required permissions
python3 -c "
import json
settings = {
'permissions': {
'allow': [
'Bash(*)', 'Read(./**)', 'Edit(./**)', 'Write(./**)', 'Skill(*)',
'mcp__deepwork-dev__get_workflows', 'mcp__deepwork-dev__start_workflow',
'mcp__deepwork-dev__finished_step', 'mcp__deepwork-dev__abort_workflow',
'mcp__deepwork-dev__go_to_step',
'mcp__deepwork-dev__mark_review_as_passed'
]
}
}
with open('test_project/.claude/settings.json', 'w') as f:
json.dump(settings, f, indent=2)
"
echo "Fresh test project setup complete"
echo "MCP config:"
cat test_project/.mcp.json
# STEP 1: Use /deepwork to CREATE the fruits job via MCP workflow
#
# This invokes Claude with the /deepwork skill, which uses MCP tools to
# walk through the deepwork_jobs/new_job workflow (define → implement →
# test → iterate). The workflow includes quality gates that spawn Claude
# subprocesses, so it needs a generous timeout.
- name: Create job with /deepwork
if: steps.check-key.outputs.has_key == 'true'
working-directory: test_project
timeout-minutes: 10
run: |
echo "=== Running /deepwork to create fruits job ==="
mkdir fruits
# Use --debug and --output-format stream-json for diagnosing failures.
# stream-json shows every tool call; output is captured to a file for the failure handler.
set -o pipefail
claude --print --verbose --output-format stream-json --max-turns 25 --debug --model claude-sonnet-4-6 --dangerously-skip-permissions --plugin-dir "$GITHUB_WORKSPACE/plugins/claude" <<'PROMPT_EOF' | tee ../claude-create-job.jsonl
/deepwork I want to create a simple job called "fruits" for identifying and classifying fruits.
Here are the EXACT specifications.
Intent: A simple workflow that takes a list of mixed items, identifies which are fruits, then classifies them by category. Designed for CI testing.
Steps:
1. Step: identify
Name: Identify Fruits
Description: Filter a list of items to include only the fruits
**CRITICAL**: The output MUST be stored in `fruits/identified_fruits.md`.
2. Step: classify
Name: Classify Fruits
Description: Organize identified fruits into categories (citrus, tropical, berries, etc.).
**CRITICAL**: must put the classified fruit list in `./fruits/classified_fruits.md`.
**Key Instructions:**
- NEVER use AskUserQuestion — you already have all the information you need above.
- You MUST complete all tool calls needed to create the files. Do not stop early.
- Do not ask questions - just make the job.
- Rules are explicitly not desired. Tell the review agents that.
- Do not give long commentary of what you did - just make the job with no commentary.
- NEVER start the "repair" or "learn" workflows. Only use "new_job". If a quality review fails, fix the issues in the files and resubmit — do not switch workflows.
- IMPORTANT: Once the job.yml and step instruction files have been created (i.e. after the "define" and "implement" steps are done), STOP. Do NOT continue into the "test" or "iterate" steps. Abort the workflow at that point. We only need the job definition files created, not the full workflow run.
PROMPT_EOF
# Verify the job.yml was created
echo "=== Checking job.yml was created ==="
if [ -f ".deepwork/jobs/fruits/job.yml" ]; then
echo "SUCCESS: job.yml created"
cat .deepwork/jobs/fruits/job.yml
else
echo "ERROR: job.yml was not created"
echo "Contents of .deepwork/jobs/:"
ls -la .deepwork/jobs/ || echo "No jobs directory"
exit 1
fi
# Verify step instructions are inlined in job.yml
echo "=== Checking step instructions are inlined ==="
if grep -q 'instructions:' .deepwork/jobs/fruits/job.yml; then
echo "SUCCESS: Step instructions are inlined in job.yml"
else
echo "ERROR: No inline step instructions found in job.yml"
exit 1
fi
echo "=== Job creation complete ==="
# Dump Claude debug log if the job creation step failed or timed out.
# This captures MCP server communication, tool calls, and error details.
- name: Dump Claude debug log on failure
if: failure() && steps.check-key.outputs.has_key == 'true'
working-directory: test_project
run: |
echo "=== Claude stream-json output (create job) ==="
if [ -f "../claude-create-job.jsonl" ]; then
echo "--- Last 100 lines ---"
tail -100 ../claude-create-job.jsonl
else
echo "No stream-json output captured for create job step"
fi
echo ""
echo "=== Claude stream-json output (run workflow) ==="
if [ -f "../claude-run-workflow.jsonl" ]; then
echo "--- Last 100 lines ---"
tail -100 ../claude-run-workflow.jsonl
else
echo "No stream-json output captured for run workflow step"
fi
echo ""
echo "=== Claude debug log ==="
# Claude --debug writes to ~/.claude/debug.log
if [ -f "$HOME/.claude/debug.log" ]; then
echo "--- Last 200 lines of debug.log ---"
tail -200 "$HOME/.claude/debug.log"
else
echo "No debug.log found at ~/.claude/debug.log"
echo "Searching for debug logs..."
find "$HOME/.claude" -name "*.log" -type f 2>/dev/null || echo "No log files found"
fi
echo ""
echo "=== MCP server config ==="
cat .mcp.json 2>/dev/null || echo "No .mcp.json found"
echo ""
echo "=== Settings.json ==="
cat .claude/settings.json 2>/dev/null || echo "No settings.json found"
echo ""
echo "=== DeepWork session state ==="
ls -la .deepwork/tmp/ 2>/dev/null || echo "No tmp directory"
for f in .deepwork/tmp/session_*.json; do
[ -f "$f" ] && echo "--- $f ---" && cat "$f"
done
# STEP 3: Execute the fruits workflow via /deepwork MCP entry point
- name: Run Workflow
if: steps.check-key.outputs.has_key == 'true'
working-directory: test_project
timeout-minutes: 6
run: |
echo "=== Running fruits workflow with test input via /deepwork ==="
set -o pipefail
claude --print --verbose --output-format stream-json --max-turns 25 --debug --model claude-sonnet-4-6 --dangerously-skip-permissions --plugin-dir "$GITHUB_WORKSPACE/plugins/claude" <<'PROMPT_EOF' | tee ../claude-run-workflow.jsonl
/deepwork Run the fruits full workflow. Process the list to the file and don't give any extra commentary or text output.
NEVER use AskUserQuestion — you already have all the information you need.
You MUST complete all tool calls needed. Do not stop early.
CRITICAL: All output files MUST be written relative to the current working directory (the project root), NOT inside .deepwork/jobs/. For example, write to ./fruits/identified_fruits.md, NOT .deepwork/jobs/fruits/identified_fruits.md.
raw_items: apple, car, banana, chair, orange, table, mango, laptop, grape, bicycle
PROMPT_EOF
echo "=== Workflow finished - looking for output file ==="
# Verify both outputs were created
if [ -f "fruits/identified_fruits.md" ]; then
echo "SUCCESS: identified_fruits.md created"
echo "--- Output ---"
cat fruits/identified_fruits.md
else
echo "ERROR: identified_fruits.md was not created"
exit 1
fi
if [ -f "fruits/classified_fruits.md" ]; then
echo "SUCCESS: classified_fruits.md created"
echo "--- Output ---"
cat fruits/classified_fruits.md
else
echo "ERROR: classified_fruits.md was not created"
exit 1
fi
# STEP 4: Validate the complete workflow output
- name: Validate Workflow Output
if: steps.check-key.outputs.has_key == 'true'
working-directory: test_project/fruits
run: |
echo "=== Validating complete workflow ==="
# Check identified_fruits.md contains expected fruits
echo "Checking identified_fruits.md..."
grep -qi "apple" identified_fruits.md || (echo "Missing: apple" && exit 1)
grep -qi "banana" identified_fruits.md || (echo "Missing: banana" && exit 1)
grep -qi "orange" identified_fruits.md || (echo "Missing: orange" && exit 1)
grep -qi "mango" identified_fruits.md || (echo "Missing: mango" && exit 1)
grep -qi "grape" identified_fruits.md || (echo "Missing: grape" && exit 1)
echo " ✓ All expected fruits found in identified_fruits.md"
# Check classified_fruits.md has expected structure
echo "Checking classified_fruits.md..."
grep -qi "citrus\|tropical\|pome\|berr" classified_fruits.md || (echo "Missing fruit categories" && exit 1)
echo " ✓ Fruit categories found in classified_fruits.md"
echo ""
echo "=========================================="
echo " ALL E2E TESTS PASSED SUCCESSFULLY!"
echo "=========================================="
echo ""
echo "Workflow tested: /deepwork fruits full - Executed full fruits workflow (identify + classify)"
echo ""
- name: Display status files
if: steps.check-key.outputs.has_key == 'true' && always()
working-directory: test_project
run: |
echo "=== Status Files ==="
STATUS_DIR=".deepwork/tmp/status/v1"
if [ -d "$STATUS_DIR" ]; then
echo "--- job_manifest.yml ---"
cat "$STATUS_DIR/job_manifest.yml" 2>/dev/null || echo "(not found)"
echo ""
if [ -d "$STATUS_DIR/sessions" ]; then
for f in "$STATUS_DIR/sessions"/*.yml; do
echo "--- $(basename "$f") ---"
cat "$f"
echo ""
done
else
echo "(no session status files)"
fi
else
echo "(status directory not found)"
fi
- name: Upload test artifacts
if: steps.check-key.outputs.has_key == 'true' && always()
uses: actions/upload-artifact@v4
with:
name: claude-code-e2e-outputs
path: |
test_project/.deepwork/jobs/fruits/
test_project/.deepwork/tmp/status/
test_project/.claude/skills/deepwork/
test_project/fruits/identified_fruits.md
test_project/fruits/classified_fruits.md
claude-create-job.jsonl
claude-run-workflow.jsonl
retention-days: 7