deepwork/.github/workflows/claude-code-test.yml at main · Unsupervisedcom/deepwork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
name: Claude Code Integration Test

on:
  # Manual trigger for testing
  workflow_dispatch:
    inputs:
      debug:
        description: 'Enable debug logging'
        required: false
        default: 'false'
        type: boolean
  # Run on all PRs (shows as check, but steps skip unless in merge queue)
  pull_request:
    branches: [main]
  # Run in the merge queue to validate before merging
  merge_group:
    branches: [main]

# Ensure only one instance runs at a time per PR/branch
concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
  cancel-in-progress: true

# Minimal permissions for this workflow
permissions:
  contents: read

jobs:
  # Job 1: Validate skill generation from fixtures (no API key needed)
  # Runs on all events, but actual work only happens in merge_group/workflow_dispatch
  # This ensures the check name exists for PRs (needed for GitHub's merge queue)
  validate-generation:
    runs-on: ubuntu-latest
    steps:
      # For PRs: just pass quickly (actual tests run in merge queue)
      - name: Skip on PR
        if: github.event_name == 'pull_request'
        run: echo "Validation will run in merge queue. Passing for PR."

      - uses: actions/checkout@v4
        if: github.event_name != 'pull_request'

      - name: Install uv
        if: github.event_name != 'pull_request'
        uses: astral-sh/setup-uv@v4
        with:
          version: "latest"
          enable-cache: true

      - name: Set up Python
        if: github.event_name != 'pull_request'
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install dependencies
        if: github.event_name != 'pull_request'
        run: uv sync --extra dev

      - name: Validate fruits fixture parses and install generates correct structure
        if: github.event_name != 'pull_request'
        run: |
          # Verify the fruits fixture parses correctly via deepwork's parser
          uv run python -c "
          from pathlib import Path
          from deepwork.jobs.parser import parse_job_definition

          job = parse_job_definition(Path('tests/fixtures/jobs/fruits'))

          assert job.name == 'fruits'
          assert job.summary is not None

          # Step arguments (shared input/output definitions)
          arg_names = [a.name for a in job.step_arguments]
          assert 'raw_items' in arg_names
          assert 'identified_fruits' in arg_names
          assert 'classified_fruits' in arg_names

          # Workflow definition
          assert 'full' in job.workflows
          wf = job.workflows['full']
          assert wf.step_names == ['identify', 'classify']

          # Identify step: raw_items input -> identified_fruits output
          identify = wf.get_step('identify')
          assert identify is not None
          assert 'raw_items' in identify.inputs
          assert 'identified_fruits' in identify.outputs

          # Classify step: identified_fruits input -> classified_fruits output
          classify = wf.get_step('classify')
          assert classify is not None
          assert 'identified_fruits' in classify.inputs
          assert 'classified_fruits' in classify.outputs

          # Validations pass (parse_job_definition already runs these, but verify they don't raise)
          job.validate_unique_step_names()
          job.validate_argument_refs()
          job.validate_sub_workflows()
          job.validate_step_exclusivity()

          print('All fruits fixture validations passed!')
          "

      - name: Validate plugin structure and skill content
        if: github.event_name != 'pull_request'
        run: |
          # Verify the plugin provides the required files
          echo "Checking plugin structure..."

          test -f plugins/claude/.claude-plugin/plugin.json || (echo "Missing plugin.json" && exit 1)
          test -f plugins/claude/skills/deepwork/SKILL.md || (echo "Missing SKILL.md" && exit 1)
          test -f plugins/claude/.mcp.json || (echo "Missing .mcp.json" && exit 1)
          test -f plugins/claude/hooks/hooks.json || (echo "Missing hooks.json" && exit 1)

          # Verify the skill references deepwork
          grep -qi "deepwork" plugins/claude/skills/deepwork/SKILL.md

          # Verify the MCP config runs deepwork serve
          grep -q "deepwork" plugins/claude/.mcp.json
          grep -q "serve" plugins/claude/.mcp.json

          echo "Plugin structure validated successfully!"

  # Job 2: Full end-to-end test with Claude Code
  # Tests the COMPLETE workflow:
  # Runs on all events, but actual work only happens in merge_group/workflow_dispatch
  # This ensures the check name exists for PRs (needed for GitHub's merge queue)
  claude-code-e2e:
    runs-on: ubuntu-latest
    needs: validate-generation
    env:
      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
    steps:
      # Determine whether to run the full e2e test suite.
      # Always runs in merge_group and workflow_dispatch.
      # For PRs, only runs if the workflow file itself was changed (so we can iterate on CI fixes).
      - name: Determine if tests should run
        id: should-run
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          if [ "${{ github.event_name }}" != "pull_request" ]; then
            echo "run=true" >> $GITHUB_OUTPUT
          else
            FILES=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/files --jq '.[].filename' 2>/dev/null || echo "")
            if echo "$FILES" | grep -q '^\.github/workflows/claude-code-test\.yml$'; then
              echo "run=true" >> $GITHUB_OUTPUT
              echo "Workflow file changed in PR - running e2e tests"
            else
              echo "run=false" >> $GITHUB_OUTPUT
              echo "E2E tests will run in merge queue. Passing for PR."
            fi
          fi

      - uses: actions/checkout@v4
        if: steps.should-run.outputs.run == 'true'

      - name: Check for API key
        if: steps.should-run.outputs.run == 'true'
        id: check-key
        run: |
          if [ -z "$ANTHROPIC_API_KEY" ]; then
            echo "has_key=false" >> $GITHUB_OUTPUT
            echo "::warning::ANTHROPIC_API_KEY not set, skipping Claude Code e2e test"
          else
            echo "has_key=true" >> $GITHUB_OUTPUT
          fi

      - name: Install Node.js (for Claude Code CLI)
        if: steps.check-key.outputs.has_key == 'true'
        uses: actions/setup-node@v4
        with:
          node-version: '20'

      - name: Install Claude Code CLI
        if: steps.check-key.outputs.has_key == 'true'
        run: npm install -g @anthropic-ai/claude-code

      - name: Install uv
        if: steps.check-key.outputs.has_key == 'true'
        uses: astral-sh/setup-uv@v4
        with:
          version: "latest"
          enable-cache: true

      - name: Set up Python
        if: steps.check-key.outputs.has_key == 'true'
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install deepwork
        if: steps.check-key.outputs.has_key == 'true'
        run: |
          uv sync

          # Add the uv virtualenv bin directory to PATH for all subsequent steps.
          #
          # Why: The MCP config uses "deepwork serve --path ." to spawn the
          # MCP server. Without this PATH addition, the bare "deepwork" command
          # is not found (it only exists inside the uv virtualenv).
          echo "$(pwd)/.venv/bin" >> $GITHUB_PATH

      - name: Set up fresh test project
        if: steps.check-key.outputs.has_key == 'true'
        run: |
          # Create a fresh project with NO pre-existing job definitions
          mkdir -p test_project/.claude

          cd test_project
          git init
          git config user.email "test@test.com"
          git config user.name "Test"
          echo "# CI Test Project - DeepWork E2E Test" > README.md
          git add . && git commit -m "init"
          cd ..

          # The plugin (--plugin-dir) provides skills, hooks, and MCP server config.
          # Override the plugin's MCP config to use the bare `deepwork` command
          # (the plugin uses `uvx` which may not resolve the local venv install).
          # Server is named "deepwork-dev" to match the reviewer agent's tool
          # patterns (mcp__deepwork-dev__*) — see CLAUDE.md MCP tool naming.
          python3 -c "
          import json
          mcp = {'mcpServers': {'deepwork-dev': {
              'command': 'deepwork',
              'args': ['serve', '--path', '.', '--platform', 'claude']
          }}}
          with open('test_project/.mcp.json', 'w') as f:
              json.dump(mcp, f, indent=2)
          "

          # Write settings.json with all required permissions
          python3 -c "
          import json
          settings = {
              'permissions': {
                  'allow': [
                      'Bash(*)', 'Read(./**)', 'Edit(./**)', 'Write(./**)', 'Skill(*)',
                      'mcp__deepwork-dev__get_workflows', 'mcp__deepwork-dev__start_workflow',
                      'mcp__deepwork-dev__finished_step', 'mcp__deepwork-dev__abort_workflow',
                      'mcp__deepwork-dev__go_to_step',
                      'mcp__deepwork-dev__mark_review_as_passed'
                  ]
              }
          }
          with open('test_project/.claude/settings.json', 'w') as f:
              json.dump(settings, f, indent=2)
          "

          echo "Fresh test project setup complete"
          echo "MCP config:"
          cat test_project/.mcp.json

      # STEP 1: Use /deepwork to CREATE the fruits job via MCP workflow
      #
      # This invokes Claude with the /deepwork skill, which uses MCP tools to
      # walk through the deepwork_jobs/new_job workflow (define → implement →
      # test → iterate). The workflow includes quality gates that spawn Claude
      # subprocesses, so it needs a generous timeout.
      - name: Create job with /deepwork
        if: steps.check-key.outputs.has_key == 'true'
        working-directory: test_project
        timeout-minutes: 10
        run: |
          echo "=== Running /deepwork to create fruits job ==="
          mkdir fruits

          # Use --debug and --output-format stream-json for diagnosing failures.
          # stream-json shows every tool call; output is captured to a file for the failure handler.
          set -o pipefail
          claude --print --verbose --output-format stream-json --max-turns 25 --debug --model claude-sonnet-4-6 --dangerously-skip-permissions --plugin-dir "$GITHUB_WORKSPACE/plugins/claude" <<'PROMPT_EOF' | tee ../claude-create-job.jsonl
          /deepwork I want to create a simple job called "fruits" for identifying and classifying fruits.

          Here are the EXACT specifications.

          Intent: A simple workflow that takes a list of mixed items, identifies which are fruits, then classifies them by category. Designed for CI testing.

          Steps:
          1. Step: identify
             Name: Identify Fruits
             Description: Filter a list of items to include only the fruits
             **CRITICAL**: The output MUST be stored in `fruits/identified_fruits.md`.

          2. Step: classify
             Name: Classify Fruits
             Description: Organize identified fruits into categories (citrus, tropical, berries, etc.).
             **CRITICAL**: must put the classified fruit list in `./fruits/classified_fruits.md`.

          **Key Instructions:**
          - NEVER use AskUserQuestion — you already have all the information you need above.
          - You MUST complete all tool calls needed to create the files. Do not stop early.
          - Do not ask questions - just make the job.
          - Rules are explicitly not desired. Tell the review agents that.
          - Do not give long commentary of what you did - just make the job with no commentary.
          - NEVER start the "repair" or "learn" workflows. Only use "new_job". If a quality review fails, fix the issues in the files and resubmit — do not switch workflows.
          - IMPORTANT: Once the job.yml and step instruction files have been created (i.e. after the "define" and "implement" steps are done), STOP. Do NOT continue into the "test" or "iterate" steps. Abort the workflow at that point. We only need the job definition files created, not the full workflow run.
          PROMPT_EOF

          # Verify the job.yml was created
          echo "=== Checking job.yml was created ==="
          if [ -f ".deepwork/jobs/fruits/job.yml" ]; then
            echo "SUCCESS: job.yml created"
            cat .deepwork/jobs/fruits/job.yml
          else
            echo "ERROR: job.yml was not created"
            echo "Contents of .deepwork/jobs/:"
            ls -la .deepwork/jobs/ || echo "No jobs directory"
            exit 1
          fi

          # Verify step instructions are inlined in job.yml
          echo "=== Checking step instructions are inlined ==="
          if grep -q 'instructions:' .deepwork/jobs/fruits/job.yml; then
            echo "SUCCESS: Step instructions are inlined in job.yml"
          else
            echo "ERROR: No inline step instructions found in job.yml"
            exit 1
          fi

          echo "=== Job creation complete ==="

      # Dump Claude debug log if the job creation step failed or timed out.
      # This captures MCP server communication, tool calls, and error details.
      - name: Dump Claude debug log on failure
        if: failure() && steps.check-key.outputs.has_key == 'true'
        working-directory: test_project
        run: |
          echo "=== Claude stream-json output (create job) ==="
          if [ -f "../claude-create-job.jsonl" ]; then
            echo "--- Last 100 lines ---"
            tail -100 ../claude-create-job.jsonl
          else
            echo "No stream-json output captured for create job step"
          fi
          echo ""
          echo "=== Claude stream-json output (run workflow) ==="
          if [ -f "../claude-run-workflow.jsonl" ]; then
            echo "--- Last 100 lines ---"
            tail -100 ../claude-run-workflow.jsonl
          else
            echo "No stream-json output captured for run workflow step"
          fi
          echo ""
          echo "=== Claude debug log ==="
          # Claude --debug writes to ~/.claude/debug.log
          if [ -f "$HOME/.claude/debug.log" ]; then
            echo "--- Last 200 lines of debug.log ---"
            tail -200 "$HOME/.claude/debug.log"
          else
            echo "No debug.log found at ~/.claude/debug.log"
            echo "Searching for debug logs..."
            find "$HOME/.claude" -name "*.log" -type f 2>/dev/null || echo "No log files found"
          fi
          echo ""
          echo "=== MCP server config ==="
          cat .mcp.json 2>/dev/null || echo "No .mcp.json found"
          echo ""
          echo "=== Settings.json ==="
          cat .claude/settings.json 2>/dev/null || echo "No settings.json found"
          echo ""
          echo "=== DeepWork session state ==="
          ls -la .deepwork/tmp/ 2>/dev/null || echo "No tmp directory"
          for f in .deepwork/tmp/session_*.json; do
            [ -f "$f" ] && echo "--- $f ---" && cat "$f"
          done

      # STEP 3: Execute the fruits workflow via /deepwork MCP entry point
      - name: Run Workflow
        if: steps.check-key.outputs.has_key == 'true'
        working-directory: test_project
        timeout-minutes: 6
        run: |
          echo "=== Running fruits workflow with test input via /deepwork ==="

          set -o pipefail
          claude --print --verbose --output-format stream-json --max-turns 25 --debug --model claude-sonnet-4-6 --dangerously-skip-permissions --plugin-dir "$GITHUB_WORKSPACE/plugins/claude" <<'PROMPT_EOF' | tee ../claude-run-workflow.jsonl
          /deepwork Run the fruits full workflow. Process the list to the file and don't give any extra commentary or text output.
          NEVER use AskUserQuestion — you already have all the information you need.
          You MUST complete all tool calls needed. Do not stop early.
          CRITICAL: All output files MUST be written relative to the current working directory (the project root), NOT inside .deepwork/jobs/. For example, write to ./fruits/identified_fruits.md, NOT .deepwork/jobs/fruits/identified_fruits.md.
          raw_items: apple, car, banana, chair, orange, table, mango, laptop, grape, bicycle
          PROMPT_EOF

          echo "=== Workflow finished - looking for output file ==="

          # Verify both outputs were created
          if [ -f "fruits/identified_fruits.md" ]; then
            echo "SUCCESS: identified_fruits.md created"
            echo "--- Output ---"
            cat fruits/identified_fruits.md
          else
            echo "ERROR: identified_fruits.md was not created"
            exit 1
          fi

          if [ -f "fruits/classified_fruits.md" ]; then
            echo "SUCCESS: classified_fruits.md created"
            echo "--- Output ---"
            cat fruits/classified_fruits.md
          else
            echo "ERROR: classified_fruits.md was not created"
            exit 1
          fi

      # STEP 4: Validate the complete workflow output
      - name: Validate Workflow Output
        if: steps.check-key.outputs.has_key == 'true'
        working-directory: test_project/fruits
        run: |
          echo "=== Validating complete workflow ==="

          # Check identified_fruits.md contains expected fruits
          echo "Checking identified_fruits.md..."
          grep -qi "apple" identified_fruits.md || (echo "Missing: apple" && exit 1)
          grep -qi "banana" identified_fruits.md || (echo "Missing: banana" && exit 1)
          grep -qi "orange" identified_fruits.md || (echo "Missing: orange" && exit 1)
          grep -qi "mango" identified_fruits.md || (echo "Missing: mango" && exit 1)
          grep -qi "grape" identified_fruits.md || (echo "Missing: grape" && exit 1)
          echo "  ✓ All expected fruits found in identified_fruits.md"

          # Check classified_fruits.md has expected structure
          echo "Checking classified_fruits.md..."
          grep -qi "citrus\|tropical\|pome\|berr" classified_fruits.md || (echo "Missing fruit categories" && exit 1)
          echo "  ✓ Fruit categories found in classified_fruits.md"

          echo ""
          echo "=========================================="
          echo "  ALL E2E TESTS PASSED SUCCESSFULLY!"
          echo "=========================================="
          echo ""
          echo "Workflow tested: /deepwork fruits full - Executed full fruits workflow (identify + classify)"
          echo ""

      - name: Display status files
        if: steps.check-key.outputs.has_key == 'true' && always()
        working-directory: test_project
        run: |
          echo "=== Status Files ==="
          STATUS_DIR=".deepwork/tmp/status/v1"
          if [ -d "$STATUS_DIR" ]; then
            echo "--- job_manifest.yml ---"
            cat "$STATUS_DIR/job_manifest.yml" 2>/dev/null || echo "(not found)"
            echo ""
            if [ -d "$STATUS_DIR/sessions" ]; then
              for f in "$STATUS_DIR/sessions"/*.yml; do
                echo "--- $(basename "$f") ---"
                cat "$f"
                echo ""
              done
            else
              echo "(no session status files)"
            fi
          else
            echo "(status directory not found)"
          fi

      - name: Upload test artifacts
        if: steps.check-key.outputs.has_key == 'true' && always()
        uses: actions/upload-artifact@v4
        with:
          name: claude-code-e2e-outputs
          path: |
            test_project/.deepwork/jobs/fruits/
            test_project/.deepwork/tmp/status/
            test_project/.claude/skills/deepwork/
            test_project/fruits/identified_fruits.md
            test_project/fruits/classified_fruits.md
            claude-create-job.jsonl
            claude-run-workflow.jsonl
          retention-days: 7