From 9d820996148248ff541ff57e7214ca8a9a191e71 Mon Sep 17 00:00:00 2001
From: Tao Ma <51425734+tmatup@users.noreply.github.com>
Date: Mon, 4 May 2026 15:53:15 -0700
Subject: [PATCH] test(hitl): migrate 4 HITL smoke tasks to canonical pattern
 vocabulary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace 10 brittle `json_check.contains` substring assertions across
four HITL smoke tasks (smoke_01_explicit, smoke_02_approval_gate,
smoke_04_writeback, smoke_05_compliance) with strict canonical-name
checks aligned with `references/hitl-patterns.md`.

Each task's prompt now lists the same 6 canonical pattern machine
names and requires the agent to emit one verbatim. Each task's
`pattern` criterion uses `equals` (single canonical answer) or
`regex` (multiple defensible canonical answers per the skill doc):

- smoke_01_explicit: `equals approval-gate` -> `regex (approval-gate
  |write-back-validation)` — the "approve before write" scenario
  documents both as applicable.
- smoke_02_approval_gate: `equals approval-gate` — single canonical
  fit, manager-must-approve language.
- smoke_04_writeback: `regex (write-back-validation|data-enrichment
  |agentic-output-review)` — AI-enriches-then-writes-to-SAP scenario
  matches three patterns in the doc.
- smoke_05_compliance: `regex (compliance-checkpoint|approval-gate)`
  — GDPR scenario; "regulatory sign-off" steers toward
  compliance-checkpoint, but "sign off" alone is also an approval-gate
  signal phrase. Prompt also strengthened with explicit "regulatory
  compliance" framing and a GDPR Article 17 reference.

Companion to UiPath/skills#555 (smoke_03_escalation), which migrated
the same anti-pattern in a separate, focused PR.

Verified: all four tasks pass 3/3 reps each (12/12 SUCCESS at score
1.0) using local coder_eval against this branch.

This PR plus #555 unblocks the matching coder_eval validator
(UiPath/coder_eval#212), which hard-fails any task YAML using
`json_check.contains` with a `< 8`-char literal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../smoke_01_explicit.yaml                    | 38 +++++++++-----
 .../smoke_02_approval_gate.yaml               | 25 ++++++++--
 .../smoke_04_writeback.yaml                   | 40 ++++++++++-----
 .../smoke_05_compliance.yaml                  | 49 ++++++++++++-------
 4 files changed, 104 insertions(+), 48 deletions(-)
diff --git a/tests/tasks/uipath-human-in-the-loop/smoke_01_explicit.yaml b/tests/tasks/uipath-human-in-the-loop/smoke_01_explicit.yaml
index f81874f8f..7449e1c92 100644
--- a/tests/tasks/uipath-human-in-the-loop/smoke_01_explicit.yaml
+++ b/tests/tasks/uipath-human-in-the-loop/smoke_01_explicit.yaml
@@ -13,10 +13,15 @@ initial_prompt: |
   I have a UiPath Flow. Add a Human-in-the-Loop node before the final data
   write step so a manager can review and approve the data before it is posted.
 
-  Write a recommendation.json file with:
+  Recommend whether HITL is needed and identify which canonical HITL
+  pattern from the `uipath-human-in-the-loop` skill applies. The skill's
+  `references/hitl-patterns.md` enumerates six canonical patterns; pick
+  exactly one and emit its machine name verbatim.
+
+  Write a `recommendation.json` file with this exact shape:
   {
     "hitl_needed": <true or false>,
-    "pattern": "<which business pattern applies>",
+    "pattern": "<one of the canonical pattern names listed below>",
     "proposed_schema": {
       "inputs": ["<field names the human will see>"],
       "outputs": ["<field names the human fills in>"],
@@ -24,6 +29,16 @@ initial_prompt: |
     }
   }
 
+  Use EXACTLY one of these machine names for `pattern` (lowercase,
+  hyphenated, no extra adjectives or prefixes — names mirror the
+  section titles in `references/hitl-patterns.md`):
+    - approval-gate
+    - exception-escalation
+    - data-enrichment
+    - compliance-checkpoint
+    - write-back-validation
+    - agentic-output-review
+
 success_criteria:
   - type: file_exists
     description: "Agent wrote a recommendation.json"
@@ -40,20 +55,19 @@ success_criteria:
     pass_threshold: 1.0
 
   - type: json_check
-    description: "Agent named an approval or write-back pattern (case-tolerant)"
+    description: >
+      Agent picked a canonical HITL pattern that fits an "approve before
+      writing" scenario. Either approval-gate (manager approves the
+      artifact) or write-back-validation (HITL gates a write to a
+      system of record) is documented in `references/hitl-patterns.md`
+      as applicable here; both are accepted.
     path: "recommendation.json"
     assertions:
       - expression: "pattern"
-        operator: contains
-        expected: "pprov"
-      - expression: "pattern"
-        operator: contains
-        expected: "rite"
-      - expression: "pattern"
-        operator: contains
-        expected: "alid"
+        operator: regex
+        expected: "^(approval-gate|write-back-validation)$"
     weight: 1.5
-    pass_threshold: 0.33
+    pass_threshold: 1.0
 
   - type: file_contains
     description: "Agent proposed a schema with outcomes"
diff --git a/tests/tasks/uipath-human-in-the-loop/smoke_02_approval_gate.yaml b/tests/tasks/uipath-human-in-the-loop/smoke_02_approval_gate.yaml
index bb9d943b2..e5f3b9aef 100644
--- a/tests/tasks/uipath-human-in-the-loop/smoke_02_approval_gate.yaml
+++ b/tests/tasks/uipath-human-in-the-loop/smoke_02_approval_gate.yaml
@@ -14,10 +14,15 @@ initial_prompt: |
   finance — but a manager must approve each expense report before the email
   is sent.
 
-  Write a recommendation.json file with:
+  Recommend whether HITL is needed and identify which canonical HITL
+  pattern from the `uipath-human-in-the-loop` skill applies. The skill's
+  `references/hitl-patterns.md` enumerates six canonical patterns; pick
+  exactly one and emit its machine name verbatim.
+
+  Write a `recommendation.json` file with this exact shape:
   {
     "hitl_needed": <true or false>,
-    "pattern": "<which business pattern applies>",
+    "pattern": "<one of the canonical pattern names listed below>",
     "proposed_schema": {
       "inputs": ["<field names>"],
       "outputs": ["<field names>"],
@@ -25,6 +30,16 @@ initial_prompt: |
     }
   }
 
+  Use EXACTLY one of these machine names for `pattern` (lowercase,
+  hyphenated, no extra adjectives or prefixes — names mirror the
+  section titles in `references/hitl-patterns.md`):
+    - approval-gate
+    - exception-escalation
+    - data-enrichment
+    - compliance-checkpoint
+    - write-back-validation
+    - agentic-output-review
+
 success_criteria:
   - type: file_exists
     description: "Agent wrote a recommendation.json"
@@ -41,11 +56,11 @@ success_criteria:
     pass_threshold: 1.0
 
   - type: json_check
-    description: "Agent identified an approval gate pattern (case-tolerant)"
+    description: "Agent identified the canonical approval-gate pattern"
     path: "recommendation.json"
     assertions:
       - expression: "pattern"
-        operator: contains
-        expected: "pprov"
+        operator: equals
+        expected: "approval-gate"
     weight: 1.5
     pass_threshold: 1.0
diff --git a/tests/tasks/uipath-human-in-the-loop/smoke_04_writeback.yaml b/tests/tasks/uipath-human-in-the-loop/smoke_04_writeback.yaml
index 5a47dbb7d..9760b3beb 100644
--- a/tests/tasks/uipath-human-in-the-loop/smoke_04_writeback.yaml
+++ b/tests/tasks/uipath-human-in-the-loop/smoke_04_writeback.yaml
@@ -14,15 +14,29 @@ initial_prompt: |
   enriches the missing vendor and cost-center fields using company data, and
   writes the corrected records back to SAP.
 
-  Analyze whether this flow needs any human checkpoints. Write a
-  recommendation.json file with:
+  Analyze whether this flow needs any human checkpoints. If yes, identify
+  which canonical HITL pattern from the `uipath-human-in-the-loop` skill
+  applies. The skill's `references/hitl-patterns.md` enumerates six
+  canonical patterns; pick exactly one and emit its machine name verbatim.
+
+  Write a `recommendation.json` file with this exact shape:
   {
     "hitl_needed": <true or false>,
-    "pattern": "<pattern name if applicable>",
+    "pattern": "<one of the canonical pattern names listed below>",
     "reason": "<why HITL is or is not needed>",
     "proposed_insertion_point": "<where in the flow>"
   }
 
+  Use EXACTLY one of these machine names for `pattern` (lowercase,
+  hyphenated, no extra adjectives or prefixes — names mirror the
+  section titles in `references/hitl-patterns.md`):
+    - approval-gate
+    - exception-escalation
+    - data-enrichment
+    - compliance-checkpoint
+    - write-back-validation
+    - agentic-output-review
+
 success_criteria:
   - type: file_exists
     description: "Agent wrote a recommendation.json"
@@ -39,17 +53,17 @@ success_criteria:
     pass_threshold: 1.0
 
   - type: json_check
-    description: "Agent named a write-back / validation / enrichment pattern (any one, case-tolerant)"
+    description: >
+      Agent picked a canonical HITL pattern that fits an "AI enriches data,
+      writes it back to a system of record" scenario. Write-back-validation
+      (HITL before write to system of record), data-enrichment (HITL fills
+      / validates incomplete data), and agentic-output-review (human
+      verifies AI output before downstream use) are all documented in
+      `references/hitl-patterns.md` as applicable here.
     path: "recommendation.json"
     assertions:
       - expression: "pattern"
-        operator: contains
-        expected: "rite"
-      - expression: "pattern"
-        operator: contains
-        expected: "alid"
-      - expression: "pattern"
-        operator: contains
-        expected: "nrich"
+        operator: regex
+        expected: "^(write-back-validation|data-enrichment|agentic-output-review)$"
     weight: 1.5
-    pass_threshold: 0.33
+    pass_threshold: 1.0
diff --git a/tests/tasks/uipath-human-in-the-loop/smoke_05_compliance.yaml b/tests/tasks/uipath-human-in-the-loop/smoke_05_compliance.yaml
index 0b6e430a3..eeb8e5919 100644
--- a/tests/tasks/uipath-human-in-the-loop/smoke_05_compliance.yaml
+++ b/tests/tasks/uipath-human-in-the-loop/smoke_05_compliance.yaml
@@ -9,14 +9,21 @@ sandbox:
   python: {}
 
 initial_prompt: |
-  Automate GDPR data deletion requests. Each request requires documented
-  sign-off from our data privacy officer before the deletion actually runs —
-  we need an audit trail for every decision.
+  Automate GDPR data deletion requests. These are subject to regulatory
+  compliance requirements: each request needs documented regulatory
+  sign-off from our data privacy officer before the deletion runs, plus
+  an audit trail for every decision (required for compliance with GDPR
+  Article 17 — Right to Erasure).
 
-  Write a recommendation.json file with:
+  Recommend whether HITL is needed and identify which canonical HITL
+  pattern from the `uipath-human-in-the-loop` skill applies. The skill's
+  `references/hitl-patterns.md` enumerates six canonical patterns; pick
+  exactly one and emit its machine name verbatim.
+
+  Write a `recommendation.json` file with this exact shape:
   {
     "hitl_needed": <true or false>,
-    "pattern": "<which business pattern applies>",
+    "pattern": "<one of the canonical pattern names listed below>",
     "proposed_schema": {
       "inputs": ["<what the privacy officer will see>"],
       "outputs": ["<what they fill in>"],
@@ -24,6 +31,16 @@ initial_prompt: |
     }
   }
 
+  Use EXACTLY one of these machine names for `pattern` (lowercase,
+  hyphenated, no extra adjectives or prefixes — names mirror the
+  section titles in `references/hitl-patterns.md`):
+    - approval-gate
+    - exception-escalation
+    - data-enrichment
+    - compliance-checkpoint
+    - write-back-validation
+    - agentic-output-review
+
 success_criteria:
   - type: file_exists
     description: "Agent wrote a recommendation.json"
@@ -40,20 +57,16 @@ success_criteria:
     pass_threshold: 1.0
 
   - type: json_check
-    description: "Agent identified a compliance / audit / sign-off / approval pattern (any one, case-tolerant)"
+    description: >
+      Agent picked a canonical HITL pattern that fits a regulatory-sign-off
+      scenario. Compliance-checkpoint is the textbook fit (the doc lists
+      "regulatory sign-off" and "GDPR consent flows" as its examples), but
+      approval-gate is also defensible since "sign off" is a shared signal
+      phrase. Both are accepted.
     path: "recommendation.json"
     assertions:
       - expression: "pattern"
-        operator: contains
-        expected: "udit"
-      - expression: "pattern"
-        operator: contains
-        expected: "omplianc"
-      - expression: "pattern"
-        operator: contains
-        expected: "ign"
-      - expression: "pattern"
-        operator: contains
-        expected: "pprov"
+        operator: regex
+        expected: "^(compliance-checkpoint|approval-gate)$"
     weight: 1.5
-    pass_threshold: 0.25
+    pass_threshold: 1.0