MobileReality · gitsad · May 21, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/.changeset/clear-windows-see.md b/.changeset/clear-windows-see.md
@@ -0,0 +1,8 @@
+---
+"@mobile-reality/mdma-validator": minor
+"@mobile-reality/mdma-spec": minor
+"@mobile-reality/mdma-prompt-pack": patch
+"@mobile-reality/mdma-demo": patch
+---
+
+Split validator into per-block validate() and multi-message validateConversation(); make form.onSubmit required and rewrite action-label fields as opaque labels (drop the action-references rule); add many model-specific fixer/author/agent-tool prompt variants (gpt-5.x family, Claude opus/sonnet/haiku, Gemini 2.5/3, Grok), promote the conversation-judge prompt out of mdma-fixer/ and rename its export to MDMA_CONVERSATION_JUDGE.
diff --git a/.changeset/clever-lines-trade.md b/.changeset/clever-lines-trade.md
@@ -0,0 +1,7 @@
+---
+"@mobile-reality/mdma-attachables-core": patch
+"@mobile-reality/mdma-parser": patch
+"@mobile-reality/mdma-cli": patch
+---
+
+Tests update
diff --git a/README.md b/README.md
@@ -74,30 +74,30 @@ Each cell shows the pass rate of the model-specialized MDMA_AUTHOR prompt varian
 | :--- | :---: | :---: | :---: | :---: |
 | **OpenAI** | | | | |
 | `gpt-5.5` | ✅ | ✅ | ✅ | ✅ |
-| `gpt-5.4` | ✅ | ✅ | ✅ | ✅ |
+| `gpt-5.4` | ✅ | ✅ † | ✅ † | ✅ † |
 | `gpt-5.4-mini` | ✅ | ✅ | ✅ \* | ✅ \* |
 | `gpt-5.4-nano` | ✅ | ✅ | ✅ \* | ✅ \* |
 | `gpt-5.2` | ✅ | ✅ | ✅ | ✅ |
 | `gpt-5.1` | ✅ | ✅ | ✅ | ✅ |
 | `gpt-5` \[i] | ✅ | ✅ | ✅ | ✅ |
 | `gpt-5-mini` \[i] | ✅ | ✅ | ✅ \* | ✅ \* |
-| `gpt-5-nano` \[i] | ✅ | ✅ | ✅ \* | ✅ \* |
+| `gpt-5-nano` \[i] | ✅ | ✅ | 🟡 \* | 🟡 \* |
 | `gpt-4.1` | ✅ | ✅ | ✅ | ✅ |
 | `gpt-4.1-mini` | ✅ | ✅ | ✅ \* | ✅ \* |
-| `gpt-4.1-nano` | 🟡 | ✅ | ✅ \* | ✅ \* |
+| `gpt-4.1-nano` | ✅ | ✅ | ✅ \* | 🟡 \* |
 | **Anthropic** | | | | |
 | `claude-opus-4.7` | ✅ | ✅ | ✅ | ✅ |
 | `claude-opus-4.6` | ✅ | ✅ | ✅ | ✅ |
 | `claude-sonnet-4.6` | ✅ | ✅ | ✅ | ✅ |
 | `claude-haiku-4.5` | ✅ | ✅ | ✅ \* | ✅ \* |
 | **Google** | | | | |
-| `gemini-3.1-pro-preview` | ✅ | ✅ | ✅ | ✅ |
+| `gemini-3.1-pro-preview` | ✅ | ✅ | ✅ | 🟡 ‡ |
 | `gemini-3.1-pro-preview-customtools` | ✅ | ✅ | ✅ | ✅ |
 | `gemini-3.1-flash-lite-preview` | ✅ | ✅ | ✅ \* | ✅ \* |
 | `gemini-3-flash-preview` | ✅ | ✅ | ✅ \* | ✅ \* |
 | `gemini-2.5-pro` | ✅ | ✅ | ✅ | ✅ |
 | `gemini-2.5-flash` | ✅ | ✅ | ✅ \* | ✅ \* |
-| `gemini-2.5-flash-lite` | 🟡 | ✅ | ✅ \* | ✅ \* |
+| `gemini-2.5-flash-lite` | ✅ | ✅ | ✅ \* | ✅ \* |
 | **xAI** | | | | |
 | `grok-4.3` \[i] | 🟡 | 🔴 | 🔴 | 🔴 |
 | `grok-4.20` | ✅ | ✅ | ✅ | ✅ |
@@ -115,11 +115,57 @@ Each cell shows the pass rate of the model-specialized MDMA_AUTHOR prompt varian
 
 > **Don't see your model?** Add a prompt variant under `packages/prompt-pack/src/prompts/mdma-author/<vendor>/` and open a PR — we'll run the eval suite and add it to this table.
 
+† **gpt-5.4 intermittent duplication bug** — `gpt-5.4` passes one-shot evals reliably but shows a non-deterministic output duplication in multi-turn, custom-prompt, and flow evals (~7–15% of runs). The model generates a complete, correct response and then immediately re-emits the entire output verbatim, causing `[duplicate-ids]` validation errors. This is a known model-level issue unrelated to the prompt variant. See the [OpenAI community thread](https://community.openai.com/t/seeing-intermittent-duplicate-strings-in-gpt-5-4-responses/1376651) for details. If this affects your use case, prefer `gpt-5.5` or `gpt-5.2`.
+
+‡ **gemini-3.1-pro-preview stochastic preamble loop** — on ~7–15% of flow-eval runs, the model emits a chain-of-thought as visible Markdown prose (e.g. `**Investigating Production Errors**` repeated 3–5 times) instead of opening a ```` ```mdma ```` block, producing either `[yaml-correctness: outside fenced block]` or `[duplicate-ids]` errors. Per Google's official Gemini 3 prompting guide, this is a model-level behavior driven by temperature/sampling — prompt-level fixes shift which test loops rather than eliminating the loops. If deterministic flow output matters, prefer `gemini-2.5-pro` for production multi-step flows.
+
 \* Smaller / lower-tier models from any lab (OpenAI mini · nano, Anthropic Haiku, Google Gemini Flash, etc.) pass our eval suites, which exercise short, structured test cases. In longer real-world conversations they tend to hallucinate, forget earlier turns, or drift from the spec. For production use that involves multi-turn dialogue or stateful flows, prefer the flagship-tier model from the same family.
 
 \[i] Noticeably slow response times — single-turn responses commonly take tens of seconds and full eval runs measure in minutes.
 
 
+## MDMA_FIXER prompt matrix
+
+Each cell shows the pass rate of the model-specialized MDMA_FIXER prompt variant on the single-block fixer eval (15 tests covering structural fixes, bindings, PII, forms, tables/charts, approvals). The fixer is what powers automatic repair of LLM output that fails `validate()` — every supported model lands at ✅ via model-tailored inline guards (no-leading-separator, preserve-input-structure, table-key-direction, replace-all-placeholders, fix-all-listed-errors, etc.).
+
+✅ 100% on the single-block fixer eval (15/15).
+
+
+| Variant | single-block fixer | notes |
+| :--- | :---: | :--- |
+| **OpenAI** | | |
+| `gpt-5.5` | ✅ | |
+| `gpt-5.4` | ✅ | |
+| `gpt-5.4-mini` | ✅ | |
+| `gpt-5.4-nano` | ✅ | |
+| `gpt-5.2` | ✅ | |
+| `gpt-5.1` | ✅ | |
+| `gpt-5` | ✅ | |
+| `gpt-5-mini` | ✅ | |
+| `gpt-5-nano` | ✅ | |
+| `gpt-4.1` | ✅ | |
+| `gpt-4.1-mini` | ✅ | |
+| `gpt-4.1-nano` | ✅ | |
+| **Anthropic** | | |
+| `claude-opus-4.7` | ✅ | |
+| `claude-opus-4.6` | ✅ | |
+| `claude-sonnet` | ✅ | catch-all variant — matches `claude-sonnet-4-5`, `claude-sonnet-4-6`, etc. |
+| `claude-haiku` | ✅ | |
+| **Google** | | |
+| `gemini-3.1-pro-preview` | ✅ ‡ | requires OpenRouter `reasoning.exclude: true` (already wired in `evals/promptfooconfig.fixer.js`) |
+| `gemini-3.1-pro-preview-customtools` | ✅ ‡ | same `reasoning.exclude` requirement |
+| `gemini-3.1-flash-lite-preview` | ✅ | |
+| `gemini-3-flash-preview` | ✅ | |
+| `gemini-2.5-pro` | ✅ ‡ | same `reasoning.exclude` requirement |
+| `gemini-2.5-flash` | ✅ | |
+| `gemini-2.5-flash-lite` | ✅ | |
+| **xAI** | | |
+| `grok-4.3` | ✅ ‡ | minimal prompt + `reasoning.exclude: true` — extra framing regresses Grok 4.3 |
+| `grok-4.20` | ✅ | |
+
+‡ Reasoning-token leak suppression — for reasoning-flavoured Gemini Pro variants and Grok 4.3, the fixer would otherwise see visible "Thinking: **Topic**" prose prepended to every response. The eval config sets `passthrough.reasoning.exclude: true` (and the demo's `usePreviewValidation` does the same per-provider) to strip reasoning tokens from the response body at the API layer rather than at the prompt layer.
+
+
 ## Components
 
 9 built-in component types, all rendered out of the box by `@mobile-reality/mdma-renderer-react`:
@@ -585,7 +631,8 @@ pnpm eval:view
 - [x] Multi-model eval coverage (Claude, GPT, Gemini, Grok)
 - [x] Prompt tuning toolkit — test and compare custom prompts
 - [x] Agent-friendly SDK — let AI agent generate your MDMA
-- [ ] Validator evals
+- [x] Validator tests & Fixer evals
+- [ ] Integrations
 - [ ] Webhook execution engine (real HTTP calls in production environments)
 
 ### v1.0 — Production Ready

diff --git a/demo/src/AgentChatView.tsx b/demo/src/AgentChatView.tsx
@@ -19,7 +19,7 @@ export function AgentChatView() {
     stop,
     clear,
     inputRef,
-  } = useAgent();
+  } = useAgent({ useAuthorSubAgent: true });
 
   const { events, isOpen: logOpen, setIsOpen: setLogOpen, clearEvents } = useAgentActionLog(turns);
 

diff --git a/demo/src/App.tsx b/demo/src/App.tsx
@@ -5,6 +5,7 @@ import { ChatView } from './ChatView.js';
 import { CustomChatView } from './CustomChatView.js';
 import { DocsView } from './DocsView.js';
 import { HomeView } from './HomeView.js';
+import { PreviewView } from './PreviewView.js';
 import { ValidatorView } from './ValidatorView.js';
 
 // ── Routing ──────────────────────────────────────────────────────────────────
@@ -25,7 +26,7 @@ function navigate(to: string) {
 
 // ── Nav config ───────────────────────────────────────────────────────────────
 
-type Route = '/' | '/chat' | '/author' | '/custom' | '/validator' | '/docs';
+type Route = '/' | '/chat' | '/preview' | '/author' | '/custom' | '/validator' | '/docs';
 
 interface NavItem {
   path: Route;
@@ -41,7 +42,10 @@ interface NavGroup {
 const NAV_GROUPS: NavGroup[] = [
   {
     label: 'Agentic',
-    items: [{ path: '/chat', label: 'Agent Chat', icon: '⚡' }],
+    items: [
+      { path: '/preview', label: 'Preview', icon: '🛡️' },
+      { path: '/chat', label: 'Agent Chat', icon: '⚡' },
+    ],
   },
   {
     label: 'Completions',
@@ -184,6 +188,8 @@ export function App() {
         <CustomChatView />
       ) : route === '/author' ? (
         <ChatView />
+      ) : route === '/preview' ? (
+        <PreviewView />
       ) : (
         <AgentChatView />
       )}

diff --git a/demo/src/HomeView.tsx b/demo/src/HomeView.tsx
@@ -11,6 +11,13 @@ const SECTIONS = [
     label: 'Agentic',
     description: 'Agent with tool use',
     items: [
+      {
+        path: '/preview',
+        label: 'Preview',
+        icon: '🛡️',
+        description:
+          'Multi-step flow demo (insurance claim) — chat on the left, live MDMA preview with auto-validation and fixer on the right.',
+      },
       {
         path: '/chat',
         label: 'Agent Chat',

diff --git a/demo/src/PreviewView.tsx b/demo/src/PreviewView.tsx
@@ -0,0 +1,126 @@
+import { useRef, useEffect, useCallback, useState } from 'react';
+import { useAgent } from './agent/use-agent.js';
+import { AgentMessage } from './agent/AgentMessage.js';
+import { AgentSettings } from './agent/AgentSettings.js';
+import { ChatInput } from './chat/ChatInput.js';
+import { BackendLogDrawer } from './preview/BackendLogPane.js';
+import { PreviewPanel } from './preview/PreviewPanel.js';
+import { clearSubmissionLog } from './preview/insurance-backend.js';
+import { INSURANCE_FLOW_PROMPT } from './preview/insurance-flow-prompt.js';
+import { useInsuranceFlow } from './preview/use-insurance-flow.js';
+import { usePreviewValidation } from './preview/use-preview-validation.js';
+
+function countToolUseBlocks(turns: ReturnType<typeof useAgent>['turns']): number {
+  let count = 0;
+  for (const turn of turns) {
+    if (turn.role !== 'assistant') continue;
+    for (const block of turn.blocks) if (block.type === 'tool_use') count++;
+  }
+  return count;
+}
+
+export function PreviewView() {
+  const {
+    turns,
+    isGenerating,
+    error,
+    input,
+    setInput,
+    config,
+    updateConfig,
+    send,
+    sendHidden,
+    stop,
+    clear,
+    inputRef,
+  } = useAgent({ flowPrompt: INSURANCE_FLOW_PROMPT, useAuthorSubAgent: true });
+
+  const [selectedBlockId, setSelectedBlockId] = useState<string | null>(null);
+
+  const previewState = usePreviewValidation({
+    turns,
+    selectedBlockId,
+    agentConfig: config,
+  });
+
+  const insuranceFlow = useInsuranceFlow({
+    currentStore: previewState.store,
+    sendHidden,
+    isGenerating,
+  });
+
+  // Snap back to the latest step whenever a new tool_use block appears so
+  // the user doesn't get stuck viewing the previous step.
+  const prevToolUseCountRef = useRef(0);
+  useEffect(() => {
+    const count = countToolUseBlocks(turns);
+    if (count > prevToolUseCountRef.current) setSelectedBlockId(null);
+    prevToolUseCountRef.current = count;
+  }, [turns]);
+
+  const chatEndRef = useRef<HTMLDivElement>(null);
+  const prevCountRef = useRef(turns.length);
+  useEffect(() => {
+    if (turns.length > prevCountRef.current) {
+      chatEndRef.current?.scrollIntoView({ behavior: 'smooth' });
+    }
+    prevCountRef.current = turns.length;
+  }, [turns]);
+
+  const handleClear = useCallback(() => {
+    clear();
+    setSelectedBlockId(null);
+    clearSubmissionLog();
+    insuranceFlow.reset();
+  }, [clear, insuranceFlow]);
+
+  return (
+    <div className="preview-layout">
+      <div className="preview-chat">
+        <AgentSettings config={config} onUpdate={updateConfig} />
+
+        <div className="chat-messages">
+          {turns.length === 0 && (
+            <div className="chat-empty">
+              <p className="chat-empty-title">Insurance Claim Demo</p>
+              <p className="chat-empty-hint">
+                Ask the agent to start a new insurance claim. It will walk you through name &amp;
+                birthday, claim details, bank account, and a final confirmation — each step rendered
+                live in the preview pane on the right.
+              </p>
+            </div>
+          )}
+
+          {turns.map((turn) => (
+            <AgentMessage
+              key={turn.id}
+              turn={turn}
+              compactToolUse
+              activeToolUseId={previewState.blockId}
+              onSelectToolUse={setSelectedBlockId}
+            />
+          ))}
+
+          {error && <div className="chat-error">{error}</div>}
+
+          <div ref={chatEndRef} />
+        </div>
+
+        <ChatInput
+          value={input}
+          onChange={setInput}
+          onSend={send}
+          onStop={stop}
+          onClear={handleClear}
+          isGenerating={isGenerating}
+          hasMessages={turns.length > 0}
+          inputRef={inputRef}
+        />
+      </div>
+
+      <PreviewPanel state={previewState} />
+
+      <BackendLogDrawer />
+    </div>
+  );
+}
diff --git a/demo/src/ValidatorView.tsx b/demo/src/ValidatorView.tsx
@@ -5,7 +5,7 @@ import { ChatMessage } from './chat/ChatMessage.js';
 import { ChatInput } from './chat/ChatInput.js';
 import { ChatActionLog } from './chat/ChatActionLog.js';
 import { useChatActionLog } from './chat/use-chat-action-log.js';
-import { validateFlow, type FlowStepDefinition } from '@mobile-reality/mdma-validator';
+import { validateConversation, type ConversationStep } from '@mobile-reality/mdma-validator';
 import { customizations } from './custom-components.js';
 import { VALIDATOR_PROMPT_VARIANTS, FLOW_STEPS } from './validator-prompts.js';
 import { ValidationPanel } from './validator/ValidationPanel.js';
@@ -72,14 +72,14 @@ function ValidatorChatInner({ promptKey }: { promptKey: string }) {
   });
 
   // Flow validation
-  const flowSteps = FLOW_STEPS[promptKey] as FlowStepDefinition[] | undefined;
+  const flowSteps = FLOW_STEPS[promptKey] as ConversationStep[] | undefined;
   const flowResult = useMemo(() => {
     if (!flowSteps) return null;
     const assistantContents = messages
       .filter((m) => m.role === 'assistant' && m.content)
       .map((m) => m.content);
     if (assistantContents.length === 0) return null;
-    return validateFlow(assistantContents, { steps: flowSteps });
+    return validateConversation(assistantContents, { steps: flowSteps });
   }, [flowSteps, messages]);
 
   const flowComplete =