mreichhoff · mreichhoff · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/.github/workflows/eval-functions.yml b/.github/workflows/eval-functions.yml
@@ -111,33 +111,33 @@ jobs:
             --batchSize 10 \
             --output eval-results/explain-chinese-results.json || { echo "⚠️ explainText evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
 
-          echo "Running explainEnglish evaluation..."
-          genkit eval:flow explainEnglish \
-            --input datasets/explain-english.json \
-            --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/grammarExplanationQuality \
-            --batchSize 10 \
-            --output eval-results/explain-english-results.json || { echo "⚠️ explainEnglish evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
-
-          echo "Running generateChineseSentences evaluation..."
-          genkit eval:flow generateChineseSentences \
-            --input datasets/generate-chinese-sentences.json \
-            --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/sentenceGenerationQuality \
-            --batchSize 10 \
-            --output eval-results/generate-sentences-results.json || { echo "⚠️ generateChineseSentences evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
-
-          echo "Running analyzeCollocation evaluation..."
-          genkit eval:flow analyzeCollocation \
-            --input datasets/analyze-collocation.json \
-            --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
-            --batchSize 10 \
-            --output eval-results/collocation-results.json || { echo "⚠️ analyzeCollocation evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
-
-          echo "Running explainWordInContext evaluation..."
-          genkit eval:flow explainWordInContext \
-            --input datasets/explain-word-in-context.json \
-            --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
-            --batchSize 10 \
-            --output eval-results/word-context-results.json || { echo "⚠️ explainWordInContext evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
+          # echo "Running explainEnglish evaluation..."
+          # genkit eval:flow explainEnglish \
+          #   --input datasets/explain-english.json \
+          #   --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/grammarExplanationQuality \
+          #   --batchSize 10 \
+          #   --output eval-results/explain-english-results.json || { echo "⚠️ explainEnglish evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
+
+          # echo "Running generateChineseSentences evaluation..."
+          # genkit eval:flow generateChineseSentences \
+          #   --input datasets/generate-chinese-sentences.json \
+          #   --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/sentenceGenerationQuality \
+          #   --batchSize 10 \
+          #   --output eval-results/generate-sentences-results.json || { echo "⚠️ generateChineseSentences evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
+
+          # echo "Running analyzeCollocation evaluation..."
+          # genkit eval:flow analyzeCollocation \
+          #   --input datasets/analyze-collocation.json \
+          #   --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
+          #   --batchSize 10 \
+          #   --output eval-results/collocation-results.json || { echo "⚠️ analyzeCollocation evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
+
+          # echo "Running explainWordInContext evaluation..."
+          # genkit eval:flow explainWordInContext \
+          #   --input datasets/explain-word-in-context.json \
+          #   --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
+          #   --batchSize 10 \
+          #   --output eval-results/word-context-results.json || { echo "⚠️ explainWordInContext evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
 
           if [ $EVAL_ERRORS -gt 0 ]; then
             echo "⚠️ $EVAL_ERRORS evaluation(s) had errors - check results for details"

diff --git a/functions/datasets/explain-chinese.json b/functions/datasets/explain-chinese.json
@@ -703,5 +703,145 @@
             ],
             "expectedTranslation": "I would rather go hungry than eat that"
         }
+    },
+    {
+        "input": "他跑的很快",
+        "reference": {
+            "expectedGrammarPoints": [
+                "得 degree complement"
+            ],
+            "expectedTranslation": "He runs very fast",
+            "expectedError": {
+                "type": "wrong particle",
+                "description": "Should use 得 instead of 的 for degree complements after verbs",
+                "correction": "他跑得很快"
+            }
+        }
+    },
+    {
+        "input": "天气是很好",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Adjectival predicate"
+            ],
+            "expectedTranslation": "The weather is very good",
+            "expectedError": {
+                "type": "unnecessary 是",
+                "description": "是 should not be used before adjectives in Chinese adjectival predicate sentences",
+                "correction": "天气很好"
+            }
+        }
+    },
+    {
+        "input": "我买了一个书",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Measure words"
+            ],
+            "expectedTranslation": "I bought a book",
+            "expectedError": {
+                "type": "wrong measure word",
+                "description": "书 (book) requires measure word 本, not 个",
+                "correction": "我买了一本书"
+            }
+        }
+    },
+    {
+        "input": "我昨天不去学校",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Negation of past actions"
+            ],
+            "expectedTranslation": "I didn't go to school yesterday",
+            "expectedError": {
+                "type": "wrong negation",
+                "description": "Past actions should be negated with 没(有), not 不",
+                "correction": "我昨天没去学校"
+            }
+        }
+    },
+    {
+        "input": "他很高兴地笑",
+        "reference": {
+            "expectedGrammarPoints": [
+                "地 adverbial particle"
+            ],
+            "expectedTranslation": "He laughed happily",
+            "expectedError": {
+                "type": "unnatural phrasing",
+                "description": "While grammatically acceptable, native speakers would more naturally say 他高兴地笑了 or 他开心地笑",
+                "correction": "他高兴地笑了"
+            }
+        }
+    },
+    {
+        "input": "在见",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Common farewell expression"
+            ],
+            "expectedTranslation": "Goodbye / See you again",
+            "expectedError": {
+                "type": "typo/homophone error",
+                "description": "在 (at/in) is wrong; should be 再 (again)",
+                "correction": "再见"
+            }
+        }
+    },
+    {
+        "input": "我想买那个红的衣服",
+        "reference": {
+            "expectedGrammarPoints": [
+                "的 with adjectives modifying nouns"
+            ],
+            "expectedTranslation": "I want to buy that red piece of clothing",
+            "expectedError": {
+                "type": "unnecessary 的",
+                "description": "Single-syllable adjectives like 红 directly modify nouns without 的: 红衣服",
+                "correction": "我想买那件红衣服"
+            }
+        }
+    },
+    {
+        "input": "我吃饭了已经",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Word order with 已经"
+            ],
+            "expectedTranslation": "I have already eaten",
+            "expectedError": {
+                "type": "word order error",
+                "description": "已经 should come before the verb, not at the end of the sentence",
+                "correction": "我已经吃饭了"
+            }
+        }
+    },
+    {
+        "input": "她的很漂亮",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Adjective predicates"
+            ],
+            "expectedTranslation": "She is very beautiful",
+            "expectedError": {
+                "type": "misplaced 的",
+                "description": "的 should not be placed after a pronoun when followed by an adjective predicate; 的 makes it possessive",
+                "correction": "她很漂亮"
+            }
+        }
+    },
+    {
+        "input": "我给你打电话明天",
+        "reference": {
+            "expectedGrammarPoints": [
+                "Time word placement"
+            ],
+            "expectedTranslation": "I will call you tomorrow",
+            "expectedError": {
+                "type": "word order error",
+                "description": "Time words like 明天 should come before the verb phrase or at the beginning of the sentence",
+                "correction": "我明天给你打电话"
+            }
+        }
     }
 ]
diff --git a/functions/prompts/explain-chinese.prompt b/functions/prompts/explain-chinese.prompt
@@ -6,4 +6,14 @@ input:
 output:
   schema: ChineseExplanationSchema
 ---
+{{role "system"}}
+You are an expert Chinese language tutor. You have thoroughly studied the Chinese Grammar Wiki and HSK Standard Course textbooks, and you use their terminology and teaching approaches.
+
+Prioritize accuracy over comprehensiveness — only explain what you are confident about. 
+When providing pinyin, ensure it exactly matches every character in the input.
+
+{{role "user"}}
 Explain the Chinese text {{text}}.
+
+In your explanation:
+* if there are any typos, grammatical errors, or unnatural phrasing you detect, please explain what's wrong and how to fix it.
diff --git a/functions/src/genkit-eval.ts b/functions/src/genkit-eval.ts
@@ -177,14 +177,33 @@ export const grammarExplanationQualityEvaluator = ai.defineEvaluator(
         const output = typeof datapoint.output === 'string' ?
             datapoint.output :
             JSON.stringify(datapoint.output);
+
+        // Check if reference includes expected error information
+        const reference = datapoint.reference as {
+            expectedError?: {
+                type: string;
+                description: string;
+                correction: string;
+            };
+        } | undefined;
+
+        const hasExpectedError = reference?.expectedError != null;
+        const errorContext = hasExpectedError
+            ? `\n\nIMPORTANT: The input text contains an intentional error that the tool should identify:
+- Error type: ${reference!.expectedError!.type}
+- What's wrong: ${reference!.expectedError!.description}
+- Correct form: ${reference!.expectedError!.correction}
+
+The tool MUST identify and explain this error to receive a high score. If the output does not mention this error, give a score of 1 or 2.`
+            : '';
 
         const { output: evalResult } = await ai.generate({
             model: vertexAI.model('gemini-3-pro-preview'),
             prompt: `You are evaluating a Chinese language learning tool's output quality.
 
 Input (Chinese text to explain): ${input}
 
-Output (explanation provided): ${output}
+Output (explanation provided): ${output}${errorContext}
 
 Evaluate the quality of this explanation on a scale of 1-5:
 1 = Poor: Incorrect, confusing, or unhelpful
@@ -197,7 +216,8 @@ Consider:
 - Is the translation accurate?
 - Are grammar explanations clear and correct?
 - Is the pinyin accurate?
-- Would this help a learner understand the text?`,
+- Would this help a learner understand the text?
+- If the input contains errors, does the output identify and explain them?`,
             output: {
                 schema: z.object({
                     score: z.number().min(1).max(5).describe('Quality score from 1-5'),