diff --git a/.github/workflows/eval-functions.yml b/.github/workflows/eval-functions.yml index 796b66c..d69075e 100644 --- a/.github/workflows/eval-functions.yml +++ b/.github/workflows/eval-functions.yml @@ -111,33 +111,33 @@ jobs: --batchSize 10 \ --output eval-results/explain-chinese-results.json || { echo "⚠️ explainText evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } - echo "Running explainEnglish evaluation..." - genkit eval:flow explainEnglish \ - --input datasets/explain-english.json \ - --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/grammarExplanationQuality \ - --batchSize 10 \ - --output eval-results/explain-english-results.json || { echo "⚠️ explainEnglish evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } - - echo "Running generateChineseSentences evaluation..." - genkit eval:flow generateChineseSentences \ - --input datasets/generate-chinese-sentences.json \ - --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/sentenceGenerationQuality \ - --batchSize 10 \ - --output eval-results/generate-sentences-results.json || { echo "⚠️ generateChineseSentences evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } - - echo "Running analyzeCollocation evaluation..." - genkit eval:flow analyzeCollocation \ - --input datasets/analyze-collocation.json \ - --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \ - --batchSize 10 \ - --output eval-results/collocation-results.json || { echo "⚠️ analyzeCollocation evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } - - echo "Running explainWordInContext evaluation..." - genkit eval:flow explainWordInContext \ - --input datasets/explain-word-in-context.json \ - --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \ - --batchSize 10 \ - --output eval-results/word-context-results.json || { echo "⚠️ explainWordInContext evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } + # echo "Running explainEnglish evaluation..." + # genkit eval:flow explainEnglish \ + # --input datasets/explain-english.json \ + # --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/grammarExplanationQuality \ + # --batchSize 10 \ + # --output eval-results/explain-english-results.json || { echo "⚠️ explainEnglish evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } + + # echo "Running generateChineseSentences evaluation..." + # genkit eval:flow generateChineseSentences \ + # --input datasets/generate-chinese-sentences.json \ + # --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/sentenceGenerationQuality \ + # --batchSize 10 \ + # --output eval-results/generate-sentences-results.json || { echo "⚠️ generateChineseSentences evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } + + # echo "Running analyzeCollocation evaluation..." + # genkit eval:flow analyzeCollocation \ + # --input datasets/analyze-collocation.json \ + # --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \ + # --batchSize 10 \ + # --output eval-results/collocation-results.json || { echo "⚠️ analyzeCollocation evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } + + # echo "Running explainWordInContext evaluation..." + # genkit eval:flow explainWordInContext \ + # --input datasets/explain-word-in-context.json \ + # --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \ + # --batchSize 10 \ + # --output eval-results/word-context-results.json || { echo "⚠️ explainWordInContext evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); } if [ $EVAL_ERRORS -gt 0 ]; then echo "⚠️ $EVAL_ERRORS evaluation(s) had errors - check results for details" diff --git a/functions/datasets/explain-chinese.json b/functions/datasets/explain-chinese.json index 0ec20a4..5a6d80b 100644 --- a/functions/datasets/explain-chinese.json +++ b/functions/datasets/explain-chinese.json @@ -703,5 +703,145 @@ ], "expectedTranslation": "I would rather go hungry than eat that" } + }, + { + "input": "他跑的很快", + "reference": { + "expectedGrammarPoints": [ + "得 degree complement" + ], + "expectedTranslation": "He runs very fast", + "expectedError": { + "type": "wrong particle", + "description": "Should use 得 instead of 的 for degree complements after verbs", + "correction": "他跑得很快" + } + } + }, + { + "input": "天气是很好", + "reference": { + "expectedGrammarPoints": [ + "Adjectival predicate" + ], + "expectedTranslation": "The weather is very good", + "expectedError": { + "type": "unnecessary 是", + "description": "是 should not be used before adjectives in Chinese adjectival predicate sentences", + "correction": "天气很好" + } + } + }, + { + "input": "我买了一个书", + "reference": { + "expectedGrammarPoints": [ + "Measure words" + ], + "expectedTranslation": "I bought a book", + "expectedError": { + "type": "wrong measure word", + "description": "书 (book) requires measure word 本, not 个", + "correction": "我买了一本书" + } + } + }, + { + "input": "我昨天不去学校", + "reference": { + "expectedGrammarPoints": [ + "Negation of past actions" + ], + "expectedTranslation": "I didn't go to school yesterday", + "expectedError": { + "type": "wrong negation", + "description": "Past actions should be negated with 没(有), not 不", + "correction": "我昨天没去学校" + } + } + }, + { + "input": "他很高兴地笑", + "reference": { + "expectedGrammarPoints": [ + "地 adverbial particle" + ], + "expectedTranslation": "He laughed happily", + "expectedError": { + "type": "unnatural phrasing", + "description": "While grammatically acceptable, native speakers would more naturally say 他高兴地笑了 or 他开心地笑", + "correction": "他高兴地笑了" + } + } + }, + { + "input": "在见", + "reference": { + "expectedGrammarPoints": [ + "Common farewell expression" + ], + "expectedTranslation": "Goodbye / See you again", + "expectedError": { + "type": "typo/homophone error", + "description": "在 (at/in) is wrong; should be 再 (again)", + "correction": "再见" + } + } + }, + { + "input": "我想买那个红的衣服", + "reference": { + "expectedGrammarPoints": [ + "的 with adjectives modifying nouns" + ], + "expectedTranslation": "I want to buy that red piece of clothing", + "expectedError": { + "type": "unnecessary 的", + "description": "Single-syllable adjectives like 红 directly modify nouns without 的: 红衣服", + "correction": "我想买那件红衣服" + } + } + }, + { + "input": "我吃饭了已经", + "reference": { + "expectedGrammarPoints": [ + "Word order with 已经" + ], + "expectedTranslation": "I have already eaten", + "expectedError": { + "type": "word order error", + "description": "已经 should come before the verb, not at the end of the sentence", + "correction": "我已经吃饭了" + } + } + }, + { + "input": "她的很漂亮", + "reference": { + "expectedGrammarPoints": [ + "Adjective predicates" + ], + "expectedTranslation": "She is very beautiful", + "expectedError": { + "type": "misplaced 的", + "description": "的 should not be placed after a pronoun when followed by an adjective predicate; 的 makes it possessive", + "correction": "她很漂亮" + } + } + }, + { + "input": "我给你打电话明天", + "reference": { + "expectedGrammarPoints": [ + "Time word placement" + ], + "expectedTranslation": "I will call you tomorrow", + "expectedError": { + "type": "word order error", + "description": "Time words like 明天 should come before the verb phrase or at the beginning of the sentence", + "correction": "我明天给你打电话" + } + } } ] \ No newline at end of file diff --git a/functions/prompts/explain-chinese.prompt b/functions/prompts/explain-chinese.prompt index 36ba166..cb8a09b 100644 --- a/functions/prompts/explain-chinese.prompt +++ b/functions/prompts/explain-chinese.prompt @@ -6,4 +6,14 @@ input: output: schema: ChineseExplanationSchema --- +{{role "system"}} +You are an expert Chinese language tutor. You have thoroughly studied the Chinese Grammar Wiki and HSK Standard Course textbooks, and you use their terminology and teaching approaches. + +Prioritize accuracy over comprehensiveness — only explain what you are confident about. +When providing pinyin, ensure it exactly matches every character in the input. + +{{role "user"}} Explain the Chinese text {{text}}. + +In your explanation: +* if there are any typos, grammatical errors, or unnatural phrasing you detect, please explain what's wrong and how to fix it. diff --git a/functions/src/genkit-eval.ts b/functions/src/genkit-eval.ts index 80a2f91..b167a56 100644 --- a/functions/src/genkit-eval.ts +++ b/functions/src/genkit-eval.ts @@ -177,6 +177,25 @@ export const grammarExplanationQualityEvaluator = ai.defineEvaluator( const output = typeof datapoint.output === 'string' ? datapoint.output : JSON.stringify(datapoint.output); + + // Check if reference includes expected error information + const reference = datapoint.reference as { + expectedError?: { + type: string; + description: string; + correction: string; + }; + } | undefined; + + const hasExpectedError = reference?.expectedError != null; + const errorContext = hasExpectedError + ? `\n\nIMPORTANT: The input text contains an intentional error that the tool should identify: +- Error type: ${reference!.expectedError!.type} +- What's wrong: ${reference!.expectedError!.description} +- Correct form: ${reference!.expectedError!.correction} + +The tool MUST identify and explain this error to receive a high score. If the output does not mention this error, give a score of 1 or 2.` + : ''; const { output: evalResult } = await ai.generate({ model: vertexAI.model('gemini-3-pro-preview'), @@ -184,7 +203,7 @@ export const grammarExplanationQualityEvaluator = ai.defineEvaluator( Input (Chinese text to explain): ${input} -Output (explanation provided): ${output} +Output (explanation provided): ${output}${errorContext} Evaluate the quality of this explanation on a scale of 1-5: 1 = Poor: Incorrect, confusing, or unhelpful @@ -197,7 +216,8 @@ Consider: - Is the translation accurate? - Are grammar explanations clear and correct? - Is the pinyin accurate? -- Would this help a learner understand the text?`, +- Would this help a learner understand the text? +- If the input contains errors, does the output identify and explain them?`, output: { schema: z.object({ score: z.number().min(1).max(5).describe('Quality score from 1-5'),