Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 27 additions & 27 deletions .github/workflows/eval-functions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,33 +111,33 @@ jobs:
--batchSize 10 \
--output eval-results/explain-chinese-results.json || { echo "⚠️ explainText evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

echo "Running explainEnglish evaluation..."
genkit eval:flow explainEnglish \
--input datasets/explain-english.json \
--evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/grammarExplanationQuality \
--batchSize 10 \
--output eval-results/explain-english-results.json || { echo "⚠️ explainEnglish evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

echo "Running generateChineseSentences evaluation..."
genkit eval:flow generateChineseSentences \
--input datasets/generate-chinese-sentences.json \
--evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/sentenceGenerationQuality \
--batchSize 10 \
--output eval-results/generate-sentences-results.json || { echo "⚠️ generateChineseSentences evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

echo "Running analyzeCollocation evaluation..."
genkit eval:flow analyzeCollocation \
--input datasets/analyze-collocation.json \
--evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
--batchSize 10 \
--output eval-results/collocation-results.json || { echo "⚠️ analyzeCollocation evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

echo "Running explainWordInContext evaluation..."
genkit eval:flow explainWordInContext \
--input datasets/explain-word-in-context.json \
--evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
--batchSize 10 \
--output eval-results/word-context-results.json || { echo "⚠️ explainWordInContext evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }
# echo "Running explainEnglish evaluation..."
# genkit eval:flow explainEnglish \
# --input datasets/explain-english.json \
# --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/grammarExplanationQuality \
# --batchSize 10 \
# --output eval-results/explain-english-results.json || { echo "⚠️ explainEnglish evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

# echo "Running generateChineseSentences evaluation..."
# genkit eval:flow generateChineseSentences \
# --input datasets/generate-chinese-sentences.json \
# --evaluators=custom/chineseTextPresent,custom/validPinyinFormat,custom/outputStructureValid,custom/sentenceGenerationQuality \
# --batchSize 10 \
# --output eval-results/generate-sentences-results.json || { echo "⚠️ generateChineseSentences evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

# echo "Running analyzeCollocation evaluation..."
# genkit eval:flow analyzeCollocation \
# --input datasets/analyze-collocation.json \
# --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
# --batchSize 10 \
# --output eval-results/collocation-results.json || { echo "⚠️ analyzeCollocation evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

# echo "Running explainWordInContext evaluation..."
# genkit eval:flow explainWordInContext \
# --input datasets/explain-word-in-context.json \
# --evaluators=custom/chineseTextPresent,custom/englishTranslationPresent,custom/outputStructureValid \
# --batchSize 10 \
# --output eval-results/word-context-results.json || { echo "⚠️ explainWordInContext evaluation had errors"; EVAL_ERRORS=$((EVAL_ERRORS+1)); }

if [ $EVAL_ERRORS -gt 0 ]; then
echo "⚠️ $EVAL_ERRORS evaluation(s) had errors - check results for details"
Expand Down
140 changes: 140 additions & 0 deletions functions/datasets/explain-chinese.json
Original file line number Diff line number Diff line change
Expand Up @@ -703,5 +703,145 @@
],
"expectedTranslation": "I would rather go hungry than eat that"
}
},
{
"input": "他跑的很快",
"reference": {
"expectedGrammarPoints": [
"得 degree complement"
],
"expectedTranslation": "He runs very fast",
"expectedError": {
"type": "wrong particle",
"description": "Should use 得 instead of 的 for degree complements after verbs",
"correction": "他跑得很快"
}
}
},
{
"input": "天气是很好",
"reference": {
"expectedGrammarPoints": [
"Adjectival predicate"
],
"expectedTranslation": "The weather is very good",
"expectedError": {
"type": "unnecessary 是",
"description": "是 should not be used before adjectives in Chinese adjectival predicate sentences",
"correction": "天气很好"
}
}
},
{
"input": "我买了一个书",
"reference": {
"expectedGrammarPoints": [
"Measure words"
],
"expectedTranslation": "I bought a book",
"expectedError": {
"type": "wrong measure word",
"description": "书 (book) requires measure word 本, not 个",
"correction": "我买了一本书"
}
}
},
{
"input": "我昨天不去学校",
"reference": {
"expectedGrammarPoints": [
"Negation of past actions"
],
"expectedTranslation": "I didn't go to school yesterday",
"expectedError": {
"type": "wrong negation",
"description": "Past actions should be negated with 没(有), not 不",
"correction": "我昨天没去学校"
}
}
},
{
"input": "他很高兴地笑",
"reference": {
"expectedGrammarPoints": [
"地 adverbial particle"
],
"expectedTranslation": "He laughed happily",
"expectedError": {
"type": "unnatural phrasing",
"description": "While grammatically acceptable, native speakers would more naturally say 他高兴地笑了 or 他开心地笑",
"correction": "他高兴地笑了"
}
}
},
{
"input": "在见",
"reference": {
"expectedGrammarPoints": [
"Common farewell expression"
],
"expectedTranslation": "Goodbye / See you again",
"expectedError": {
"type": "typo/homophone error",
"description": "在 (at/in) is wrong; should be 再 (again)",
"correction": "再见"
}
}
},
{
"input": "我想买那个红的衣服",
"reference": {
"expectedGrammarPoints": [
"的 with adjectives modifying nouns"
],
"expectedTranslation": "I want to buy that red piece of clothing",
"expectedError": {
"type": "unnecessary 的",
"description": "Single-syllable adjectives like 红 directly modify nouns without 的: 红衣服",
"correction": "我想买那件红衣服"
}
}
},
{
"input": "我吃饭了已经",
"reference": {
"expectedGrammarPoints": [
"Word order with 已经"
],
"expectedTranslation": "I have already eaten",
"expectedError": {
"type": "word order error",
"description": "已经 should come before the verb, not at the end of the sentence",
"correction": "我已经吃饭了"
}
}
},
{
"input": "她的很漂亮",
"reference": {
"expectedGrammarPoints": [
"Adjective predicates"
],
"expectedTranslation": "She is very beautiful",
"expectedError": {
"type": "misplaced 的",
"description": "的 should not be placed after a pronoun when followed by an adjective predicate; 的 makes it possessive",
"correction": "她很漂亮"
}
}
},
{
"input": "我给你打电话明天",
"reference": {
"expectedGrammarPoints": [
"Time word placement"
],
"expectedTranslation": "I will call you tomorrow",
"expectedError": {
"type": "word order error",
"description": "Time words like 明天 should come before the verb phrase or at the beginning of the sentence",
"correction": "我明天给你打电话"
}
}
}
]
10 changes: 10 additions & 0 deletions functions/prompts/explain-chinese.prompt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,14 @@ input:
output:
schema: ChineseExplanationSchema
---
{{role "system"}}
You are an expert Chinese language tutor. You have thoroughly studied the Chinese Grammar Wiki and HSK Standard Course textbooks, and you use their terminology and teaching approaches.

Prioritize accuracy over comprehensiveness — only explain what you are confident about.
When providing pinyin, ensure it exactly matches every character in the input.

{{role "user"}}
Explain the Chinese text {{text}}.

In your explanation:
* if there are any typos, grammatical errors, or unnatural phrasing you detect, please explain what's wrong and how to fix it.
24 changes: 22 additions & 2 deletions functions/src/genkit-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,33 @@ export const grammarExplanationQualityEvaluator = ai.defineEvaluator(
const output = typeof datapoint.output === 'string' ?
datapoint.output :
JSON.stringify(datapoint.output);

// Check if reference includes expected error information
const reference = datapoint.reference as {
expectedError?: {
type: string;
description: string;
correction: string;
};
} | undefined;

const hasExpectedError = reference?.expectedError != null;
const errorContext = hasExpectedError
? `\n\nIMPORTANT: The input text contains an intentional error that the tool should identify:
- Error type: ${reference!.expectedError!.type}
- What's wrong: ${reference!.expectedError!.description}
- Correct form: ${reference!.expectedError!.correction}

The tool MUST identify and explain this error to receive a high score. If the output does not mention this error, give a score of 1 or 2.`
: '';

const { output: evalResult } = await ai.generate({
model: vertexAI.model('gemini-3-pro-preview'),
prompt: `You are evaluating a Chinese language learning tool's output quality.

Input (Chinese text to explain): ${input}

Output (explanation provided): ${output}
Output (explanation provided): ${output}${errorContext}

Evaluate the quality of this explanation on a scale of 1-5:
1 = Poor: Incorrect, confusing, or unhelpful
Expand All @@ -197,7 +216,8 @@ Consider:
- Is the translation accurate?
- Are grammar explanations clear and correct?
- Is the pinyin accurate?
- Would this help a learner understand the text?`,
- Would this help a learner understand the text?
- If the input contains errors, does the output identify and explain them?`,
output: {
schema: z.object({
score: z.number().min(1).max(5).describe('Quality score from 1-5'),
Expand Down