|
| 1 | +from dingo.io.input import RequiredField |
| 2 | +from dingo.model import Model |
| 3 | +from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality |
| 4 | + |
| 5 | + |
| 6 | +@Model.llm_register("LLMTextEquation") |
| 7 | +class LLMTextEquation(BaseTextQuality): |
| 8 | + # Metadata for documentation generation |
| 9 | + _metric_info = { |
| 10 | + "category": "Pretrain Text Quality Assessment Metrics", |
| 11 | + "metric_name": "LLMTextQualityV5", |
| 12 | + "description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds", |
| 13 | + "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages", |
| 14 | + "paper_url": "https://arxiv.org/abs/2501.14506", |
| 15 | + "paper_authors": "Yu et al., 2025", |
| 16 | + "examples": "examples/llm_and_rule/llm_local.py", |
| 17 | + "evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md" |
| 18 | + } |
| 19 | + _required_fields = [RequiredField.CONTENT] |
| 20 | + prompt = r""" |
| 21 | +你是一个专业的数学、化学等学科的公式质检员。我会给你一个从文档中提取的 equation 类型元素(JSON 格式),请对其 text 字段进行质量检测。 |
| 22 | +
|
| 23 | +## 检测维度 |
| 24 | +
|
| 25 | +1. **语法问题** |
| 26 | + - LaTeX 命令拼写错误(如 \frace 代替 \frac) |
| 27 | + - 括号未正确配对闭合({}、[]、()) |
| 28 | + - 环境标签不匹配(如 \begin{} 与 \end{} 不对应) |
| 29 | +
|
| 30 | +2. **识别问题** |
| 31 | + - 疑似 OCR 识别错误(如字母与符号混淆:x 与 ×、- 与 −、l 与 1、O 与 0 等) |
| 32 | + - 公式内容明显残缺或截断 |
| 33 | + - 出现乱码或无意义字符 |
| 34 | +
|
| 35 | +3. **语义问题** |
| 36 | + - 公式结构不完整,无法表达完整的数学含义 |
| 37 | + - 运算符或符号使用明显不合数学规范 |
| 38 | +
|
| 39 | +## 一级错误类型(type) |
| 40 | +
|
| 41 | +- `syntax`:语法问题 |
| 42 | +- `recognition`:识别问题 |
| 43 | +- `semantic`:语义问题 |
| 44 | +
|
| 45 | +## 二级错误类型(name) |
| 46 | +
|
| 47 | +- `command_error`:LaTeX 命令拼写错误 |
| 48 | +- `bracket_mismatch`:括号未正确配对 |
| 49 | +- `env_mismatch`:环境标签不匹配 |
| 50 | +- `ocr_error`:OCR 字符识别错误 |
| 51 | +- `truncated_content`:公式残缺或截断 |
| 52 | +- `garbled_text`:乱码或无意义字符 |
| 53 | +- `incomplete_expression`:公式结构不完整 |
| 54 | +- `invalid_notation`:数学符号使用不规范 |
| 55 | +- `none`:无问题 |
| 56 | +
|
| 57 | +## Output Format |
| 58 | +
|
| 59 | +Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""} |
| 60 | +
|
| 61 | +score 类型必须为int; |
| 62 | +score 为 1 表示通过,type 填 "Good",name 填 "None",reason 说明公式正常的依据; |
| 63 | +score 为 0 表示不通过,type 和 name 填对应的错误类型,reason 说明判断依据并指出具体的问题位置或内容。 |
| 64 | +
|
| 65 | +## Input content to evaluate: |
| 66 | +
|
| 67 | +""" |
| 68 | + # process_response method is now inherited from BaseTextQuality |
0 commit comments