diff --git a/.claude/skills/dingo-verify/scripts/fact_check.py b/.claude/skills/dingo-verify/scripts/fact_check.py index 23748206..71dd158d 100644 --- a/.claude/skills/dingo-verify/scripts/fact_check.py +++ b/.claude/skills/dingo-verify/scripts/fact_check.py @@ -169,13 +169,11 @@ def build_config( "key": api_key, "model": model, "api_url": api_url, - "parameters": { - "temperature": 0, - "agent_config": { - "max_concurrent_claims": max_concurrent, - "max_iterations": 50, - "tools": tools_config, - } + "temperature": 0, + "agent_config": { + "max_concurrent_claims": max_concurrent, + "max_iterations": 50, + "tools": tools_config, } } }] diff --git a/clawhub/scripts/fact_check.py b/clawhub/scripts/fact_check.py index b2865496..a4930f14 100644 --- a/clawhub/scripts/fact_check.py +++ b/clawhub/scripts/fact_check.py @@ -164,13 +164,11 @@ def build_config( "key": api_key, "model": model, "api_url": api_url, - "parameters": { - "temperature": 0, - "agent_config": { - "max_concurrent_claims": max_concurrent, - "max_iterations": 50, - "tools": tools_config, - } + "temperature": 0, + "agent_config": { + "max_concurrent_claims": max_concurrent, + "max_iterations": 50, + "tools": tools_config, } } }] diff --git a/dingo/config/input_args.py b/dingo/config/input_args.py index df1007a4..de589a69 100644 --- a/dingo/config/input_args.py +++ b/dingo/config/input_args.py @@ -102,10 +102,11 @@ class EmbeddingConfigArgs(BaseModel): class EvaluatorLLMArgs(BaseModel): + model_config = {"extra": "allow"} + model: Optional[str] = None key: Optional[str] = None api_url: Optional[str] = None - parameters: Optional[dict] = None embedding_config: Optional[EmbeddingConfigArgs] = None diff --git a/dingo/model/llm/agent/agent_article_fact_checker.py b/dingo/model/llm/agent/agent_article_fact_checker.py index 0a829073..b72b9787 100644 --- a/dingo/model/llm/agent/agent_article_fact_checker.py +++ b/dingo/model/llm/agent/agent_article_fact_checker.py @@ -343,21 +343,21 @@ class ArticleFactChecker(BaseAgent): "config": { "key": "your-openai-api-key", "model": "gpt-4o-mini", - "parameters": { - "agent_config": { - "max_iterations": 10, - "tools": { - "claims_extractor": { - "api_key": "your-openai-api-key", - "max_claims": 50, - "claim_types": ["factual", "institutional", "statistical", "attribution"] - }, - "tavily_search": { - "api_key": "your-tavily-api-key", - "max_results": 5 - }, - "arxiv_search": {"max_results": 5} - } + "agent_config": { + "max_iterations": 10, + "overall_timeout": 900, + "max_concurrent_claims": 5, + "tools": { + "claims_extractor": { + "api_key": "your-openai-api-key", + "max_claims": 50, + "claim_types": ["factual", "institutional", "statistical", "attribution"] + }, + "tavily_search": { + "api_key": "your-tavily-api-key", + "max_results": 5 + }, + "arxiv_search": {"max_results": 5} } } } @@ -372,6 +372,9 @@ class ArticleFactChecker(BaseAgent): ] max_iterations = 10 # Allow more iterations for comprehensive checking max_concurrent_claims = 5 # Default parallel claim verification slots + overall_timeout = 900 # 15-minute wall-clock timeout for entire evaluation + _MIN_OVERALL_TIMEOUT = 30 # Floor: 30 seconds + _MAX_OVERALL_TIMEOUT = 7200 # Ceiling: 2 hours _required_fields = [RequiredField.CONTENT] # Article text @@ -394,8 +397,8 @@ def _get_output_dir(cls) -> Optional[str]: Returns: Output directory path (created if needed), or None if saving is disabled. """ - params = cls.dynamic_config.parameters or {} - agent_cfg = params.get('agent_config') or {} + extra_params = cls.dynamic_config.model_extra + agent_cfg = extra_params.get('agent_config') or {} explicit_path = agent_cfg.get('output_path') if explicit_path: @@ -816,24 +819,42 @@ def eval(cls, input_data: Data) -> EvalDetail: output_dir = cls._get_output_dir() if cls.dynamic_config: - if cls.dynamic_config.parameters is None: - cls.dynamic_config.parameters = {} - cls.dynamic_config.parameters.setdefault("temperature", 0) + if 'temperature' not in cls.dynamic_config.model_extra: + cls.dynamic_config.temperature = 0 if output_dir and input_data.content: cls._save_article_content(output_dir, input_data.content) + timeout = cls._get_overall_timeout() + + async def _run_with_timeout() -> EvalDetail: + return await asyncio.wait_for( + cls._async_eval(input_data, start_time, output_dir), + timeout=timeout, + ) + try: - return asyncio.run(cls._async_eval(input_data, start_time, output_dir)) + return asyncio.run(_run_with_timeout()) + except asyncio.TimeoutError: + elapsed = time.time() - start_time + log.warning(f"ArticleFactChecker: overall timeout exceeded ({elapsed:.1f}s / {timeout:.0f}s limit)") + return cls._create_overall_timeout_result(elapsed, timeout) except RuntimeError as e: # Fallback when called inside an already-running event loop (e.g. Jupyter, tests) if "cannot run" in str(e).lower() or "already running" in str(e).lower(): import concurrent.futures with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: - future = pool.submit( - lambda: asyncio.run(cls._async_eval(input_data, start_time, output_dir)) - ) - return future.result() + future = pool.submit(lambda: asyncio.run(_run_with_timeout())) + try: + # Extra margin so asyncio.wait_for fires before this outer timeout + return future.result(timeout=timeout + 30) + except (asyncio.TimeoutError, concurrent.futures.TimeoutError): + elapsed = time.time() - start_time + log.warning( + f"ArticleFactChecker: overall timeout exceeded " + f"({elapsed:.1f}s / {timeout:.0f}s limit, fallback path)" + ) + return cls._create_overall_timeout_result(elapsed, timeout) raise # --- Two-Phase Async Architecture Methods --- @@ -922,8 +943,8 @@ async def _async_extract_claims(cls, input_data: Data) -> List[Dict]: """ from dingo.model.llm.agent.tools.claims_extractor import ClaimsExtractor, ClaimsExtractorConfig - params = cls.dynamic_config.parameters or {} - agent_cfg = params.get('agent_config') or {} + extra_params = cls.dynamic_config.model_extra + agent_cfg = extra_params.get('agent_config') or {} extractor_cfg = agent_cfg.get('tools', {}).get('claims_extractor', {}) config_kwargs: Dict[str, Any] = { @@ -1019,10 +1040,30 @@ async def _async_verify_single_claim( @classmethod def _get_max_concurrent_claims(cls) -> int: """Read max_concurrent_claims from agent_config or use class default.""" - params = cls.dynamic_config.parameters or {} - agent_cfg = params.get('agent_config') or {} + extra_params = cls.dynamic_config.model_extra + agent_cfg = extra_params.get('agent_config') or {} return agent_cfg.get('max_concurrent_claims', cls.max_concurrent_claims) + @classmethod + def _get_overall_timeout(cls) -> float: + """Read overall_timeout from agent_config or use class default (900s). + + Returns: + Positive timeout in seconds, clamped to [30, 7200]. + """ + extra_params = cls.dynamic_config.model_extra + agent_cfg = extra_params.get('agent_config') or {} + raw = agent_cfg.get('overall_timeout', cls.overall_timeout) + try: + timeout = float(raw) + except (TypeError, ValueError): + log.warning(f"Invalid overall_timeout={raw!r}, using default {cls.overall_timeout}s") + return float(cls.overall_timeout) + clamped = max(cls._MIN_OVERALL_TIMEOUT, min(timeout, cls._MAX_OVERALL_TIMEOUT)) + if clamped != timeout: + log.warning(f"overall_timeout={timeout} out of range, clamped to {clamped}s") + return float(clamped) + @classmethod def _parse_claim_json_robust(cls, output: Optional[str]) -> Dict[str, Any]: """ @@ -1795,6 +1836,38 @@ def _create_error_result(cls, error_message: str) -> EvalDetail: ] return result + @classmethod + def _create_overall_timeout_result(cls, elapsed: float, timeout: float) -> EvalDetail: + """ + Create error result when overall wall-clock timeout is exceeded. + + Args: + elapsed: Actual elapsed time in seconds + timeout: Configured timeout limit in seconds + + Returns: + EvalDetail with timeout error status + """ + minutes, seconds = divmod(int(timeout), 60) + limit_str = f"{minutes}m{seconds}s" if minutes else f"{int(timeout)}s" + result = EvalDetail(metric=cls.__name__) + result.status = True + result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_OVERALL_TIMEOUT"] + result.reason = [ + "Article Fact-Checking Failed: Overall Timeout Exceeded", + "=" * 70, + f"Execution exceeded the {int(timeout)}s ({limit_str}) wall-clock limit.", + f"Elapsed time: {elapsed:.1f}s", + "", + "Recommendations:", + f" 1. Increase overall_timeout (current: {int(timeout)}s) in agent_config", + " 2. Reduce max_claims in claims_extractor config (e.g., 50 -> 20)", + " 3. Use a faster model (e.g., gpt-4o-mini instead of gpt-4o)", + " 4. Reduce max_concurrent_claims to lower API rate-limit pressure", + " 5. Split long articles into shorter sections", + ] + return result + @classmethod def plan_execution(cls, input_data: Data) -> List[Dict[str, Any]]: """ diff --git a/dingo/model/llm/agent/agent_fact_check.py b/dingo/model/llm/agent/agent_fact_check.py index 190e105d..5246d7a1 100644 --- a/dingo/model/llm/agent/agent_fact_check.py +++ b/dingo/model/llm/agent/agent_fact_check.py @@ -70,15 +70,13 @@ class AgentFactCheck(BaseAgent): "key": "your-openai-api-key", "api_url": "https://api.openai.com/v1", "model": "gpt-4.1-mini-2025-04-14", - "parameters": { - "agent_config": { - "max_iterations": 5, - "tools": { - "tavily_search": { - "api_key": "your-tavily-api-key", - "max_results": 5, - "search_depth": "advanced" - } + "agent_config": { + "max_iterations": 5, + "tools": { + "tavily_search": { + "api_key": "your-tavily-api-key", + "max_results": 5, + "search_depth": "advanced" } } } diff --git a/dingo/model/llm/agent/agent_hallucination.py b/dingo/model/llm/agent/agent_hallucination.py index 0e39c48c..fc22ba56 100644 --- a/dingo/model/llm/agent/agent_hallucination.py +++ b/dingo/model/llm/agent/agent_hallucination.py @@ -82,15 +82,13 @@ class AgentHallucination(BaseAgent): "key": "your-openai-api-key", "api_url": "https://api.openai.com/v1", "model": "gpt-4.1-mini-2025-04-14", - "parameters": { - "agent_config": { - "max_iterations": 3, - "tools": { - "tavily_search": { - "api_key": "your-tavily-api-key", - "max_results": 5, - "search_depth": "advanced" - } + "agent_config": { + "max_iterations": 3, + "tools": { + "tavily_search": { + "api_key": "your-tavily-api-key", + "max_results": 5, + "search_depth": "advanced" } } } diff --git a/dingo/model/llm/agent/agent_wrapper.py b/dingo/model/llm/agent/agent_wrapper.py index 4240c1ef..8c20247d 100644 --- a/dingo/model/llm/agent/agent_wrapper.py +++ b/dingo/model/llm/agent/agent_wrapper.py @@ -327,22 +327,22 @@ def get_openai_llm_from_dingo_config(dynamic_config): ) # Extract parameters - params = dynamic_config.parameters or {} + extra_params = dynamic_config.model_extra # Create ChatOpenAI instance llm = ChatOpenAI( api_key=dynamic_config.key, base_url=dynamic_config.api_url, model=dynamic_config.model or "gpt-4.1-mini", - temperature=params.get("temperature", 0.3), - max_tokens=params.get("max_tokens", 4096), - top_p=params.get("top_p", 1.0), - timeout=params.get("timeout", 30) + temperature=extra_params.get("temperature", 0.3), + max_tokens=extra_params.get("max_tokens", 4096), + top_p=extra_params.get("top_p", 1.0), + timeout=extra_params.get("timeout", 30) ) log.debug( f"Created ChatOpenAI: model={dynamic_config.model}, " - f"temp={params.get('temperature', 0.3)}" + f"temp={extra_params.get('temperature', 0.3)}" ) return llm diff --git a/dingo/model/llm/agent/base_agent.py b/dingo/model/llm/agent/base_agent.py index 3832cefe..d3db23d2 100644 --- a/dingo/model/llm/agent/base_agent.py +++ b/dingo/model/llm/agent/base_agent.py @@ -146,7 +146,7 @@ def get_tool_config(cls, tool_name: str) -> Dict[str, Any]: Extract tool configuration from agent's dynamic_config. Configuration is expected in: - dynamic_config.parameters.agent_config.tools.{tool_name} + dynamic_config.agent_config.tools.{tool_name} Args: tool_name: Name of the tool @@ -154,8 +154,8 @@ def get_tool_config(cls, tool_name: str) -> Dict[str, Any]: Returns: Dict of configuration values for the tool """ - params = cls.dynamic_config.parameters or {} - agent_config = params.get('agent_config', {}) + extra_params = cls.dynamic_config.model_extra + agent_config = extra_params.get('agent_config', {}) tools_config = agent_config.get('tools', {}) return tools_config.get(tool_name, {}) @@ -184,8 +184,8 @@ def get_max_iterations(cls) -> int: Returns: Maximum number of iterations allowed """ - params = cls.dynamic_config.parameters or {} - agent_config = params.get('agent_config', {}) + extra_params = cls.dynamic_config.model_extra + agent_config = extra_params.get('agent_config', {}) return agent_config.get('max_iterations', cls.max_iterations) @classmethod diff --git a/dingo/model/llm/base_openai.py b/dingo/model/llm/base_openai.py index b6fbcd52..c3911699 100644 --- a/dingo/model/llm/base_openai.py +++ b/dingo/model/llm/base_openai.py @@ -82,22 +82,18 @@ def send_messages(cls, messages: List): else: model_name = cls.client.models.list().data[0].id - params = cls.dynamic_config.parameters - cls.validate_config(params) + extra_params = cls.dynamic_config.model_extra + cls.validate_config(extra_params) completions = cls.client.chat.completions.create( model=model_name, messages=messages, - temperature=params.get("temperature", 0.3) if params else 0.3, - top_p=params.get("top_p", 1) if params else 1, - max_tokens=params.get("max_tokens", 4000) if params else 4000, - presence_penalty=params.get("presence_penalty", 0) if params else 0, - frequency_penalty=params.get("frequency_penalty", 0) if params else 0, + **extra_params, ) if completions.choices[0].finish_reason == "length": raise ExceedMaxTokens( - f"Exceed max tokens: {params.get('max_tokens', 4000) if params else 4000}" + f"Exceed max tokens: {extra_params.get('max_tokens', 4000)}" ) return str(completions.choices[0].message.content) diff --git a/dingo/model/llm/compare/llm_html_extract_compare_v2.py b/dingo/model/llm/compare/llm_html_extract_compare_v2.py index 65290d1f..a54c3fdb 100644 --- a/dingo/model/llm/compare/llm_html_extract_compare_v2.py +++ b/dingo/model/llm/compare/llm_html_extract_compare_v2.py @@ -25,9 +25,17 @@ class LLMHtmlExtractCompareV2(BaseOpenAI): 输入数据要求: - input_data.prompt: 工具A提取的文本 - input_data.content: 工具B提取的文本 - - input_data.raw_data.get("language", "en"): 语言类型 ("zh" 或 "en") + - language: 可选,来自 input_data.language 或 raw_data["language"],缺省为 "en"("zh" / "en") """ + _metric_info = { + 'category': 'Pretrain Text Quality Assessment Metrics', + 'metric_name': 'LLMHtmlExtractCompareV2', + 'description': 'Compares two HTML main-content extraction tools by computing text diffs and using LLM to judge which preserves more core information', + 'paper_title': '', + 'paper_url': '', + } + _required_fields = [RequiredField.CONTENT, RequiredField.PROMPT] prompt = { "content_en": r"""Please compare the following two texts, each extracted from the same webpage using different HTML parsing methods. Your task is to determine whether there is a difference in the core informational content between them. @@ -174,7 +182,8 @@ def build_messages(cls, input_data: Data) -> List: text_tool_b = input_data.content # 获取配置参数 - language = input_data.raw_data.get("language", "en") + raw_data = getattr(input_data, 'raw_data', {}) or {} + language = raw_data.get("language", getattr(input_data, 'language', "en")) # 计算文本差异 diff_result = cls.extract_text_diff(text_tool_a, text_tool_b) diff --git a/dingo/model/llm/instruction_quality/llm_instruction_clarity.py b/dingo/model/llm/instruction_quality/llm_instruction_clarity.py index 9c73a35d..526611d8 100644 --- a/dingo/model/llm/instruction_quality/llm_instruction_clarity.py +++ b/dingo/model/llm/instruction_quality/llm_instruction_clarity.py @@ -283,8 +283,8 @@ def process_response(cls, response: str) -> EvalDetail: # 判断是否通过(默认阈值 6.0) threshold = 6.0 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - threshold = cls.dynamic_config.parameters.get('threshold', 6.0) + if hasattr(cls, 'dynamic_config'): + threshold = cls.dynamic_config.model_extra.get('threshold', 6.0) if score >= threshold: result.status = False diff --git a/dingo/model/llm/instruction_quality/llm_task_difficulty.py b/dingo/model/llm/instruction_quality/llm_task_difficulty.py index e3fe1db0..9c676396 100644 --- a/dingo/model/llm/instruction_quality/llm_task_difficulty.py +++ b/dingo/model/llm/instruction_quality/llm_task_difficulty.py @@ -321,14 +321,14 @@ def process_response(cls, response: str) -> EvalDetail: # 难度评估没有"通过/不通过"的概念,只是描述性的 # 但为了兼容框架,我们设置一个合理的默认行为 - # 可以通过 parameters 配置 min_difficulty 和 max_difficulty + # 可以通过 config 中的 min_difficulty 和 max_difficulty 配置难度范围 result.status = False # 默认不标记为问题 result.label = [f"TASK_DIFFICULTY.{difficulty_level.upper()}"] # 如果配置了难度范围要求,进行检查 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - min_difficulty = cls.dynamic_config.parameters.get('min_difficulty', 0) - max_difficulty = cls.dynamic_config.parameters.get('max_difficulty', 10) + if hasattr(cls, 'dynamic_config'): + min_difficulty = cls.dynamic_config.model_extra.get('min_difficulty', 0) + max_difficulty = cls.dynamic_config.model_extra.get('max_difficulty', 10) if difficulty_score < min_difficulty: result.status = True diff --git a/dingo/model/llm/rag/llm_rag_answer_relevancy.py b/dingo/model/llm/rag/llm_rag_answer_relevancy.py index ec0e0cda..199187cd 100644 --- a/dingo/model/llm/rag/llm_rag_answer_relevancy.py +++ b/dingo/model/llm/rag/llm_rag_answer_relevancy.py @@ -242,14 +242,8 @@ def eval(cls, input_data: Data) -> EvalDetail: try: # 增加温度参数以提高问题生成的随机性 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - if 'temperature' not in cls.dynamic_config.parameters: - cls.dynamic_config.parameters['temperature'] = 0.7 - else: - # 如果没有parameters,创建一个包含temperature的parameters - current_params = cls.dynamic_config.parameters or {} - current_params['temperature'] = 0.7 - cls.dynamic_config.parameters = current_params + if hasattr(cls, 'dynamic_config') and 'temperature' not in cls.dynamic_config.model_extra: + cls.dynamic_config.temperature = 0.7 # 生成多个相关问题 generated_questions = cls.generate_multiple_questions(input_data, cls.strictness) @@ -263,10 +257,9 @@ def eval(cls, input_data: Data) -> EvalDetail: # 根据分数判断是否通过,默认阈值为5 threshold = 5 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - threshold = cls.dynamic_config.parameters.get('threshold', 5) - # 检查是否有自定义的strictness参数 - cls.strictness = cls.dynamic_config.parameters.get('strictness', 3) + if hasattr(cls, 'dynamic_config'): + threshold = cls.dynamic_config.model_extra.get('threshold', 5) + cls.strictness = cls.dynamic_config.model_extra.get('strictness', 3) # 构建详细的reason文本 all_reasons = [] diff --git a/dingo/model/llm/rag/llm_rag_context_precision.py b/dingo/model/llm/rag/llm_rag_context_precision.py index 50f9b661..9c305c4b 100644 --- a/dingo/model/llm/rag/llm_rag_context_precision.py +++ b/dingo/model/llm/rag/llm_rag_context_precision.py @@ -256,8 +256,8 @@ def process_response(cls, responses: List[str]) -> EvalDetail: # 根据分数判断是否通过,默认阈值为5 threshold = 5 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - threshold = cls.dynamic_config.parameters.get('threshold', 5) + if hasattr(cls, 'dynamic_config'): + threshold = cls.dynamic_config.model_extra.get('threshold', 5) if score >= threshold: result.status = False diff --git a/dingo/model/llm/rag/llm_rag_context_recall.py b/dingo/model/llm/rag/llm_rag_context_recall.py index 4ba059cc..8d6d06cc 100644 --- a/dingo/model/llm/rag/llm_rag_context_recall.py +++ b/dingo/model/llm/rag/llm_rag_context_recall.py @@ -215,8 +215,8 @@ def process_response(cls, response: str) -> EvalDetail: # 根据分数判断是否通过,默认阈值为5 threshold = 5 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - threshold = cls.dynamic_config.parameters.get('threshold', 5) + if hasattr(cls, 'dynamic_config'): + threshold = cls.dynamic_config.model_extra.get('threshold', 5) if score >= threshold: result.status = False diff --git a/dingo/model/llm/rag/llm_rag_context_relevancy.py b/dingo/model/llm/rag/llm_rag_context_relevancy.py index ca16e289..94204e8a 100644 --- a/dingo/model/llm/rag/llm_rag_context_relevancy.py +++ b/dingo/model/llm/rag/llm_rag_context_relevancy.py @@ -206,8 +206,8 @@ def process_response(cls, response: str) -> EvalDetail: # 根据分数判断是否通过,默认阈值为5 threshold = 5 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - threshold = cls.dynamic_config.parameters.get('threshold', 5) + if hasattr(cls, 'dynamic_config'): + threshold = cls.dynamic_config.model_extra.get('threshold', 5) if score >= threshold: result.status = False diff --git a/dingo/model/llm/rag/llm_rag_faithfulness.py b/dingo/model/llm/rag/llm_rag_faithfulness.py index 2111e77d..fe763ef0 100644 --- a/dingo/model/llm/rag/llm_rag_faithfulness.py +++ b/dingo/model/llm/rag/llm_rag_faithfulness.py @@ -290,8 +290,8 @@ def process_response(cls, response: str) -> EvalDetail: # 根据分数判断是否通过,默认阈值为5 threshold = 5 - if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: - threshold = cls.dynamic_config.parameters.get('threshold', 5) + if hasattr(cls, 'dynamic_config'): + threshold = cls.dynamic_config.model_extra.get('threshold', 5) if score >= threshold: result.status = False diff --git a/dingo/model/llm/text_quality/base_text_quality.py b/dingo/model/llm/text_quality/base_text_quality.py index 5f3133ea..713e5e06 100644 --- a/dingo/model/llm/text_quality/base_text_quality.py +++ b/dingo/model/llm/text_quality/base_text_quality.py @@ -47,16 +47,11 @@ def process_response(cls, response: str) -> EvalDetail: response_json = json.loads(response) response_model = ResponseScoreTypeNameReason(**response_json) - # Create EvalDetail with all required fields - # status = False for Good quality (no issues found) - # status = True for Bad quality (issues found) - is_good = response_model.type == "Good" - result = EvalDetail( metric=cls.__name__, - status=not is_good, # True if Bad (issues found), False if Good + status=False if response_model.score == 1 else True, score=response_model.score, - label=["QUALITY_GOOD"] if is_good else [f"{response_model.type}.{response_model.name}"], + label=["QUALITY_GOOD"] if response_model.score == 1 else [f"{response_model.type}.{response_model.name}"], reason=[response_model.reason] ) diff --git a/dingo/model/llm/text_quality/llm_text_equation.py b/dingo/model/llm/text_quality/llm_text_equation.py new file mode 100644 index 00000000..e71220d1 --- /dev/null +++ b/dingo/model/llm/text_quality/llm_text_equation.py @@ -0,0 +1,68 @@ +from dingo.io.input import RequiredField +from dingo.model import Model +from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality + + +@Model.llm_register("LLMTextEquation") +class LLMTextEquation(BaseTextQuality): + # Metadata for documentation generation + _metric_info = { + "category": "Pretrain Text Quality Assessment Metrics", + "metric_name": "LLMTextQualityV5", + "description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds", + "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages", + "paper_url": "https://arxiv.org/abs/2501.14506", + "paper_authors": "Yu et al., 2025", + "examples": "examples/llm_and_rule/llm_local.py", + "evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md" + } + _required_fields = [RequiredField.CONTENT] + prompt = r""" +你是一个专业的数学、化学等学科的公式质检员。我会给你一个从文档中提取的 equation 类型元素(JSON 格式),请对其 text 字段进行质量检测。 + +## 检测维度 + +1. **语法问题** + - LaTeX 命令拼写错误(如 \frace 代替 \frac) + - 括号未正确配对闭合({}、[]、()) + - 环境标签不匹配(如 \begin{} 与 \end{} 不对应) + +2. **识别问题** + - 疑似 OCR 识别错误(如字母与符号混淆:x 与 ×、- 与 −、l 与 1、O 与 0 等) + - 公式内容明显残缺或截断 + - 出现乱码或无意义字符 + +3. **语义问题** + - 公式结构不完整,无法表达完整的数学含义 + - 运算符或符号使用明显不合数学规范 + +## 一级错误类型(type) + +- `syntax`:语法问题 +- `recognition`:识别问题 +- `semantic`:语义问题 + +## 二级错误类型(name) + +- `command_error`:LaTeX 命令拼写错误 +- `bracket_mismatch`:括号未正确配对 +- `env_mismatch`:环境标签不匹配 +- `ocr_error`:OCR 字符识别错误 +- `truncated_content`:公式残缺或截断 +- `garbled_text`:乱码或无意义字符 +- `incomplete_expression`:公式结构不完整 +- `invalid_notation`:数学符号使用不规范 +- `none`:无问题 + +## Output Format + +Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""} + +score 类型必须为int; +score 为 1 表示通过,type 填 "Good",name 填 "None",reason 说明公式正常的依据; +score 为 0 表示不通过,type 和 name 填对应的错误类型,reason 说明判断依据并指出具体的问题位置或内容。 + +## Input content to evaluate: + +""" + # process_response method is now inherited from BaseTextQuality diff --git a/dingo/model/llm/text_quality/llm_text_quality_v5.py b/dingo/model/llm/text_quality/llm_text_quality_v5.py index d0b02992..918ecff9 100644 --- a/dingo/model/llm/text_quality/llm_text_quality_v5.py +++ b/dingo/model/llm/text_quality/llm_text_quality_v5.py @@ -30,7 +30,27 @@ class LLMTextQualityV5(BaseTextQuality): **Impact**: Broken structures prevent models from learning correct formatting patterns. **Check for**: -- **Error_Formula**: Mathematical expressions with **unmatched delimiters** or **unclosed environments** +- **Error_Formula**: Mathematical content with **broken syntax** OR **systematically stripped symbols/formulas** + + Two failure modes: + + **(A) Broken LaTeX syntax** — delimiters or environments are present but malformed: + - Delimiters unmatched: $ without closing $ (LaTeX context, not dollar signs) + - Environments unclosed: \\begin{{align}} without \\end{{align}} + - Syntax broken: \\frac{{a}}{{b missing closing }} + - HTML tags unclosed: text without + - Impact: Prevents >50% of mainstream parsers from rendering + + **(B) Stripped mathematical content** — symbols/formulas systematically removed during extraction: + - Orphan hyphens from stripped Greek letters: "κ-solutions" → "-solutions", "ε-net" → "-net" + - Empty positions after connective words: "thus ;" or "the interval ;" where a formula was removed + - Sentences referencing variables/expressions that are absent: "a small number" (number missing), "we have ." (equation missing) + - Systematic loss: multiple occurrences throughout the text, not just one or two typos + - Impact: Mathematical text becomes incoherent; models learn broken academic writing patterns + + Example (BAD — stripped symbols): + "Let be a -solution to the Ricci flow which is -noncollapsed. Ancient, in the sense that t ranges on the interval ; Bounded curvature, thus ;" + (Greek letters κ stripped from "κ-solution" and "κ-noncollapsed"; interval expression and inequality after "thus" removed entirely) ⚠️ **Normal patterns (DO NOT flag)**: - Mixing inline ($...$) and display ($$...$$) formulas @@ -38,31 +58,39 @@ class LLMTextQualityV5(BaseTextQuality): - Line breaks with \\\\ in alignment environments - HTML tags: x, 2 for subscripts/superscripts - Mixing LaTeX and HTML in web-extracted content - - ✅ **Only flag when**: - - Delimiters unmatched: $ without closing $ (LaTeX context, not dollar signs) - - Environments unclosed: \\begin{{align}} without \\end{{align}} - - Syntax broken: \\frac{{a}}{{b missing closing }} - - HTML tags unclosed: text without + - Plain-text math without any LaTeX (e.g., "a^2 + b^2 = c^2" without $ delimiters) — this is fine as long as the expressions are actually present ⚠️ **Important**: Distinguish LaTeX $ from dollar signs ($100) - Dollar sign: "$100", "$5.99" (followed by numbers) → NOT LaTeX - LaTeX delimiter: "$x$", "$\\alpha$" (contains math symbols) → IS LaTeX - - Example: "The price is $100 and equation $x=y$ costs $50" has 4 dollar symbols but only 2 are LaTeX delimiters (and they match) - - Example (BAD): "$x^2 + y^2 is broken here $$a = b$$$" + - Example (BAD — broken delimiters): "$x^2 + y^2 is broken here $$a = b$$$" (First LaTeX $ never closes, extra $ at end) - Example (GOOD): "The item costs $100 and satisfies $x^2 + y^2 = z^2$ where price is $50" (Dollar signs for money + proper LaTeX pair) - - Impact: Only flag errors that prevent >50% of mainstream parsers (pdflatex, MathJax, KaTeX, Pandoc, Jupyter) from rendering - **Error_Table**: Table structures that are malformed or unreadable - Example (BAD): Misaligned columns, missing headers, or garbled HTML tags - Impact: Models cannot learn proper table representation - **Error_Code**: Code blocks with formatting corruption - - Example (BAD): Line numbers mixed with code, broken syntax highlighting markers - - Impact: Teaches incorrect code structure + **Common corruption patterns**: + - Missing code fence (` ``` `): code appears as plain text without language block + - Lost indentation: Python/YAML code with all indentation stripped (flat lines) + - Broken identifiers: spaces injected into tokens, e.g. `sys .argv`, `pts .append`, `i[ 0]` + - Line numbers mixed with code, broken syntax highlighting markers + - Keywords wrapped in inline backticks instead of a fenced block, e.g. `` `import` sys `` + + Example (BAD — indentation and identifiers destroyed): + ``` + `import` sys + pts = [] + for i in range( 1,len(sys .argv), 2): + pts .append([int(sys .argv[i]), int(sys .argv[i +1])]) + ``` + Correct version would have a code fence, proper indentation, and no spaces inside `sys.argv`. + + - Impact: Teaches incorrect code syntax, broken tokenization patterns, and wrong indentation conventions **Key Question**: "Can the model learn proper formatting from this structure?" @@ -160,10 +188,14 @@ class LLMTextQualityV5(BaseTextQuality): Input: "The eigenstate $\\psi_n$ where n is quantum number and energy E2 = m2c4" Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Normal mix of LaTeX and HTML tags from web content"}} -**Example 2 (Bad - Completeness)**: +**Example 2 (Bad - Completeness, broken delimiters)**: Input: "The formula $x^2 + y^2 is broken here $$a = b$$$" Output: {"score": 0, "type": "Completeness", "name": "Error_Formula", "reason": "Unmatched delimiters: first $ never closes, extra $ at end"} +**Example 2.5 (Bad - Completeness, stripped math)**: +Input: "Definition 1.(-solutions) A -solution is a Ricci flow which is -noncollapsed at every scale. Ancient, in the sense that t ranges on the interval ; Bounded curvature, thus ;" +Output: {{"score": 0, "type": "Completeness", "name": "Error_Formula", "reason": "Mathematical symbols systematically stripped: Greek letters removed ('-solutions' instead of 'κ-solutions'), formulas missing after 'the interval' and 'thus'"}} + **Example 3 (Bad - Effectiveness)**: Input: "Theappleisredandtasty�withsomegarbledtext□□" Output: {"score": 0, "type": "Effectiveness", "name": "Error_Garbled_Characters", "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)"} diff --git a/dingo/model/llm/text_quality/llm_text_table.py b/dingo/model/llm/text_quality/llm_text_table.py new file mode 100644 index 00000000..17897347 --- /dev/null +++ b/dingo/model/llm/text_quality/llm_text_table.py @@ -0,0 +1,70 @@ +from dingo.io.input import RequiredField +from dingo.model import Model +from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality + + +@Model.llm_register("LLMTextTable") +class LLMTextTable(BaseTextQuality): + # Metadata for documentation generation + _metric_info = { + "category": "Pretrain Text Quality Assessment Metrics", + "metric_name": "LLMTextQualityV5", + "description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds", + "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages", + "paper_url": "https://arxiv.org/abs/2501.14506", + "paper_authors": "Yu et al., 2025", + "examples": "examples/llm_and_rule/llm_local.py", + "evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md" + } + _required_fields = [RequiredField.CONTENT] + prompt = r""" +你是一个专业的表格数据质检员。我会给你一段从文档中提取的 HTML 表格(table_body 字段),请判断该表格是否存在质量问题。 + +## 检测维度 + +请从以下维度进行检查: + +1. **结构问题** + - HTML 标签不完整或嵌套错误(
| 未正确闭合) + - 行列结构异常(某行 | 数量与其他行差异过大)
+ - 表格内容全部为空
+
+2. **识别问题**
+ - 存在明显乱码或无意义字符
+ - 疑似 OCR 识别错误(如字母/数字混淆:0与O、1与l、S与5等)
+ - 文字截断或内容残缺
+
+3. **语义问题**
+ - 单元格内容语义不连贯,无法理解表格表达的含义
+ - 行列关系混乱,内容错位
+
+## 一级错误类型(type)
+
+- `structure`:结构问题
+- `recognition`:识别问题
+- `semantic`:语义问题
+
+## 二级错误类型(name)
+
+- `tag_error`:标签不完整或嵌套错误
+- `row_col_mismatch`:行列数量不一致
+- `empty_table`:表格内容为空
+- `garbled_text`:乱码或无意义字符
+- `ocr_error`:OCR 字符识别错误
+- `truncated_content`:文字截断或内容残缺
+- `incoherent_semantics`:语义不连贯
+- `misaligned_content`:内容错位
+- `none`:无问题
+
+## Output Format
+
+Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""}
+
+score 类型必须为int;
+score 为 1 表示通过,type 填 "Good",name 填 "None",reason 说明公式正常的依据;
+score 为 0 表示不通过,type 和 name 填对应的错误类型,reason 说明判断依据并指出具体位置或内容。
+
+## Input content to evaluate:
+
+"""
+ # process_response method is now inherited from BaseTextQuality
diff --git a/dingo/model/llm/vlm_layout_quality.py b/dingo/model/llm/vlm_layout_quality.py
index 40627a88..e3a5456d 100644
--- a/dingo/model/llm/vlm_layout_quality.py
+++ b/dingo/model/llm/vlm_layout_quality.py
@@ -201,8 +201,8 @@ def send_messages(cls, messages: List):
else:
model_name = cls.client.models.list().data[0].id
- params = cls.dynamic_config.parameters
- cls.validate_config(params)
+ extra_params = cls.dynamic_config.model_extra
+ cls.validate_config(extra_params)
completions = cls.client.chat.completions.create(
model=model_name,
diff --git a/docs/agent_architecture.md b/docs/agent_architecture.md
index c55d34c6..e563357b 100644
--- a/docs/agent_architecture.md
+++ b/docs/agent_architecture.md
@@ -458,7 +458,7 @@ Aggregation:
├─ name: "AgentFactCheck"
├─ config.key: API key
├─ config.model: "gpt-4"
- └─ config.parameters.agent_config:
+ └─ config.agent_config:
├─ max_iterations: 10
└─ tools:
└─ tavily_search:
@@ -539,7 +539,7 @@ Check if tool in available_tools
ToolRegistry.get(tool_name) → tool_class
↓
configure_tool(tool_name, tool_class)
- ├─ Extract config from dynamic_config.parameters.agent_config.tools.{tool_name}
+ ├─ Extract config from dynamic_config.agent_config.tools.{tool_name}
└─ tool_class.update_config(config_dict)
↓
tool_class.execute(**kwargs)
@@ -560,7 +560,7 @@ Return to agent for processing
3. **Three Patterns**: LangChain-based (declarative), Custom Workflow (imperative), Agent-First + Context (hybrid)
4. **Tool System**: Centralized registry with configuration injection
5. **Execution**: Runs in ThreadPoolExecutor alongside other LLMs
-6. **Configuration**: Nested under `parameters.agent_config` in evaluator config
+6. **Configuration**: `agent_config` is a top-level key in evaluator config (flat structure)
7. **Artifact Saving**: ArticleFactChecker auto-saves intermediate artifacts to a timestamped directory by default; override via `agent_config.output_path`, or disable with `agent_config.save_artifacts=false`
### Implementation Checklist
diff --git a/docs/agent_development_guide.md b/docs/agent_development_guide.md
index da071b7c..6d6f80df 100644
--- a/docs/agent_development_guide.md
+++ b/docs/agent_development_guide.md
@@ -441,8 +441,8 @@ def _get_output_dir(cls) -> Optional[str]:
Get output directory for artifact files (three-priority chain).
Returns output dir path (created if needed), or None if saving disabled.
"""
- params = cls.dynamic_config.parameters or {}
- agent_cfg = params.get('agent_config') or {}
+ extra_params = cls.dynamic_config.model_extra
+ agent_cfg = extra_params.get('agent_config') or {}
explicit_path = agent_cfg.get('output_path')
if explicit_path:
@@ -673,17 +673,15 @@ class MyAgent(BaseAgent):
"key": "openai-api-key",
"api_url": "https://api.openai.com/v1",
"model": "gpt-4",
- "parameters": {
- "agent_config": {
- "max_iterations": 3,
- "tools": {
- "my_tool": {
- "api_key": "tool-api-key",
- "max_results": 5
- }
- }
- }
- }
+ "agent_config": {
+ "max_iterations": 3,
+ "tools": {
+ "my_tool": {
+ "api_key": "tool-api-key",
+ "max_results": 5
+ }
+ }
+ }
}
}
"""
@@ -889,19 +887,17 @@ def eval(cls, input_data: Data) -> EvalDetail:
"key": "openai-api-key",
"api_url": "https://api.openai.com/v1",
"model": "gpt-4-turbo",
- "parameters": {
- "temperature": 0.1,
- "agent_config": {
- "max_iterations": 3,
- "tools": {
- "my_tool": {
- "api_key": "my-tool-api-key",
- "max_results": 10,
- "timeout": 30
- },
- "another_tool": {
- "config_key": "value"
- }
+ "temperature": 0.1,
+ "agent_config": {
+ "max_iterations": 3,
+ "tools": {
+ "my_tool": {
+ "api_key": "my-tool-api-key",
+ "max_results": 10,
+ "timeout": 30
+ },
+ "another_tool": {
+ "config_key": "value"
}
}
}
@@ -919,10 +915,10 @@ def eval(cls, input_data: Data) -> EvalDetail:
def some_method(cls):
# Access LLM configuration
model = cls.dynamic_config.model # "gpt-4-turbo"
- temperature = cls.dynamic_config.parameters.get('temperature', 0)
+ temperature = cls.dynamic_config.model_extra.get('temperature', 0)
# Access agent-specific configuration
- agent_config = cls.dynamic_config.parameters.get('agent_config', {})
+ agent_config = cls.dynamic_config.model_extra.get('agent_config', {})
max_iterations = agent_config.get('max_iterations', 5)
# Get tool configuration
@@ -966,10 +962,8 @@ class MyAgent(BaseAgent):
{
"name": "MyAgent",
"config": {
- "parameters": {
- "agent_config": {
- "max_iterations": 10
- }
+ "agent_config": {
+ "max_iterations": 10
}
}
}
@@ -1259,17 +1253,15 @@ Always include SOURCES with specific URLs when you perform web searches."""
"key": "your-openai-api-key",
"api_url": "https://api.openai.com/v1",
"model": "gpt-4-turbo",
- "parameters": {
- "temperature": 0.1,
- "max_tokens": 16384,
- "agent_config": {
- "max_iterations": 5,
- "tools": {
- "tavily_search": {
- "api_key": "your-tavily-api-key",
- "max_results": 5,
- "search_depth": "advanced"
- }
+ "temperature": 0.1,
+ "max_tokens": 16384,
+ "agent_config": {
+ "max_iterations": 5,
+ "tools": {
+ "tavily_search": {
+ "api_key": "your-tavily-api-key",
+ "max_results": 5,
+ "search_depth": "advanced"
}
}
}
@@ -1597,11 +1589,9 @@ config = {
"key": "openai-key",
"api_url": "https://api.openai.com/v1",
"model": "gpt-4",
- "parameters": {
- "agent_config": {
- "tools": {
- "tavily_search": {"api_key": "tavily-key"}
- }
+ "agent_config": {
+ "tools": {
+ "tavily_search": {"api_key": "tavily-key"}
}
}
}
@@ -1632,7 +1622,7 @@ summary = executor.execute()
**Configuration not working:**
- Check JSON structure matches expected format
-- Verify `parameters.agent_config.tools.{tool_name}` structure
+- Verify `agent_config.tools.{tool_name}` structure
- Use Pydantic validation to catch config errors early
**Tests failing:**
diff --git a/docs/article_fact_checking_guide.md b/docs/article_fact_checking_guide.md
index 518b0ff3..43d04947 100644
--- a/docs/article_fact_checking_guide.md
+++ b/docs/article_fact_checking_guide.md
@@ -81,24 +81,22 @@ config = {
"config": {
"key": os.getenv("OPENAI_API_KEY"),
"model": "deepseek-chat", # or "gpt-4o-mini" for OpenAI
- "parameters": {
- "agent_config": {
- "max_iterations": 15,
- "output_path": "outputs/article_factcheck/", # Optional: save intermediate artifacts
- "tools": {
- "claims_extractor": {
- "api_key": os.getenv("OPENAI_API_KEY"),
- "max_claims": 50,
- "claim_types": [
- "factual", "statistical", "attribution", "institutional",
- "temporal", "comparative", "monetary", "technical"
- ]
- },
- "tavily_search": {
- "api_key": os.getenv("TAVILY_API_KEY")
- },
- "arxiv_search": {"max_results": 5}
- }
+ "agent_config": {
+ "max_iterations": 15,
+ "output_path": "outputs/article_factcheck/", # Optional: save intermediate artifacts
+ "tools": {
+ "claims_extractor": {
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ "max_claims": 50,
+ "claim_types": [
+ "factual", "statistical", "attribution", "institutional",
+ "temporal", "comparative", "monetary", "technical"
+ ]
+ },
+ "tavily_search": {
+ "api_key": os.getenv("TAVILY_API_KEY")
+ },
+ "arxiv_search": {"max_results": 5}
}
}
}
@@ -143,19 +141,17 @@ cat > article_check_config.json << EOF
"config": {
"key": "${OPENAI_API_KEY}",
"model": "deepseek-chat",
- "parameters": {
- "agent_config": {
- "max_iterations": 15,
- "tools": {
- "claims_extractor": {
- "api_key": "${OPENAI_API_KEY}",
- "max_claims": 50
- },
- "tavily_search": {
- "api_key": "${TAVILY_API_KEY}"
- },
- "arxiv_search": {}
- }
+ "agent_config": {
+ "max_iterations": 15,
+ "tools": {
+ "claims_extractor": {
+ "api_key": "${OPENAI_API_KEY}",
+ "max_claims": 50
+ },
+ "tavily_search": {
+ "api_key": "${TAVILY_API_KEY}"
+ },
+ "arxiv_search": {}
}
}
}
diff --git a/docs/config.md b/docs/config.md
index 6d1f1d4c..a5020ee4 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -104,26 +104,22 @@ HuggingFace 特定配置:
#### EvaluatorLLMArgs 配置 (evaluator.llm_config.[llm_name])
-LLM 配置:
+LLM 配置(支持额外字段,所有额外字段会直接透传给 LLM API):
| Parameter | Type | Default | Required | Description |
|-----------|------|---------|----------|-------------|
| model | str | null | No | 使用的模型名称 |
| key | str | null | No | API 密钥 |
| api_url | str | null | No | API URL |
-| parameters | object | null | No | LLM 调参配置 |
-
-##### LLM Parameters 配置
-
-LLM 调参配置:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| temperature | number | 1 | 采样温度,0-2之间 |
-| top_p | number | 1 | 核心采样概率 |
-| max_tokens | number | 4000 | 最大生成token数 |
-| presence_penalty | number | 0 | 存在惩罚,-2.0到2.0之间 |
-| frequency_penalty | number | 0 | 频率惩罚,-2.0到2.0之间 |
+| embedding_config | object | null | No | Embedding 模型独立配置(RAG 评估器使用) |
+| temperature | number | 1 | No | 采样温度,0-2之间 |
+| top_p | number | 1 | No | 核心采样概率 |
+| max_tokens | number | 4000 | No | 最大生成token数 |
+| presence_penalty | number | 0 | No | 存在惩罚,-2.0到2.0之间 |
+| frequency_penalty | number | 0 | No | 频率惩罚,-2.0到2.0之间 |
+| agent_config | object | null | No | Agent 专属配置(max_iterations、tools 等) |
+| threshold | number | - | No | 评估通过阈值(各评估器自定义) |
+| *其他字段* | any | - | No | 所有额外字段直接透传给 LLM API |
## 配置文件示例
@@ -181,13 +177,11 @@ LLM 调参配置:
"model": "gpt-3.5-turbo",
"key": "your-api-key",
"api_url": "https://api.openai.com/v1/chat/completions",
- "parameters": {
- "temperature": 1,
- "top_p": 1,
- "max_tokens": 4000,
- "presence_penalty": 0,
- "frequency_penalty": 0
- }
+ "temperature": 1,
+ "top_p": 1,
+ "max_tokens": 4000,
+ "presence_penalty": 0,
+ "frequency_penalty": 0
}
}
}
diff --git a/docs/factcheck_guide.md b/docs/factcheck_guide.md
index 4112707f..7abed067 100644
--- a/docs/factcheck_guide.md
+++ b/docs/factcheck_guide.md
@@ -101,9 +101,7 @@ input_data = {
"model": "deepseek-chat",
"key": "your-api-key",
"api_url": "https://api.deepseek.com/v1",
- "parameters": {
- "temperature": 0.1
- }
+ "temperature": 0.1
}
}
}
diff --git a/docs/factuality_assessment_guide.md b/docs/factuality_assessment_guide.md
index 13680cc2..6f670fcc 100644
--- a/docs/factuality_assessment_guide.md
+++ b/docs/factuality_assessment_guide.md
@@ -59,7 +59,7 @@ LLMFactCheck.dynamic_config = EvaluatorLLMArgs(
key=os.getenv("OPENAI_API_KEY"),
api_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
model=os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
- parameters={"threshold": 5.0}
+ threshold=5.0
)
# Prepare data
@@ -108,7 +108,7 @@ input_data = {
"model": "gpt-4o-mini",
"key": "YOUR_API_KEY",
"api_url": "https://api.openai.com/v1",
- "parameters": {"threshold": 5.0}
+ "threshold": 5.0
}
}
]
@@ -142,7 +142,7 @@ LLMFactCheck.dynamic_config = EvaluatorLLMArgs(
key="YOUR_API_KEY",
api_url="https://api.openai.com/v1",
model="gpt-4o-mini",
- parameters={"threshold": 5.0} # Range: 0.0-10.0
+ threshold=5.0 # Range: 0.0-10.0
)
```
diff --git a/docs/hallucination_detection_guide.md b/docs/hallucination_detection_guide.md
index d6fceea9..4da27e0d 100644
--- a/docs/hallucination_detection_guide.md
+++ b/docs/hallucination_detection_guide.md
@@ -131,7 +131,7 @@ LLMHallucination.dynamic_config = EvaluatorLLMArgs(
key=os.getenv("OPENAI_API_KEY"),
api_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
model=os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
- parameters={"threshold": 0.5}
+ threshold=0.5
)
# Prepare data
@@ -218,7 +218,7 @@ LLMHallucination.dynamic_config = EvaluatorLLMArgs(
key="YOUR_API_KEY",
api_url="https://api.openai.com/v1",
model="gpt-4o-mini",
- parameters={"threshold": 0.5} # Range: 0.0-1.0
+ threshold=0.5 # Range: 0.0-1.0
)
```
diff --git a/docs/html_extract_compare_v2.md b/docs/html_extract_compare_v2.md
index c0d92242..b194b882 100644
--- a/docs/html_extract_compare_v2.md
+++ b/docs/html_extract_compare_v2.md
@@ -62,9 +62,7 @@ data = Data(
data_id="unique_id_001", # 必需:数据的唯一标识符
prompt="工具A提取的文本内容", # 必需
content="工具B提取的文本内容", # 必需
- raw_data={
- "language": "zh", # 可选,默认 "en"
- }
+ language="zh", # 可选,默认 "en";也可放在 raw_data["language"]
)
```
@@ -116,9 +114,7 @@ data = Data(
data_id="test_001",
prompt="工具A提取的内容...",
content="工具B提取的文本内容",
- raw_data={
- "language": "zh"
- }
+ language="zh", # 可选
)
# 执行评估
@@ -131,70 +127,76 @@ print(f"推理: {result.reason[0]}")
### 批量评估(使用 Executor)
-推荐使用 Executor 进行大规模批量评估,支持并发处理和结果保存。
+推荐使用 Executor 进行大规模批量评估,支持并发处理和结果保存。配置需与 `InputArgs` 一致:`evaluator` 为列表,每项包含 `fields`(列名映射到 `Data`)与 `evals`(评估器及 `config`)。
+
+`LLMHtmlExtractCompareV2` 约定:`prompt` = 工具 A 文本,`content` = 工具 B 文本;`language` 可选,缺省为 `"en"`。
```python
+import os
from pathlib import Path
from dingo.config.input_args import InputArgs
from dingo.exec.base import Executor
-# 配置参数
+common_config = {
+ "model": os.getenv("OPENAI_MODEL", "deepseek-chat"),
+ "key": os.getenv("OPENAI_API_KEY"),
+ "api_url": os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1"),
+}
+
input_data = {
- "task_name": "html_extract_compare_evaluation",
+ "task_name": "html_extract_compare_v2_evaluation",
"input_path": str(Path("test/data/html_extract_compare_test.jsonl")),
"output_path": "output/html_extract_compare_evaluation/",
-
- # 数据集配置
"dataset": {
"source": "local",
"format": "jsonl",
- "field": {
- "id": "data_id",
- "content": "content"
- # magic_md 和 language 会自动放入 raw_data
- }
},
-
- # 执行器配置
"executor": {
- "eval_group": "html_extract_compare", # 评估组
- "max_workers": 4, # 并发数
+ "max_workers": 4,
+ "batch_size": 1,
"result_save": {
- "bad": True, # 保存问题样本
- "good": True # 保存正常样本
- }
+ "bad": True, # 保存工具 B 更优的样本(status=True,对应判断 C)
+ "good": True, # 保存工具 A 更好或相当的样本
+ },
},
-
- # LLM 配置
- "evaluator": {
- "llm_config": {
- "LLMHtmlExtractCompareV2": {
- "model": "deepseek-chat",
- "key": "your_api_key",
- "api_url": "https://api.deepseek.com/v1"
- }
+ "evaluator": [
+ {
+ # 将 JSONL 列映射到 Data:prompt=工具A,content=工具B
+ "fields": {
+ "id": "data_id",
+ "prompt": "method1",
+ "content": "method2",
+ "language": "language",
+ },
+ "evals": [
+ {"name": "LLMHtmlExtractCompareV2", "config": common_config},
+ ],
}
- }
+ ],
}
-# 执行评估
input_args = InputArgs(**input_data)
executor = Executor.exec_map["local"](input_args)
result = executor.execute()
-# 查看结果
print(f"总样本数: {result.total}")
print(f"工具B更好: {result.num_bad}")
-print(f"工具A更好或相同: {result.total - result.num_bad}")
+print(f"工具A更好或相同: {result.num_good}")
```
+若你的数据列名为 `content` / `magic_md`,只需将 `fields` 改为 `"prompt": "content", "content": "magic_md"` 等即可。
+
#### JSONL 数据格式
+与仓库内 `test/data/html_extract_compare_test.jsonl` 对齐:每行一条 JSON,至少包含唯一标识、两种提取结果与可选语言。
+
```jsonl
-{"data_id": "001", "content": "工具A文本", "magic_md": "工具B文本", "language": "zh"}
-{"data_id": "002", "content": "Tool A text", "magic_md": "Tool B text", "language": "en"}
+{"data_id": "001", "method1": "工具A提取的Markdown文本...", "method2": "工具B提取的Markdown文本...", "language": "zh"}
+{"data_id": "002", "method1": "Tool A extracted text...", "method2": "Tool B extracted text...", "language": "en"}
```
+`method1` / `method2` 仅为示例列名;实际列名通过 `evaluator[].fields` 中的 `prompt` / `content` 映射指定。
+
## 与 V1 版本的对比
| 特性 | V1 | V2 |
diff --git a/docs/instruction_quality_guide.md b/docs/instruction_quality_guide.md
index eb1b08d1..2caae4dc 100644
--- a/docs/instruction_quality_guide.md
+++ b/docs/instruction_quality_guide.md
@@ -197,7 +197,7 @@ input_data = {
"model": "deepseek-chat",
"key": "your-api-key",
"api_url": "https://api.deepseek.com",
- "parameters": {"threshold": 6.0}
+ "threshold": 6.0
}
}
]
@@ -223,10 +223,8 @@ print(f"清晰指令: {summary.num_good}/{summary.total}")
"model": "deepseek-chat",
"key": "your-api-key",
"api_url": "https://api.deepseek.com",
- "parameters": {
- "min_difficulty": 3.0, # 可选:过滤太简单的
- "max_difficulty": 8.0, # 可选:过滤太难的
- }
+ "min_difficulty": 3.0, # 可选:过滤太简单的
+ "max_difficulty": 8.0, # 可选:过滤太难的
}
}
]
@@ -303,13 +301,13 @@ python examples/custom/evaluate_instruction_quality.py distribution
**问题1: 过多简单指令**
```python
# 设置最低难度阈值
-"parameters": {"min_difficulty": 3.0}
+"min_difficulty": 3.0
```
**问题2: 指令模糊不清**
```python
# 提高清晰度要求
-"parameters": {"threshold": 7.0}
+"threshold": 7.0
```
**问题3: 难度分布不均**
diff --git a/docs/metrics.md b/docs/metrics.md
index bfde1ea7..44e75482 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -20,11 +20,14 @@ This document provides comprehensive information about all quality metrics used
|------|--------|-------------|--------------|-------------------|----------|
| `LLMCodeCompare` | LLMCodeCompare | Compares the effectiveness of two tools in extracting code blocks from HTML to Markdown format by evaluating recognit... | Internal Implementation | N/A | N/A |
| `LLMDatamanAssessment` | LLMDatamanAssessment | Evaluates pre-training data quality using the DataMan methodology (14 standards, 15 domains). Assigns a score (0/1), ... | [DataMan: Data Manager for Pre-training Large Language Models](https://arxiv.org/abs/2502.19363) (Peng et al., 2025) | N/A | N/A |
+| `LLMHtmlExtractCompareV2` | LLMHtmlExtractCompareV2 | Compares two HTML main-content extraction tools by computing text diffs and using LLM to judge which preserves more c... | Internal Implementation | N/A | N/A |
| `LLMMathCompare` | LLMMathCompare | Compares the effectiveness of two tools in extracting mathematical formulas from HTML to Markdown format by evaluatin... | Internal Implementation | N/A | N/A |
| `LLMSecurityPolitics` | LLMSecurityPolitics | Evaluates whether the text contains politics-related content | Internal Implementation | N/A | N/A |
| `LLMTableCompare` | LLMTableCompare | Compares the effectiveness of two tools in extracting tables from HTML to Markdown format by evaluating recognition r... | Internal Implementation | N/A | N/A |
+| `LLMTextEquation` | LLMTextEquation | Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversit... | [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/abs/2501.14506) (Yu et al., 2025) | [📊 See Results](eval/prompt/redpajama_data_evaluated_by_prompt.md) | [📝 View Example](../examples/llm_and_rule/llm_local.py) |
| `LLMTextQualityV4` | LLMTextQualityV4 | Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing... | [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/abs/2501.14506) (Yu et al., 2025) | [📊 See Results](eval/prompt/redpajama_data_evaluated_by_prompt.md) | N/A |
| `LLMTextQualityV5` | LLMTextQualityV5 | Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversit... | [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/abs/2501.14506) (Yu et al., 2025) | [📊 See Results](eval/prompt/redpajama_data_evaluated_by_prompt.md) | [📝 View Example](../examples/llm_and_rule/llm_local.py) |
+| `LLMTextTable` | LLMTextTable | Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversit... | [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/abs/2501.14506) (Yu et al., 2025) | [📊 See Results](eval/prompt/redpajama_data_evaluated_by_prompt.md) | [📝 View Example](../examples/llm_and_rule/llm_local.py) |
### SFT Data Assessment Metrics
@@ -57,7 +60,7 @@ This document provides comprehensive information about all quality metrics used
| Type | Metric | Description | Paper Source | Evaluation Results | Examples |
|------|--------|-------------|--------------|-------------------|----------|
| `QUALITY_BAD_COMPLETENESS` | RuleLineEndWithEllipsis, RuleLineEndWithTerminal, RuleSentenceNumber, RuleWordNumber | Checks whether the ratio of lines ending with ellipsis is below threshold; Checks whether the ratio of lines ending w... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
-| `QUALITY_BAD_EFFECTIVENESS` | RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDoi, RuleIsbn | Detects garbled text and anti-crawling characters by combining special character and invisible character detection; D... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
+| `QUALITY_BAD_EFFECTIVENESS` | RuleDoi, RuleIsbn, RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl | Check whether the string is in the correct format of the doi; Check whether the string is in the correct format of th... | Internal Implementation | N/A | N/A |
| `QUALITY_BAD_FLUENCY` | RuleAbnormalNumber, RuleCharSplit, RuleNoPunc, RuleWordSplit, RuleWordStuck | Checks PDF content for abnormal book page or index numbers that disrupt text flow; Checks PDF content for abnormal ch... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
| `QUALITY_BAD_RELEVANCE` | RuleHeadWordAr, RuleHeadWordCs, RuleHeadWordHu, RuleHeadWordKo, RuleHeadWordRu, RuleHeadWordSr, RuleHeadWordTh, RuleHeadWordVi, RulePatternSearch, RuleWatermark | Checks whether Arabic content contains irrelevant tail source information; Checks whether Czech content contains irre... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
| `QUALITY_BAD_SECURITY` | RuleIDCard, RuleUnsafeWords, RulePIIDetection | Checks whether content contains ID card information; Checks whether content contains unsafe words; Detects Personal I... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
diff --git a/docs/rag_evaluation_metrics.md b/docs/rag_evaluation_metrics.md
index 1c11c5dc..c2fce750 100644
--- a/docs/rag_evaluation_metrics.md
+++ b/docs/rag_evaluation_metrics.md
@@ -86,10 +86,8 @@ llm_config_embedding = {
"api_url": "https://api.openai.com/v1",
"key": "YOUR_API_KEY"
},
- "parameters": {
- "strictness": 3,
- "threshold": 5
- }
+ "strictness": 3,
+ "threshold": 5
}
input_data = {
@@ -170,7 +168,8 @@ summary = executor.execute()
"api_url": "https://api.deepseek.com",
"key": "YOUR_API_KEY"
},
- "parameters": {"strictness": 3, "threshold": 5}
+ "strictness": 3,
+ "threshold": 5
}
```
@@ -186,7 +185,8 @@ summary = executor.execute()
"api_url": "http://localhost:8000/v1", # Local vLLM/Xinference
"key": "dummy-key"
},
- "parameters": {"strictness": 3, "threshold": 5}
+ "strictness": 3,
+ "threshold": 5
}
```
diff --git a/docs/rag_evaluation_metrics_zh.md b/docs/rag_evaluation_metrics_zh.md
index 099addb4..963b02dd 100644
--- a/docs/rag_evaluation_metrics_zh.md
+++ b/docs/rag_evaluation_metrics_zh.md
@@ -123,10 +123,8 @@ input_data = {
"api_url": OPENAI_URL,
"key": OPENAI_KEY
},
- "parameters": {
- "strictness": 3,
- "threshold": 5
- }
+ "strictness": 3,
+ "threshold": 5
}
},
{
@@ -466,7 +464,7 @@ LLMRAGFaithfulness.dynamic_config = EvaluatorLLMArgs(
key="YOUR_API_KEY",
api_url="https://api.openai.com/v1",
model="gpt-4o-mini",
- parameters={"threshold": 7} # 自定义阈值
+ threshold=7 # 自定义阈值
)
# Answer Relevancy 特殊配置(需要 embedding)⭐
@@ -480,10 +478,8 @@ LLMRAGAnswerRelevancy.dynamic_config = EvaluatorLLMArgs(
api_url="https://api.openai.com/v1",
key="YOUR_API_KEY"
),
- parameters={
- "strictness": 3, # 生成问题数量
- "threshold": 5 # 通过阈值
- }
+ strictness=3, # 生成问题数量
+ threshold=5 # 通过阈值
)
```
@@ -499,7 +495,7 @@ LLMRAGAnswerRelevancy.dynamic_config = EvaluatorLLMArgs(
"model": "gpt-4o-mini",
"key": "YOUR_API_KEY",
"api_url": "https://api.openai.com/v1",
- "parameters": {"threshold": 7}
+ "threshold": 7
}
},
{
@@ -513,10 +509,8 @@ LLMRAGAnswerRelevancy.dynamic_config = EvaluatorLLMArgs(
"api_url": "https://api.openai.com/v1",
"key": "YOUR_API_KEY"
},
- "parameters": {
- "strictness": 3,
- "threshold": 5
- }
+ "strictness": 3,
+ "threshold": 5
}
}
]
@@ -528,8 +522,8 @@ LLMRAGAnswerRelevancy.dynamic_config = EvaluatorLLMArgs(
| 参数 | 适用指标 | 默认值 | 说明 |
|------|---------|--------|------|
-| `threshold` | 所有指标 | 5.0 | 通过阈值(0-10),在 `parameters` 中配置 |
-| `strictness` | Answer Relevancy | 3 | 生成问题数量(1-5),在 `parameters` 中配置 |
+| `threshold` | 所有指标 | 5.0 | 通过阈值(0-10),直接在 `config` 中配置 |
+| `strictness` | Answer Relevancy | 3 | 生成问题数量(1-5),直接在 `config` 中配置 |
| `embedding_config` | Answer Relevancy | - | **必需配置**,包含 `model`(模型名)、`api_url`(服务地址)、`key`(API密钥) |
## 📊 指标详细说明
diff --git a/docs/technical/technical_all.md b/docs/technical/technical_all.md
index 45833111..212fd15c 100644
--- a/docs/technical/technical_all.md
+++ b/docs/technical/technical_all.md
@@ -220,9 +220,14 @@ dingo 在使用提示词进行评估任务的时候,必须同时使用场景
+ model
+ key
+ api_url
-+ parameters
++ temperature(直接平铺在配置中)
++ top_p
++ max_tokens
++ presence_penalty
++ frequency_penalty
++ agent_config(Agent 评估器专用,包含 max_iterations、tools 等)
-需要注意的是参数 [parameters](config.md#parameters) ,这个参数会对模型的推理产生影响,可以设置的值包括:
+LLM 调参配置直接平铺在 `config` 对象中(不再嵌套在 `parameters` 字段下),会对模型推理产生影响,可以设置的值包括:
+ temperature
+ top_p
+ max_tokens
diff --git a/examples/agent/agent_article_fact_checking_example.py b/examples/agent/agent_article_fact_checking_example.py
index 45b0ad60..3071f45f 100644
--- a/examples/agent/agent_article_fact_checking_example.py
+++ b/examples/agent/agent_article_fact_checking_example.py
@@ -58,32 +58,30 @@ def main() -> int:
"key": openai_key,
"model": "intern-s1-pro",
"api_url": "https://chat.intern-ai.org.cn/api/v1/",
- "parameters": {
- "timeout": 600,
- "temperature": 0, # deterministic output
- "agent_config": {
- "max_concurrent_claims": 10,
- "max_iterations": 50,
- # Artifacts auto-saved to outputs/article_factcheck_ |