Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dingo/model/llm/compare/llm_html_extract_compare_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def process_response(cls, response: str) -> EvalDetail:
tmp_type = "EXTRACTION_EQUAL"

result.status = response_model.score != 1
result.label = [f"{tmp_type}.{response_model.name}"]
result.label = [f"{tmp_type}"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Removing the specific error type (response_model.name) from the label reduces the granularity of the evaluation results. This makes it harder to aggregate or filter results by the specific cause of quality differences (e.g., Error_Formula vs Error_Table) without parsing the full JSON reason for every record.

If the goal was to clean up the label (e.g., avoiding the .None suffix when the score is 0), consider a conditional approach that preserves the detail when a specific error is identified. Additionally, the f-string f"{tmp_type}" is redundant as tmp_type is already a string.

Suggested change
result.label = [f"{tmp_type}"]
result.label = [f"{tmp_type}.{response_model.name}" if response_model.name != "None" else tmp_type]

result.reason = [json.dumps(response_json, ensure_ascii=False)]

return result
8 changes: 4 additions & 4 deletions test/scripts/model/llm/test_llm_html_extract_compare_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_score_1_prompt_better(self):
)
result = LLMHtmlExtractCompareV3.process_response(raw)
assert result.metric == "LLMHtmlExtractCompareV3"
assert result.label == ["PROMPT_BETTER.Error_Content_Coverage"]
assert result.label == ["PROMPT_BETTER"]
assert result.status is False
parsed = json.loads(result.reason[0])
assert parsed["score"] == 1
Expand All @@ -86,7 +86,7 @@ def test_score_2_content_better(self):
ensure_ascii=False,
)
result = LLMHtmlExtractCompareV3.process_response(raw)
assert result.label == ["CONTENT_BETTER.Error_Formula"]
assert result.label == ["CONTENT_BETTER"]
assert result.status is True

def test_score_0_extraction_equal(self):
Expand All @@ -95,7 +95,7 @@ def test_score_0_extraction_equal(self):
ensure_ascii=False,
)
result = LLMHtmlExtractCompareV3.process_response(raw)
assert result.label == ["EXTRACTION_EQUAL.None"]
assert result.label == ["EXTRACTION_EQUAL"]
assert result.status is True

def test_json_fenced_with_markdown(self):
Expand All @@ -111,7 +111,7 @@ def test_redacted_thinking_appended_to_reason(self):
'{"score": 2, "name": "Error_Table", "reason": "Brief."}'
)
result = LLMHtmlExtractCompareV3.process_response(body)
assert "CONTENT_BETTER.Error_Table" == result.label[0]
assert "CONTENT_BETTER" == result.label[0]
parsed = json.loads(result.reason[0])
assert "internal" in parsed["reason"]

Expand Down
Loading