diff --git a/tools/diff_report.py b/tools/diff_report.py
new file mode 100644
index 0000000..37016e4
--- /dev/null
+++ b/tools/diff_report.py
@@ -0,0 +1,310 @@
+"""
+QA 기대값 vs 실제값 Diff 리포트 생성기
+
+사용 예시:
+ python diff_report.py --input results/test_result_YYYYMMDD_HHMMSS.json
+ python diff_report.py --input results/test_result.json --output reports/qa_diff_report.html
+
+입력 JSON은 아래 두 형태를 모두 지원합니다.
+ 1) [{...}, {...}]
+ 2) {"results": [{...}, {...}]}
+"""
+
+from __future__ import annotations
+
+import argparse
+import html
+import json
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+STATUS_CLASSES = {
+ "PASS": "pass",
+ "FAIL": "fail",
+ "ERROR": "error",
+}
+
+
+def as_list(value: Any) -> list[str]:
+ """문자열/리스트/None을 화면 출력용 문자열 리스트로 정규화한다."""
+ if value is None:
+ return []
+ if isinstance(value, list):
+ return [str(item) for item in value if item is not None]
+ if isinstance(value, tuple):
+ return [str(item) for item in value if item is not None]
+ if isinstance(value, str):
+ if not value.strip():
+ return []
+ return [value]
+ return [str(value)]
+
+
+def safe_text(value: Any, default: str = "—") -> str:
+ """HTML에 안전하게 넣을 수 있도록 문자열을 escape한다."""
+ if value is None:
+ return default
+ text = str(value)
+ return html.escape(text if text else default)
+
+
+def short_text(value: Any, length: int = 60) -> str:
+ text = "" if value is None else str(value)
+ if len(text) <= length:
+ return safe_text(text)
+ return safe_text(text[:length] + "...")
+
+
+def get_first(row: dict[str, Any], *keys: str, default: Any = None) -> Any:
+ """여러 버전의 결과 JSON 필드명을 호환하기 위한 getter."""
+ for key in keys:
+ if key in row and row[key] is not None:
+ return row[key]
+ return default
+
+
+def normalize_result(row: dict[str, Any]) -> dict[str, Any]:
+ """테스트 결과 1건을 리포트 생성에 필요한 표준 필드로 변환한다."""
+ expected_codes = as_list(
+ get_first(row, "expected_codes_list", "expected_reason_codes", "expected_codes", default=[])
+ )
+ actual_codes = as_list(
+ get_first(row, "actual_reason_codes", "reason_codes", "actual_codes", default=[])
+ )
+
+ expected_action = get_first(row, "expected_action", "expected", default="—")
+ actual_action = get_first(row, "actual_action", "action", "actual", default="—")
+
+ action_pass = row.get("action_pass")
+ if action_pass is None and expected_action != "—" and actual_action != "—":
+ action_pass = str(expected_action).upper() == str(actual_action).upper()
+
+ codes_pass = row.get("codes_pass")
+ if codes_pass is None:
+ codes_pass = set(expected_codes) == set(actual_codes)
+
+ status = str(row.get("status") or "").upper()
+ if status not in STATUS_CLASSES:
+ status = "PASS" if action_pass and codes_pass else "FAIL"
+
+ return {
+ "tc_id": get_first(row, "tc_id", "id", "case_id", default="—"),
+ "description": get_first(row, "description", "title", "name", default=""),
+ "category": get_first(row, "category", "type", default="UNCATEGORIZED"),
+ "input_text": get_first(row, "input_text", "input", "prompt", "text", default=""),
+ "expected_action": expected_action,
+ "actual_action": actual_action,
+ "expected_codes_list": expected_codes,
+ "actual_reason_codes": actual_codes,
+ "action_pass": bool(action_pass),
+ "codes_pass": bool(codes_pass),
+ "status": status,
+ "latency_ms": get_first(row, "latency_ms", "elapsed_ms", "duration_ms", default="—"),
+ "error": get_first(row, "error", "message", default=""),
+ }
+
+
+def load_results(input_path: Path) -> list[dict[str, Any]]:
+ if not input_path.exists():
+ raise FileNotFoundError(f"입력 파일을 찾을 수 없습니다: {input_path}")
+
+ with input_path.open(encoding="utf-8") as file:
+ data = json.load(file)
+
+ if isinstance(data, dict):
+ data = data.get("results") or data.get("test_results") or data.get("cases")
+
+ if not isinstance(data, list):
+ raise ValueError("입력 JSON은 리스트이거나 results/test_results/cases 키를 가진 객체여야 합니다.")
+
+ normalized: list[dict[str, Any]] = []
+ for index, item in enumerate(data, start=1):
+ if not isinstance(item, dict):
+ raise ValueError(f"{index}번째 결과가 객체(dict)가 아닙니다.")
+ normalized.append(normalize_result(item))
+ return normalized
+
+
+def build_table_rows(results: list[dict[str, Any]]) -> str:
+ rows: list[str] = []
+ for row in results:
+ status_class = STATUS_CLASSES.get(row["status"], "")
+ exp_codes = safe_text(", ".join(row["expected_codes_list"]) if row["expected_codes_list"] else "—"
+ )
+ act_codes = safe_text(", ".join(row["actual_reason_codes"]) if row["actual_reason_codes"] else "—")
+ action_diff = "" if row["action_pass"] else " ⚠️"
+ codes_diff = "" if row["codes_pass"] else " ⚠️"
+ error_note = f"
{safe_text(row['error'])}
" if row["error"] else ""
+
+ rows.append(
+ f"""
+
+ | {safe_text(row['tc_id'])} |
+ {safe_text(row['category'])} |
+ {short_text(row['input_text'])}{error_note} |
+ {safe_text(row['expected_action'])} |
+ {safe_text(row['actual_action'])}{action_diff} |
+ {exp_codes} |
+ {act_codes}{codes_diff} |
+ {safe_text(row['status'])} |
+ {safe_text(row['latency_ms'])} |
+
"""
+ )
+ return "\n".join(rows)
+
+
+def generate_html_report(results: list[dict[str, Any]], output_path: Path) -> None:
+ total = len(results)
+ passed = sum(1 for row in results if row["status"] == "PASS")
+ failed = sum(1 for row in results if row["status"] == "FAIL")
+ errors = sum(1 for row in results if row["status"] == "ERROR")
+ pass_rate = passed / total * 100 if total else 0.0
+
+ category_total = Counter(row["category"] for row in results)
+ category_fail = Counter(row["category"] for row in results if row["status"] == "FAIL")
+
+ failed_reason_codes: list[str] = []
+ for row in results:
+ if row["status"] == "FAIL":
+ # 실패 원인은 실제 탐지 코드 기준으로 보는 게 디버깅에 더 유용하다.
+ failed_reason_codes.extend(row["actual_reason_codes"] or row["expected_codes_list"])
+ reason_code_counter = Counter(failed_reason_codes)
+
+ category_rows = "\n".join(
+ f"| {safe_text(category)} | {count} | {category_fail.get(category, 0)} | {count - category_fail.get(category, 0)} |
"
+ for category, count in sorted(category_total.items())
+ )
+ reason_rows = "\n".join(
+ f"| {safe_text(code)} | {count} |
"
+ for code, count in reason_code_counter.most_common()
+ )
+ reason_section = (
+ "실패 다발 reason_code
"
+ "
| reason_code | 실패 건수 |
"
+ f"{reason_rows}
"
+ if reason_rows
+ else ""
+ )
+
+ generated_at = datetime.now().strftime("%Y년 %m월 %d일 %H:%M:%S")
+ html_doc = f"""
+
+
+
+
+QA Diff Report
+
+
+
+
+
+
+
+ {passed}
PASS
{pass_rate:.1f}%
+
+
+
+
+
+
카테고리별 결과
+
| 카테고리 | 전체 | FAIL | PASS/ERROR |
{category_rows}
+
+
+ {reason_section}
+
+
+
전체 테스트 케이스 상세
+
+ | ID | 카테고리 | 입력 | 기대 Action | 실제 Action | 기대 Codes | 실제 Codes | 결과 | 지연(ms) |
+ {build_table_rows(results)}
+
+
+
+
+"""
+
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(html_doc, encoding="utf-8")
+
+
+def print_diff_summary(results: list[dict[str, Any]]) -> None:
+ print("\n" + "=" * 70)
+ print("DIFF 요약 (기대값 vs 실제값)")
+ print("=" * 70)
+
+ has_diff = False
+ for row in results:
+ if row["status"] == "PASS":
+ continue
+ has_diff = True
+ print(f"\n[{row['tc_id']}] {row['description'] or row['category']}")
+ if not row["action_pass"]:
+ print(f" Action 기대: {row['expected_action']} -> 실제: {row['actual_action']}")
+ if not row["codes_pass"]:
+ expected = row["expected_codes_list"]
+ actual = row["actual_reason_codes"]
+ missing = [code for code in expected if code not in actual]
+ extra = [code for code in actual if code not in expected]
+ if missing:
+ print(f" Codes 누락: {missing}")
+ if extra:
+ print(f" Codes 추가: {extra}")
+ if row["error"]:
+ print(f" Error: {row['error']}")
+
+ if not has_diff:
+ print("모든 테스트 케이스가 PASS입니다.")
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description="QA Diff 리포트 생성기")
+ parser.add_argument("--input", required=True, help="qa_runner/evaluation 결과 JSON 파일 경로")
+ parser.add_argument("--output", default=None, help="HTML 리포트 출력 경로")
+ return parser.parse_args()
+
+
+def main() -> None:
+ args = parse_args()
+ input_path = Path(args.input)
+ output_path = Path(args.output) if args.output else input_path.with_name(f"{input_path.stem}_report.html")
+
+ results = load_results(input_path)
+ generate_html_report(results, output_path)
+ print_diff_summary(results)
+ print(f"\nHTML 리포트 저장: {output_path}\n")
+
+
+if __name__ == "__main__":
+ main()