diff --git a/tools/diff_report.py b/tools/diff_report.py new file mode 100644 index 0000000..37016e4 --- /dev/null +++ b/tools/diff_report.py @@ -0,0 +1,310 @@ +""" +QA 기대값 vs 실제값 Diff 리포트 생성기 + +사용 예시: + python diff_report.py --input results/test_result_YYYYMMDD_HHMMSS.json + python diff_report.py --input results/test_result.json --output reports/qa_diff_report.html + +입력 JSON은 아래 두 형태를 모두 지원합니다. + 1) [{...}, {...}] + 2) {"results": [{...}, {...}]} +""" + +from __future__ import annotations + +import argparse +import html +import json +from collections import Counter +from datetime import datetime +from pathlib import Path +from typing import Any + +STATUS_CLASSES = { + "PASS": "pass", + "FAIL": "fail", + "ERROR": "error", +} + + +def as_list(value: Any) -> list[str]: + """문자열/리스트/None을 화면 출력용 문자열 리스트로 정규화한다.""" + if value is None: + return [] + if isinstance(value, list): + return [str(item) for item in value if item is not None] + if isinstance(value, tuple): + return [str(item) for item in value if item is not None] + if isinstance(value, str): + if not value.strip(): + return [] + return [value] + return [str(value)] + + +def safe_text(value: Any, default: str = "—") -> str: + """HTML에 안전하게 넣을 수 있도록 문자열을 escape한다.""" + if value is None: + return default + text = str(value) + return html.escape(text if text else default) + + +def short_text(value: Any, length: int = 60) -> str: + text = "" if value is None else str(value) + if len(text) <= length: + return safe_text(text) + return safe_text(text[:length] + "...") + + +def get_first(row: dict[str, Any], *keys: str, default: Any = None) -> Any: + """여러 버전의 결과 JSON 필드명을 호환하기 위한 getter.""" + for key in keys: + if key in row and row[key] is not None: + return row[key] + return default + + +def normalize_result(row: dict[str, Any]) -> dict[str, Any]: + """테스트 결과 1건을 리포트 생성에 필요한 표준 필드로 변환한다.""" + expected_codes = as_list( + get_first(row, "expected_codes_list", "expected_reason_codes", "expected_codes", default=[]) + ) + actual_codes = as_list( + get_first(row, "actual_reason_codes", "reason_codes", "actual_codes", default=[]) + ) + + expected_action = get_first(row, "expected_action", "expected", default="—") + actual_action = get_first(row, "actual_action", "action", "actual", default="—") + + action_pass = row.get("action_pass") + if action_pass is None and expected_action != "—" and actual_action != "—": + action_pass = str(expected_action).upper() == str(actual_action).upper() + + codes_pass = row.get("codes_pass") + if codes_pass is None: + codes_pass = set(expected_codes) == set(actual_codes) + + status = str(row.get("status") or "").upper() + if status not in STATUS_CLASSES: + status = "PASS" if action_pass and codes_pass else "FAIL" + + return { + "tc_id": get_first(row, "tc_id", "id", "case_id", default="—"), + "description": get_first(row, "description", "title", "name", default=""), + "category": get_first(row, "category", "type", default="UNCATEGORIZED"), + "input_text": get_first(row, "input_text", "input", "prompt", "text", default=""), + "expected_action": expected_action, + "actual_action": actual_action, + "expected_codes_list": expected_codes, + "actual_reason_codes": actual_codes, + "action_pass": bool(action_pass), + "codes_pass": bool(codes_pass), + "status": status, + "latency_ms": get_first(row, "latency_ms", "elapsed_ms", "duration_ms", default="—"), + "error": get_first(row, "error", "message", default=""), + } + + +def load_results(input_path: Path) -> list[dict[str, Any]]: + if not input_path.exists(): + raise FileNotFoundError(f"입력 파일을 찾을 수 없습니다: {input_path}") + + with input_path.open(encoding="utf-8") as file: + data = json.load(file) + + if isinstance(data, dict): + data = data.get("results") or data.get("test_results") or data.get("cases") + + if not isinstance(data, list): + raise ValueError("입력 JSON은 리스트이거나 results/test_results/cases 키를 가진 객체여야 합니다.") + + normalized: list[dict[str, Any]] = [] + for index, item in enumerate(data, start=1): + if not isinstance(item, dict): + raise ValueError(f"{index}번째 결과가 객체(dict)가 아닙니다.") + normalized.append(normalize_result(item)) + return normalized + + +def build_table_rows(results: list[dict[str, Any]]) -> str: + rows: list[str] = [] + for row in results: + status_class = STATUS_CLASSES.get(row["status"], "") + exp_codes = safe_text(", ".join(row["expected_codes_list"]) if row["expected_codes_list"] else "—" + ) + act_codes = safe_text(", ".join(row["actual_reason_codes"]) if row["actual_reason_codes"] else "—") + action_diff = "" if row["action_pass"] else " ⚠️" + codes_diff = "" if row["codes_pass"] else " ⚠️" + error_note = f"
{safe_text(row['error'])}
" if row["error"] else "" + + rows.append( + f""" + + {safe_text(row['tc_id'])} + {safe_text(row['category'])} + {short_text(row['input_text'])}{error_note} + {safe_text(row['expected_action'])} + {safe_text(row['actual_action'])}{action_diff} + {exp_codes} + {act_codes}{codes_diff} + {safe_text(row['status'])} + {safe_text(row['latency_ms'])} + """ + ) + return "\n".join(rows) + + +def generate_html_report(results: list[dict[str, Any]], output_path: Path) -> None: + total = len(results) + passed = sum(1 for row in results if row["status"] == "PASS") + failed = sum(1 for row in results if row["status"] == "FAIL") + errors = sum(1 for row in results if row["status"] == "ERROR") + pass_rate = passed / total * 100 if total else 0.0 + + category_total = Counter(row["category"] for row in results) + category_fail = Counter(row["category"] for row in results if row["status"] == "FAIL") + + failed_reason_codes: list[str] = [] + for row in results: + if row["status"] == "FAIL": + # 실패 원인은 실제 탐지 코드 기준으로 보는 게 디버깅에 더 유용하다. + failed_reason_codes.extend(row["actual_reason_codes"] or row["expected_codes_list"]) + reason_code_counter = Counter(failed_reason_codes) + + category_rows = "\n".join( + f"{safe_text(category)}{count}{category_fail.get(category, 0)}{count - category_fail.get(category, 0)}" + for category, count in sorted(category_total.items()) + ) + reason_rows = "\n".join( + f"{safe_text(code)}{count}" + for code, count in reason_code_counter.most_common() + ) + reason_section = ( + "

실패 다발 reason_code

" + "" + f"{reason_rows}
reason_code실패 건수
" + if reason_rows + else "" + ) + + generated_at = datetime.now().strftime("%Y년 %m월 %d일 %H:%M:%S") + html_doc = f""" + + + + +QA Diff Report + + + +
+

LLM 보안 프록시 QA Diff Report

+

생성일시: {generated_at} | 총 {total}개 테스트 케이스

+
+
+
+
{total}
전체 케이스
+
{passed}
PASS
{pass_rate:.1f}%
+
{failed}
FAIL
+
{errors}
ERROR
+
+ +
+

카테고리별 결과

+ {category_rows}
카테고리전체FAILPASS/ERROR
+
+ + {reason_section} + +
+

전체 테스트 케이스 상세

+ + + {build_table_rows(results)} +
ID카테고리입력기대 Action실제 Action기대 Codes실제 Codes결과지연(ms)
+
+
+ +""" + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(html_doc, encoding="utf-8") + + +def print_diff_summary(results: list[dict[str, Any]]) -> None: + print("\n" + "=" * 70) + print("DIFF 요약 (기대값 vs 실제값)") + print("=" * 70) + + has_diff = False + for row in results: + if row["status"] == "PASS": + continue + has_diff = True + print(f"\n[{row['tc_id']}] {row['description'] or row['category']}") + if not row["action_pass"]: + print(f" Action 기대: {row['expected_action']} -> 실제: {row['actual_action']}") + if not row["codes_pass"]: + expected = row["expected_codes_list"] + actual = row["actual_reason_codes"] + missing = [code for code in expected if code not in actual] + extra = [code for code in actual if code not in expected] + if missing: + print(f" Codes 누락: {missing}") + if extra: + print(f" Codes 추가: {extra}") + if row["error"]: + print(f" Error: {row['error']}") + + if not has_diff: + print("모든 테스트 케이스가 PASS입니다.") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="QA Diff 리포트 생성기") + parser.add_argument("--input", required=True, help="qa_runner/evaluation 결과 JSON 파일 경로") + parser.add_argument("--output", default=None, help="HTML 리포트 출력 경로") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + input_path = Path(args.input) + output_path = Path(args.output) if args.output else input_path.with_name(f"{input_path.stem}_report.html") + + results = load_results(input_path) + generate_html_report(results, output_path) + print_diff_summary(results) + print(f"\nHTML 리포트 저장: {output_path}\n") + + +if __name__ == "__main__": + main()