From eb4a0af6de5be3ec83522fc9a6ee2ab48cbc4457 Mon Sep 17 00:00:00 2001 From: youngdong Date: Tue, 19 May 2026 21:17:08 +0900 Subject: [PATCH] fix: align hybrid evaluation with model hits --- evaluation/deepset_official_split_compare.py | 9 ++- evaluation/external_dataset_compare.py | 77 ++++++++++++++++++-- evaluation/external_overlap_analysis.py | 60 +++++++++++++-- 3 files changed, 129 insertions(+), 17 deletions(-) diff --git a/evaluation/deepset_official_split_compare.py b/evaluation/deepset_official_split_compare.py index c1e9611..161bcf9 100644 --- a/evaluation/deepset_official_split_compare.py +++ b/evaluation/deepset_official_split_compare.py @@ -165,8 +165,8 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any f"- Generated at: `{generated_at}`", f"- Lightweight threshold: `{threshold:.2f}`", "", - "| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN |", - "|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|", + "| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN | Safe Guard Cancelled Model Hits | Cancelled TP |", + "|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|", ] for row in rows: lines.append( @@ -181,7 +181,9 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any f"| {_fmt(row['tp'])} " f"| {_fmt(row['fp'])} " f"| {_fmt(row['tn'])} " - f"| {_fmt(row['fn'])} |" + f"| {_fmt(row['fn'])} " + f"| {_fmt(row.get('model_hit_cancelled_by_safe_guard_count'))} " + f"| {_fmt(row.get('model_hit_cancelled_by_safe_guard_tp'))} |" ) lines.extend( @@ -193,6 +195,7 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any "", "- `custom 70/30 eval` uses the project-generated held-out eval split and the saved `external-tuned` artifact.", "- `official train/test` trains a temporary lightweight model with internal samples plus deepset official train split, then evaluates deepset official test split.", + "- `Hybrid / Full Pipeline` predictions are counted as `rule_predicted OR model_predicted`; safe explanation guard cancellations are reported separately instead of lowering Hybrid TP.", "- If custom split performance is much higher than official test performance, custom split metrics may be easier or inflated by similar examples.", "", ] diff --git a/evaluation/external_dataset_compare.py b/evaluation/external_dataset_compare.py index 61a8fc4..d55fc4a 100644 --- a/evaluation/external_dataset_compare.py +++ b/evaluation/external_dataset_compare.py @@ -85,7 +85,16 @@ class DatasetBundle: note: str = "" -Predictor = Callable[[str], bool] +@dataclass(frozen=True, slots=True) +class PredictionDecision: + predicted: bool + rule_predicted: bool | None = None + model_predicted: bool | None = None + pipeline_predicted: bool | None = None + model_hit_cancelled_by_safe_guard: bool = False + + +Predictor = Callable[[str], bool | PredictionDecision] DATASET_SPECS = ( @@ -182,12 +191,35 @@ def _hybrid_pipeline(classifier: LightweightClassifier, threshold: float) -> Pre model_detector_fail_mode="warn", ) - def predict(text: str) -> bool: + def predict(text: str) -> PredictionDecision: + rule_predicted = _rule_only(text) result = detect_hybrid(text, classifier=classifier, settings=settings) - return any( + model_predicted = ( + _is_model_injection_prediction(result.model_prediction) + if result.model_prediction is not None + else False + ) + pipeline_predicted = any( + detection.detector_type == DetectorType.INJECTION + for detection in result.detections + ) + model_injection_detection = any( detection.detector_type == DetectorType.INJECTION + and detection.detector_name == "llm" for detection in result.detections ) + model_hit_cancelled_by_safe_guard = ( + model_predicted + and not model_injection_detection + and "SAFE_SECURITY_EXPLANATION" in result.reason_codes + ) + return PredictionDecision( + predicted=rule_predicted or model_predicted, + rule_predicted=rule_predicted, + model_predicted=model_predicted, + pipeline_predicted=pipeline_predicted, + model_hit_cancelled_by_safe_guard=model_hit_cancelled_by_safe_guard, + ) return predict @@ -270,12 +302,41 @@ def _metric_result( ) -> dict[str, Any]: tp = fp = fn = tn = 0 latencies: list[float] = [] + decision_diagnostics = { + "rule_predicted_count": 0, + "model_predicted_count": 0, + "hybrid_pipeline_predicted_count": 0, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, + "hybrid_or_changed_prediction_count": 0, + } + saw_decision_diagnostics = False for sample in dataset.samples: started = time.perf_counter() - predicted = predictor(sample.text) + prediction_result = predictor(sample.text) latencies.append((time.perf_counter() - started) * 1000) + if isinstance(prediction_result, PredictionDecision): + saw_decision_diagnostics = True + predicted = prediction_result.predicted + if prediction_result.rule_predicted: + decision_diagnostics["rule_predicted_count"] += 1 + if prediction_result.model_predicted: + decision_diagnostics["model_predicted_count"] += 1 + if prediction_result.pipeline_predicted: + decision_diagnostics["hybrid_pipeline_predicted_count"] += 1 + if prediction_result.model_hit_cancelled_by_safe_guard: + decision_diagnostics["model_hit_cancelled_by_safe_guard_count"] += 1 + if sample.expected_injection: + decision_diagnostics["model_hit_cancelled_by_safe_guard_tp"] += 1 + if prediction_result.pipeline_predicted is not None and ( + prediction_result.predicted != prediction_result.pipeline_predicted + ): + decision_diagnostics["hybrid_or_changed_prediction_count"] += 1 + else: + predicted = prediction_result + if predicted and sample.expected_injection: tp += 1 elif predicted and not sample.expected_injection: @@ -292,7 +353,7 @@ def _metric_result( f1 = None if precision is None else _safe_div(2 * precision * recall, precision + recall) accuracy = _safe_div(tp + tn, size) - return { + row = { "dataset_name": dataset.spec.name, "model_version": model_version, "mode": mode, @@ -311,6 +372,10 @@ def _metric_result( "dataset_status": dataset.status, "note": dataset.note, } + if saw_decision_diagnostics: + row.update(decision_diagnostics) + row["hybrid_prediction_formula"] = "rule_predicted OR model_predicted" + return row def _na_result( @@ -770,7 +835,7 @@ def _render_markdown( "", "- `Rule Only`는 `backend/app/detection/injection_detector.py`의 규칙·휴리스틱 Prompt Injection 탐지만 사용한다.", "- `Lightweight Model Only`는 `models/lightweight/vectorizer.joblib`와 `models/lightweight/classifier.joblib`가 실제로 로드된 경우에만 측정한다.", - "- `Hybrid / Full Pipeline`은 현재 프로젝트의 다층형 탐지 파이프라인 실행 경로이며, 규칙 탐지와 경량 모델 계층을 함께 사용한다.", + "- `Hybrid / Full Pipeline`은 `rule_predicted OR model_predicted` 기준으로 집계한다. safe explanation guard가 model hit를 취소한 경우에는 JSON 결과의 `model_hit_cancelled_by_safe_guard_count`와 `model_hit_cancelled_by_safe_guard_tp`에 별도로 기록한다.", "- `Lakera/gandalf_ignore_instructions`는 공격 샘플 중심 데이터셋이므로 Precision, F1, FP, TN은 `N/A`로 표시하고 Recall과 Accuracy 중심으로 해석한다.", "- `model_status`가 `enabled`가 아니면 Hybrid 결과는 경량 분류 계층이 빠진 fallback 성격이므로 완전한 Hybrid 성능으로 과장하지 않는다.", "- sklearn artifact 버전 경고가 발생하면 같은 scikit-learn 버전으로 artifact를 재생성한 뒤 결과를 다시 확인한다.", diff --git a/evaluation/external_overlap_analysis.py b/evaluation/external_overlap_analysis.py index 1d6c02a..968755d 100644 --- a/evaluation/external_overlap_analysis.py +++ b/evaluation/external_overlap_analysis.py @@ -36,11 +36,11 @@ OVERLAP_CSV_PATH = Path("reports/external_overlap_analysis_results.csv") -def _hybrid_predicted( +def _hybrid_pipeline_prediction( text: str, classifier: LightweightClassifier, threshold: float, -) -> bool: +) -> tuple[bool, bool]: settings = DetectionSettings( enable_model_detector=True, detection_mode="hybrid", @@ -48,10 +48,22 @@ def _hybrid_predicted( model_detector_fail_mode="warn", ) result = detect_hybrid(text, classifier=classifier, settings=settings) - return any( + pipeline_predicted = any( detection.detector_type == DetectorType.INJECTION for detection in result.detections ) + model_injection_detection = any( + detection.detector_type == DetectorType.INJECTION + and detection.detector_name == "llm" + for detection in result.detections + ) + model_hit_cancelled_by_safe_guard = ( + result.model_prediction is not None + and _is_model_injection_prediction(result.model_prediction) + and not model_injection_detection + and "SAFE_SECURITY_EXPLANATION" in result.reason_codes + ) + return pipeline_predicted, model_hit_cancelled_by_safe_guard def _analyze_dataset( @@ -68,7 +80,12 @@ def _analyze_dataset( rule_predicted = bool(detect_injection(sample.text)) model_prediction = classifier.classify(sample.text) model_predicted = _is_model_injection_prediction(model_prediction) - hybrid_predicted = _hybrid_predicted(sample.text, classifier, threshold) + hybrid_pipeline_predicted, model_hit_cancelled_by_safe_guard = _hybrid_pipeline_prediction( + sample.text, + classifier, + threshold, + ) + hybrid_predicted = rule_predicted or model_predicted expected = bool(sample.expected_injection) sample_rows.append( { @@ -79,6 +96,8 @@ def _analyze_dataset( "rule_predicted": rule_predicted, "model_predicted": model_predicted, "hybrid_predicted": hybrid_predicted, + "hybrid_pipeline_predicted": hybrid_pipeline_predicted, + "model_hit_cancelled_by_safe_guard": model_hit_cancelled_by_safe_guard, "model_label": model_prediction.label, "model_confidence": model_prediction.confidence, } @@ -111,6 +130,19 @@ def _analyze_dataset( for row in sample_rows if row["expected_injection"] and row["hybrid_predicted"] and not row["rule_predicted"] ) + hybrid_pipeline_tp = sum( + 1 + for row in sample_rows + if row["expected_injection"] and row["hybrid_pipeline_predicted"] + ) + model_hit_cancelled_by_safe_guard_count = sum( + 1 for row in sample_rows if row["model_hit_cancelled_by_safe_guard"] + ) + model_hit_cancelled_by_safe_guard_tp = sum( + 1 + for row in sample_rows + if row["expected_injection"] and row["model_hit_cancelled_by_safe_guard"] + ) summary = { "dataset_name": dataset_name, @@ -124,6 +156,9 @@ def _analyze_dataset( "model_only_unique_tp": model_only_unique_tp, "hybrid_tp": hybrid_tp, "hybrid_extra_tp": hybrid_extra_tp, + "hybrid_pipeline_tp": hybrid_pipeline_tp, + "model_hit_cancelled_by_safe_guard_count": model_hit_cancelled_by_safe_guard_count, + "model_hit_cancelled_by_safe_guard_tp": model_hit_cancelled_by_safe_guard_tp, "hybrid_tp_equals_rule_plus_model_unique": hybrid_tp == rule_tp + model_only_unique_tp, "hybrid_tp_equals_rule_plus_hybrid_extra": hybrid_tp == rule_tp + hybrid_extra_tp, } @@ -169,6 +204,9 @@ def _run_analysis( "model_only_unique_tp": None, "hybrid_tp": None, "hybrid_extra_tp": None, + "hybrid_pipeline_tp": None, + "model_hit_cancelled_by_safe_guard_count": None, + "model_hit_cancelled_by_safe_guard_tp": None, "hybrid_tp_equals_rule_plus_model_unique": None, "hybrid_tp_equals_rule_plus_hybrid_extra": None, "dataset_status": dataset.status, @@ -223,8 +261,8 @@ def _render_report( "", "## Summary", "", - "| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP |", - "|---|---|---:|---:|---:|---:|---:|---:|---:|", + "| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP | Pipeline TP | Safe Guard Cancelled Model Hits | Cancelled TP |", + "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|", ] for row in summaries: lines.append( @@ -236,7 +274,10 @@ def _render_report( f"| {_fmt(row['rule_only_tp'])} " f"| {_fmt(row['model_only_unique_tp'])} " f"| {_fmt(row['hybrid_tp'])} " - f"| {_fmt(row['hybrid_extra_tp'])} |" + f"| {_fmt(row['hybrid_extra_tp'])} " + f"| {_fmt(row['hybrid_pipeline_tp'])} " + f"| {_fmt(row['model_hit_cancelled_by_safe_guard_count'])} " + f"| {_fmt(row['model_hit_cancelled_by_safe_guard_tp'])} |" ) lines.extend( @@ -248,7 +289,7 @@ def _render_report( "", "반대로 external-tuned 모델처럼 `Model Only Unique TP`가 증가하면 Hybrid TP도 Rule TP보다 커진다. 따라서 이 표는 Hybrid 개선 여부를 모델 계층의 독립 기여도로 설명하는 핵심 근거다.", "", - "`Hybrid Extra TP`는 실제 Hybrid 실행 결과가 Rule Only보다 추가로 맞춘 공격 샘플 수다. 이 값이 `Model Only Unique TP`와 다르면, 현재 Hybrid 내부의 model detector heuristic 또는 fallback reason이 순수 lightweight classifier와 다르게 작동했다는 뜻이다.", + "`Hybrid TP`와 `Hybrid Extra TP`는 `rule_predicted OR model_predicted` 기준이다. `Pipeline TP`는 safe explanation guard가 적용된 기존 `detect_hybrid()` 실행 결과이며, guard로 취소된 model hit는 별도 열에 기록한다.", "", "샘플 단위의 `expected_injection`, `rule_predicted`, `model_predicted`, `hybrid_predicted` 값은 JSON 결과 파일의 `sample_predictions`에 저장한다.", "", @@ -292,6 +333,9 @@ def _write_csv(rows: list[dict[str, Any]], path: Path) -> None: "model_only_unique_tp", "hybrid_tp", "hybrid_extra_tp", + "hybrid_pipeline_tp", + "model_hit_cancelled_by_safe_guard_count", + "model_hit_cancelled_by_safe_guard_tp", "hybrid_tp_equals_rule_plus_model_unique", "hybrid_tp_equals_rule_plus_hybrid_extra", "dataset_status",