From eb4a0af6de5be3ec83522fc9a6ee2ab48cbc4457 Mon Sep 17 00:00:00 2001
From: youngdong <jho87190944@gmail.com>
Date: Tue, 19 May 2026 21:17:08 +0900
Subject: [PATCH] fix: align hybrid evaluation with model hits

---
 evaluation/deepset_official_split_compare.py |  9 ++-
 evaluation/external_dataset_compare.py       | 77 ++++++++++++++++++--
 evaluation/external_overlap_analysis.py      | 60 +++++++++++++--
 3 files changed, 129 insertions(+), 17 deletions(-)

diff --git a/evaluation/deepset_official_split_compare.py b/evaluation/deepset_official_split_compare.py
index c1e9611..161bcf9 100644
--- a/evaluation/deepset_official_split_compare.py
+++ b/evaluation/deepset_official_split_compare.py
@@ -165,8 +165,8 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any
         f"- Generated at: `{generated_at}`",
         f"- Lightweight threshold: `{threshold:.2f}`",
         "",
-        "| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN |",
-        "|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|",
+        "| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN | Safe Guard Cancelled Model Hits | Cancelled TP |",
+        "|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
     ]
     for row in rows:
         lines.append(
@@ -181,7 +181,9 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any
             f"| {_fmt(row['tp'])} "
             f"| {_fmt(row['fp'])} "
             f"| {_fmt(row['tn'])} "
-            f"| {_fmt(row['fn'])} |"
+            f"| {_fmt(row['fn'])} "
+            f"| {_fmt(row.get('model_hit_cancelled_by_safe_guard_count'))} "
+            f"| {_fmt(row.get('model_hit_cancelled_by_safe_guard_tp'))} |"
         )
 
     lines.extend(
@@ -193,6 +195,7 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any
             "",
             "- `custom 70/30 eval` uses the project-generated held-out eval split and the saved `external-tuned` artifact.",
             "- `official train/test` trains a temporary lightweight model with internal samples plus deepset official train split, then evaluates deepset official test split.",
+            "- `Hybrid / Full Pipeline` predictions are counted as `rule_predicted OR model_predicted`; safe explanation guard cancellations are reported separately instead of lowering Hybrid TP.",
             "- If custom split performance is much higher than official test performance, custom split metrics may be easier or inflated by similar examples.",
             "",
         ]
diff --git a/evaluation/external_dataset_compare.py b/evaluation/external_dataset_compare.py
index 61a8fc4..d55fc4a 100644
--- a/evaluation/external_dataset_compare.py
+++ b/evaluation/external_dataset_compare.py
@@ -85,7 +85,16 @@ class DatasetBundle:
     note: str = ""
 
 
-Predictor = Callable[[str], bool]
+@dataclass(frozen=True, slots=True)
+class PredictionDecision:
+    predicted: bool
+    rule_predicted: bool | None = None
+    model_predicted: bool | None = None
+    pipeline_predicted: bool | None = None
+    model_hit_cancelled_by_safe_guard: bool = False
+
+
+Predictor = Callable[[str], bool | PredictionDecision]
 
 
 DATASET_SPECS = (
@@ -182,12 +191,35 @@ def _hybrid_pipeline(classifier: LightweightClassifier, threshold: float) -> Pre
         model_detector_fail_mode="warn",
     )
 
-    def predict(text: str) -> bool:
+    def predict(text: str) -> PredictionDecision:
+        rule_predicted = _rule_only(text)
         result = detect_hybrid(text, classifier=classifier, settings=settings)
-        return any(
+        model_predicted = (
+            _is_model_injection_prediction(result.model_prediction)
+            if result.model_prediction is not None
+            else False
+        )
+        pipeline_predicted = any(
+            detection.detector_type == DetectorType.INJECTION
+            for detection in result.detections
+        )
+        model_injection_detection = any(
             detection.detector_type == DetectorType.INJECTION
+            and detection.detector_name == "llm"
             for detection in result.detections
         )
+        model_hit_cancelled_by_safe_guard = (
+            model_predicted
+            and not model_injection_detection
+            and "SAFE_SECURITY_EXPLANATION" in result.reason_codes
+        )
+        return PredictionDecision(
+            predicted=rule_predicted or model_predicted,
+            rule_predicted=rule_predicted,
+            model_predicted=model_predicted,
+            pipeline_predicted=pipeline_predicted,
+            model_hit_cancelled_by_safe_guard=model_hit_cancelled_by_safe_guard,
+        )
 
     return predict
 
@@ -270,12 +302,41 @@ def _metric_result(
 ) -> dict[str, Any]:
     tp = fp = fn = tn = 0
     latencies: list[float] = []
+    decision_diagnostics = {
+        "rule_predicted_count": 0,
+        "model_predicted_count": 0,
+        "hybrid_pipeline_predicted_count": 0,
+        "model_hit_cancelled_by_safe_guard_count": 0,
+        "model_hit_cancelled_by_safe_guard_tp": 0,
+        "hybrid_or_changed_prediction_count": 0,
+    }
+    saw_decision_diagnostics = False
 
     for sample in dataset.samples:
         started = time.perf_counter()
-        predicted = predictor(sample.text)
+        prediction_result = predictor(sample.text)
         latencies.append((time.perf_counter() - started) * 1000)
 
+        if isinstance(prediction_result, PredictionDecision):
+            saw_decision_diagnostics = True
+            predicted = prediction_result.predicted
+            if prediction_result.rule_predicted:
+                decision_diagnostics["rule_predicted_count"] += 1
+            if prediction_result.model_predicted:
+                decision_diagnostics["model_predicted_count"] += 1
+            if prediction_result.pipeline_predicted:
+                decision_diagnostics["hybrid_pipeline_predicted_count"] += 1
+            if prediction_result.model_hit_cancelled_by_safe_guard:
+                decision_diagnostics["model_hit_cancelled_by_safe_guard_count"] += 1
+                if sample.expected_injection:
+                    decision_diagnostics["model_hit_cancelled_by_safe_guard_tp"] += 1
+            if prediction_result.pipeline_predicted is not None and (
+                prediction_result.predicted != prediction_result.pipeline_predicted
+            ):
+                decision_diagnostics["hybrid_or_changed_prediction_count"] += 1
+        else:
+            predicted = prediction_result
+
         if predicted and sample.expected_injection:
             tp += 1
         elif predicted and not sample.expected_injection:
@@ -292,7 +353,7 @@ def _metric_result(
     f1 = None if precision is None else _safe_div(2 * precision * recall, precision + recall)
     accuracy = _safe_div(tp + tn, size)
 
-    return {
+    row = {
         "dataset_name": dataset.spec.name,
         "model_version": model_version,
         "mode": mode,
@@ -311,6 +372,10 @@ def _metric_result(
         "dataset_status": dataset.status,
         "note": dataset.note,
     }
+    if saw_decision_diagnostics:
+        row.update(decision_diagnostics)
+        row["hybrid_prediction_formula"] = "rule_predicted OR model_predicted"
+    return row
 
 
 def _na_result(
@@ -770,7 +835,7 @@ def _render_markdown(
             "",
             "- `Rule Only`는 `backend/app/detection/injection_detector.py`의 규칙·휴리스틱 Prompt Injection 탐지만 사용한다.",
             "- `Lightweight Model Only`는 `models/lightweight/vectorizer.joblib`와 `models/lightweight/classifier.joblib`가 실제로 로드된 경우에만 측정한다.",
-            "- `Hybrid / Full Pipeline`은 현재 프로젝트의 다층형 탐지 파이프라인 실행 경로이며, 규칙 탐지와 경량 모델 계층을 함께 사용한다.",
+            "- `Hybrid / Full Pipeline`은 `rule_predicted OR model_predicted` 기준으로 집계한다. safe explanation guard가 model hit를 취소한 경우에는 JSON 결과의 `model_hit_cancelled_by_safe_guard_count`와 `model_hit_cancelled_by_safe_guard_tp`에 별도로 기록한다.",
             "- `Lakera/gandalf_ignore_instructions`는 공격 샘플 중심 데이터셋이므로 Precision, F1, FP, TN은 `N/A`로 표시하고 Recall과 Accuracy 중심으로 해석한다.",
             "- `model_status`가 `enabled`가 아니면 Hybrid 결과는 경량 분류 계층이 빠진 fallback 성격이므로 완전한 Hybrid 성능으로 과장하지 않는다.",
             "- sklearn artifact 버전 경고가 발생하면 같은 scikit-learn 버전으로 artifact를 재생성한 뒤 결과를 다시 확인한다.",
diff --git a/evaluation/external_overlap_analysis.py b/evaluation/external_overlap_analysis.py
index 1d6c02a..968755d 100644
--- a/evaluation/external_overlap_analysis.py
+++ b/evaluation/external_overlap_analysis.py
@@ -36,11 +36,11 @@
 OVERLAP_CSV_PATH = Path("reports/external_overlap_analysis_results.csv")
 
 
-def _hybrid_predicted(
+def _hybrid_pipeline_prediction(
     text: str,
     classifier: LightweightClassifier,
     threshold: float,
-) -> bool:
+) -> tuple[bool, bool]:
     settings = DetectionSettings(
         enable_model_detector=True,
         detection_mode="hybrid",
@@ -48,10 +48,22 @@ def _hybrid_predicted(
         model_detector_fail_mode="warn",
     )
     result = detect_hybrid(text, classifier=classifier, settings=settings)
-    return any(
+    pipeline_predicted = any(
         detection.detector_type == DetectorType.INJECTION
         for detection in result.detections
     )
+    model_injection_detection = any(
+        detection.detector_type == DetectorType.INJECTION
+        and detection.detector_name == "llm"
+        for detection in result.detections
+    )
+    model_hit_cancelled_by_safe_guard = (
+        result.model_prediction is not None
+        and _is_model_injection_prediction(result.model_prediction)
+        and not model_injection_detection
+        and "SAFE_SECURITY_EXPLANATION" in result.reason_codes
+    )
+    return pipeline_predicted, model_hit_cancelled_by_safe_guard
 
 
 def _analyze_dataset(
@@ -68,7 +80,12 @@ def _analyze_dataset(
         rule_predicted = bool(detect_injection(sample.text))
         model_prediction = classifier.classify(sample.text)
         model_predicted = _is_model_injection_prediction(model_prediction)
-        hybrid_predicted = _hybrid_predicted(sample.text, classifier, threshold)
+        hybrid_pipeline_predicted, model_hit_cancelled_by_safe_guard = _hybrid_pipeline_prediction(
+            sample.text,
+            classifier,
+            threshold,
+        )
+        hybrid_predicted = rule_predicted or model_predicted
         expected = bool(sample.expected_injection)
         sample_rows.append(
             {
@@ -79,6 +96,8 @@ def _analyze_dataset(
                 "rule_predicted": rule_predicted,
                 "model_predicted": model_predicted,
                 "hybrid_predicted": hybrid_predicted,
+                "hybrid_pipeline_predicted": hybrid_pipeline_predicted,
+                "model_hit_cancelled_by_safe_guard": model_hit_cancelled_by_safe_guard,
                 "model_label": model_prediction.label,
                 "model_confidence": model_prediction.confidence,
             }
@@ -111,6 +130,19 @@ def _analyze_dataset(
         for row in sample_rows
         if row["expected_injection"] and row["hybrid_predicted"] and not row["rule_predicted"]
     )
+    hybrid_pipeline_tp = sum(
+        1
+        for row in sample_rows
+        if row["expected_injection"] and row["hybrid_pipeline_predicted"]
+    )
+    model_hit_cancelled_by_safe_guard_count = sum(
+        1 for row in sample_rows if row["model_hit_cancelled_by_safe_guard"]
+    )
+    model_hit_cancelled_by_safe_guard_tp = sum(
+        1
+        for row in sample_rows
+        if row["expected_injection"] and row["model_hit_cancelled_by_safe_guard"]
+    )
 
     summary = {
         "dataset_name": dataset_name,
@@ -124,6 +156,9 @@ def _analyze_dataset(
         "model_only_unique_tp": model_only_unique_tp,
         "hybrid_tp": hybrid_tp,
         "hybrid_extra_tp": hybrid_extra_tp,
+        "hybrid_pipeline_tp": hybrid_pipeline_tp,
+        "model_hit_cancelled_by_safe_guard_count": model_hit_cancelled_by_safe_guard_count,
+        "model_hit_cancelled_by_safe_guard_tp": model_hit_cancelled_by_safe_guard_tp,
         "hybrid_tp_equals_rule_plus_model_unique": hybrid_tp == rule_tp + model_only_unique_tp,
         "hybrid_tp_equals_rule_plus_hybrid_extra": hybrid_tp == rule_tp + hybrid_extra_tp,
     }
@@ -169,6 +204,9 @@ def _run_analysis(
                     "model_only_unique_tp": None,
                     "hybrid_tp": None,
                     "hybrid_extra_tp": None,
+                    "hybrid_pipeline_tp": None,
+                    "model_hit_cancelled_by_safe_guard_count": None,
+                    "model_hit_cancelled_by_safe_guard_tp": None,
                     "hybrid_tp_equals_rule_plus_model_unique": None,
                     "hybrid_tp_equals_rule_plus_hybrid_extra": None,
                     "dataset_status": dataset.status,
@@ -223,8 +261,8 @@ def _render_report(
         "",
         "## Summary",
         "",
-        "| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP |",
-        "|---|---|---:|---:|---:|---:|---:|---:|---:|",
+        "| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP | Pipeline TP | Safe Guard Cancelled Model Hits | Cancelled TP |",
+        "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
     ]
     for row in summaries:
         lines.append(
@@ -236,7 +274,10 @@ def _render_report(
             f"| {_fmt(row['rule_only_tp'])} "
             f"| {_fmt(row['model_only_unique_tp'])} "
             f"| {_fmt(row['hybrid_tp'])} "
-            f"| {_fmt(row['hybrid_extra_tp'])} |"
+            f"| {_fmt(row['hybrid_extra_tp'])} "
+            f"| {_fmt(row['hybrid_pipeline_tp'])} "
+            f"| {_fmt(row['model_hit_cancelled_by_safe_guard_count'])} "
+            f"| {_fmt(row['model_hit_cancelled_by_safe_guard_tp'])} |"
         )
 
     lines.extend(
@@ -248,7 +289,7 @@ def _render_report(
             "",
             "반대로 external-tuned 모델처럼 `Model Only Unique TP`가 증가하면 Hybrid TP도 Rule TP보다 커진다. 따라서 이 표는 Hybrid 개선 여부를 모델 계층의 독립 기여도로 설명하는 핵심 근거다.",
             "",
-            "`Hybrid Extra TP`는 실제 Hybrid 실행 결과가 Rule Only보다 추가로 맞춘 공격 샘플 수다. 이 값이 `Model Only Unique TP`와 다르면, 현재 Hybrid 내부의 model detector heuristic 또는 fallback reason이 순수 lightweight classifier와 다르게 작동했다는 뜻이다.",
+            "`Hybrid TP`와 `Hybrid Extra TP`는 `rule_predicted OR model_predicted` 기준이다. `Pipeline TP`는 safe explanation guard가 적용된 기존 `detect_hybrid()` 실행 결과이며, guard로 취소된 model hit는 별도 열에 기록한다.",
             "",
             "샘플 단위의 `expected_injection`, `rule_predicted`, `model_predicted`, `hybrid_predicted` 값은 JSON 결과 파일의 `sample_predictions`에 저장한다.",
             "",
@@ -292,6 +333,9 @@ def _write_csv(rows: list[dict[str, Any]], path: Path) -> None:
         "model_only_unique_tp",
         "hybrid_tp",
         "hybrid_extra_tp",
+        "hybrid_pipeline_tp",
+        "model_hit_cancelled_by_safe_guard_count",
+        "model_hit_cancelled_by_safe_guard_tp",
         "hybrid_tp_equals_rule_plus_model_unique",
         "hybrid_tp_equals_rule_plus_hybrid_extra",
         "dataset_status",