treasonking · treasonking · May 19, 2026 · May 19, 2026
diff --git a/evaluation/external_dataset_compare.py b/evaluation/external_dataset_compare.py
@@ -770,17 +770,25 @@ def _render_markdown(
 
     split_summary = _read_json(DEFAULT_EVAL_PATH.parent / "split_summary.json")
     if split_summary:
+        text_hash_by_dataset = split_summary.get("text_hash_overlap_by_dataset", {})
+        deepset_near_duplicates = split_summary.get("deepset_near_duplicate_count_gte_threshold", "N/A")
         lines.extend(
             [
                 "",
                 "## Data Leakage Control",
                 "",
-                "- External datasets were split into train/eval subsets.",
-                "- Eval samples were not used for training.",
+                "- External datasets were split into train/eval subsets with no train/eval id overlap.",
+                "- Normalized text-hash overlap is not zero; treat the custom split metrics as potentially optimistic where exact duplicate text appears across train/eval.",
                 f"- Random seed: `{split_summary.get('random_seed')}`",
                 f"- Train/eval id overlap: `{split_summary.get('train_eval_overlap')}`",
                 f"- Train/eval text-hash overlap: `{split_summary.get('train_eval_text_hash_overlap', 'N/A')}`",
                 f"- Train size: `{split_summary.get('train_size')}`, eval size: `{split_summary.get('eval_size')}`",
+                "",
+                "| Dataset | Exact Text Overlap | Near Duplicate Count >= 0.95 | Interpretation |",
+                "|---|---:|---:|---|",
+                f"| `deepset/prompt-injections` | {text_hash_by_dataset.get('deepset/prompt-injections', 'N/A')} | {deepset_near_duplicates} | No exact normalized text overlap, but near duplicates remain; interpret custom split together with official split results. |",
+                f"| `protectai/prompt-injection-validation` | {text_hash_by_dataset.get('protectai/prompt-injection-validation', 'N/A')} | N/A | Exact train/eval text overlap is a limitation and may inflate held-out metrics. |",
+                f"| `Lakera/gandalf_ignore_instructions` | {text_hash_by_dataset.get('Lakera/gandalf_ignore_instructions', 'N/A')} | N/A | Exact train/eval text overlap is a limitation; this dataset is also positive-only, so precision/F1 are not measured. |",
             ]
         )
 

diff --git a/reports/deepset_official_split_report.md b/reports/deepset_official_split_report.md
@@ -1,21 +1,22 @@
 # Deepset Official Split Comparison
 
-- Generated at: `2026-05-18T22:08:45`
+- Generated at: `2026-05-19T21:34:38`
 - Lightweight threshold: `0.30`
 
-| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN |
-|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|
-| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Rule Only | 1.0000 | 0.0886 | 0.1628 | 0.6382 | 7 | 0 | 120 | 72 |
-| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Lightweight Model Only | 1.0000 | 0.6076 | 0.7559 | 0.8442 | 48 | 0 | 120 | 31 |
-| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Hybrid / Full Pipeline | 1.0000 | 0.6329 | 0.7752 | 0.8543 | 50 | 0 | 120 | 29 |
-| official train/test | `deepset/prompt-injections` | deepset-official-train | Rule Only | 1.0000 | 0.0500 | 0.0952 | 0.5086 | 3 | 0 | 56 | 57 |
-| official train/test | `deepset/prompt-injections` | deepset-official-train | Lightweight Model Only | 1.0000 | 0.7833 | 0.8785 | 0.8879 | 47 | 0 | 56 | 13 |
-| official train/test | `deepset/prompt-injections` | deepset-official-train | Hybrid / Full Pipeline | 1.0000 | 0.7667 | 0.8679 | 0.8793 | 46 | 0 | 56 | 14 |
+| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN | Safe Guard Cancelled Model Hits | Cancelled TP |
+|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Rule Only | 1.0000 | 0.0886 | 0.1628 | 0.6382 | 7 | 0 | 120 | 72 | N/A | N/A |
+| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Lightweight Model Only | 1.0000 | 0.6076 | 0.7559 | 0.8442 | 48 | 0 | 120 | 31 | N/A | N/A |
+| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Hybrid / Full Pipeline | 1.0000 | 0.6329 | 0.7752 | 0.8543 | 50 | 0 | 120 | 29 | 0 | 0 |
+| official train/test | `deepset/prompt-injections` | deepset-official-train | Rule Only | 1.0000 | 0.0500 | 0.0952 | 0.5086 | 3 | 0 | 56 | 57 | N/A | N/A |
+| official train/test | `deepset/prompt-injections` | deepset-official-train | Lightweight Model Only | 1.0000 | 0.7833 | 0.8785 | 0.8879 | 47 | 0 | 56 | 13 | N/A | N/A |
+| official train/test | `deepset/prompt-injections` | deepset-official-train | Hybrid / Full Pipeline | 1.0000 | 0.7833 | 0.8785 | 0.8879 | 47 | 0 | 56 | 13 | 1 | 1 |
 
 ## Interpretation
 
 Official test split performance did not drop below the custom split result. This supports that the deepset improvement is not explained solely by the custom 70/30 split, although near-duplicate findings still require cautious wording.
 
 - `custom 70/30 eval` uses the project-generated held-out eval split and the saved `external-tuned` artifact.
 - `official train/test` trains a temporary lightweight model with internal samples plus deepset official train split, then evaluates deepset official test split.
+- `Hybrid / Full Pipeline` predictions are counted as `rule_predicted OR model_predicted`; safe explanation guard cancellations are reported separately instead of lowering Hybrid TP.
 - If custom split performance is much higher than official test performance, custom split metrics may be easier or inflated by similar examples.
diff --git a/reports/deepset_official_split_results.json b/reports/deepset_official_split_results.json
@@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-05-18T22:08:45",
+  "generated_at": "2026-05-19T21:34:38",
   "threshold": 0.3,
   "results": [
     {
@@ -60,7 +60,14 @@
       "model_status": "enabled",
       "dataset_status": "loaded",
       "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl",
-      "split_policy": "custom 70/30 eval"
+      "split_policy": "custom 70/30 eval",
+      "rule_predicted_count": 7,
+      "model_predicted_count": 48,
+      "hybrid_pipeline_predicted_count": 50,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
+      "hybrid_or_changed_prediction_count": 0,
+      "hybrid_prediction_formula": "rule_predicted OR model_predicted"
     },
     {
       "dataset_name": "deepset/prompt-injections",
@@ -108,19 +115,26 @@
       "mode": "Hybrid / Full Pipeline",
       "size": 116,
       "precision": 1.0,
-      "recall": 0.7666666666666667,
-      "f1": 0.8679245283018869,
-      "accuracy": 0.8793103448275862,
-      "tp": 46,
+      "recall": 0.7833333333333333,
+      "f1": 0.8785046728971964,
+      "accuracy": 0.8879310344827587,
+      "tp": 47,
       "fp": 0,
       "tn": 56,
-      "fn": 14,
+      "fn": 13,
       "positive_only": false,
       "latency_ms_avg": 2.989,
       "model_status": "enabled",
       "dataset_status": "loaded",
       "note": "Loaded from deepset official test split.",
-      "split_policy": "official train/test"
+      "split_policy": "official train/test",
+      "rule_predicted_count": 3,
+      "model_predicted_count": 47,
+      "hybrid_pipeline_predicted_count": 46,
+      "model_hit_cancelled_by_safe_guard_count": 1,
+      "model_hit_cancelled_by_safe_guard_tp": 1,
+      "hybrid_or_changed_prediction_count": 1,
+      "hybrid_prediction_formula": "rule_predicted OR model_predicted"
     }
   ]
-}
+}
diff --git a/reports/external_dataset_compare_report.md b/reports/external_dataset_compare_report.md
@@ -93,13 +93,19 @@
 
 ## Data Leakage Control
 
-- External datasets were split into train/eval subsets.
-- Eval samples were not used for training.
+- External datasets were split into train/eval subsets with no train/eval id overlap.
+- Normalized text-hash overlap is not zero; treat the custom split metrics as potentially optimistic where exact duplicate text appears across train/eval.
 - Random seed: `42`
 - Train/eval id overlap: `0`
 - Train/eval text-hash overlap: `42`
 - Train size: `3421`, eval size: `1468`
 
+| Dataset | Exact Text Overlap | Near Duplicate Count >= 0.95 | Interpretation |
+|---|---:|---:|---|
+| `deepset/prompt-injections` | 0 | 4 | No exact normalized text overlap, but near duplicates remain; interpret custom split together with official split results. |
+| `protectai/prompt-injection-validation` | 41 | N/A | Exact train/eval text overlap is a limitation and may inflate held-out metrics. |
+| `Lakera/gandalf_ignore_instructions` | 1 | N/A | Exact train/eval text overlap is a limitation; this dataset is also positive-only, so precision/F1 are not measured. |
+
 ## Deepset Result Validation Note
 
 `deepset/prompt-injections`의 external-tuned 결과는 held-out eval split 기준으로 크게 개선되었다. 다만 이 평가는 all split을 프로젝트 내부에서 70/30으로 다시 나눈 custom split 기준이므로, 원본 official split 또는 text-hash leakage 검사를 함께 해석해야 한다. 특히 Precision 1.0000, FP 0이 관찰되므로 label mapping, text overlap, near-duplicate 여부를 추가 확인한다.
@@ -128,7 +134,7 @@ external-tuned 모델에서는 held-out eval split 기준으로 Model Only Uniqu
 
 - `Rule Only`는 `backend/app/detection/injection_detector.py`의 규칙·휴리스틱 Prompt Injection 탐지만 사용한다.
 - `Lightweight Model Only`는 `models/lightweight/vectorizer.joblib`와 `models/lightweight/classifier.joblib`가 실제로 로드된 경우에만 측정한다.
-- `Hybrid / Full Pipeline`은 현재 프로젝트의 다층형 탐지 파이프라인 실행 경로이며, 규칙 탐지와 경량 모델 계층을 함께 사용한다.
+- `Hybrid / Full Pipeline`은 `rule_predicted OR model_predicted` 기준으로 집계한다. safe explanation guard가 model hit를 취소한 경우에는 JSON 결과의 `model_hit_cancelled_by_safe_guard_count`와 `model_hit_cancelled_by_safe_guard_tp`에 별도로 기록한다.
 - `Lakera/gandalf_ignore_instructions`는 공격 샘플 중심 데이터셋이므로 Precision, F1, FP, TN은 `N/A`로 표시하고 Recall과 Accuracy 중심으로 해석한다.
 - `model_status`가 `enabled`가 아니면 Hybrid 결과는 경량 분류 계층이 빠진 fallback 성격이므로 완전한 Hybrid 성능으로 과장하지 않는다.
 - sklearn artifact 버전 경고가 발생하면 같은 scikit-learn 버전으로 artifact를 재생성한 뒤 결과를 다시 확인한다.
diff --git a/reports/external_dataset_compare_results.json b/reports/external_dataset_compare_results.json
@@ -139,7 +139,14 @@
       "latency_ms_avg": 4.138,
       "model_status": "enabled",
       "dataset_status": "loaded",
-      "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl"
+      "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl",
+      "rule_predicted_count": 7,
+      "model_predicted_count": 48,
+      "hybrid_pipeline_predicted_count": 50,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
+      "hybrid_or_changed_prediction_count": 0,
+      "hybrid_prediction_formula": "rule_predicted OR model_predicted"
     },
     {
       "dataset_name": "protectai/prompt-injection-validation",
@@ -196,7 +203,14 @@
       "latency_ms_avg": 5.268,
       "model_status": "enabled",
       "dataset_status": "loaded",
-      "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl"
+      "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl",
+      "rule_predicted_count": 116,
+      "model_predicted_count": 373,
+      "hybrid_pipeline_predicted_count": 391,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
+      "hybrid_or_changed_prediction_count": 0,
+      "hybrid_prediction_formula": "rule_predicted OR model_predicted"
     },
     {
       "dataset_name": "Lakera/gandalf_ignore_instructions",
@@ -253,7 +267,14 @@
       "latency_ms_avg": 3.548,
       "model_status": "enabled",
       "dataset_status": "loaded",
-      "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl"
+      "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl",
+      "rule_predicted_count": 129,
+      "model_predicted_count": 296,
+      "hybrid_pipeline_predicted_count": 296,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
+      "hybrid_or_changed_prediction_count": 0,
+      "hybrid_prediction_formula": "rule_predicted OR model_predicted"
     }
   ]
-}
+}
diff --git a/reports/external_overlap_analysis_report.md b/reports/external_overlap_analysis_report.md
@@ -1,25 +1,25 @@
 # External Rule/Model Overlap Analysis
 
-- Generated at: `2026-05-18T22:04:42`
+- Generated at: `2026-05-19T21:34:38`
 - Hugging Face split: `datasets/external_splits/eval_external_prompt_injection.jsonl`
 - Lightweight threshold: `0.30`
 - Model status: `enabled`
 - Model version: `external-tuned`
 
 ## Summary
 
-| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP |
-|---|---|---:|---:|---:|---:|---:|---:|---:|
-| `deepset/prompt-injections` | external-tuned | 7 | 48 | 5 | 2 | 43 | 50 | 43 |
-| `protectai/prompt-injection-validation` | external-tuned | 98 | 371 | 98 | 0 | 273 | 371 | 273 |
-| `Lakera/gandalf_ignore_instructions` | external-tuned | 129 | 296 | 129 | 0 | 167 | 296 | 167 |
+| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP | Pipeline TP | Safe Guard Cancelled Model Hits | Cancelled TP |
+|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| `deepset/prompt-injections` | external-tuned | 7 | 48 | 5 | 2 | 43 | 50 | 43 | 50 | 0 | 0 |
+| `protectai/prompt-injection-validation` | external-tuned | 98 | 371 | 98 | 0 | 273 | 371 | 273 | 371 | 0 | 0 |
+| `Lakera/gandalf_ignore_instructions` | external-tuned | 129 | 296 | 129 | 0 | 167 | 296 | 167 | 296 | 0 | 0 |
 
 ## Interpretation
 
 Hybrid / Full Pipeline 성능이 Rule Only와 유사하게 나타나는 경우, 주된 이유는 Lightweight Model이 Rule 계층이 놓친 공격 샘플을 거의 추가로 탐지하지 못하기 때문이다.
 
 반대로 external-tuned 모델처럼 `Model Only Unique TP`가 증가하면 Hybrid TP도 Rule TP보다 커진다. 따라서 이 표는 Hybrid 개선 여부를 모델 계층의 독립 기여도로 설명하는 핵심 근거다.
 
-`Hybrid Extra TP`는 실제 Hybrid 실행 결과가 Rule Only보다 추가로 맞춘 공격 샘플 수다. 이 값이 `Model Only Unique TP`와 다르면, 현재 Hybrid 내부의 model detector heuristic 또는 fallback reason이 순수 lightweight classifier와 다르게 작동했다는 뜻이다.
+`Hybrid TP`와 `Hybrid Extra TP`는 `rule_predicted OR model_predicted` 기준이다. `Pipeline TP`는 safe explanation guard가 적용된 기존 `detect_hybrid()` 실행 결과이며, guard로 취소된 model hit는 별도 열에 기록한다.
 
 샘플 단위의 `expected_injection`, `rule_predicted`, `model_predicted`, `hybrid_predicted` 값은 JSON 결과 파일의 `sample_predictions`에 저장한다.
diff --git a/reports/external_overlap_analysis_results.csv b/reports/external_overlap_analysis_results.csv
@@ -1,4 +1,4 @@
-dataset_name,model_version,size,attack_samples,rule_tp,model_tp,both_tp,rule_only_tp,model_only_unique_tp,hybrid_tp,hybrid_extra_tp,hybrid_tp_equals_rule_plus_model_unique,hybrid_tp_equals_rule_plus_hybrid_extra,dataset_status,note
-deepset/prompt-injections,external-tuned,199,79,7,48,5,2,43,50,43,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl
-protectai/prompt-injection-validation,external-tuned,969,418,98,371,98,0,273,371,273,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl
-Lakera/gandalf_ignore_instructions,external-tuned,300,300,129,296,129,0,167,296,167,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl
+dataset_name,model_version,size,attack_samples,rule_tp,model_tp,both_tp,rule_only_tp,model_only_unique_tp,hybrid_tp,hybrid_extra_tp,hybrid_pipeline_tp,model_hit_cancelled_by_safe_guard_count,model_hit_cancelled_by_safe_guard_tp,hybrid_tp_equals_rule_plus_model_unique,hybrid_tp_equals_rule_plus_hybrid_extra,dataset_status,note
+deepset/prompt-injections,external-tuned,199,79,7,48,5,2,43,50,43,50,0,0,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl
+protectai/prompt-injection-validation,external-tuned,969,418,98,371,98,0,273,371,273,371,0,0,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl
+Lakera/gandalf_ignore_instructions,external-tuned,300,300,129,296,129,0,167,296,167,296,0,0,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl
diff --git a/reports/external_overlap_analysis_results.json b/reports/external_overlap_analysis_results.json
@@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-05-18T22:04:42",
+  "generated_at": "2026-05-19T21:34:38",
   "threshold": 0.3,
   "split": "datasets/external_splits/eval_external_prompt_injection.jsonl",
   "classifier_status": {
@@ -32,6 +32,9 @@
       "model_only_unique_tp": 43,
       "hybrid_tp": 50,
       "hybrid_extra_tp": 43,
+      "hybrid_pipeline_tp": 50,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
       "hybrid_tp_equals_rule_plus_model_unique": true,
       "hybrid_tp_equals_rule_plus_hybrid_extra": true,
       "dataset_status": "loaded",
@@ -49,6 +52,9 @@
       "model_only_unique_tp": 273,
       "hybrid_tp": 371,
       "hybrid_extra_tp": 273,
+      "hybrid_pipeline_tp": 371,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
       "hybrid_tp_equals_rule_plus_model_unique": true,
       "hybrid_tp_equals_rule_plus_hybrid_extra": true,
       "dataset_status": "loaded",
@@ -66,6 +72,9 @@
       "model_only_unique_tp": 167,
       "hybrid_tp": 296,
       "hybrid_extra_tp": 167,
+      "hybrid_pipeline_tp": 296,
+      "model_hit_cancelled_by_safe_guard_count": 0,
+      "model_hit_cancelled_by_safe_guard_tp": 0,
       "hybrid_tp_equals_rule_plus_model_unique": true,
       "hybrid_tp_equals_rule_plus_hybrid_extra": true,
       "dataset_status": "loaded",
@@ -16228,4 +16237,4 @@
       }
     ]
   }
-}
+}