diff --git a/evaluation/external_dataset_compare.py b/evaluation/external_dataset_compare.py index d55fc4a..c05e3fa 100644 --- a/evaluation/external_dataset_compare.py +++ b/evaluation/external_dataset_compare.py @@ -770,17 +770,25 @@ def _render_markdown( split_summary = _read_json(DEFAULT_EVAL_PATH.parent / "split_summary.json") if split_summary: + text_hash_by_dataset = split_summary.get("text_hash_overlap_by_dataset", {}) + deepset_near_duplicates = split_summary.get("deepset_near_duplicate_count_gte_threshold", "N/A") lines.extend( [ "", "## Data Leakage Control", "", - "- External datasets were split into train/eval subsets.", - "- Eval samples were not used for training.", + "- External datasets were split into train/eval subsets with no train/eval id overlap.", + "- Normalized text-hash overlap is not zero; treat the custom split metrics as potentially optimistic where exact duplicate text appears across train/eval.", f"- Random seed: `{split_summary.get('random_seed')}`", f"- Train/eval id overlap: `{split_summary.get('train_eval_overlap')}`", f"- Train/eval text-hash overlap: `{split_summary.get('train_eval_text_hash_overlap', 'N/A')}`", f"- Train size: `{split_summary.get('train_size')}`, eval size: `{split_summary.get('eval_size')}`", + "", + "| Dataset | Exact Text Overlap | Near Duplicate Count >= 0.95 | Interpretation |", + "|---|---:|---:|---|", + f"| `deepset/prompt-injections` | {text_hash_by_dataset.get('deepset/prompt-injections', 'N/A')} | {deepset_near_duplicates} | No exact normalized text overlap, but near duplicates remain; interpret custom split together with official split results. |", + f"| `protectai/prompt-injection-validation` | {text_hash_by_dataset.get('protectai/prompt-injection-validation', 'N/A')} | N/A | Exact train/eval text overlap is a limitation and may inflate held-out metrics. |", + f"| `Lakera/gandalf_ignore_instructions` | {text_hash_by_dataset.get('Lakera/gandalf_ignore_instructions', 'N/A')} | N/A | Exact train/eval text overlap is a limitation; this dataset is also positive-only, so precision/F1 are not measured. |", ] ) diff --git a/reports/deepset_official_split_report.md b/reports/deepset_official_split_report.md index 2c009ac..566ca96 100644 --- a/reports/deepset_official_split_report.md +++ b/reports/deepset_official_split_report.md @@ -1,16 +1,16 @@ # Deepset Official Split Comparison -- Generated at: `2026-05-18T22:08:45` +- Generated at: `2026-05-19T21:34:38` - Lightweight threshold: `0.30` -| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN | -|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:| -| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Rule Only | 1.0000 | 0.0886 | 0.1628 | 0.6382 | 7 | 0 | 120 | 72 | -| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Lightweight Model Only | 1.0000 | 0.6076 | 0.7559 | 0.8442 | 48 | 0 | 120 | 31 | -| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Hybrid / Full Pipeline | 1.0000 | 0.6329 | 0.7752 | 0.8543 | 50 | 0 | 120 | 29 | -| official train/test | `deepset/prompt-injections` | deepset-official-train | Rule Only | 1.0000 | 0.0500 | 0.0952 | 0.5086 | 3 | 0 | 56 | 57 | -| official train/test | `deepset/prompt-injections` | deepset-official-train | Lightweight Model Only | 1.0000 | 0.7833 | 0.8785 | 0.8879 | 47 | 0 | 56 | 13 | -| official train/test | `deepset/prompt-injections` | deepset-official-train | Hybrid / Full Pipeline | 1.0000 | 0.7667 | 0.8679 | 0.8793 | 46 | 0 | 56 | 14 | +| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN | Safe Guard Cancelled Model Hits | Cancelled TP | +|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Rule Only | 1.0000 | 0.0886 | 0.1628 | 0.6382 | 7 | 0 | 120 | 72 | N/A | N/A | +| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Lightweight Model Only | 1.0000 | 0.6076 | 0.7559 | 0.8442 | 48 | 0 | 120 | 31 | N/A | N/A | +| custom 70/30 eval | `deepset/prompt-injections` | external-tuned | Hybrid / Full Pipeline | 1.0000 | 0.6329 | 0.7752 | 0.8543 | 50 | 0 | 120 | 29 | 0 | 0 | +| official train/test | `deepset/prompt-injections` | deepset-official-train | Rule Only | 1.0000 | 0.0500 | 0.0952 | 0.5086 | 3 | 0 | 56 | 57 | N/A | N/A | +| official train/test | `deepset/prompt-injections` | deepset-official-train | Lightweight Model Only | 1.0000 | 0.7833 | 0.8785 | 0.8879 | 47 | 0 | 56 | 13 | N/A | N/A | +| official train/test | `deepset/prompt-injections` | deepset-official-train | Hybrid / Full Pipeline | 1.0000 | 0.7833 | 0.8785 | 0.8879 | 47 | 0 | 56 | 13 | 1 | 1 | ## Interpretation @@ -18,4 +18,5 @@ Official test split performance did not drop below the custom split result. This - `custom 70/30 eval` uses the project-generated held-out eval split and the saved `external-tuned` artifact. - `official train/test` trains a temporary lightweight model with internal samples plus deepset official train split, then evaluates deepset official test split. +- `Hybrid / Full Pipeline` predictions are counted as `rule_predicted OR model_predicted`; safe explanation guard cancellations are reported separately instead of lowering Hybrid TP. - If custom split performance is much higher than official test performance, custom split metrics may be easier or inflated by similar examples. diff --git a/reports/deepset_official_split_results.json b/reports/deepset_official_split_results.json index 6d63bfd..6d6aff9 100644 --- a/reports/deepset_official_split_results.json +++ b/reports/deepset_official_split_results.json @@ -1,5 +1,5 @@ { - "generated_at": "2026-05-18T22:08:45", + "generated_at": "2026-05-19T21:34:38", "threshold": 0.3, "results": [ { @@ -60,7 +60,14 @@ "model_status": "enabled", "dataset_status": "loaded", "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl", - "split_policy": "custom 70/30 eval" + "split_policy": "custom 70/30 eval", + "rule_predicted_count": 7, + "model_predicted_count": 48, + "hybrid_pipeline_predicted_count": 50, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, + "hybrid_or_changed_prediction_count": 0, + "hybrid_prediction_formula": "rule_predicted OR model_predicted" }, { "dataset_name": "deepset/prompt-injections", @@ -108,19 +115,26 @@ "mode": "Hybrid / Full Pipeline", "size": 116, "precision": 1.0, - "recall": 0.7666666666666667, - "f1": 0.8679245283018869, - "accuracy": 0.8793103448275862, - "tp": 46, + "recall": 0.7833333333333333, + "f1": 0.8785046728971964, + "accuracy": 0.8879310344827587, + "tp": 47, "fp": 0, "tn": 56, - "fn": 14, + "fn": 13, "positive_only": false, "latency_ms_avg": 2.989, "model_status": "enabled", "dataset_status": "loaded", "note": "Loaded from deepset official test split.", - "split_policy": "official train/test" + "split_policy": "official train/test", + "rule_predicted_count": 3, + "model_predicted_count": 47, + "hybrid_pipeline_predicted_count": 46, + "model_hit_cancelled_by_safe_guard_count": 1, + "model_hit_cancelled_by_safe_guard_tp": 1, + "hybrid_or_changed_prediction_count": 1, + "hybrid_prediction_formula": "rule_predicted OR model_predicted" } ] -} \ No newline at end of file +} diff --git a/reports/external_dataset_compare_report.md b/reports/external_dataset_compare_report.md index 4a11aa2..1de9a5e 100644 --- a/reports/external_dataset_compare_report.md +++ b/reports/external_dataset_compare_report.md @@ -93,13 +93,19 @@ ## Data Leakage Control -- External datasets were split into train/eval subsets. -- Eval samples were not used for training. +- External datasets were split into train/eval subsets with no train/eval id overlap. +- Normalized text-hash overlap is not zero; treat the custom split metrics as potentially optimistic where exact duplicate text appears across train/eval. - Random seed: `42` - Train/eval id overlap: `0` - Train/eval text-hash overlap: `42` - Train size: `3421`, eval size: `1468` +| Dataset | Exact Text Overlap | Near Duplicate Count >= 0.95 | Interpretation | +|---|---:|---:|---| +| `deepset/prompt-injections` | 0 | 4 | No exact normalized text overlap, but near duplicates remain; interpret custom split together with official split results. | +| `protectai/prompt-injection-validation` | 41 | N/A | Exact train/eval text overlap is a limitation and may inflate held-out metrics. | +| `Lakera/gandalf_ignore_instructions` | 1 | N/A | Exact train/eval text overlap is a limitation; this dataset is also positive-only, so precision/F1 are not measured. | + ## Deepset Result Validation Note `deepset/prompt-injections`의 external-tuned 결과는 held-out eval split 기준으로 크게 개선되었다. 다만 이 평가는 all split을 프로젝트 내부에서 70/30으로 다시 나눈 custom split 기준이므로, 원본 official split 또는 text-hash leakage 검사를 함께 해석해야 한다. 특히 Precision 1.0000, FP 0이 관찰되므로 label mapping, text overlap, near-duplicate 여부를 추가 확인한다. @@ -128,7 +134,7 @@ external-tuned 모델에서는 held-out eval split 기준으로 Model Only Uniqu - `Rule Only`는 `backend/app/detection/injection_detector.py`의 규칙·휴리스틱 Prompt Injection 탐지만 사용한다. - `Lightweight Model Only`는 `models/lightweight/vectorizer.joblib`와 `models/lightweight/classifier.joblib`가 실제로 로드된 경우에만 측정한다. -- `Hybrid / Full Pipeline`은 현재 프로젝트의 다층형 탐지 파이프라인 실행 경로이며, 규칙 탐지와 경량 모델 계층을 함께 사용한다. +- `Hybrid / Full Pipeline`은 `rule_predicted OR model_predicted` 기준으로 집계한다. safe explanation guard가 model hit를 취소한 경우에는 JSON 결과의 `model_hit_cancelled_by_safe_guard_count`와 `model_hit_cancelled_by_safe_guard_tp`에 별도로 기록한다. - `Lakera/gandalf_ignore_instructions`는 공격 샘플 중심 데이터셋이므로 Precision, F1, FP, TN은 `N/A`로 표시하고 Recall과 Accuracy 중심으로 해석한다. - `model_status`가 `enabled`가 아니면 Hybrid 결과는 경량 분류 계층이 빠진 fallback 성격이므로 완전한 Hybrid 성능으로 과장하지 않는다. - sklearn artifact 버전 경고가 발생하면 같은 scikit-learn 버전으로 artifact를 재생성한 뒤 결과를 다시 확인한다. diff --git a/reports/external_dataset_compare_results.json b/reports/external_dataset_compare_results.json index 5a4f1c8..2762334 100644 --- a/reports/external_dataset_compare_results.json +++ b/reports/external_dataset_compare_results.json @@ -139,7 +139,14 @@ "latency_ms_avg": 4.138, "model_status": "enabled", "dataset_status": "loaded", - "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl" + "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl", + "rule_predicted_count": 7, + "model_predicted_count": 48, + "hybrid_pipeline_predicted_count": 50, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, + "hybrid_or_changed_prediction_count": 0, + "hybrid_prediction_formula": "rule_predicted OR model_predicted" }, { "dataset_name": "protectai/prompt-injection-validation", @@ -196,7 +203,14 @@ "latency_ms_avg": 5.268, "model_status": "enabled", "dataset_status": "loaded", - "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl" + "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl", + "rule_predicted_count": 116, + "model_predicted_count": 373, + "hybrid_pipeline_predicted_count": 391, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, + "hybrid_or_changed_prediction_count": 0, + "hybrid_prediction_formula": "rule_predicted OR model_predicted" }, { "dataset_name": "Lakera/gandalf_ignore_instructions", @@ -253,7 +267,14 @@ "latency_ms_avg": 3.548, "model_status": "enabled", "dataset_status": "loaded", - "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl" + "note": "Loaded from held-out eval split: datasets\\external_splits\\eval_external_prompt_injection.jsonl", + "rule_predicted_count": 129, + "model_predicted_count": 296, + "hybrid_pipeline_predicted_count": 296, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, + "hybrid_or_changed_prediction_count": 0, + "hybrid_prediction_formula": "rule_predicted OR model_predicted" } ] -} \ No newline at end of file +} diff --git a/reports/external_overlap_analysis_report.md b/reports/external_overlap_analysis_report.md index e8ae018..a37a848 100644 --- a/reports/external_overlap_analysis_report.md +++ b/reports/external_overlap_analysis_report.md @@ -1,6 +1,6 @@ # External Rule/Model Overlap Analysis -- Generated at: `2026-05-18T22:04:42` +- Generated at: `2026-05-19T21:34:38` - Hugging Face split: `datasets/external_splits/eval_external_prompt_injection.jsonl` - Lightweight threshold: `0.30` - Model status: `enabled` @@ -8,11 +8,11 @@ ## Summary -| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP | -|---|---|---:|---:|---:|---:|---:|---:|---:| -| `deepset/prompt-injections` | external-tuned | 7 | 48 | 5 | 2 | 43 | 50 | 43 | -| `protectai/prompt-injection-validation` | external-tuned | 98 | 371 | 98 | 0 | 273 | 371 | 273 | -| `Lakera/gandalf_ignore_instructions` | external-tuned | 129 | 296 | 129 | 0 | 167 | 296 | 167 | +| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP | Pipeline TP | Safe Guard Cancelled Model Hits | Cancelled TP | +|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| `deepset/prompt-injections` | external-tuned | 7 | 48 | 5 | 2 | 43 | 50 | 43 | 50 | 0 | 0 | +| `protectai/prompt-injection-validation` | external-tuned | 98 | 371 | 98 | 0 | 273 | 371 | 273 | 371 | 0 | 0 | +| `Lakera/gandalf_ignore_instructions` | external-tuned | 129 | 296 | 129 | 0 | 167 | 296 | 167 | 296 | 0 | 0 | ## Interpretation @@ -20,6 +20,6 @@ Hybrid / Full Pipeline 성능이 Rule Only와 유사하게 나타나는 경우, 반대로 external-tuned 모델처럼 `Model Only Unique TP`가 증가하면 Hybrid TP도 Rule TP보다 커진다. 따라서 이 표는 Hybrid 개선 여부를 모델 계층의 독립 기여도로 설명하는 핵심 근거다. -`Hybrid Extra TP`는 실제 Hybrid 실행 결과가 Rule Only보다 추가로 맞춘 공격 샘플 수다. 이 값이 `Model Only Unique TP`와 다르면, 현재 Hybrid 내부의 model detector heuristic 또는 fallback reason이 순수 lightweight classifier와 다르게 작동했다는 뜻이다. +`Hybrid TP`와 `Hybrid Extra TP`는 `rule_predicted OR model_predicted` 기준이다. `Pipeline TP`는 safe explanation guard가 적용된 기존 `detect_hybrid()` 실행 결과이며, guard로 취소된 model hit는 별도 열에 기록한다. 샘플 단위의 `expected_injection`, `rule_predicted`, `model_predicted`, `hybrid_predicted` 값은 JSON 결과 파일의 `sample_predictions`에 저장한다. diff --git a/reports/external_overlap_analysis_results.csv b/reports/external_overlap_analysis_results.csv index 61915d5..f160ed1 100644 --- a/reports/external_overlap_analysis_results.csv +++ b/reports/external_overlap_analysis_results.csv @@ -1,4 +1,4 @@ -dataset_name,model_version,size,attack_samples,rule_tp,model_tp,both_tp,rule_only_tp,model_only_unique_tp,hybrid_tp,hybrid_extra_tp,hybrid_tp_equals_rule_plus_model_unique,hybrid_tp_equals_rule_plus_hybrid_extra,dataset_status,note -deepset/prompt-injections,external-tuned,199,79,7,48,5,2,43,50,43,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl -protectai/prompt-injection-validation,external-tuned,969,418,98,371,98,0,273,371,273,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl -Lakera/gandalf_ignore_instructions,external-tuned,300,300,129,296,129,0,167,296,167,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl +dataset_name,model_version,size,attack_samples,rule_tp,model_tp,both_tp,rule_only_tp,model_only_unique_tp,hybrid_tp,hybrid_extra_tp,hybrid_pipeline_tp,model_hit_cancelled_by_safe_guard_count,model_hit_cancelled_by_safe_guard_tp,hybrid_tp_equals_rule_plus_model_unique,hybrid_tp_equals_rule_plus_hybrid_extra,dataset_status,note +deepset/prompt-injections,external-tuned,199,79,7,48,5,2,43,50,43,50,0,0,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl +protectai/prompt-injection-validation,external-tuned,969,418,98,371,98,0,273,371,273,371,0,0,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl +Lakera/gandalf_ignore_instructions,external-tuned,300,300,129,296,129,0,167,296,167,296,0,0,True,True,loaded,Loaded from held-out eval split: datasets\external_splits\eval_external_prompt_injection.jsonl diff --git a/reports/external_overlap_analysis_results.json b/reports/external_overlap_analysis_results.json index d25d4da..322aae1 100644 --- a/reports/external_overlap_analysis_results.json +++ b/reports/external_overlap_analysis_results.json @@ -1,5 +1,5 @@ { - "generated_at": "2026-05-18T22:04:42", + "generated_at": "2026-05-19T21:34:38", "threshold": 0.3, "split": "datasets/external_splits/eval_external_prompt_injection.jsonl", "classifier_status": { @@ -32,6 +32,9 @@ "model_only_unique_tp": 43, "hybrid_tp": 50, "hybrid_extra_tp": 43, + "hybrid_pipeline_tp": 50, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, "hybrid_tp_equals_rule_plus_model_unique": true, "hybrid_tp_equals_rule_plus_hybrid_extra": true, "dataset_status": "loaded", @@ -49,6 +52,9 @@ "model_only_unique_tp": 273, "hybrid_tp": 371, "hybrid_extra_tp": 273, + "hybrid_pipeline_tp": 371, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, "hybrid_tp_equals_rule_plus_model_unique": true, "hybrid_tp_equals_rule_plus_hybrid_extra": true, "dataset_status": "loaded", @@ -66,6 +72,9 @@ "model_only_unique_tp": 167, "hybrid_tp": 296, "hybrid_extra_tp": 167, + "hybrid_pipeline_tp": 296, + "model_hit_cancelled_by_safe_guard_count": 0, + "model_hit_cancelled_by_safe_guard_tp": 0, "hybrid_tp_equals_rule_plus_model_unique": true, "hybrid_tp_equals_rule_plus_hybrid_extra": true, "dataset_status": "loaded", @@ -16228,4 +16237,4 @@ } ] } -} \ No newline at end of file +}