Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions evaluation/deepset_official_split_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any
f"- Generated at: `{generated_at}`",
f"- Lightweight threshold: `{threshold:.2f}`",
"",
"| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN |",
"|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|",
"| Split Policy | Dataset | Model Version | Mode | Precision | Recall | F1 | Accuracy | TP | FP | TN | FN | Safe Guard Cancelled Model Hits | Cancelled TP |",
"|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
]
for row in rows:
lines.append(
Expand All @@ -181,7 +181,9 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any
f"| {_fmt(row['tp'])} "
f"| {_fmt(row['fp'])} "
f"| {_fmt(row['tn'])} "
f"| {_fmt(row['fn'])} |"
f"| {_fmt(row['fn'])} "
f"| {_fmt(row.get('model_hit_cancelled_by_safe_guard_count'))} "
f"| {_fmt(row.get('model_hit_cancelled_by_safe_guard_tp'))} |"
)

lines.extend(
Expand All @@ -193,6 +195,7 @@ def _render_report(generated_at: str, threshold: float, rows: list[dict[str, Any
"",
"- `custom 70/30 eval` uses the project-generated held-out eval split and the saved `external-tuned` artifact.",
"- `official train/test` trains a temporary lightweight model with internal samples plus deepset official train split, then evaluates deepset official test split.",
"- `Hybrid / Full Pipeline` predictions are counted as `rule_predicted OR model_predicted`; safe explanation guard cancellations are reported separately instead of lowering Hybrid TP.",
"- If custom split performance is much higher than official test performance, custom split metrics may be easier or inflated by similar examples.",
"",
]
Expand Down
77 changes: 71 additions & 6 deletions evaluation/external_dataset_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,16 @@ class DatasetBundle:
note: str = ""


Predictor = Callable[[str], bool]
@dataclass(frozen=True, slots=True)
class PredictionDecision:
predicted: bool
rule_predicted: bool | None = None
model_predicted: bool | None = None
pipeline_predicted: bool | None = None
model_hit_cancelled_by_safe_guard: bool = False


Predictor = Callable[[str], bool | PredictionDecision]


DATASET_SPECS = (
Expand Down Expand Up @@ -182,12 +191,35 @@ def _hybrid_pipeline(classifier: LightweightClassifier, threshold: float) -> Pre
model_detector_fail_mode="warn",
)

def predict(text: str) -> bool:
def predict(text: str) -> PredictionDecision:
rule_predicted = _rule_only(text)
result = detect_hybrid(text, classifier=classifier, settings=settings)
return any(
model_predicted = (
_is_model_injection_prediction(result.model_prediction)
if result.model_prediction is not None
else False
)
pipeline_predicted = any(
detection.detector_type == DetectorType.INJECTION
for detection in result.detections
)
model_injection_detection = any(
detection.detector_type == DetectorType.INJECTION
and detection.detector_name == "llm"
for detection in result.detections
)
model_hit_cancelled_by_safe_guard = (
model_predicted
and not model_injection_detection
and "SAFE_SECURITY_EXPLANATION" in result.reason_codes
)
return PredictionDecision(
predicted=rule_predicted or model_predicted,
rule_predicted=rule_predicted,
model_predicted=model_predicted,
pipeline_predicted=pipeline_predicted,
model_hit_cancelled_by_safe_guard=model_hit_cancelled_by_safe_guard,
)

return predict

Expand Down Expand Up @@ -270,12 +302,41 @@ def _metric_result(
) -> dict[str, Any]:
tp = fp = fn = tn = 0
latencies: list[float] = []
decision_diagnostics = {
"rule_predicted_count": 0,
"model_predicted_count": 0,
"hybrid_pipeline_predicted_count": 0,
"model_hit_cancelled_by_safe_guard_count": 0,
"model_hit_cancelled_by_safe_guard_tp": 0,
"hybrid_or_changed_prediction_count": 0,
}
saw_decision_diagnostics = False

for sample in dataset.samples:
started = time.perf_counter()
predicted = predictor(sample.text)
prediction_result = predictor(sample.text)
latencies.append((time.perf_counter() - started) * 1000)

if isinstance(prediction_result, PredictionDecision):
saw_decision_diagnostics = True
predicted = prediction_result.predicted
if prediction_result.rule_predicted:
decision_diagnostics["rule_predicted_count"] += 1
if prediction_result.model_predicted:
decision_diagnostics["model_predicted_count"] += 1
if prediction_result.pipeline_predicted:
decision_diagnostics["hybrid_pipeline_predicted_count"] += 1
if prediction_result.model_hit_cancelled_by_safe_guard:
decision_diagnostics["model_hit_cancelled_by_safe_guard_count"] += 1
if sample.expected_injection:
decision_diagnostics["model_hit_cancelled_by_safe_guard_tp"] += 1
if prediction_result.pipeline_predicted is not None and (
prediction_result.predicted != prediction_result.pipeline_predicted
):
decision_diagnostics["hybrid_or_changed_prediction_count"] += 1
else:
predicted = prediction_result

if predicted and sample.expected_injection:
tp += 1
elif predicted and not sample.expected_injection:
Expand All @@ -292,7 +353,7 @@ def _metric_result(
f1 = None if precision is None else _safe_div(2 * precision * recall, precision + recall)
accuracy = _safe_div(tp + tn, size)

return {
row = {
"dataset_name": dataset.spec.name,
"model_version": model_version,
"mode": mode,
Expand All @@ -311,6 +372,10 @@ def _metric_result(
"dataset_status": dataset.status,
"note": dataset.note,
}
if saw_decision_diagnostics:
row.update(decision_diagnostics)
row["hybrid_prediction_formula"] = "rule_predicted OR model_predicted"
return row


def _na_result(
Expand Down Expand Up @@ -770,7 +835,7 @@ def _render_markdown(
"",
"- `Rule Only`는 `backend/app/detection/injection_detector.py`의 규칙·휴리스틱 Prompt Injection 탐지만 사용한다.",
"- `Lightweight Model Only`는 `models/lightweight/vectorizer.joblib`와 `models/lightweight/classifier.joblib`가 실제로 로드된 경우에만 측정한다.",
"- `Hybrid / Full Pipeline`은 현재 프로젝트의 다층형 탐지 파이프라인 실행 경로이며, 규칙 탐지와 경량 모델 계층을 함께 사용한다.",
"- `Hybrid / Full Pipeline`은 `rule_predicted OR model_predicted` 기준으로 집계한다. safe explanation guard가 model hit를 취소한 경우에는 JSON 결과의 `model_hit_cancelled_by_safe_guard_count`와 `model_hit_cancelled_by_safe_guard_tp`에 별도로 기록한다.",
"- `Lakera/gandalf_ignore_instructions`는 공격 샘플 중심 데이터셋이므로 Precision, F1, FP, TN은 `N/A`로 표시하고 Recall과 Accuracy 중심으로 해석한다.",
"- `model_status`가 `enabled`가 아니면 Hybrid 결과는 경량 분류 계층이 빠진 fallback 성격이므로 완전한 Hybrid 성능으로 과장하지 않는다.",
"- sklearn artifact 버전 경고가 발생하면 같은 scikit-learn 버전으로 artifact를 재생성한 뒤 결과를 다시 확인한다.",
Expand Down
60 changes: 52 additions & 8 deletions evaluation/external_overlap_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,34 @@
OVERLAP_CSV_PATH = Path("reports/external_overlap_analysis_results.csv")


def _hybrid_predicted(
def _hybrid_pipeline_prediction(
text: str,
classifier: LightweightClassifier,
threshold: float,
) -> bool:
) -> tuple[bool, bool]:
settings = DetectionSettings(
enable_model_detector=True,
detection_mode="hybrid",
model_detector_threshold=threshold,
model_detector_fail_mode="warn",
)
result = detect_hybrid(text, classifier=classifier, settings=settings)
return any(
pipeline_predicted = any(
detection.detector_type == DetectorType.INJECTION
for detection in result.detections
)
model_injection_detection = any(
detection.detector_type == DetectorType.INJECTION
and detection.detector_name == "llm"
for detection in result.detections
)
model_hit_cancelled_by_safe_guard = (
result.model_prediction is not None
and _is_model_injection_prediction(result.model_prediction)
and not model_injection_detection
and "SAFE_SECURITY_EXPLANATION" in result.reason_codes
)
return pipeline_predicted, model_hit_cancelled_by_safe_guard


def _analyze_dataset(
Expand All @@ -68,7 +80,12 @@ def _analyze_dataset(
rule_predicted = bool(detect_injection(sample.text))
model_prediction = classifier.classify(sample.text)
model_predicted = _is_model_injection_prediction(model_prediction)
hybrid_predicted = _hybrid_predicted(sample.text, classifier, threshold)
hybrid_pipeline_predicted, model_hit_cancelled_by_safe_guard = _hybrid_pipeline_prediction(
sample.text,
classifier,
threshold,
)
hybrid_predicted = rule_predicted or model_predicted
expected = bool(sample.expected_injection)
sample_rows.append(
{
Expand All @@ -79,6 +96,8 @@ def _analyze_dataset(
"rule_predicted": rule_predicted,
"model_predicted": model_predicted,
"hybrid_predicted": hybrid_predicted,
"hybrid_pipeline_predicted": hybrid_pipeline_predicted,
"model_hit_cancelled_by_safe_guard": model_hit_cancelled_by_safe_guard,
"model_label": model_prediction.label,
"model_confidence": model_prediction.confidence,
}
Expand Down Expand Up @@ -111,6 +130,19 @@ def _analyze_dataset(
for row in sample_rows
if row["expected_injection"] and row["hybrid_predicted"] and not row["rule_predicted"]
)
hybrid_pipeline_tp = sum(
1
for row in sample_rows
if row["expected_injection"] and row["hybrid_pipeline_predicted"]
)
model_hit_cancelled_by_safe_guard_count = sum(
1 for row in sample_rows if row["model_hit_cancelled_by_safe_guard"]
)
model_hit_cancelled_by_safe_guard_tp = sum(
1
for row in sample_rows
if row["expected_injection"] and row["model_hit_cancelled_by_safe_guard"]
)

summary = {
"dataset_name": dataset_name,
Expand All @@ -124,6 +156,9 @@ def _analyze_dataset(
"model_only_unique_tp": model_only_unique_tp,
"hybrid_tp": hybrid_tp,
"hybrid_extra_tp": hybrid_extra_tp,
"hybrid_pipeline_tp": hybrid_pipeline_tp,
"model_hit_cancelled_by_safe_guard_count": model_hit_cancelled_by_safe_guard_count,
"model_hit_cancelled_by_safe_guard_tp": model_hit_cancelled_by_safe_guard_tp,
"hybrid_tp_equals_rule_plus_model_unique": hybrid_tp == rule_tp + model_only_unique_tp,
"hybrid_tp_equals_rule_plus_hybrid_extra": hybrid_tp == rule_tp + hybrid_extra_tp,
}
Expand Down Expand Up @@ -169,6 +204,9 @@ def _run_analysis(
"model_only_unique_tp": None,
"hybrid_tp": None,
"hybrid_extra_tp": None,
"hybrid_pipeline_tp": None,
"model_hit_cancelled_by_safe_guard_count": None,
"model_hit_cancelled_by_safe_guard_tp": None,
"hybrid_tp_equals_rule_plus_model_unique": None,
"hybrid_tp_equals_rule_plus_hybrid_extra": None,
"dataset_status": dataset.status,
Expand Down Expand Up @@ -223,8 +261,8 @@ def _render_report(
"",
"## Summary",
"",
"| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP |",
"|---|---|---:|---:|---:|---:|---:|---:|---:|",
"| Dataset | Model Version | Rule TP | Model TP | Both TP | Rule Only TP | Model Only Unique TP | Hybrid TP | Hybrid Extra TP | Pipeline TP | Safe Guard Cancelled Model Hits | Cancelled TP |",
"|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
]
for row in summaries:
lines.append(
Expand All @@ -236,7 +274,10 @@ def _render_report(
f"| {_fmt(row['rule_only_tp'])} "
f"| {_fmt(row['model_only_unique_tp'])} "
f"| {_fmt(row['hybrid_tp'])} "
f"| {_fmt(row['hybrid_extra_tp'])} |"
f"| {_fmt(row['hybrid_extra_tp'])} "
f"| {_fmt(row['hybrid_pipeline_tp'])} "
f"| {_fmt(row['model_hit_cancelled_by_safe_guard_count'])} "
f"| {_fmt(row['model_hit_cancelled_by_safe_guard_tp'])} |"
)

lines.extend(
Expand All @@ -248,7 +289,7 @@ def _render_report(
"",
"반대로 external-tuned 모델처럼 `Model Only Unique TP`가 증가하면 Hybrid TP도 Rule TP보다 커진다. 따라서 이 표는 Hybrid 개선 여부를 모델 계층의 독립 기여도로 설명하는 핵심 근거다.",
"",
"`Hybrid Extra TP`는 실제 Hybrid 실행 결과가 Rule Only보다 추가로 맞춘 공격 샘플 수다. 이 값이 `Model Only Unique TP`와 다르면, 현재 Hybrid 내부의 model detector heuristic 또는 fallback reason이 순수 lightweight classifier와 다르게 작동했다는 뜻이다.",
"`Hybrid TP`와 `Hybrid Extra TP`는 `rule_predicted OR model_predicted` 기준이다. `Pipeline TP`는 safe explanation guard가 적용된 기존 `detect_hybrid()` 실행 결과이며, guard로 취소된 model hit는 별도 열에 기록한다.",
"",
"샘플 단위의 `expected_injection`, `rule_predicted`, `model_predicted`, `hybrid_predicted` 값은 JSON 결과 파일의 `sample_predictions`에 저장한다.",
"",
Expand Down Expand Up @@ -292,6 +333,9 @@ def _write_csv(rows: list[dict[str, Any]], path: Path) -> None:
"model_only_unique_tp",
"hybrid_tp",
"hybrid_extra_tp",
"hybrid_pipeline_tp",
"model_hit_cancelled_by_safe_guard_count",
"model_hit_cancelled_by_safe_guard_tp",
"hybrid_tp_equals_rule_plus_model_unique",
"hybrid_tp_equals_rule_plus_hybrid_extra",
"dataset_status",
Expand Down
Loading