From f6356309fcbc6f2fee31639e4c056a0512002bbd Mon Sep 17 00:00:00 2001
From: Ogulcan Aydogan <ogulcanaydogan@hotmail.com>
Date: Wed, 27 May 2026 12:48:20 +0100
Subject: [PATCH] feat: add owasp llm04 data poisoning rule pack

New rule file rules/owasp-llm04-data-poisoning.yaml adds 6 patterns
for OWASP LLM04 (2025) Data Poisoning attack scenarios not covered by
the 2 existing LLM04 rules in owasp-llm-top10.yaml:

- PIF-LLM04-003: adversarial example construction (FGSM, C&W, PGD)
- PIF-LLM04-004: backdoor trigger phrase injection
- PIF-LLM04-005: cross-session memory contamination
- PIF-LLM04-006: federated learning gradient poisoning
- PIF-LLM04-007: synthetic training data injection to bias a model
- PIF-LLM04-008: RLHF reward hacking instruction

loader_test.go updated to expect 4 rule files (was 3).
Starts v1.4.0 OWASP partial-coverage closure per ROADMAP.
---
 CHANGELOG.md                          |  3 ++
 pkg/rules/loader_test.go              |  2 +-
 rules/owasp-llm04-data-poisoning.yaml | 69 +++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 rules/owasp-llm04-data-poisoning.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14fea24..7a0be01 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- OWASP LLM04 (2025) Data Poisoning rule pack (`rules/owasp-llm04-data-poisoning.yaml`) with 6 new patterns covering adversarial example construction, backdoor trigger phrases, cross-session memory contamination, federated learning poisoning, synthetic training data injection, and RLHF reward hacking. Closes the v1.4.0 ROADMAP item to extend LLM04 partial coverage; existing 2 LLM04 patterns in `rules/owasp-llm-top10.yaml` remain in place.
+
 ### Changed
 - CI and release workflows now pin `go-version: '1.26.x'` (was `1.25.x`); unblocks Dependabot rollups that bump `go.mod` to require Go 1.26.0 (e.g. `hugot` v0.7.2).
 
diff --git a/pkg/rules/loader_test.go b/pkg/rules/loader_test.go
index 09115f8..813cc37 100644
--- a/pkg/rules/loader_test.go
+++ b/pkg/rules/loader_test.go
@@ -106,7 +106,7 @@ func TestLoadDir(t *testing.T) {
 
 	sets, err := LoadDir(rulesDir)
 	require.NoError(t, err)
-	assert.Len(t, sets, 3, "should load all 3 rule files")
+	assert.Len(t, sets, 4, "should load all 4 rule files")
 
 	totalRules := 0
 	for _, rs := range sets {
diff --git a/rules/owasp-llm04-data-poisoning.yaml b/rules/owasp-llm04-data-poisoning.yaml
new file mode 100644
index 0000000..04ee7d2
--- /dev/null
+++ b/rules/owasp-llm04-data-poisoning.yaml
@@ -0,0 +1,69 @@
+name: "OWASP LLM04 - Data Poisoning (Extended)"
+version: "1.0.0"
+description: "Extended detection patterns for OWASP LLM04 (2025) Data Poisoning attacks covering adversarial inputs, backdoor triggers, cross-session memory contamination, federated learning poisoning, synthetic training data injection, and RLHF reward hacking"
+rules:
+  # ── Adversarial Example Construction ──
+
+  - id: "PIF-LLM04-003"
+    name: "Adversarial Example Construction"
+    description: "Detects requests to construct adversarial inputs designed to fool model outputs"
+    category: "prompt_injection"
+    severity: 3
+    pattern: '(?i)(adversarial\s+example|carefully\s+crafted\s+(input|prompt)|gradient[\s-]based\s+attack|fast\s+gradient\s+sign\s+method|fgsm\s+attack|adversarial\s+perturbation|projected\s+gradient\s+descent\s+attack|carlini[\s-]wagner\s+attack)'
+    enabled: true
+    tags: ["owasp-llm04", "data-poisoning", "adversarial"]
+
+  # ── Backdoor Trigger Phrase ──
+
+  - id: "PIF-LLM04-004"
+    name: "Backdoor Trigger Phrase"
+    description: "Detects injection of backdoor trigger phrases intended to activate hidden model behaviour"
+    category: "prompt_injection"
+    severity: 4
+    pattern: '(?i)(trigger\s+(phrase|word|token|sequence)\s+(activates?|enables?|unlocks?|causes?)\s+(hidden|secret|backdoor|debug|override)|when\s+you\s+(see|read|encounter|receive)\s+.{0,40}(activate|reveal|bypass|ignore|execute|switch\s+to))'
+    enabled: true
+    tags: ["owasp-llm04", "data-poisoning", "backdoor"]
+
+  # ── Cross-Session Memory Contamination ──
+
+  - id: "PIF-LLM04-005"
+    name: "Cross-Session Memory Contamination"
+    description: "Detects attempts to persist or propagate injected context across sessions or users"
+    category: "prompt_injection"
+    severity: 3
+    pattern: '(?i)(carry|persist|propagate|inject|spread)\s+(this|these|the\s+following)\s+(memor(y|ies)|context|state|knowledge|instructions?|rules?)\s+(across|between|into)\s+(other|future|new|all)\s+(session|conversation|chat|user)s?'
+    enabled: true
+    tags: ["owasp-llm04", "data-poisoning", "memory-contamination"]
+
+  # ── Federated Learning Poisoning ──
+
+  - id: "PIF-LLM04-006"
+    name: "Federated Learning Poisoning"
+    description: "Detects attempts to submit poisoned model updates in federated or distributed training contexts"
+    category: "prompt_injection"
+    severity: 3
+    pattern: '(?i)(submit|upload|contribute|inject|send)\s+(poisoned|malicious|crafted|backdoored|corrupted)\s+(gradient|update|weight|delta|model\s+update|local\s+model)s?'
+    enabled: true
+    tags: ["owasp-llm04", "data-poisoning", "federated"]
+
+  # ── Synthetic Training Data Injection ──
+
+  - id: "PIF-LLM04-007"
+    name: "Synthetic Training Data Injection"
+    description: "Detects requests to generate synthetic training data designed to bias or manipulate a model"
+    category: "prompt_injection"
+    severity: 2
+    pattern: '(?i)(create|generate|produce|synthesize)\s+.{0,50}(training\s+data|fine[\s-]?tun(ing|e)\s+data|labeled\s+(examples?|samples?|pairs?))\s+.{0,50}(to\s+(teach|train|condition|bias|force)|that\s+(teach(es)?|train(s)?|bias(es)?|condition(s)?))\s+(the\s+)?(model|ai|llm|assistant)\s+to\s+(say|do|believe|output|generate|prefer|hate|attack|avoid)'
+    enabled: true
+    tags: ["owasp-llm04", "data-poisoning", "synthetic-data"]
+
+  # ── RLHF Reward Hacking ──
+
+  - id: "PIF-LLM04-008"
+    name: "RLHF Reward Hacking"
+    description: "Detects attempts to exploit or game reinforcement learning from human feedback reward signals"
+    category: "prompt_injection"
+    severity: 3
+    pattern: '(?i)(maximize|game|exploit|hack|manipulate|abuse)\s+(the\s+)?(reward|rlhf|preference|alignment|human\s+feedback)\s+(signal|score|model|function|feedback)\s+(to|by|via|through|and\s+make)'
+    enabled: true
+    tags: ["owasp-llm04", "data-poisoning", "reward-hacking"]