From f6356309fcbc6f2fee31639e4c056a0512002bbd Mon Sep 17 00:00:00 2001 From: Ogulcan Aydogan Date: Wed, 27 May 2026 12:48:20 +0100 Subject: [PATCH] feat: add owasp llm04 data poisoning rule pack New rule file rules/owasp-llm04-data-poisoning.yaml adds 6 patterns for OWASP LLM04 (2025) Data Poisoning attack scenarios not covered by the 2 existing LLM04 rules in owasp-llm-top10.yaml: - PIF-LLM04-003: adversarial example construction (FGSM, C&W, PGD) - PIF-LLM04-004: backdoor trigger phrase injection - PIF-LLM04-005: cross-session memory contamination - PIF-LLM04-006: federated learning gradient poisoning - PIF-LLM04-007: synthetic training data injection to bias a model - PIF-LLM04-008: RLHF reward hacking instruction loader_test.go updated to expect 4 rule files (was 3). Starts v1.4.0 OWASP partial-coverage closure per ROADMAP. --- CHANGELOG.md | 3 ++ pkg/rules/loader_test.go | 2 +- rules/owasp-llm04-data-poisoning.yaml | 69 +++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 rules/owasp-llm04-data-poisoning.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 14fea24..7a0be01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- OWASP LLM04 (2025) Data Poisoning rule pack (`rules/owasp-llm04-data-poisoning.yaml`) with 6 new patterns covering adversarial example construction, backdoor trigger phrases, cross-session memory contamination, federated learning poisoning, synthetic training data injection, and RLHF reward hacking. Closes the v1.4.0 ROADMAP item to extend LLM04 partial coverage; existing 2 LLM04 patterns in `rules/owasp-llm-top10.yaml` remain in place. + ### Changed - CI and release workflows now pin `go-version: '1.26.x'` (was `1.25.x`); unblocks Dependabot rollups that bump `go.mod` to require Go 1.26.0 (e.g. `hugot` v0.7.2). diff --git a/pkg/rules/loader_test.go b/pkg/rules/loader_test.go index 09115f8..813cc37 100644 --- a/pkg/rules/loader_test.go +++ b/pkg/rules/loader_test.go @@ -106,7 +106,7 @@ func TestLoadDir(t *testing.T) { sets, err := LoadDir(rulesDir) require.NoError(t, err) - assert.Len(t, sets, 3, "should load all 3 rule files") + assert.Len(t, sets, 4, "should load all 4 rule files") totalRules := 0 for _, rs := range sets { diff --git a/rules/owasp-llm04-data-poisoning.yaml b/rules/owasp-llm04-data-poisoning.yaml new file mode 100644 index 0000000..04ee7d2 --- /dev/null +++ b/rules/owasp-llm04-data-poisoning.yaml @@ -0,0 +1,69 @@ +name: "OWASP LLM04 - Data Poisoning (Extended)" +version: "1.0.0" +description: "Extended detection patterns for OWASP LLM04 (2025) Data Poisoning attacks covering adversarial inputs, backdoor triggers, cross-session memory contamination, federated learning poisoning, synthetic training data injection, and RLHF reward hacking" +rules: + # ── Adversarial Example Construction ── + + - id: "PIF-LLM04-003" + name: "Adversarial Example Construction" + description: "Detects requests to construct adversarial inputs designed to fool model outputs" + category: "prompt_injection" + severity: 3 + pattern: '(?i)(adversarial\s+example|carefully\s+crafted\s+(input|prompt)|gradient[\s-]based\s+attack|fast\s+gradient\s+sign\s+method|fgsm\s+attack|adversarial\s+perturbation|projected\s+gradient\s+descent\s+attack|carlini[\s-]wagner\s+attack)' + enabled: true + tags: ["owasp-llm04", "data-poisoning", "adversarial"] + + # ── Backdoor Trigger Phrase ── + + - id: "PIF-LLM04-004" + name: "Backdoor Trigger Phrase" + description: "Detects injection of backdoor trigger phrases intended to activate hidden model behaviour" + category: "prompt_injection" + severity: 4 + pattern: '(?i)(trigger\s+(phrase|word|token|sequence)\s+(activates?|enables?|unlocks?|causes?)\s+(hidden|secret|backdoor|debug|override)|when\s+you\s+(see|read|encounter|receive)\s+.{0,40}(activate|reveal|bypass|ignore|execute|switch\s+to))' + enabled: true + tags: ["owasp-llm04", "data-poisoning", "backdoor"] + + # ── Cross-Session Memory Contamination ── + + - id: "PIF-LLM04-005" + name: "Cross-Session Memory Contamination" + description: "Detects attempts to persist or propagate injected context across sessions or users" + category: "prompt_injection" + severity: 3 + pattern: '(?i)(carry|persist|propagate|inject|spread)\s+(this|these|the\s+following)\s+(memor(y|ies)|context|state|knowledge|instructions?|rules?)\s+(across|between|into)\s+(other|future|new|all)\s+(session|conversation|chat|user)s?' + enabled: true + tags: ["owasp-llm04", "data-poisoning", "memory-contamination"] + + # ── Federated Learning Poisoning ── + + - id: "PIF-LLM04-006" + name: "Federated Learning Poisoning" + description: "Detects attempts to submit poisoned model updates in federated or distributed training contexts" + category: "prompt_injection" + severity: 3 + pattern: '(?i)(submit|upload|contribute|inject|send)\s+(poisoned|malicious|crafted|backdoored|corrupted)\s+(gradient|update|weight|delta|model\s+update|local\s+model)s?' + enabled: true + tags: ["owasp-llm04", "data-poisoning", "federated"] + + # ── Synthetic Training Data Injection ── + + - id: "PIF-LLM04-007" + name: "Synthetic Training Data Injection" + description: "Detects requests to generate synthetic training data designed to bias or manipulate a model" + category: "prompt_injection" + severity: 2 + pattern: '(?i)(create|generate|produce|synthesize)\s+.{0,50}(training\s+data|fine[\s-]?tun(ing|e)\s+data|labeled\s+(examples?|samples?|pairs?))\s+.{0,50}(to\s+(teach|train|condition|bias|force)|that\s+(teach(es)?|train(s)?|bias(es)?|condition(s)?))\s+(the\s+)?(model|ai|llm|assistant)\s+to\s+(say|do|believe|output|generate|prefer|hate|attack|avoid)' + enabled: true + tags: ["owasp-llm04", "data-poisoning", "synthetic-data"] + + # ── RLHF Reward Hacking ── + + - id: "PIF-LLM04-008" + name: "RLHF Reward Hacking" + description: "Detects attempts to exploit or game reinforcement learning from human feedback reward signals" + category: "prompt_injection" + severity: 3 + pattern: '(?i)(maximize|game|exploit|hack|manipulate|abuse)\s+(the\s+)?(reward|rlhf|preference|alignment|human\s+feedback)\s+(signal|score|model|function|feedback)\s+(to|by|via|through|and\s+make)' + enabled: true + tags: ["owasp-llm04", "data-poisoning", "reward-hacking"]