From a4d1e98096ed97779ced22a3e416f47913ad70dc Mon Sep 17 00:00:00 2001
From: Thibault <Thibault.ahn@gmail.com>
Date: Sat, 20 Jun 2026 14:02:24 +0200
Subject: [PATCH 01/11] Add finance bias evaluator script

---
 scripts/python/finance_bias_evaluator.py | 183 +++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 scripts/python/finance_bias_evaluator.py

diff --git a/scripts/python/finance_bias_evaluator.py b/scripts/python/finance_bias_evaluator.py
new file mode 100644
index 0000000..b6a7aad
--- /dev/null
+++ b/scripts/python/finance_bias_evaluator.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Rule-based evaluator for finance security and algorithmic-bias reviews."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class Finding:
+    """A deterministic rule finding."""
+
+    code: str
+    severity: float
+    message: str
+
+
+RISK_PATTERNS: tuple[tuple[str, str, float, str], ...] = (
+    (
+        "protected_attribute",
+        r"\b(age|gender|sex|race|ethnicity|religion|nationality|disability|marital status|postal code|zip code)\b",
+        0.95,
+        "usage potentiel d'attribut protégé ou proxy discriminatoire",
+    ),
+    (
+        "unbounded_automation",
+        r"\b(auto[- ]?approve|automatic approval|fully automated|no human review|without human intervention)\b",
+        0.90,
+        "décision financière automatisée sans contrôle humain explicite",
+    ),
+    (
+        "no_auditability",
+        r"\bblack box|opaque model|no logs?|without audit|no audit trail|untraceable\b",
+        0.90,
+        "auditabilité ou traçabilité insuffisante",
+    ),
+    (
+        "security_secret",
+        r"\b(api[_-]?key|secret|password|token)\s*[:=]\s*['\"]?[A-Za-z0-9_\-]{12,}",
+        0.95,
+        "secret ou identifiant technique potentiellement exposé",
+    ),
+    (
+        "unsupported_guarantee",
+        r"\bguarantee(?:d)?\b|\b100%\b|\bmathematically certain\b|\bno bias\b",
+        0.85,
+        "garantie absolue non démontrable dans un système algorithmique",
+    ),
+    (
+        "hallucinated_interface",
+        r"\b(call_magic_model|detect_all_bias|guarantee_fairness|remove_all_bias|perfect_explainability)\b",
+        0.80,
+        "fonction ou capacité probablement hallucinee/non spécifiée",
+    ),
+    (
+        "financial_exclusion",
+        r"\b(reject|deny|exclude|blacklist)\b.*\b(low income|unemployed|immigrant|neighbourhood|neighborhood)\b",
+        0.95,
+        "risque d'exclusion financière ou de discrimination indirecte",
+    ),
+)
+
+REQUIRED_CONTROLS: tuple[tuple[str, str, str], ...] = (
+    (
+        "fairness_metrics",
+        r"\b(fairness|bias|disparate impact|equal opportunity|demographic parity|equalized odds)\b",
+        "absence de métriques ou critères de fairness",
+    ),
+    (
+        "human_review",
+        r"\b(human review|manual review|appeal|contest|override|second line review)\b",
+        "absence de revue humaine ou mécanisme de contestation",
+    ),
+    (
+        "audit_logging",
+        r"\b(audit|log|traceability|monitoring|model card|decision record)\b",
+        "absence d'auditabilité ou de journalisation",
+    ),
+    (
+        "data_governance",
+        r"\b(data quality|data governance|lineage|consent|privacy|gdpr|retention)\b",
+        "absence de gouvernance des données",
+    ),
+    (
+        "security_controls",
+        r"\b(encryption|access control|least privilege|secret management|rate limit|authentication)\b",
+        "absence de contrôles de sécurité explicites",
+    ),
+)
+
+
+def normalize(text: str) -> str:
+    """Normalize whitespace for deterministic rule matching."""
+
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def evaluate(text: str) -> dict[str, object]:
+    """Evaluate a proposal and return the required JSON-compatible object."""
+
+    proposal = normalize(text)
+    findings: list[Finding] = []
+
+    if not proposal:
+        findings.append(Finding("empty_input", 1.0, "proposition vide ou non fournie"))
+
+    lowered = proposal.casefold()
+
+    for code, pattern, severity, message in RISK_PATTERNS:
+        if re.search(pattern, lowered, flags=re.IGNORECASE):
+            findings.append(Finding(code, severity, message))
+
+    for code, pattern, message in REQUIRED_CONTROLS:
+        if not re.search(pattern, lowered, flags=re.IGNORECASE):
+            findings.append(Finding(code, 0.70, message))
+
+    if findings:
+        top = max(findings, key=lambda item: item.severity)
+        confidence = min(
+            0.99,
+            0.55 + max(item.severity for item in findings) * 0.35 + len(findings) * 0.02,
+        )
+        return {
+            "statut": 0,
+            "confiance": round(confidence, 2),
+            "justification_technique": (
+                f"{top.message}; {len(findings)} point(s) de risque ou de contrôle manquant détecté(s)."
+            ),
+        }
+
+    return {
+        "statut": 1,
+        "confiance": 0.78,
+        "justification_technique": (
+            "Aucun biais, hallucination d'interface ou manque critique de contrôle détecté par les règles statiques."
+        ),
+    }
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    """Parse CLI arguments."""
+
+    parser = argparse.ArgumentParser(
+        description="Evaluate finance security and algorithmic-bias risks and print strict JSON."
+    )
+    parser.add_argument(
+        "file",
+        nargs="?",
+        help="Optional UTF-8 file containing the technical proposal. Reads stdin when omitted.",
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print JSON for manual review. The default is compact JSON.",
+    )
+    return parser.parse_args(argv)
+
+
+def read_input(path: str | None) -> str:
+    """Read the proposal from a file or stdin."""
+
+    if path:
+        return Path(path).read_text(encoding="utf-8")
+    return sys.stdin.read()
+
+
+def main(argv: list[str] | None = None) -> int:
+    """CLI entrypoint."""
+
+    args = parse_args(argv or sys.argv[1:])
+    result = evaluate(read_input(args.file))
+    output = json.dumps(result, ensure_ascii=False, indent=2 if args.pretty else None)
+    print(output)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 7e26cea333f4867cc6ddda50579274a768310606 Mon Sep 17 00:00:00 2001
From: Thibault <Thibault.ahn@gmail.com>
Date: Sat, 20 Jun 2026 14:02:51 +0200
Subject: [PATCH 02/11] Document finance bias evaluator

---
 .../python/README_Finance_Bias_Evaluator.md   | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 scripts/python/README_Finance_Bias_Evaluator.md

diff --git a/scripts/python/README_Finance_Bias_Evaluator.md b/scripts/python/README_Finance_Bias_Evaluator.md
new file mode 100644
index 0000000..d8bf662
--- /dev/null
+++ b/scripts/python/README_Finance_Bias_Evaluator.md
@@ -0,0 +1,65 @@
+# Finance Bias Evaluator
+
+Small rule-based evaluator for finance-oriented security and algorithmic-bias review prompts.
+
+The script reads a technical proposal from stdin or from a UTF-8 file and prints exactly one JSON object:
+
+```json
+{
+  "statut": 0,
+  "confiance": 0.99,
+  "justification_technique": "..."
+}
+```
+
+## Why this exists
+
+The evaluator is intentionally conservative for finance use cases. A proposal fails when it contains a risky pattern or when it omits core controls such as fairness metrics, human review, audit logging, data governance, or security controls.
+
+It can detect examples such as:
+
+- protected attributes or possible proxies, for example age, gender, postal code, or zip code;
+- fully automated financial decisions without human review;
+- opaque or unauditable systems;
+- exposed secrets or tokens;
+- unrealistic claims such as guaranteed fairness or 100% absence of bias;
+- hallucinated helper functions such as `detect_all_bias` or `guarantee_fairness`.
+
+## Usage
+
+```bash
+python scripts/python/finance_bias_evaluator.py proposal.txt
+```
+
+or:
+
+```bash
+echo "We use fairness metrics, human review, audit logs, data governance and encryption." | \
+  python scripts/python/finance_bias_evaluator.py
+```
+
+Pretty output:
+
+```bash
+python scripts/python/finance_bias_evaluator.py proposal.txt --pretty
+```
+
+## Validation examples
+
+Valid-looking proposal:
+
+```bash
+echo "We use fairness metrics, human review, audit logs, data governance and encryption to assess credit risk." | \
+  python scripts/python/finance_bias_evaluator.py
+```
+
+Risky proposal:
+
+```bash
+echo "The model auto-approves loans using age and zip code with no human review." | \
+  python scripts/python/finance_bias_evaluator.py
+```
+
+## Limitations
+
+This is a deterministic static checker, not a formal proof of fairness or security. It should be used as a first-pass guardrail before deeper review, statistical fairness testing, model validation, and legal/compliance assessment.

From 70665df6d56828f4dfab9b8e35f78b5bf403c6cc Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 15:50:35 +0200
Subject: [PATCH 03/11] fix: capture native validation diagnostics

---
 scripts/powershell/Optimize-CodexWorkspace.ps1 | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/scripts/powershell/Optimize-CodexWorkspace.ps1 b/scripts/powershell/Optimize-CodexWorkspace.ps1
index 08d2ca7..0a5133b 100644
--- a/scripts/powershell/Optimize-CodexWorkspace.ps1
+++ b/scripts/powershell/Optimize-CodexWorkspace.ps1
@@ -447,18 +447,8 @@ function Invoke-NativeValidation {
     $startInfo.UseShellExecute = $false
     $startInfo.WorkingDirectory = $resolvedProject.Path
     $nativeArguments = @($Arguments | ForEach-Object { ConvertTo-NativeArgument $_ })
-    if ($resolvedExecutable.Source -match '\.(cmd|bat)$') {
-        $startInfo.FileName = $env:ComSpec
-        $invocation = @(
-            ConvertTo-NativeArgument $resolvedExecutable.Source
-            $nativeArguments
-        ) -join ' '
-        $startInfo.Arguments = '/d /s /c "' + $invocation + '"'
-    }
-    else {
-        $startInfo.FileName = $resolvedExecutable.Source
-        $startInfo.Arguments = $nativeArguments -join ' '
-    }
+    $startInfo.FileName = $resolvedExecutable.Source
+    $startInfo.Arguments = $nativeArguments -join ' '
 
     $process = [System.Diagnostics.Process]::new()
     $process.StartInfo = $startInfo

From ab0fa18d0cae68c7851e4f78a5525afff91c3a8c Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 15:59:52 +0200
Subject: [PATCH 04/11] fix: run cmd validation portably

---
 scripts/powershell/Optimize-CodexWorkspace.ps1 | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/scripts/powershell/Optimize-CodexWorkspace.ps1 b/scripts/powershell/Optimize-CodexWorkspace.ps1
index 0a5133b..ad12b37 100644
--- a/scripts/powershell/Optimize-CodexWorkspace.ps1
+++ b/scripts/powershell/Optimize-CodexWorkspace.ps1
@@ -447,8 +447,18 @@ function Invoke-NativeValidation {
     $startInfo.UseShellExecute = $false
     $startInfo.WorkingDirectory = $resolvedProject.Path
     $nativeArguments = @($Arguments | ForEach-Object { ConvertTo-NativeArgument $_ })
-    $startInfo.FileName = $resolvedExecutable.Source
-    $startInfo.Arguments = $nativeArguments -join ' '
+    if ($resolvedExecutable.Source -match '\.(cmd|bat)$') {
+        $startInfo.FileName = if ($env:ComSpec) { $env:ComSpec } else { 'cmd.exe' }
+        $invocation = @(
+            ConvertTo-NativeArgument $resolvedExecutable.Source
+            $nativeArguments
+        ) -join ' '
+        $startInfo.Arguments = '/d /c ' + $invocation
+    }
+    else {
+        $startInfo.FileName = $resolvedExecutable.Source
+        $startInfo.Arguments = $nativeArguments -join ' '
+    }
 
     $process = [System.Diagnostics.Process]::new()
     $process.StartInfo = $startInfo

From 39e11d9ed40d52ef4dca5a9c8daa696a0a77e2fb Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 16:04:36 +0200
Subject: [PATCH 05/11] test: annotate workspace doctor smoke failures

---
 scripts/tests/Test-Optimize-CodexWorkspace.ps1 | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/tests/Test-Optimize-CodexWorkspace.ps1 b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
index 92bb8b8..3b3150b 100644
--- a/scripts/tests/Test-Optimize-CodexWorkspace.ps1
+++ b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
@@ -163,6 +163,13 @@ try {
 
     Write-Host 'Codex Workspace Doctor smoke test passed.'
 }
+catch {
+    $message = $_.Exception.Message -replace '\r?\n', ' '
+    if ($env:GITHUB_ACTIONS -eq 'true') {
+        Write-Output "::error title=Codex Workspace Doctor smoke test::$message"
+    }
+    throw
+}
 finally {
     if (Test-Path -LiteralPath $testRoot) {
         Remove-Item -LiteralPath $testRoot -Recurse -Force

From d40ec4fffb7729dea5a57bac8e2de790c7b3c139 Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 16:07:31 +0200
Subject: [PATCH 06/11] test: accept redacted diagnostics from stdout

---
 scripts/tests/Test-Optimize-CodexWorkspace.ps1 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/tests/Test-Optimize-CodexWorkspace.ps1 b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
index 3b3150b..25c0eca 100644
--- a/scripts/tests/Test-Optimize-CodexWorkspace.ps1
+++ b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
@@ -108,7 +108,8 @@ try {
     if ($trustedNpmValidation.Count -ne 1 -or $trustedNpmValidation[0].Status -eq 'skipped') {
         throw 'Trusted project validation commands did not run after explicit opt-in.'
     }
-    if ($trustedNpmValidation[0].Status -ne 'failed' -or ($trustedNpmValidation[0].StdErrTail -join "`n") -notmatch '\[REDACTED\]') {
+    $trustedNpmOutputTail = @($trustedNpmValidation[0].StdOutTail) + @($trustedNpmValidation[0].StdErrTail)
+    if ($trustedNpmValidation[0].Status -ne 'failed' -or ($trustedNpmOutputTail -join "`n") -notmatch '\[REDACTED\]') {
         throw 'Failed validation diagnostics were not captured and redacted.'
     }
     if ((Get-Content -LiteralPath $trustedReportPath -Raw) -match [regex]::Escape($validationLogSecret)) {

From d264cebd4d3e41a1da93103f19eacb50b589556f Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 16:10:26 +0200
Subject: [PATCH 07/11] test: report validation diagnostic tails

---
 scripts/tests/Test-Optimize-CodexWorkspace.ps1 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/tests/Test-Optimize-CodexWorkspace.ps1 b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
index 25c0eca..5310793 100644
--- a/scripts/tests/Test-Optimize-CodexWorkspace.ps1
+++ b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
@@ -110,7 +110,9 @@ try {
     }
     $trustedNpmOutputTail = @($trustedNpmValidation[0].StdOutTail) + @($trustedNpmValidation[0].StdErrTail)
     if ($trustedNpmValidation[0].Status -ne 'failed' -or ($trustedNpmOutputTail -join "`n") -notmatch '\[REDACTED\]') {
-        throw 'Failed validation diagnostics were not captured and redacted.'
+        $safeStdOut = (@($trustedNpmValidation[0].StdOutTail) -join ' | ') -replace [regex]::Escape($validationLogSecret), '[UNREDACTED-TEST-SECRET]'
+        $safeStdErr = (@($trustedNpmValidation[0].StdErrTail) -join ' | ') -replace [regex]::Escape($validationLogSecret), '[UNREDACTED-TEST-SECRET]'
+        throw "Failed validation diagnostics were not captured and redacted. Status=$($trustedNpmValidation[0].Status); StdOutTail=$safeStdOut; StdErrTail=$safeStdErr"
     }
     if ((Get-Content -LiteralPath $trustedReportPath -Raw) -match [regex]::Escape($validationLogSecret)) {
         throw 'A secret value leaked from validation diagnostics into the report.'

From 37899d447f2b17991241b40f6678e271e41866f4 Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 16:44:30 +0200
Subject: [PATCH 08/11] feat: add generic llm bias multi-agent reducer

---
 scripts/python/README_LLM_Bias_Multi_Agent.md |  45 +++
 scripts/python/llm_bias_multi_agent.py        | 340 ++++++++++++++++++
 .../python/tests/test_llm_bias_multi_agent.py |  59 +++
 3 files changed, 444 insertions(+)
 create mode 100644 scripts/python/README_LLM_Bias_Multi_Agent.md
 create mode 100644 scripts/python/llm_bias_multi_agent.py
 create mode 100644 scripts/python/tests/test_llm_bias_multi_agent.py

diff --git a/scripts/python/README_LLM_Bias_Multi_Agent.md b/scripts/python/README_LLM_Bias_Multi_Agent.md
new file mode 100644
index 0000000..df348c3
--- /dev/null
+++ b/scripts/python/README_LLM_Bias_Multi_Agent.md
@@ -0,0 +1,45 @@
+# LLM Bias Multi-Agent Reducer
+
+Standalone deterministic multi-agent review layer for reducing bias in generic LLM outputs.
+
+It does not call an LLM provider. It can sit after any model output and performs:
+
+- protected-attribute review;
+- stereotype and broad-generalization review;
+- overconfidence and weak-evidence review;
+- inclusion and alternatives review;
+- safeguards review for consequential domains such as finance, hiring, housing, medicine, education, or insurance.
+
+## Usage
+
+```bash
+python scripts/python/llm_bias_multi_agent.py answer.txt --prompt-file prompt.txt --pretty
+```
+
+or:
+
+```bash
+echo "All young users are risky, so the loan model should reject them." | \
+  python scripts/python/llm_bias_multi_agent.py --prompt "Evaluate a credit policy" --pretty
+```
+
+The output is JSON:
+
+```json
+{
+  "risk_score": 0.93,
+  "status": "needs_revision",
+  "finding_count": 6,
+  "agent_reports": [],
+  "revised_answer": "..."
+}
+```
+
+## How To Use With Any LLM
+
+1. Send a prompt to your LLM.
+2. Pass the LLM answer to `llm_bias_multi_agent.py`.
+3. If `status` is `needs_revision`, use `revised_answer` or feed the findings back into your LLM for another revision round.
+4. For consequential decisions, keep human review, audit logs, subgroup metrics, and domain-specific validation.
+
+This is a first-pass guardrail. It reduces obvious biased wording and flags missing safeguards, but it does not replace statistical fairness testing on real data.
diff --git a/scripts/python/llm_bias_multi_agent.py b/scripts/python/llm_bias_multi_agent.py
new file mode 100644
index 0000000..61a827a
--- /dev/null
+++ b/scripts/python/llm_bias_multi_agent.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""Generic multi-agent bias reducer for LLM outputs.
+
+The module is intentionally dependency-free. It can be used as a post-processing
+layer after any LLM provider: pass the original prompt and model answer, then
+receive a revised answer plus a structured audit report.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class Finding:
+    """One issue raised by a review agent."""
+
+    agent: str
+    code: str
+    severity: float
+    span: str
+    recommendation: str
+
+
+@dataclass(frozen=True)
+class AgentReport:
+    """Findings produced by one specialist agent."""
+
+    agent: str
+    findings: tuple[Finding, ...]
+
+
+class ReviewAgent:
+    """Base class for deterministic review agents."""
+
+    name = "base"
+
+    def review(self, prompt: str, answer: str) -> AgentReport:
+        raise NotImplementedError
+
+
+def normalize_spaces(text: str) -> str:
+    """Collapse whitespace without changing words."""
+
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def snippet(text: str, start: int, end: int, max_len: int = 90) -> str:
+    """Return a compact excerpt around a regex match."""
+
+    value = normalize_spaces(text[start:end])
+    if len(value) <= max_len:
+        return value
+    return value[: max_len - 3].rstrip() + "..."
+
+
+class ProtectedAttributeAgent(ReviewAgent):
+    """Detects protected attributes used as broad explanations."""
+
+    name = "protected_attribute_agent"
+    patterns: tuple[tuple[str, str], ...] = (
+        ("age", r"\b(age|old people|young people|teenagers|elderly)\b"),
+        ("gender", r"\b(gender|men|women|male|female|boys|girls)\b"),
+        ("origin", r"\b(race|ethnicity|nationality|immigrant|foreigners?)\b"),
+        ("religion", r"\b(religion|muslim|christian|jewish|hindu)\b"),
+        ("disability", r"\b(disability|disabled|neurodivergent)\b"),
+        ("socioeconomic", r"\b(poor people|low income|rich people|working class)\b"),
+    )
+
+    def review(self, prompt: str, answer: str) -> AgentReport:
+        findings: list[Finding] = []
+        for code, pattern in self.patterns:
+            for match in re.finditer(pattern, answer, flags=re.IGNORECASE):
+                findings.append(
+                    Finding(
+                        self.name,
+                        f"protected_attribute_{code}",
+                        0.72,
+                        snippet(answer, match.start(), match.end()),
+                        "Avoid using protected or sensitive attributes as a broad causal shortcut.",
+                    )
+                )
+        return AgentReport(self.name, tuple(findings))
+
+
+class StereotypeAgent(ReviewAgent):
+    """Detects broad claims and stereotype-like phrasing."""
+
+    name = "stereotype_agent"
+    patterns: tuple[tuple[str, str, float, str], ...] = (
+        (
+            "group_generalization",
+            r"\b(all|always|never|everyone|nobody|most|typical|naturally|inherently)\b.{0,80}\b(people|users|customers|patients|employees|men|women|immigrants|students)\b",
+            0.82,
+            "Replace broad group claims with scoped, evidence-based language.",
+        ),
+        (
+            "deficit_framing",
+            r"\b(lazy|irrational|untrustworthy|aggressive|bad at|not suited|cannot handle|less capable)\b",
+            0.86,
+            "Remove deficit framing unless it is supported by specific, relevant evidence.",
+        ),
+        (
+            "culture_essentialism",
+            r"\b(culture makes them|because of their culture|born to|naturally better|naturally worse)\b",
+            0.88,
+            "Avoid essentialist explanations for behavior or capability.",
+        ),
+    )
+
+    def review(self, prompt: str, answer: str) -> AgentReport:
+        findings: list[Finding] = []
+        for code, pattern, severity, recommendation in self.patterns:
+            for match in re.finditer(pattern, answer, flags=re.IGNORECASE):
+                findings.append(
+                    Finding(
+                        self.name,
+                        code,
+                        severity,
+                        snippet(answer, match.start(), match.end()),
+                        recommendation,
+                    )
+                )
+        return AgentReport(self.name, tuple(findings))
+
+
+class EvidenceAgent(ReviewAgent):
+    """Detects overconfident claims without uncertainty markers."""
+
+    name = "evidence_agent"
+    unsupported_patterns: tuple[tuple[str, str], ...] = (
+        ("certainty", r"\b(proves|guarantees|certainly|without doubt|100%|completely unbiased|no bias)\b"),
+        ("universal_policy", r"\b(best for everyone|works for all|one-size-fits-all|universally optimal)\b"),
+    )
+
+    def review(self, prompt: str, answer: str) -> AgentReport:
+        findings: list[Finding] = []
+        for code, pattern in self.unsupported_patterns:
+            for match in re.finditer(pattern, answer, flags=re.IGNORECASE):
+                findings.append(
+                    Finding(
+                        self.name,
+                        f"unsupported_{code}",
+                        0.70,
+                        snippet(answer, match.start(), match.end()),
+                        "Add uncertainty, evidence requirements, and limits of applicability.",
+                    )
+                )
+        return AgentReport(self.name, tuple(findings))
+
+
+class InclusionAgent(ReviewAgent):
+    """Detects missing alternatives or asymmetric framing."""
+
+    name = "inclusion_agent"
+
+    def review(self, prompt: str, answer: str) -> AgentReport:
+        lowered = answer.casefold()
+        findings: list[Finding] = []
+        if any(word in lowered for word in ("should reject", "should exclude", "should avoid hiring", "deny access")):
+            findings.append(
+                Finding(
+                    self.name,
+                    "exclusionary_recommendation",
+                    0.84,
+                    "exclusionary recommendation",
+                    "Prefer least-restrictive alternatives, review paths, and context-specific criteria.",
+                )
+            )
+        if "alternative" not in lowered and "exception" not in lowered and "review" not in lowered:
+            findings.append(
+                Finding(
+                    self.name,
+                    "missing_alternatives",
+                    0.45,
+                    "no alternatives or review path",
+                    "Mention alternatives, exceptions, or human review when making consequential recommendations.",
+                )
+            )
+        return AgentReport(self.name, tuple(findings))
+
+
+class SafetyAgent(ReviewAgent):
+    """Detects sensitive decision contexts that require stronger safeguards."""
+
+    name = "safety_agent"
+    consequential_context = re.compile(
+        r"\b(loan|credit|insurance|hiring|medical|diagnosis|housing|school admission|policing|benefit|welfare)\b",
+        flags=re.IGNORECASE,
+    )
+
+    def review(self, prompt: str, answer: str) -> AgentReport:
+        combined = f"{prompt}\n{answer}"
+        if not self.consequential_context.search(combined):
+            return AgentReport(self.name, ())
+
+        lowered = answer.casefold()
+        findings: list[Finding] = []
+        required_terms = (
+            ("human_review", ("human review", "manual review", "appeal", "contest", "override")),
+            ("auditability", ("audit", "log", "trace", "monitor", "record")),
+            ("fairness_testing", ("fairness", "bias", "disparate impact", "equal opportunity")),
+        )
+        for code, terms in required_terms:
+            if not any(term in lowered for term in terms):
+                findings.append(
+                    Finding(
+                        self.name,
+                        f"missing_{code}",
+                        0.76,
+                        "consequential decision without safeguard",
+                        f"Add {code.replace('_', ' ')} before using this output in a consequential context.",
+                    )
+                )
+        return AgentReport(self.name, tuple(findings))
+
+
+class BiasReducer:
+    """Coordinates review agents and applies conservative text revisions."""
+
+    def __init__(self, agents: tuple[ReviewAgent, ...] | None = None) -> None:
+        self.agents = agents or (
+            ProtectedAttributeAgent(),
+            StereotypeAgent(),
+            EvidenceAgent(),
+            InclusionAgent(),
+            SafetyAgent(),
+        )
+
+    def evaluate(self, prompt: str, answer: str) -> dict[str, object]:
+        """Return a structured multi-agent review."""
+
+        reports = tuple(agent.review(prompt, answer) for agent in self.agents)
+        findings = tuple(finding for report in reports for finding in report.findings)
+        risk_score = self._risk_score(findings)
+        revised = self.revise(answer, findings)
+        return {
+            "risk_score": risk_score,
+            "status": "needs_revision" if findings else "accepted",
+            "finding_count": len(findings),
+            "agent_reports": [
+                {
+                    "agent": report.agent,
+                    "findings": [asdict(finding) for finding in report.findings],
+                }
+                for report in reports
+            ],
+            "revised_answer": revised,
+        }
+
+    def revise(self, answer: str, findings: tuple[Finding, ...]) -> str:
+        """Apply conservative debiasing rewrites without inventing facts."""
+
+        if not findings:
+            return answer.strip()
+
+        revised = answer.strip()
+        replacements = (
+            (r"\bshould reject them\b", "should route them to documented review", re.IGNORECASE),
+            (r"\bshould exclude them\b", "should assess them with documented criteria", re.IGNORECASE),
+            (r"\ball\b", "some", re.IGNORECASE),
+            (r"\balways\b", "may sometimes", re.IGNORECASE),
+            (r"\bnever\b", "may not always", re.IGNORECASE),
+            (r"\beveryone\b", "many people", re.IGNORECASE),
+            (r"\bnobody\b", "not everyone", re.IGNORECASE),
+            (r"\bguarantees?\b", "may support", re.IGNORECASE),
+            (r"\bproves?\b", "may suggest", re.IGNORECASE),
+            (r"\b100%\b", "highly", re.IGNORECASE),
+            (r"\bcompletely unbiased\b", "designed to reduce measured bias", re.IGNORECASE),
+            (r"\bno bias\b", "lower measured bias", re.IGNORECASE),
+            (r"\bshould reject\b", "should review carefully", re.IGNORECASE),
+            (r"\bshould exclude\b", "should assess with documented criteria", re.IGNORECASE),
+        )
+        for pattern, replacement, flags in replacements:
+            revised = re.sub(pattern, replacement, revised, flags=flags)
+
+        notes = self._mitigation_notes(findings)
+        if notes:
+            revised = f"{revised}\n\nBias-mitigation notes:\n" + "\n".join(f"- {note}" for note in notes)
+        return revised
+
+    @staticmethod
+    def _risk_score(findings: tuple[Finding, ...]) -> float:
+        if not findings:
+            return 0.0
+        combined = 1.0
+        for finding in findings:
+            combined *= 1.0 - min(max(finding.severity, 0.0), 1.0) * 0.35
+        return round(min(1.0, 1.0 - combined), 3)
+
+    @staticmethod
+    def _mitigation_notes(findings: tuple[Finding, ...]) -> list[str]:
+        notes: list[str] = []
+        recommendations = []
+        for finding in sorted(findings, key=lambda item: item.severity, reverse=True):
+            if finding.recommendation not in recommendations:
+                recommendations.append(finding.recommendation)
+        for recommendation in recommendations[:5]:
+            notes.append(recommendation)
+        if findings:
+            notes.append("Validate remaining claims with task-specific data, subgroup metrics, and human review.")
+        return notes
+
+
+def read_text(path: str | None) -> str:
+    """Read text from a UTF-8 file or stdin."""
+
+    if path:
+        return Path(path).read_text(encoding="utf-8")
+    return sys.stdin.read()
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Reduce bias in generic LLM outputs using deterministic multi-agent review."
+    )
+    parser.add_argument("answer_file", nargs="?", help="UTF-8 file containing the LLM answer. Reads stdin if omitted.")
+    parser.add_argument("--prompt-file", help="Optional UTF-8 file containing the original prompt.")
+    parser.add_argument("--prompt", default="", help="Optional original prompt text.")
+    parser.add_argument("--text", help="LLM answer text. Overrides answer_file/stdin when provided.")
+    parser.add_argument("--pretty", action="store_true", help="Pretty-print the JSON report.")
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv or sys.argv[1:])
+    prompt = read_text(args.prompt_file) if args.prompt_file else args.prompt
+    answer = args.text if args.text is not None else read_text(args.answer_file)
+    result = BiasReducer().evaluate(prompt, answer)
+    print(json.dumps(result, ensure_ascii=False, indent=2 if args.pretty else None))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/python/tests/test_llm_bias_multi_agent.py b/scripts/python/tests/test_llm_bias_multi_agent.py
new file mode 100644
index 0000000..3be1e1a
--- /dev/null
+++ b/scripts/python/tests/test_llm_bias_multi_agent.py
@@ -0,0 +1,59 @@
+import importlib.util
+import sys
+import unittest
+from pathlib import Path
+
+
+MODULE_PATH = Path(__file__).resolve().parents[1] / "llm_bias_multi_agent.py"
+spec = importlib.util.spec_from_file_location("llm_bias_multi_agent", MODULE_PATH)
+llm_bias_multi_agent = importlib.util.module_from_spec(spec)
+assert spec.loader is not None
+sys.modules[spec.name] = llm_bias_multi_agent
+spec.loader.exec_module(llm_bias_multi_agent)
+
+
+class LLMBiasMultiAgentTests(unittest.TestCase):
+    def test_accepts_low_risk_answer(self):
+        reducer = llm_bias_multi_agent.BiasReducer()
+        result = reducer.evaluate(
+            "Summarize a training plan.",
+            "The plan should be evaluated with feedback, monitoring, and context-specific review.",
+        )
+
+        self.assertEqual(result["status"], "accepted")
+        self.assertEqual(result["risk_score"], 0.0)
+        self.assertTrue(result["revised_answer"].startswith("The plan"))
+
+    def test_rewrites_broad_biased_claims(self):
+        reducer = llm_bias_multi_agent.BiasReducer()
+        result = reducer.evaluate(
+            "Evaluate a credit policy.",
+            "All young people are risky, so the loan model should reject them with no bias.",
+        )
+
+        self.assertEqual(result["status"], "needs_revision")
+        self.assertGreater(result["risk_score"], 0.5)
+        revised = result["revised_answer"].casefold()
+        self.assertIn("some young people", revised)
+        self.assertIn("should route them to documented review", revised)
+        self.assertIn("bias-mitigation notes", revised)
+
+    def test_consequential_context_requires_safeguards(self):
+        reducer = llm_bias_multi_agent.BiasReducer()
+        result = reducer.evaluate(
+            "Write a hiring recommendation.",
+            "Use a ranking score to select candidates.",
+        )
+        findings = [
+            finding["code"]
+            for report in result["agent_reports"]
+            for finding in report["findings"]
+        ]
+
+        self.assertIn("missing_human_review", findings)
+        self.assertIn("missing_auditability", findings)
+        self.assertIn("missing_fairness_testing", findings)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 41caf16b6aa201732ff1c6d6fa42673f4c3676e3 Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 16:58:30 +0200
Subject: [PATCH 09/11] feat: generalize llm multi-agent prompt manager

---
 scripts/python/README_LLM_Bias_Multi_Agent.md |  37 +++-
 scripts/python/llm_bias_multi_agent.py        | 158 +++++++++++++++---
 .../python/tests/test_llm_bias_multi_agent.py |  44 +++++
 3 files changed, 210 insertions(+), 29 deletions(-)

diff --git a/scripts/python/README_LLM_Bias_Multi_Agent.md b/scripts/python/README_LLM_Bias_Multi_Agent.md
index df348c3..c312ce9 100644
--- a/scripts/python/README_LLM_Bias_Multi_Agent.md
+++ b/scripts/python/README_LLM_Bias_Multi_Agent.md
@@ -1,8 +1,16 @@
-# LLM Bias Multi-Agent Reducer
+# LLM Multi-Agent Prompt Manager
 
-Standalone deterministic multi-agent review layer for reducing bias in generic LLM outputs.
+Standalone deterministic multi-agent manager for reviewing and improving generic LLM prompt returns.
 
-It does not call an LLM provider. It can sit after any model output and performs:
+It does not call an LLM provider. It sits after any model output and performs a standard orchestration loop:
+
+1. receive the original prompt and LLM answer;
+2. run a panel of review agents;
+3. aggregate findings and a risk score;
+4. revise the answer conservatively;
+5. optionally repeat the review for several rounds.
+
+The default panel focuses on bias and safeguard review:
 
 - protected-attribute review;
 - stereotype and broad-generalization review;
@@ -13,7 +21,7 @@ It does not call an LLM provider. It can sit after any model output and performs
 ## Usage
 
 ```bash
-python scripts/python/llm_bias_multi_agent.py answer.txt --prompt-file prompt.txt --pretty
+python scripts/python/llm_bias_multi_agent.py answer.txt --prompt-file prompt.txt --max-rounds 2 --pretty
 ```
 
 or:
@@ -27,6 +35,8 @@ The output is JSON:
 
 ```json
 {
+  "manager": "multi_agent_prompt_manager",
+  "round_count": 1,
   "risk_score": 0.93,
   "status": "needs_revision",
   "finding_count": 6,
@@ -38,8 +48,23 @@ The output is JSON:
 ## How To Use With Any LLM
 
 1. Send a prompt to your LLM.
-2. Pass the LLM answer to `llm_bias_multi_agent.py`.
+2. Pass the LLM answer to `llm_bias_multi_agent.py` or to `MultiAgentPromptManager` in Python.
 3. If `status` is `needs_revision`, use `revised_answer` or feed the findings back into your LLM for another revision round.
 4. For consequential decisions, keep human review, audit logs, subgroup metrics, and domain-specific validation.
 
-This is a first-pass guardrail. It reduces obvious biased wording and flags missing safeguards, but it does not replace statistical fairness testing on real data.
+## Python API
+
+```python
+from llm_bias_multi_agent import MultiAgentPromptManager
+
+manager = MultiAgentPromptManager(max_rounds=2)
+report = manager.evaluate(
+    prompt="Write a recommendation.",
+    answer="Everyone will certainly benefit from this policy.",
+)
+print(report["revised_answer"])
+```
+
+You can plug in custom agents by implementing `ReviewAgent.review(prompt, answer)` and returning an `AgentReport`.
+
+This is a first-pass manager. It reduces obvious biased wording and flags missing safeguards, but it does not replace task-specific evaluation, statistical fairness testing, security review, or human judgment.
diff --git a/scripts/python/llm_bias_multi_agent.py b/scripts/python/llm_bias_multi_agent.py
index 61a827a..4f2c181 100644
--- a/scripts/python/llm_bias_multi_agent.py
+++ b/scripts/python/llm_bias_multi_agent.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
-"""Generic multi-agent bias reducer for LLM outputs.
+"""Generic multi-agent prompt-return manager for LLM outputs.
 
 The module is intentionally dependency-free. It can be used as a post-processing
 layer after any LLM provider: pass the original prompt and model answer, then
-receive a revised answer plus a structured audit report.
+receive a managed multi-agent review, a revised answer, and a structured report.
 """
 
 from __future__ import annotations
@@ -35,6 +35,18 @@ class AgentReport:
     findings: tuple[Finding, ...]
 
 
+@dataclass(frozen=True)
+class ManagerRound:
+    """One manager review/revision pass."""
+
+    round_index: int
+    input_answer: str
+    revised_answer: str
+    risk_score: float
+    status: str
+    reports: tuple[AgentReport, ...]
+
+
 class ReviewAgent:
     """Base class for deterministic review agents."""
 
@@ -59,6 +71,12 @@ def snippet(text: str, start: int, end: int, max_len: int = 90) -> str:
     return value[: max_len - 3].rstrip() + "..."
 
 
+def strip_manager_notes(text: str) -> str:
+    """Remove notes previously appended by this manager before a new revision."""
+
+    return re.split(r"\n\nBias-mitigation notes:\n", text, maxsplit=1)[0].strip()
+
+
 class ProtectedAttributeAgent(ReviewAgent):
     """Detects protected attributes used as broad explanations."""
 
@@ -221,45 +239,103 @@ def review(self, prompt: str, answer: str) -> AgentReport:
 
 
 class BiasReducer:
-    """Coordinates review agents and applies conservative text revisions."""
+    """Backward-compatible facade for the generic prompt manager."""
 
     def __init__(self, agents: tuple[ReviewAgent, ...] | None = None) -> None:
-        self.agents = agents or (
-            ProtectedAttributeAgent(),
-            StereotypeAgent(),
-            EvidenceAgent(),
-            InclusionAgent(),
-            SafetyAgent(),
-        )
+        self.manager = MultiAgentPromptManager(agents=agents)
+        self.agents = self.manager.agents
 
     def evaluate(self, prompt: str, answer: str) -> dict[str, object]:
-        """Return a structured multi-agent review."""
+        """Return a structured multi-agent bias review."""
 
-        reports = tuple(agent.review(prompt, answer) for agent in self.agents)
-        findings = tuple(finding for report in reports for finding in report.findings)
-        risk_score = self._risk_score(findings)
-        revised = self.revise(answer, findings)
+        return self.manager.evaluate(prompt, answer)
+
+    def revise(self, answer: str, findings: tuple[Finding, ...]) -> str:
+        """Apply conservative debiasing rewrites without inventing facts."""
+
+        return self.manager.revise(answer, findings)
+
+
+class MultiAgentPromptManager:
+    """Standard manager for generic LLM prompt returns.
+
+    The manager has no dependency on a domain such as finance. Its job is to:
+    collect review-agent feedback, aggregate risk, revise the answer
+    conservatively, and optionally repeat the process for several rounds.
+    """
+
+    def __init__(
+        self,
+        agents: tuple[ReviewAgent, ...] | None = None,
+        max_rounds: int = 1,
+        stop_risk_score: float = 0.0,
+    ) -> None:
+        self.agents = agents or default_review_agents()
+        self.max_rounds = max(1, max_rounds)
+        self.stop_risk_score = max(0.0, min(1.0, stop_risk_score))
+
+    def evaluate(self, prompt: str, answer: str, max_rounds: int | None = None) -> dict[str, object]:
+        """Run managed review/revision rounds for one LLM answer."""
+
+        rounds = self.run_rounds(prompt, answer, max_rounds=max_rounds)
+        final_round = rounds[-1]
+        findings = tuple(finding for report in final_round.reports for finding in report.findings)
         return {
-            "risk_score": risk_score,
-            "status": "needs_revision" if findings else "accepted",
+            "manager": "multi_agent_prompt_manager",
+            "round_count": len(rounds),
+            "risk_score": final_round.risk_score,
+            "status": final_round.status,
             "finding_count": len(findings),
             "agent_reports": [
                 {
                     "agent": report.agent,
                     "findings": [asdict(finding) for finding in report.findings],
                 }
-                for report in reports
+                for report in final_round.reports
             ],
-            "revised_answer": revised,
+            "rounds": [self._round_to_dict(round_result) for round_result in rounds],
+            "revised_answer": final_round.revised_answer,
         }
 
+    def run_rounds(self, prompt: str, answer: str, max_rounds: int | None = None) -> tuple[ManagerRound, ...]:
+        """Return each review/revision pass as structured objects."""
+
+        limit = max(1, max_rounds if max_rounds is not None else self.max_rounds)
+        current_answer = answer.strip()
+        rounds: list[ManagerRound] = []
+        for round_index in range(1, limit + 1):
+            reports = tuple(agent.review(prompt, current_answer) for agent in self.agents)
+            findings = tuple(finding for report in reports for finding in report.findings)
+            risk_score = self._risk_score(findings)
+            revised_answer = self.revise(current_answer, findings)
+            status = "needs_revision" if findings else "accepted"
+            round_result = ManagerRound(
+                round_index=round_index,
+                input_answer=current_answer,
+                revised_answer=revised_answer,
+                risk_score=risk_score,
+                status=status,
+                reports=reports,
+            )
+            rounds.append(round_result)
+            current_answer = revised_answer
+            if risk_score <= self.stop_risk_score or revised_answer == round_result.input_answer:
+                break
+        return tuple(rounds)
+
+    def review_once(self, prompt: str, answer: str) -> tuple[AgentReport, ...]:
+        """Run agents without applying a revision."""
+
+        reports = tuple(agent.review(prompt, answer) for agent in self.agents)
+        return reports
+
     def revise(self, answer: str, findings: tuple[Finding, ...]) -> str:
-        """Apply conservative debiasing rewrites without inventing facts."""
+        """Apply conservative generic rewrites without inventing new facts."""
 
         if not findings:
             return answer.strip()
 
-        revised = answer.strip()
+        revised = strip_manager_notes(answer)
         replacements = (
             (r"\bshould reject them\b", "should route them to documented review", re.IGNORECASE),
             (r"\bshould exclude them\b", "should assess them with documented criteria", re.IGNORECASE),
@@ -268,6 +344,9 @@ def revise(self, answer: str, findings: tuple[Finding, ...]) -> str:
             (r"\bnever\b", "may not always", re.IGNORECASE),
             (r"\beveryone\b", "many people", re.IGNORECASE),
             (r"\bnobody\b", "not everyone", re.IGNORECASE),
+            (r"\bwill certainly\b", "may", re.IGNORECASE),
+            (r"\bcertainly\b", "may", re.IGNORECASE),
+            (r"\bwithout doubt\b", "with supporting evidence", re.IGNORECASE),
             (r"\bguarantees?\b", "may support", re.IGNORECASE),
             (r"\bproves?\b", "may suggest", re.IGNORECASE),
             (r"\b100%\b", "highly", re.IGNORECASE),
@@ -306,6 +385,29 @@ def _mitigation_notes(findings: tuple[Finding, ...]) -> list[str]:
             notes.append("Validate remaining claims with task-specific data, subgroup metrics, and human review.")
         return notes
 
+    @staticmethod
+    def _round_to_dict(round_result: ManagerRound) -> dict[str, object]:
+        findings = tuple(finding for report in round_result.reports for finding in report.findings)
+        return {
+            "round_index": round_result.round_index,
+            "risk_score": round_result.risk_score,
+            "status": round_result.status,
+            "finding_count": len(findings),
+            "revised_answer": round_result.revised_answer,
+        }
+
+
+def default_review_agents() -> tuple[ReviewAgent, ...]:
+    """Return the standard generic LLM-output review panel."""
+
+    return (
+        ProtectedAttributeAgent(),
+        StereotypeAgent(),
+        EvidenceAgent(),
+        InclusionAgent(),
+        SafetyAgent(),
+    )
+
 
 def read_text(path: str | None) -> str:
     """Read text from a UTF-8 file or stdin."""
@@ -317,12 +419,19 @@ def read_text(path: str | None) -> str:
 
 def parse_args(argv: list[str]) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description="Reduce bias in generic LLM outputs using deterministic multi-agent review."
+        description="Manage generic LLM prompt returns using deterministic multi-agent review."
     )
     parser.add_argument("answer_file", nargs="?", help="UTF-8 file containing the LLM answer. Reads stdin if omitted.")
     parser.add_argument("--prompt-file", help="Optional UTF-8 file containing the original prompt.")
     parser.add_argument("--prompt", default="", help="Optional original prompt text.")
     parser.add_argument("--text", help="LLM answer text. Overrides answer_file/stdin when provided.")
+    parser.add_argument("--max-rounds", type=int, default=1, help="Maximum manager review/revision rounds.")
+    parser.add_argument(
+        "--stop-risk-score",
+        type=float,
+        default=0.0,
+        help="Stop early when the round risk score is at or below this value.",
+    )
     parser.add_argument("--pretty", action="store_true", help="Pretty-print the JSON report.")
     return parser.parse_args(argv)
 
@@ -331,7 +440,10 @@ def main(argv: list[str] | None = None) -> int:
     args = parse_args(argv or sys.argv[1:])
     prompt = read_text(args.prompt_file) if args.prompt_file else args.prompt
     answer = args.text if args.text is not None else read_text(args.answer_file)
-    result = BiasReducer().evaluate(prompt, answer)
+    result = MultiAgentPromptManager(max_rounds=args.max_rounds, stop_risk_score=args.stop_risk_score).evaluate(
+        prompt,
+        answer,
+    )
     print(json.dumps(result, ensure_ascii=False, indent=2 if args.pretty else None))
     return 0
 
diff --git a/scripts/python/tests/test_llm_bias_multi_agent.py b/scripts/python/tests/test_llm_bias_multi_agent.py
index 3be1e1a..5ddbd2f 100644
--- a/scripts/python/tests/test_llm_bias_multi_agent.py
+++ b/scripts/python/tests/test_llm_bias_multi_agent.py
@@ -13,6 +13,23 @@
 
 
 class LLMBiasMultiAgentTests(unittest.TestCase):
+    def test_manager_returns_standard_prompt_report(self):
+        manager = llm_bias_multi_agent.MultiAgentPromptManager(max_rounds=2)
+        result = manager.evaluate(
+            "Summarize a general product launch plan.",
+            "Everyone will certainly love this launch.",
+        )
+
+        self.assertEqual(result["manager"], "multi_agent_prompt_manager")
+        self.assertEqual(result["status"], "accepted")
+        self.assertGreaterEqual(result["round_count"], 1)
+        self.assertEqual(result["rounds"][0]["status"], "needs_revision")
+        self.assertIn("rounds", result)
+        self.assertIn("revised_answer", result)
+        self.assertNotIn("certainly", result["revised_answer"].casefold())
+        self.assertNotIn("will may", result["revised_answer"].casefold())
+        self.assertEqual(result["revised_answer"].count("Bias-mitigation notes:"), 1)
+
     def test_accepts_low_risk_answer(self):
         reducer = llm_bias_multi_agent.BiasReducer()
         result = reducer.evaluate(
@@ -54,6 +71,33 @@ def test_consequential_context_requires_safeguards(self):
         self.assertIn("missing_auditability", findings)
         self.assertIn("missing_fairness_testing", findings)
 
+    def test_custom_agent_can_be_plugged_into_manager(self):
+        class LengthAgent(llm_bias_multi_agent.ReviewAgent):
+            name = "length_agent"
+
+            def review(self, prompt, answer):
+                if len(answer) <= 20:
+                    return llm_bias_multi_agent.AgentReport(self.name, ())
+                return llm_bias_multi_agent.AgentReport(
+                    self.name,
+                    (
+                        llm_bias_multi_agent.Finding(
+                            self.name,
+                            "too_long",
+                            0.5,
+                            "answer length",
+                            "Shorten the answer.",
+                        ),
+                    ),
+                )
+
+        manager = llm_bias_multi_agent.MultiAgentPromptManager(agents=(LengthAgent(),))
+        result = manager.evaluate("Reply briefly.", "This response is intentionally longer than requested.")
+        findings = result["agent_reports"][0]["findings"]
+
+        self.assertEqual(result["status"], "needs_revision")
+        self.assertEqual(findings[0]["code"], "too_long")
+
 
 if __name__ == "__main__":
     unittest.main()

From 3f53206a640069540e3b6442a6fc1a78c19bd4db Mon Sep 17 00:00:00 2001
From: Tibo2403 <Tibo2403@users.noreply.github.com>
Date: Sat, 20 Jun 2026 17:02:39 +0200
Subject: [PATCH 10/11] test: make workspace doctor smoke timeout robust

---
 scripts/tests/Test-Optimize-CodexWorkspace.ps1 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/tests/Test-Optimize-CodexWorkspace.ps1 b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
index 5310793..9d2945d 100644
--- a/scripts/tests/Test-Optimize-CodexWorkspace.ps1
+++ b/scripts/tests/Test-Optimize-CodexWorkspace.ps1
@@ -20,7 +20,7 @@ try {
     $validationLogSecret = 'validation-log-secret-value'
     Set-Content -LiteralPath (Join-Path $testRoot 'package.json') -Value '{"scripts":{"test":"node test.js","lint":"node slow.js","build":"node build.js"}}'
     Set-Content -LiteralPath (Join-Path $testRoot 'test.js') -Value "console.error('to' + 'ken=' + '$validationLogSecret'); process.exit(1);"
-    Set-Content -LiteralPath (Join-Path $testRoot 'slow.js') -Value "setTimeout(() => {}, 10000);"
+    Set-Content -LiteralPath (Join-Path $testRoot 'slow.js') -Value "setTimeout(() => {}, 30000);"
     Set-Content -LiteralPath (Join-Path $testRoot 'pyproject.toml') -Value '[tool.pytest.ini_options]'
     Set-Content -LiteralPath (Join-Path $testRoot '.env') -Value "OPENAI_API_KEY=$fakeKey"
     Set-Content -LiteralPath (Join-Path $testRoot 'tokens.txt') -Value $fakeAwsKey
@@ -102,7 +102,7 @@ try {
         throw 'A secret value from an excluded directory leaked into the report.'
     }
 
-    & $doctor -ProjectPath $testRoot -Validate -AllowProjectCommands -ValidationTimeoutSeconds 5 -ValidationLogLineLimit 2 -ReportPath $trustedReportPath
+    & $doctor -ProjectPath $testRoot -Validate -AllowProjectCommands -ValidationTimeoutSeconds 15 -ValidationLogLineLimit 2 -ReportPath $trustedReportPath
     $trustedReport = Get-Content -LiteralPath $trustedReportPath -Raw | ConvertFrom-Json
     $trustedNpmValidation = @($trustedReport.ValidationResults | Where-Object { $_.Command -eq 'npm run test' })
     if ($trustedNpmValidation.Count -ne 1 -or $trustedNpmValidation[0].Status -eq 'skipped') {
@@ -121,7 +121,7 @@ try {
     if ($trustedNpmLint.Count -ne 1 -or $trustedNpmLint[0].Status -ne 'timed-out') {
         throw 'Native validation commands did not stop after the configured timeout.'
     }
-    if ($trustedReport.ValidationTimeoutSeconds -ne 5 -or $trustedReport.ValidationLogLineLimit -ne 2) {
+    if ($trustedReport.ValidationTimeoutSeconds -ne 15 -or $trustedReport.ValidationLogLineLimit -ne 2) {
         throw 'Validation timeout and log limits were not written to the report.'
     }
 

From a05e23d7fe0dd0acf5a641ad37b3ccbdc74bbde0 Mon Sep 17 00:00:00 2001
From: Thibault <Thibault.ahn@gmail.com>
Date: Sun, 21 Jun 2026 12:28:22 +0200
Subject: [PATCH 11/11] Document finance bias evaluator

---
 README.md | 48 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 528b6b1..9ca25a3 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![Script Validation](https://github.com/Tibo2403/Scripting/actions/workflows/script-validation.yml/badge.svg)](https://github.com/Tibo2403/Scripting/actions/workflows/script-validation.yml)
 
-Collection of PowerShell, Bash, and Python scripts for system administration, security checks, Microsoft 365 operations, Linux dependency checks, MCP integrations, and authorized lab or pentest workflows.
+Collection of PowerShell, Bash, and Python scripts for system administration, security checks, Microsoft 365 operations, Linux dependency checks, MCP integrations, finance bias/security review helpers, and authorized lab or pentest workflows.
 
 ## Legal Notice
 
@@ -24,26 +24,28 @@ scripts/
 |   |-- setup_api.sh
 |   `-- stealth_post.sh
 |-- powershell/
-    |-- DiskUsageReport.ps1
-    |-- ExchangeOnlineManagement.ps1
-    |-- Get-SystemInfo.ps1
-    |-- LinkCrawler.ps1
-    |-- ManageServices.ps1
-    |-- Optimize-CodexWorkspace.ps1
-    |-- SecurityCheck.ps1
-    |-- SharePointManagement.ps1
-    |-- TeamsManagement.ps1
-    |-- Test-ScriptSyntax.ps1
-    |-- UserManagement.ps1
-    `-- VMManagement.ps1
+|   |-- DiskUsageReport.ps1
+|   |-- ExchangeOnlineManagement.ps1
+|   |-- Get-SystemInfo.ps1
+|   |-- LinkCrawler.ps1
+|   |-- ManageServices.ps1
+|   |-- Optimize-CodexWorkspace.ps1
+|   |-- SecurityCheck.ps1
+|   |-- SharePointManagement.ps1
+|   |-- TeamsManagement.ps1
+|   |-- Test-ScriptSyntax.ps1
+|   |-- UserManagement.ps1
+|   `-- VMManagement.ps1
 `-- python/
     |-- codex-cost-routing.cmd
     |-- codex_cost_router.py
+    |-- finance_bias_evaluator.py
     |-- litellm-cost-routing.yaml
     |-- Manage-CodexCostRouting.ps1
     |-- mcp_server.py
     |-- README.md
     |-- README_Codex_Cost_Routing.md
+    |-- README_Finance_Bias_Evaluator.md
     `-- requirements.txt
 ```
 
@@ -94,6 +96,12 @@ Run Bash static analysis:
 find scripts/linux -name "*.sh" -print0 | xargs -0 shellcheck --severity=error
 ```
 
+Validate Python syntax:
+
+```bash
+python -m py_compile scripts/python/mcp_server.py scripts/python/codex_cost_router.py scripts/python/finance_bias_evaluator.py
+```
+
 Check Linux dependencies:
 
 ```bash
@@ -148,7 +156,7 @@ Sensitive Linux scripts require either an interactive `AUTHORIZED` confirmation
 
 Use the safe placeholders in `examples/` for lab demos and documentation. Do not commit real targets, credentials, tenant identifiers, scan output, packet captures, or customer data.
 
-## MCP Server
+## Python Tools
 
 The read-only Python MCP server exposes tools to list, search, inspect, and
 validate scripts without executing them. It can also browse documentation and
@@ -167,6 +175,18 @@ The optional Codex cost router in `scripts/python/codex_cost_router.py` can
 compress one-shot prompts and route them through a self-hosted LiteLLM OSS proxy.
 See [`scripts/python/README_Codex_Cost_Routing.md`](scripts/python/README_Codex_Cost_Routing.md).
 
+The finance bias evaluator in `scripts/python/finance_bias_evaluator.py` checks
+technical proposals for finance-oriented security, fairness, auditability, and
+hallucinated-interface risks. It always returns the strict JSON shape
+`statut`, `confiance`, and `justification_technique`:
+
+```bash
+echo "We use fairness metrics, human review, audit logs, data governance and encryption." | \
+  python scripts/python/finance_bias_evaluator.py
+```
+
+See [`scripts/python/README_Finance_Bias_Evaluator.md`](scripts/python/README_Finance_Bias_Evaluator.md).
+
 ## CI
 
 The `script-validation.yml` workflow checks: