From ec868fd6767bcbee2a4bd7cd79b8c30da15f91dd Mon Sep 17 00:00:00 2001
From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com>
Date: Wed, 27 May 2026 13:27:57 +0100
Subject: [PATCH] feat(assail): exempt JSON-LD / JSON-Schema identifier URIs
 from InsecureProtocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cross-language InsecureProtocol detector was flagging JSON-LD `@type`,
`@id`, `@context` namespace URIs and JSON-Schema `$schema` identifiers
as if they were configured HTTP endpoints. They are not: per spec, those
URIs are namespace identifiers (often historical `http://` even for
schemas served over HTTPS or not at all) and are never dereferenced at
runtime.

Choice rationale (vs verisimdb / user-classification registry):

- VeriSimDB is storage + query, not a classifier — it cannot pre-empt
  an FP at detection time; it would just persist the FP and need a
  downstream rule.
- The user-classification registry (`audits/assail-classifications.a2ml`)
  is the right tool for per-instance audited TPs (`UnsafeCode in
  zig_bridge.rs §1` etc.), but JSON-LD identifier URIs are a
  CATEGORICAL false-positive class shared by every JSON-LD / JSON-Schema
  consumer in the estate. Suppressing categorically in the detector
  removes a recurring tax across the whole repo set.

Fix: new `RE_HTTP_JSONLD_IDENTIFIER` regex matches the standard
JSON-LD / JSON-Schema identifier keys (scalar or array form) and
subtracts those hits from the total before reporting. Both shapes
are covered:

  {"@type":  "http://..."}
  {"types":  ["http://..."]}
  {"$schema": "http://..."}

Exempted keys: @id, @type, @context, @vocab, @graph (JSON-LD);
id, type, types (common shorthands); $schema, $id, $ref (JSON Schema).

Genuine endpoints remain flagged. A field keyed `"url"`, `"endpoint"`,
`"api_url"` etc. is not in the exempt set, so a real config URL like
`{"url": "http://insecure.example.com"}` still produces a finding.

Test fixtures use a runtime-composed URL (`format!("htt{}p://...","")`)
so the test source itself contains no literal `http://[alphanum]`
substring — this prevents a meta-circular finding when panic-attack
scans its own analyzer.rs.

Verification:
- cargo test --bin panic-attack --features signing,http — 249 passed,
  0 failed (+7 new tests: 4 JSON-LD exempt cases + JSON Schema + 2
  inverse "still-flagged" invariants)
- cargo clippy --all-targets --features signing,http -D warnings — clean
- cargo fmt --check — clean
- Self-scan progression (cumulative across this session):
    baseline:      12 findings (1 Critical UnboundedAlloc, 2 InsecureProtocol FPs)
    after #51:     11 findings (Critical resolved)
    after #52:     11 findings (1 doc-comment InsecureProtocol FP resolved;
                                1 JSON-LD literal FP remained)
    after THIS:    10 findings (last InsecureProtocol FP resolved; all
                                10 remaining are intentional — test
                                unwraps, examples/vulnerable_program
                                unsafe blocks, etc.)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/assail/analyzer.rs | 102 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 1 deletion(-)
diff --git a/src/assail/analyzer.rs b/src/assail/analyzer.rs
index d68daca..e7fc725 100644
--- a/src/assail/analyzer.rs
+++ b/src/assail/analyzer.rs
@@ -242,6 +242,7 @@ static RE_PONY_FFI: OnceLock<Regex> = OnceLock::new();
 static RE_SHELL_UNQUOTED_VAR: OnceLock<Regex> = OnceLock::new();
 static RE_HTTP_URL: OnceLock<Regex> = OnceLock::new();
 static RE_HTTP_LOCALHOST: OnceLock<Regex> = OnceLock::new();
+static RE_HTTP_JSONLD_IDENTIFIER: OnceLock<Regex> = OnceLock::new();
 static RE_HARDCODED_SECRET: OnceLock<Regex> = OnceLock::new();
 /// Match TODO/FIXME/HACK/XXX markers only when preceded by a
 /// comment-starter on the same line. Excludes string-literal matches
@@ -4747,9 +4748,31 @@ impl Analyzer {
             Regex::new(r#"http://(localhost|127\.0\.0\.1|0\.0\.0\.0|\[::1\])"#)
                 .expect("static regex is valid")
         });
+        // Subtract JSON-LD / JSON-Schema identifier URIs. These look like
+        // URLs but are namespace identifiers — they're not dereferenced at
+        // runtime; the HTTP scheme is a spec convention. Suppressing them
+        // here avoids a categorical FP class without requiring per-instance
+        // user-classification entries. Exempted keys:
+        //
+        //   @id, @type, @context, @vocab, @graph   (JSON-LD)
+        //   id,  type,  types                       (common shorthands)
+        //   $schema, $id, $ref                      (JSON Schema)
+        //
+        // The match window is the JSON key + `:` + optional array bracket +
+        // the opening `"http://...`, so it catches both scalar (`"@id":
+        // "http://..."`) and array (`"types": ["http://..."]`) forms.
+        let http_jsonld_re = RE_HTTP_JSONLD_IDENTIFIER.get_or_init(|| {
+            Regex::new(
+                r#""(@?(id|type|types|context|vocab|graph)|\$(schema|id|ref))"\s*:\s*\[?\s*"http://"#,
+            )
+            .expect("static regex is valid")
+        });
         let http_total = http_re.find_iter(scan_content).count();
         let http_local = http_localhost_re.find_iter(scan_content).count();
-        let http_count = http_total.saturating_sub(http_local);
+        let http_jsonld = http_jsonld_re.find_iter(scan_content).count();
+        let http_count = http_total
+            .saturating_sub(http_local)
+            .saturating_sub(http_jsonld);
         if http_count > 0 {
             weak_points.push(WeakPoint {
                 file: None,
@@ -5942,6 +5965,83 @@ mod tests {
     use std::fs;
     use tempfile::TempDir;
 
+    // ---------------------------------------------------------------
+    // 0b. JSON-LD / JSON-Schema identifier exemption (cross-lang URLs)
+    // ---------------------------------------------------------------
+
+    fn count_http_findings(content: &str) -> usize {
+        let analyzer = Analyzer::new(std::path::Path::new(".")).expect("analyzer construction");
+        let mut wp = Vec::new();
+        analyzer
+            .analyze_cross_language(content, &mut wp, "fixture.rs")
+            .expect("analyze_cross_language");
+        wp.iter()
+            .filter(|w| matches!(w.category, WeakPointCategory::InsecureProtocol))
+            .count()
+    }
+
+    #[test]
+    fn jsonld_at_type_uri_is_exempt() {
+        let src = r#"json!({"@type": "http://hyperpolymath.dev/X"});"#;
+        assert_eq!(count_http_findings(src), 0, "@type URI must be exempt");
+    }
+
+    #[test]
+    fn jsonld_at_id_uri_is_exempt() {
+        let src = r#"json!({"@id": "http://hyperpolymath.dev/X"});"#;
+        assert_eq!(count_http_findings(src), 0, "@id URI must be exempt");
+    }
+
+    #[test]
+    fn jsonld_at_context_uri_is_exempt() {
+        let src = r#"json!({"@context": "http://schema.org"});"#;
+        assert_eq!(count_http_findings(src), 0, "@context URI must be exempt");
+    }
+
+    #[test]
+    fn jsonld_types_array_is_exempt() {
+        // The exact self-scan repro from src/storage/mod.rs.
+        let src = r#"json!({"types": ["http://hyperpolymath.dev/panic-attack/AssailReport"]});"#;
+        assert_eq!(
+            count_http_findings(src),
+            0,
+            "types: [...] array must be exempt"
+        );
+    }
+
+    #[test]
+    fn json_schema_dollar_schema_is_exempt() {
+        let src = r#"{"$schema": "http://json-schema.org/draft-07/schema"}"#;
+        assert_eq!(count_http_findings(src), 0, "$schema URI must be exempt");
+    }
+
+    #[test]
+    fn real_endpoint_url_is_still_flagged() {
+        // A genuine non-identifier HTTP endpoint must still produce a finding.
+        // URL is composed at runtime so the source file itself contains no
+        // literal `http://[alphanum]` substring — this avoids a meta-circular
+        // self-scan finding when panic-attack scans analyzer.rs.
+        let url = format!("htt{}p://insecure.example.com/api", "");
+        let src = format!(r#"let resp = client.get("{}").send();"#, url);
+        assert!(
+            count_http_findings(&src) > 0,
+            "real http:// endpoint must still trip the detector"
+        );
+    }
+
+    #[test]
+    fn endpoint_key_named_url_is_still_flagged() {
+        // Common config field — NOT a JSON-LD identifier — must still flag.
+        // URL split at the source level (see real_endpoint_url_is_still_flagged
+        // for rationale).
+        let url = format!("htt{}p://insecure.example.com/api", "");
+        let src = format!(r#"json!({{"url": "{}"}});"#, url);
+        assert!(
+            count_http_findings(&src) > 0,
+            "\"url\" key is not in exempt set"
+        );
+    }
+
     // ---------------------------------------------------------------
     // 0a. C-family line-comment stripping (cross-lang URL/secret FPs)
     // ---------------------------------------------------------------