diff --git a/.gitignore b/.gitignore index 2100afd..f6391ea 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.test *.out *.ndjson +report.html .idea/ .vscode/ diff --git a/scripts/generate-report.py b/scripts/generate-report.py new file mode 100755 index 0000000..d4ed1e8 --- /dev/null +++ b/scripts/generate-report.py @@ -0,0 +1,1607 @@ +#!/usr/bin/env python3 +"""Generate an HTML report from bumblebee inventory.ndjson. + +Requires Python 3.10+. + +Usage: + python3 scripts/generate-report.py # defaults + python3 scripts/generate-report.py inventory.ndjson # explicit input + python3 scripts/generate-report.py -o report.html # explicit output + python3 scripts/generate-report.py scan.ndjson -o out.html # both +""" + +import json +import collections +import html as html_mod +import sys +import argparse + +# ── Ecosystem colours ── +ECO_COLORS = { + "go": "#00ADD8", + "npm": "#CB3837", + "pypi": "#306998", + "rubygems": "#CC342D", + "browser-extension": "#FF9500", + "editor-extension": "#8B5CF6", + "mcp": "#10B981", + "unknown": "#6B7280", +} + +# ── Helpers ── + +def esc(s): + return html_mod.escape(str(s)) + + +def normalize_path(path: str) -> str: + return path.replace("/Users/", "~/").replace("/home/", "~/") + + +def bar_html(pct, color, min_pct=2.5): + pct = max(pct, min_pct) + return ( + f'
" + ) + + +# ── Main generator ── + +def generate_report(ndjson_path: str, output_path: str) -> None: + # ── Load data ── + packages: list[dict] = [] + summary: dict | None = None + + with open(ndjson_path, encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + r = json.loads(line) + if r["record_type"] == "package": + packages.append(r) + elif r["record_type"] == "scan_summary": + summary = r + + if not summary: + print("Error: no scan_summary record found", file=sys.stderr) + sys.exit(1) + + if not packages: + print("Error: no package records found", file=sys.stderr) + sys.exit(1) + + # ── Compute aggregates ── + ecosystems = collections.Counter() + source_types = collections.Counter() + projects = collections.Counter() + confidence_levels = collections.Counter() + direct_deps = 0 + lifecycle_script_pkgs: list[dict] = [] + unique_by_eco: dict[str, set[str]] = collections.defaultdict(set) + multi_version: dict[tuple[str, str], set[str]] = collections.defaultdict(set) + + for p in packages: + eco = p.get("ecosystem", "unknown") + ecosystems[eco] += 1 + source_types[p.get("source_type", "unknown")] += 1 + + proj = normalize_path(p.get("project_path", "unknown")) + projects[proj] += 1 + + confidence_levels[p.get("confidence", "unknown")] += 1 + + if p.get("direct_dependency"): + direct_deps += 1 + + if p.get("has_lifecycle_scripts"): + lifecycle_script_pkgs.append( + { + "name": p.get("package_name"), + "ecosystem": eco, + "project": proj, + "scripts": p.get("lifecycle_scripts", []), + } + ) + + unique_by_eco[eco].add(p.get("normalized_name", "")) + + name = p.get("normalized_name", "") + ver = p.get("version", "?") + multi_version[(eco, name)].add(ver) + + multi_pkgs = {k: v for k, v in multi_version.items() if len(v) > 1} + top_multi = sorted(multi_pkgs.items(), key=lambda x: -len(x[1]))[:25] + eco_order = [e for e, _ in ecosystems.most_common()] + package_count = len(packages) + + # ── Group scan roots by kind ── + root_kinds_map: dict[str, list[str]] = collections.defaultdict(list) + for root in summary["roots"]: + rk = root["kind"] + path = normalize_path(root["path"]) + root_kinds_map[rk].append(path) + + # ── Build table rows ── + + # Ecosystems + max_eco = max(ecosystems.values()) + eco_rows_parts: list[str] = [] + for eco in eco_order: + cnt = ecosystems[eco] + unique = len(unique_by_eco[eco]) + pct = cnt / max_eco * 100 + color = ECO_COLORS.get(eco, "#6B7280") + eco_rows_parts.append(f""" +{esc(src)}{esc(name)}{esc(pkg["name"])}{esc(p)}What’s on this machine — a breakdown of every package discovered, grouped by language and runtime.
+| Ecosystem | Total | Unique | Distribution |
|---|
What’s dangerous — packages that run arbitrary code when installed or updated.
+ +| Ecosystem | Package | Scripts | Project |
|---|
What’s outdated — packages pinned to many different versions across projects, increasing patching burden.
+| Ecosystem | Package | Versions | Sample |
|---|
Where complexity concentrates — projects with the deepest dependency trees.
+| Project | Packages | Distribution |
|---|
How reliable is this data — detection confidence assigned to each package record.
+| Level | Count | Share | Distribution |
|---|
How packages were found — lockfiles, module caches, manifests, and extension metadata.
+| Source | Count | Distribution |
|---|
Reference — every directory bumblebee crawled during this scan.
+