diff --git a/.gitignore b/.gitignore index 2100afd..f6391ea 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.test *.out *.ndjson +report.html .idea/ .vscode/ diff --git a/report-dark.png b/report-dark.png new file mode 100644 index 0000000..eae6573 Binary files /dev/null and b/report-dark.png differ diff --git a/report-light-confidence.png b/report-light-confidence.png new file mode 100644 index 0000000..b82359b Binary files /dev/null and b/report-light-confidence.png differ diff --git a/report-light-ecosystems.png b/report-light-ecosystems.png new file mode 100644 index 0000000..e8db030 Binary files /dev/null and b/report-light-ecosystems.png differ diff --git a/report-light-lifecycle.png b/report-light-lifecycle.png new file mode 100644 index 0000000..7a0e02c Binary files /dev/null and b/report-light-lifecycle.png differ diff --git a/report-light-projects.png b/report-light-projects.png new file mode 100644 index 0000000..addada6 Binary files /dev/null and b/report-light-projects.png differ diff --git a/report-light-roots.png b/report-light-roots.png new file mode 100644 index 0000000..396619f Binary files /dev/null and b/report-light-roots.png differ diff --git a/report-light-sources.png b/report-light-sources.png new file mode 100644 index 0000000..df57c22 Binary files /dev/null and b/report-light-sources.png differ diff --git a/report-light-versions.png b/report-light-versions.png new file mode 100644 index 0000000..bd78324 Binary files /dev/null and b/report-light-versions.png differ diff --git a/report-light.png b/report-light.png new file mode 100644 index 0000000..e8db030 Binary files /dev/null and b/report-light.png differ diff --git a/scripts/generate-report.py b/scripts/generate-report.py new file mode 100755 index 0000000..64765e1 --- /dev/null +++ b/scripts/generate-report.py @@ -0,0 +1,1596 @@ +#!/usr/bin/env python3 +"""Generate an HTML report from bumblebee inventory.ndjson. + +Requires Python 3.10+. + +Usage: + python3 scripts/generate-report.py # defaults + python3 scripts/generate-report.py inventory.ndjson # explicit input + python3 scripts/generate-report.py -o report.html # explicit output + python3 scripts/generate-report.py scan.ndjson -o out.html # both +""" + +import json +import collections +import html as html_mod +import sys +import argparse + +# ── Ecosystem colours ── +ECO_COLORS = { + "go": "#00ADD8", + "npm": "#CB3837", + "pypi": "#306998", + "rubygems": "#CC342D", + "browser-extension": "#FF9500", + "editor-extension": "#8B5CF6", + "mcp": "#10B981", + "unknown": "#6B7280", +} + +# ── Helpers ── + +def esc(s): + return html_mod.escape(str(s)) + + +def bar_html(pct, color, min_pct=2.5): + pct = max(pct, min_pct) + return ( + f'
' + f'
" + f'' + f"
" + ) + + +# ── Main generator ── + +def generate_report(ndjson_path: str, output_path: str) -> None: + # ── Load data ── + packages: list[dict] = [] + summary: dict | None = None + + with open(ndjson_path) as f: + for line in f: + r = json.loads(line) + if r["record_type"] == "package": + packages.append(r) + elif r["record_type"] == "scan_summary": + summary = r + + if not summary: + print("Error: no scan_summary record found", file=sys.stderr) + sys.exit(1) + + if not packages: + print("Error: no package records found", file=sys.stderr) + sys.exit(1) + + # ── Compute aggregates ── + ecosystems = collections.Counter() + source_types = collections.Counter() + projects = collections.Counter() + confidence_levels = collections.Counter() + direct_deps = 0 + lifecycle_script_pkgs: list[dict] = [] + unique_by_eco: dict[str, set[str]] = collections.defaultdict(set) + multi_version: dict[tuple, dict[str, list]] = collections.defaultdict( + lambda: collections.defaultdict(list) + ) + + for p in packages: + eco = p.get("ecosystem", "unknown") + ecosystems[eco] += 1 + source_types[p.get("source_type", "unknown")] += 1 + + proj = p.get("project_path", "unknown") + proj = proj.replace("/Users/", "~/").replace("/home/", "~/") + projects[proj] += 1 + + confidence_levels[p.get("confidence", "unknown")] += 1 + + if p.get("direct_dependency"): + direct_deps += 1 + + if p.get("has_lifecycle_scripts"): + lifecycle_script_pkgs.append( + { + "name": p.get("package_name"), + "ecosystem": eco, + "project": proj, + "scripts": p.get("lifecycle_scripts", []), + } + ) + + unique_by_eco[eco].add(p.get("normalized_name", "")) + + name = p.get("normalized_name", "") + ver = p.get("version", "?") + multi_version[(eco, name)][ver].append(proj) + + multi_pkgs = {k: v for k, v in multi_version.items() if len(v) > 1} + top_multi = sorted(multi_pkgs.items(), key=lambda x: -len(x[1]))[:25] + eco_order = [e for e, _ in ecosystems.most_common()] + + # ── Group scan roots by kind ── + root_kinds_map: dict[str, list[str]] = collections.OrderedDict() + for root in summary["roots"]: + rk = root["kind"] + path = root["path"].replace("/Users/", "~/").replace("/home/", "~/") + root_kinds_map.setdefault(rk, []).append(path) + + # ── Build table rows ── + + # Ecosystems + max_eco = max(ecosystems.values()) + eco_rows = "" + for eco in eco_order: + cnt = ecosystems[eco] + unique = len(unique_by_eco[eco]) + pct = cnt / max_eco * 100 + color = ECO_COLORS.get(eco, "#6B7280") + eco_rows += f""" + + {esc(eco)} + {cnt:,} + {unique:,} + {bar_html(pct, color)} + """ + + # Source types + src_rows = "" + max_src = max(source_types.values()) + for src, cnt in source_types.most_common(): + pct = cnt / max_src * 100 + src_rows += f""" + + {esc(src)} + {cnt:,} + {bar_html(pct, "#64748b")} + """ + + # Confidence + conf_rows = "" + max_conf = max(confidence_levels.values()) + conf_colors = {"high": "#10b981", "medium": "#f59e0b", "low": "#ef4444"} + for lvl, cnt in confidence_levels.most_common(): + color = conf_colors.get(lvl, "#6B7280") + pct = cnt / max_conf * 100 + conf_rows += f""" + + {esc(lvl)} + {cnt:,} + {cnt / len(packages) * 100:.1f}% + {bar_html(pct, color)} + """ + + # Top projects + proj_rows = "" + max_proj = max(projects.values()) + for proj, cnt in projects.most_common(25): + pct = cnt / max_proj * 100 + proj_rows += f""" + + + {esc(proj)} + + {cnt:,} + {bar_html(pct, "#d97706")} + """ + + # Multi-version + multi_rows = "" + for (eco, name), versions in top_multi: + ver_list = sorted(versions.keys())[:3] + ver_str = ", ".join(ver_list) + if len(versions) > 3: + ver_str += f" … +{len(versions) - 3} more" + color = ECO_COLORS.get(eco, "#6B7280") + multi_rows += f""" + + {esc(eco)} + {esc(name)} + {len(versions)} + {esc(ver_str)} + """ + + # Lifecycle scripts + lifecycle_rows = "" + for pkg in sorted(lifecycle_script_pkgs, key=lambda x: x["name"]): + color = ECO_COLORS.get(pkg["ecosystem"], "#6B7280") + scripts_html = " ".join( + f'{esc(s)}' for s in pkg["scripts"] + ) + lifecycle_rows += f""" + + {esc(pkg["ecosystem"])} + {esc(pkg["name"])} + {scripts_html} + {esc(pkg["project"])} + """ + + # Scan roots + root_icons = { + "user_package_root": "📦", + "editor_extension_root": "🧩", + "mcp_config_root": "🔌", + "browser_extension_root": "🌐", + "homebrew_root": "🍺", + } + scan_roots_html = "" + for rk, paths in root_kinds_map.items(): + icon = root_icons.get(rk, "📁") + items = "".join(f"
  • {esc(p)}
  • " for p in paths) + scan_roots_html += f""" +
    +
    + {icon} + {esc(rk.replace('_', ' ').title())} + {len(paths)} +
    + +
    """ + + duration_s = summary["duration_ms"] / 1000 + + # ── Assemble HTML ── + html = f""" + + + + +Bumblebee Inventory Report — {esc(summary["endpoint"]["hostname"])} + + + + + + + + +
    + + + + + + + +
    + + +
    +
    + + + + Inventory Report +
    +

    Bumblebee Inventory

    +
    + {esc(summary["endpoint"]["hostname"])} + · + {esc(summary["endpoint"]["os"])}/{esc(summary["endpoint"]["arch"])} + · + {esc(summary["scan_time"][:19].replace("T", " "))} + · + {duration_s:.1f}s + · + profile: {esc(summary["profile"])} +
    +
    + + +
    +
    + {len(packages):,} + Total Packages +
    +
    + {sum(len(v) for v in unique_by_eco.values()):,} + Unique Names +
    +
    + {len(ecosystems)} + Ecosystems +
    +
    + {len(projects)} + Projects +
    +
    + {direct_deps:,} + Direct Deps +
    +
    + {len(lifecycle_script_pkgs)} + Lifecycle Scripts +
    +
    + {len(multi_pkgs)} + Multi-Version +
    +
    + {summary["files_considered"]:,} + Files Scanned +
    +
    + + + + +
    +
    + 01 +
    +

    Packages by Ecosystem

    + +
    +
    +
    +

    What’s on this machine — a breakdown of every package discovered, grouped by language and runtime.

    +
    + + {eco_rows} +
    EcosystemTotalUniqueDistribution
    +
    + + +
    +
    + 02 +
    +

    Lifecycle Scripts

    + +
    +
    +
    +

    What’s dangerous — packages that run arbitrary code when installed or updated.

    +
    + + These packages execute arbitrary code at install time (preinstall, postinstall, prepare). Review them for supply-chain risk. +
    +
    + + {lifecycle_rows} +
    EcosystemPackageScriptsProject
    +
    + + +
    +
    + 03 +
    +

    Version Sprawl

    + +
    +
    +
    +

    What’s outdated — packages pinned to many different versions across projects, increasing patching burden.

    +
    + + {multi_rows} +
    EcosystemPackageVersionsSample
    +
    + + +
    +
    + 04 +
    +

    Top Projects

    + +
    +
    +
    +

    Where complexity concentrates — projects with the deepest dependency trees.

    +
    + + {proj_rows} +
    ProjectPackagesDistribution
    +
    + + +
    +
    + 05 +
    +

    Confidence Levels

    +
    +
    +
    +

    How reliable is this data — detection confidence assigned to each package record.

    +
    + + {conf_rows} +
    LevelCountShareDistribution
    +
    + + +
    +
    + 06 +
    +

    Detection Sources

    + +
    +
    +
    +

    How packages were found — lockfiles, module caches, manifests, and extension metadata.

    +
    + + {src_rows} +
    SourceCountDistribution
    +
    + + +
    +
    + 07 +
    +

    Scan Roots

    + +
    +
    +
    +

    Reference — every directory bumblebee crawled during this scan.

    +
    {scan_roots_html}
    +
    + + + + + + + +
    + + + +""" + + with open(output_path, "w") as f: + f.write(html) + print(f"Report written to {output_path} ({len(html):,} bytes)") + + +# ── CLI entry point ── +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate an HTML report from bumblebee inventory.ndjson", + ) + parser.add_argument( + "input", + nargs="?", + default="inventory.ndjson", + help="Path to inventory.ndjson (default: inventory.ndjson)", + ) + parser.add_argument( + "-o", + "--output", + default="report.html", + help="Output HTML file path (default: report.html)", + ) + args = parser.parse_args() + generate_report(args.input, args.output)