diff --git a/artifact-preview-cache-guard/README.md b/artifact-preview-cache-guard/README.md new file mode 100644 index 00000000..79ac8ed4 --- /dev/null +++ b/artifact-preview-cache-guard/README.md @@ -0,0 +1,20 @@ +# Artifact Preview Cache Guard + +This module covers a metadata-aware preview and versioning slice of SCIBASE issue #14. + +It evaluates hosted scientific artifacts after uploads or version changes and decides whether previews can be reused, must be regenerated, or should be held because they could expose sensitive fields. The guard keeps spreadsheet previews, notebook previews, thumbnails, and metadata cards aligned with artifact hashes, upload versions, schema versions, and FAIR metadata. + +## What It Does + +- Classifies datasets, code, images, media, models, and supplementary artifacts. +- Detects stale previews when content hashes, upload versions, schema versions, or metadata digests drift. +- Blocks previews that expose sensitive fields such as participant IDs, emails, geolocation, or patient data. +- Produces dataset column diff summaries for upload version changes. +- Emits preview regeneration plans, FAIR metadata warnings, and a deterministic audit digest. + +## Run + +```bash +node artifact-preview-cache-guard/test.js +node artifact-preview-cache-guard/demo.js +``` diff --git a/artifact-preview-cache-guard/acceptance-notes.md b/artifact-preview-cache-guard/acceptance-notes.md new file mode 100644 index 00000000..a01d2a9b --- /dev/null +++ b/artifact-preview-cache-guard/acceptance-notes.md @@ -0,0 +1,30 @@ +# Acceptance Notes + +## Review Scenarios + +1. Current preview reuse + - Preview kind, artifact hash, upload version, schema version, and metadata digest match the artifact. + - The result is ready with a stable audit digest. + +2. Stale preview regeneration + - A new artifact version changes the content hash, schema, or metadata snapshot. + - The result requests preview regeneration and reports dataset column differences. + +3. Sensitive preview hold + - A preview exposes sensitive columns such as participant IDs or email addresses. + - The result blocks the preview until redaction is complete. + +4. FAIR metadata warning + - Missing license, creator, DOI, or persistent identifier metadata is surfaced before public release. + +## Validation + +```bash +node artifact-preview-cache-guard/test.js +node artifact-preview-cache-guard/demo.js +node --check artifact-preview-cache-guard/index.js +node --check artifact-preview-cache-guard/test.js +node --check artifact-preview-cache-guard/demo.js +``` + +The included `demo.mp4` is a five-second visual walkthrough of preview reuse, regeneration, and hold decisions. diff --git a/artifact-preview-cache-guard/demo.js b/artifact-preview-cache-guard/demo.js new file mode 100644 index 00000000..aedc5f04 --- /dev/null +++ b/artifact-preview-cache-guard/demo.js @@ -0,0 +1,67 @@ +"use strict" + +const { assessArtifactPreviews } = require("./index") + +const result = assessArtifactPreviews({ + artifacts: [ + { + id: "dose-response-csv", + path: "data/dose-response.csv", + version: "v3", + hash: "sha256:333", + schemaVersion: "v3", + access: "public", + metadata: { + title: "Dose response measurements", + license: "CC-BY-4.0", + creators: ["North Lab"], + doi: "10.1234/scibase.demo", + columns: ["sample_id", "dose", "response", "batch"], + }, + previousVersion: { + hash: "sha256:222", + metadata: { columns: ["sample_id", "dose", "response"] }, + }, + preview: { + kind: "table", + generatedFromHash: "sha256:222", + generatedFromVersion: "v2", + schemaVersion: "v2", + metadataDigest: "old", + }, + }, + { + id: "participant-roster", + path: "data/participants.csv", + version: "v1", + hash: "sha256:111", + schemaVersion: "v1", + access: "restricted", + metadata: { + title: "Participant roster", + license: "DUA-required", + creators: ["Clinic Lab"], + columns: ["participant_id", "email", "cohort"], + }, + preview: { + kind: "table", + generatedFromHash: "sha256:111", + generatedFromVersion: "v1", + schemaVersion: "v1", + metadataDigest: "old", + redactedFields: ["participant_id"], + }, + }, + ], +}) + +console.log("Artifact Preview Cache Guard Demo") +console.log("=================================") +console.log(`status: ${result.status}`) +console.log(`artifacts checked: ${result.artifactCount}`) +console.log(`refresh count: ${result.refreshCount}`) +console.log(`blocked count: ${result.blockedCount}`) +console.log(`first plan: ${result.previewPlan[0].action}`) +console.log(`first diff added: ${result.artifacts[0].diffSummary.addedColumns.join(", ")}`) +console.log(`blocked reason: ${result.previewPlan[1].reason}`) +console.log(`digest: ${result.auditDigest.slice(0, 16)}...`) diff --git a/artifact-preview-cache-guard/demo.mp4 b/artifact-preview-cache-guard/demo.mp4 new file mode 100644 index 00000000..f820cc17 Binary files /dev/null and b/artifact-preview-cache-guard/demo.mp4 differ diff --git a/artifact-preview-cache-guard/demo.svg b/artifact-preview-cache-guard/demo.svg new file mode 100644 index 00000000..c4bc823a --- /dev/null +++ b/artifact-preview-cache-guard/demo.svg @@ -0,0 +1,26 @@ + + Artifact Preview Cache Guard + A visual summary showing artifact previews being reused, regenerated, or held based on hash, version, metadata, and sensitive fields. + + + Artifact Preview Cache Guard + Keep hosted data/code previews aligned with hashes, versions, metadata, and redaction. + + + Reuse + Preview matches artifact + + + + Regenerate + Hash or version drift + + + + Hold + Sensitive field exposed + + Inputs + content hash, upload version, schema version, metadata digest, redacted fields + Output: preview plan, dataset diff summary, FAIR warnings, stable audit digest + diff --git a/artifact-preview-cache-guard/index.js b/artifact-preview-cache-guard/index.js new file mode 100644 index 00000000..bb225de2 --- /dev/null +++ b/artifact-preview-cache-guard/index.js @@ -0,0 +1,176 @@ +"use strict" + +const crypto = require("node:crypto") +const path = require("node:path") + +function stableStringify(value) { + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]` + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}` + } + return JSON.stringify(value) +} + +function digest(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex") +} + +function artifactType(filePath) { + const ext = path.extname(filePath || "").toLowerCase() + if ([".csv", ".tsv", ".xlsx", ".parquet", ".json"].includes(ext)) return "dataset" + if ([".ipynb", ".py", ".r", ".jl"].includes(ext)) return "code" + if ([".png", ".jpg", ".jpeg", ".svg", ".tif", ".tiff"].includes(ext)) return "image" + if ([".mp4", ".mov", ".avi"].includes(ext)) return "media" + if ([".pt", ".onnx", ".pkl", ".safetensors"].includes(ext)) return "model" + return "supplement" +} + +function previewKindFor(type) { + return { + dataset: "table", + code: "notebook-or-code", + image: "thumbnail", + media: "poster-frame", + model: "model-card", + supplement: "metadata-card", + }[type] +} + +function isSensitiveField(field) { + return /patient|subject|email|phone|ssn|dob|birth|address|lat|lon|geo|location|participant/i.test(field) +} + +function compareColumns(previous = [], current = []) { + const prevSet = new Set(previous) + const currSet = new Set(current) + return { + added: current.filter((column) => !prevSet.has(column)), + removed: previous.filter((column) => !currSet.has(column)), + } +} + +function normalizeArtifact(artifact) { + return { + id: artifact.id, + path: artifact.path || "", + version: artifact.version || null, + hash: artifact.hash || null, + schemaVersion: artifact.schemaVersion || artifact.metadata?.schemaVersion || null, + access: artifact.access || "private", + metadata: artifact.metadata || {}, + preview: artifact.preview || null, + previousVersion: artifact.previousVersion || null, + } +} + +function evaluateArtifact(artifactInput) { + const artifact = normalizeArtifact(artifactInput) + const type = artifactType(artifact.path) + const expectedPreviewKind = previewKindFor(type) + const blockers = [] + const warnings = [] + const refreshReasons = [] + const metadataDigest = digest(artifact.metadata) + const columns = artifact.metadata.columns || [] + const sensitiveFields = columns.filter(isSensitiveField) + + if (!artifact.id) blockers.push("artifact is missing an id") + if (!artifact.path) blockers.push("artifact is missing a path") + if (!artifact.version) blockers.push(`${artifact.id || "artifact"} is missing an upload version`) + if (!artifact.hash) blockers.push(`${artifact.id || "artifact"} is missing a content hash`) + + for (const field of ["title", "license", "creators"]) { + if (!artifact.metadata[field]) warnings.push(`${artifact.id} metadata is missing ${field}`) + } + if (artifact.access === "public" && !artifact.metadata.doi && !artifact.metadata.persistentId) { + warnings.push(`${artifact.id} public artifact is missing DOI or persistentId metadata`) + } + + if (!artifact.preview) { + refreshReasons.push("preview is missing") + } else { + if (artifact.preview.kind !== expectedPreviewKind) { + refreshReasons.push(`preview kind ${artifact.preview.kind} should be ${expectedPreviewKind}`) + } + if (artifact.preview.generatedFromHash !== artifact.hash) { + refreshReasons.push("preview content hash is stale") + } + if (artifact.preview.generatedFromVersion !== artifact.version) { + refreshReasons.push("preview upload version is stale") + } + if (artifact.preview.schemaVersion !== artifact.schemaVersion) { + refreshReasons.push("preview schema version is stale") + } + if (artifact.preview.metadataDigest !== metadataDigest) { + refreshReasons.push("preview metadata snapshot is stale") + } + } + + const redactedFields = new Set(artifact.preview?.redactedFields || []) + const unredactedSensitiveFields = sensitiveFields.filter((field) => !redactedFields.has(field)) + if (unredactedSensitiveFields.length > 0) { + blockers.push(`${artifact.id} preview exposes sensitive fields: ${unredactedSensitiveFields.join(", ")}`) + } + + const previousColumns = artifact.previousVersion?.metadata?.columns || [] + const columnDiff = compareColumns(previousColumns, columns) + const changedHash = artifact.previousVersion?.hash && artifact.previousVersion.hash !== artifact.hash + const diffSummary = { + changedHash: Boolean(changedHash), + addedColumns: columnDiff.added, + removedColumns: columnDiff.removed, + } + + const action = + blockers.length > 0 ? "hold-preview" : refreshReasons.length > 0 ? "regenerate-preview" : "reuse-preview" + + return { + id: artifact.id, + path: artifact.path, + type, + expectedPreviewKind, + action, + blockers, + warnings, + refreshReasons, + diffSummary, + metadataDigest, + } +} + +function assessArtifactPreviews(input) { + const artifacts = (input.artifacts || []).map(evaluateArtifact) + const blockers = artifacts.flatMap((artifact) => artifact.blockers) + const refreshCount = artifacts.filter((artifact) => artifact.action === "regenerate-preview").length + const warningCount = artifacts.reduce((count, artifact) => count + artifact.warnings.length, 0) + const previewPlan = artifacts.map((artifact) => ({ + artifactId: artifact.id, + action: artifact.action, + reason: artifact.blockers[0] || artifact.refreshReasons[0] || artifact.warnings[0] || "preview is current", + })) + const fairWarnings = artifacts.flatMap((artifact) => + artifact.warnings.map((warning) => ({ artifactId: artifact.id, warning })), + ) + + const result = { + status: blockers.length > 0 ? "blocked" : refreshCount > 0 ? "needs-refresh" : warningCount > 0 ? "needs-review" : "ready", + artifactCount: artifacts.length, + refreshCount, + blockedCount: artifacts.filter((artifact) => artifact.action === "hold-preview").length, + artifacts, + previewPlan, + fairWarnings, + } + + return { + ...result, + auditDigest: digest(result), + } +} + +module.exports = { + assessArtifactPreviews, +} diff --git a/artifact-preview-cache-guard/requirements-map.md b/artifact-preview-cache-guard/requirements-map.md new file mode 100644 index 00000000..78acbea7 --- /dev/null +++ b/artifact-preview-cache-guard/requirements-map.md @@ -0,0 +1,13 @@ +# Requirements Map + +| Issue #14 requirement | Coverage in this module | +| --- | --- | +| Support datasets, code files, supplementary files, images, videos, and models | Classifies common artifact extensions into dataset, code, image, media, model, or supplement preview lanes. | +| Metadata-aware previews | Verifies preview kind, content hash, upload version, schema version, and metadata digest before reuse. | +| Upload versioning and diffing | Compares previous and current dataset columns and reports added or removed fields. | +| Structured metadata and FAIR compliance | Warns on missing title, license, creators, DOI, or persistent ID metadata. | +| Access control for reusable hosted assets | Holds previews that expose sensitive fields before they are displayed or exported. | + +## Non-Overlap Note + +This submission is distinct from broad data/code hosting ledgers, FAIR manifest validators, artifact access gates, package integrity gates, quarantine/rerun governance, provenance chains, storage quota/deduplication ledgers, and executable-environment drift modules. It focuses specifically on preview cache correctness, upload-version drift, metadata snapshots, and sensitive-field preview safety. diff --git a/artifact-preview-cache-guard/test.js b/artifact-preview-cache-guard/test.js new file mode 100644 index 00000000..9449a481 --- /dev/null +++ b/artifact-preview-cache-guard/test.js @@ -0,0 +1,148 @@ +"use strict" + +const assert = require("node:assert/strict") +const { assessArtifactPreviews } = require("./index") + +const currentMetadata = { + title: "Dose response measurements", + license: "CC-BY-4.0", + creators: ["North Lab"], + doi: "10.1234/scibase.demo", + columns: ["sample_id", "dose", "response"], + schemaVersion: "v2", +} + +{ + const metadataDigest = require("node:crypto") + .createHash("sha256") + .update( + '{"columns":["sample_id","dose","response"],"creators":["North Lab"],"doi":"10.1234/scibase.demo","license":"CC-BY-4.0","schemaVersion":"v2","title":"Dose response measurements"}', + ) + .digest("hex") + + const result = assessArtifactPreviews({ + artifacts: [ + { + id: "artifact-ready", + path: "data/dose-response.csv", + version: "v2", + hash: "sha256:222", + schemaVersion: "v2", + access: "public", + metadata: currentMetadata, + preview: { + kind: "table", + generatedFromHash: "sha256:222", + generatedFromVersion: "v2", + schemaVersion: "v2", + metadataDigest, + }, + }, + ], + }) + + assert.equal(result.status, "ready") + assert.equal(result.refreshCount, 0) + assert.equal(result.previewPlan[0].action, "reuse-preview") + assert.match(result.auditDigest, /^[0-9a-f]{64}$/) +} + +{ + const result = assessArtifactPreviews({ + artifacts: [ + { + id: "artifact-stale", + path: "data/results.csv", + version: "v3", + hash: "sha256:333", + schemaVersion: "v3", + access: "public", + metadata: { + title: "Updated measurements", + license: "CC-BY-4.0", + creators: ["North Lab"], + doi: "10.1234/scibase.updated", + columns: ["sample_id", "dose", "response", "batch"], + }, + previousVersion: { + hash: "sha256:222", + metadata: { columns: ["sample_id", "dose", "response"] }, + }, + preview: { + kind: "table", + generatedFromHash: "sha256:222", + generatedFromVersion: "v2", + schemaVersion: "v2", + metadataDigest: "old", + }, + }, + ], + }) + + assert.equal(result.status, "needs-refresh") + assert.equal(result.refreshCount, 1) + assert.ok(result.artifacts[0].refreshReasons.includes("preview content hash is stale")) + assert.deepEqual(result.artifacts[0].diffSummary.addedColumns, ["batch"]) +} + +{ + const result = assessArtifactPreviews({ + artifacts: [ + { + id: "artifact-sensitive", + path: "data/participants.csv", + version: "v1", + hash: "sha256:111", + schemaVersion: "v1", + access: "restricted", + metadata: { + title: "Participant metadata", + license: "DUA-required", + creators: ["Clinic Lab"], + columns: ["participant_id", "email", "cohort"], + }, + preview: { + kind: "table", + generatedFromHash: "sha256:111", + generatedFromVersion: "v1", + schemaVersion: "v1", + metadataDigest: "stale", + redactedFields: ["participant_id"], + }, + }, + ], + }) + + assert.equal(result.status, "blocked") + assert.equal(result.blockedCount, 1) + assert.ok(result.previewPlan[0].reason.includes("email")) +} + +{ + const result = assessArtifactPreviews({ + artifacts: [ + { + id: "artifact-metadata-warning", + path: "notebooks/analysis.ipynb", + version: "v1", + hash: "sha256:notebook", + schemaVersion: "v1", + access: "private", + metadata: { title: "Notebook preview", columns: [] }, + preview: { + kind: "notebook-or-code", + generatedFromHash: "sha256:notebook", + generatedFromVersion: "v1", + schemaVersion: "v1", + metadataDigest: "old", + }, + }, + ], + }) + + assert.equal(result.status, "needs-refresh") + assert.ok(result.fairWarnings.some((item) => item.warning.includes("license"))) + assert.ok(result.fairWarnings.some((item) => item.warning.includes("creators"))) +} + +console.log("artifact-preview-cache-guard tests passed")