From d69abe39b815dc68740b1b07a642f4d89e277f2b Mon Sep 17 00:00:00 2001
From: Allen Rose <RoseAllenM@gmail.com>
Date: Mon, 20 Apr 2026 21:33:36 -0700
Subject: [PATCH] Add script to bootstrap SQLite & Mongo from files

Signed-off-by: Allen Rose <RoseAllenM@gmail.com>
---
 QUICKSTART.md                                 |  34 +-
 backend/.gitignore                            |   1 +
 backend/docker-compose.yml                    |   1 +
 backend/example.docker-compose.local.yml      |   3 +
 backend/makefile                              |   5 +-
 backend/src/dna/devtools/__init__.py          |   1 +
 backend/src/dna/devtools/bootstrap_dataset.py | 783 ++++++++++++++++++
 .../dna/prodtrack_providers/mock_provider.py  |  88 +-
 backend/tests/providers/test_mock_provider.py |  26 +
 backend/tests/test_bootstrap_dataset.py       | 531 ++++++++++++
 backend/uv.lock                               |   3 +
 sample_dailies_dataset/in_review.json         |   7 +
 sample_dailies_dataset/transcript.json        |   2 +-
 13 files changed, 1462 insertions(+), 23 deletions(-)
 create mode 100644 backend/src/dna/devtools/__init__.py
 create mode 100644 backend/src/dna/devtools/bootstrap_dataset.py
 create mode 100644 backend/tests/test_bootstrap_dataset.py
 create mode 100644 backend/uv.lock
 create mode 100644 sample_dailies_dataset/in_review.json

diff --git a/QUICKSTART.md b/QUICKSTART.md
index 705e9d9c..15f8f4fb 100644
--- a/QUICKSTART.md
+++ b/QUICKSTART.md
@@ -29,7 +29,7 @@ cp example.docker-compose.local.vexa.yml docker-compose.local.vexa.yml
 
 Edit `docker-compose.local.yml` with your credentials.
 
-**Production tracking (ShotGrid):** To run without a ShotGrid seat, set **`PRODTRACK_PROVIDER=mock`** in `docker-compose.local.yml`. The mock provider uses read-only SQLite with pre-seeded data. To use real ShotGrid, set `PRODTRACK_PROVIDER=shotgrid` (or leave it unset) and add `SHOTGRID_URL`, `SHOTGRID_SCRIPT_NAME`, and `SHOTGRID_API_KEY`. See [Mock setup](#mock-production-tracking-setup) below for how to refresh or customize the mock data.
+**Production tracking (ShotGrid):** To run without a ShotGrid seat, set **`PRODTRACK_PROVIDER=mock`** in `docker-compose.local.yml`. Set **`MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db`** to make the active mock DB explicit and easy to change later. The mock provider uses read-only SQLite with pre-seeded data. To use real ShotGrid, set `PRODTRACK_PROVIDER=shotgrid` (or leave it unset) and add `SHOTGRID_URL`, `SHOTGRID_SCRIPT_NAME`, and `SHOTGRID_API_KEY`. See [Mock setup](#mock-production-tracking-setup) below for how to refresh or customize the mock data.
 
 **LLM provider:** Set `LLM_PROVIDER` to choose which backend LLM integration to use.
 
@@ -140,6 +140,7 @@ The React app will be available at `http://localhost:5173`.
 | `SHOTGRID_API_KEY` | Yes\* | - | ShotGrid API key (required when using ShotGrid) |
 | `SHOTGRID_SCRIPT_NAME` | Yes\* | - | ShotGrid script name (required when using ShotGrid) |
 | `PRODTRACK_PROVIDER` | No | `shotgrid` | `shotgrid` or `mock`; set to `mock` to use the read-only mock DB without ShotGrid |
+| `MOCK_PRODTRACK_DB_PATH` | No | bundled `mock.db` | Path to the SQLite DB used when `PRODTRACK_PROVIDER=mock` |
 | `MONGODB_URL` | No | `mongodb://mongo:27017` | MongoDB connection string |
 | `STORAGE_PROVIDER` | No | `mongodb` | Storage provider type |
 | `VEXA_API_KEY` | Yes | - | API key for Vexa transcription service |
@@ -257,12 +258,13 @@ The DNA API serves as the central hub:
 
 ## Mock Production Tracking Setup
 
-When you set **`PRODTRACK_PROVIDER=mock`**, the backend uses a read-only mock provider backed by SQLite (`backend/src/dna/prodtrack_providers/mock_data/mock.db`). The app runs normally with this data so you can develop and test the UI without a ShotGrid seat.
+When you set **`PRODTRACK_PROVIDER=mock`**, the backend uses a read-only mock provider backed by SQLite. By default, set **`MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db`** in `docker-compose.local.yml` so the active DB is explicit and easy to change. The app runs normally with this data so you can develop and test the UI without a ShotGrid seat.
 
 ### Using the mock provider
 
-- In `docker-compose.local.yml`, set **`PRODTRACK_PROVIDER=mock`**. You do not need to set any ShotGrid variables when using the mock.
+- In `docker-compose.local.yml`, set **`PRODTRACK_PROVIDER=mock`** and **`MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db`**.
 - The mock provider is used only when explicitly set; there is no automatic fallback if ShotGrid credentials are missing.
+- To switch the backend to a bootstrapped local DB, change `MOCK_PRODTRACK_DB_PATH` to `/app/.local/mock.db`.
 
 ### Refreshing or customizing mock data from ShotGrid
 
@@ -283,10 +285,34 @@ docker-compose -f docker-compose.yml -f docker-compose.local.yml run --rm api \
   --api-key 'YOUR_API_KEY'
 ```
 
-- This overwrites `mock_data/mock.db` with entities (projects, users, shots, assets, tasks, versions, playlists, notes) from the given ShotGrid project.
+- To bootstrap sample review data, first start Mongo, then run `bootstrap_dataset`.
+- By default, `bootstrap_dataset` writes to `backend/.local/mock.db`, so it does not modify the checked-in mock DB.
+- If you want the backend to use that bootstrapped DB, set `MOCK_PRODTRACK_DB_PATH=/app/.local/mock.db` in `docker-compose.local.yml`.
+- `seed-mock-db` overwrites `mock_data/mock.db` with entities (projects, users, shots, assets, tasks, versions, playlists, notes) from the given ShotGrid project.
 - Use `--skip-thumbnails` to skip downloading version thumbnails (faster seed; thumbnails will not work after signed URLs expire).
 - Without `--skip-thumbnails`, thumbnails are downloaded to `mock_data/thumbnails/` and served by the API at `/api/mock-thumbnails/{version_id}` so they keep working after ShotGrid signed URLs expire.
 
+Example sample bootstrap workflow:
+
+```bash
+cd backend
+
+# Start only Mongo first
+make start-mongo
+
+# See available arguments
+python -m dna.devtools.bootstrap_dataset --help
+
+# Preview the import without writing anything
+python -m dna.devtools.bootstrap_dataset ../sample_dailies_dataset --dry-run
+
+# Seed SQLite + Mongo using the default local SQLite output
+python -m dna.devtools.bootstrap_dataset ../sample_dailies_dataset
+
+# Then start the full backend stack so the API comes up with the seeded data
+make start-local
+```
+
 The mock provider is **read-only**: it does not write to ShotGrid or to the SQLite file at runtime. Writes such as publishing notes will raise an error when using the mock provider.
 
 ## Docker Compose Files
diff --git a/backend/.gitignore b/backend/.gitignore
index e30b5633..04e870fe 100644
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -42,3 +42,4 @@ backend/docker-compose.vexa.yml
 # Local environment files
 docker-compose.local.yml
 docker-compose.local.vexa.yml
+.local/
diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml
index 5be99118..ac5772ca 100644
--- a/backend/docker-compose.yml
+++ b/backend/docker-compose.yml
@@ -29,6 +29,7 @@ services:
       - ./pytest.ini:/app/pytest.ini
       - ./.coveragerc:/app/.coveragerc
       - ./htmlcov:/app/htmlcov
+      - ./.local:/app/.local
     environment:
       - PYTHONUNBUFFERED=1
       - SHOTGRID_URL=https://your-shotgrid-url.com
diff --git a/backend/example.docker-compose.local.yml b/backend/example.docker-compose.local.yml
index b0095bb4..f9a27f88 100644
--- a/backend/example.docker-compose.local.yml
+++ b/backend/example.docker-compose.local.yml
@@ -1,6 +1,8 @@
 # Local development override. Copy to docker-compose.local.yml and fill in secrets.
 # Auth: AUTH_PROVIDER=none uses the noop provider (sign in with any email; no token validation).
 # Prodtrack: Set PRODTRACK_PROVIDER=mock to use the read-only mock (no ShotGrid needed).
+# Mock DB path: change MOCK_PRODTRACK_DB_PATH to /app/.local/mock.db if you
+# want the backend to use a locally bootstrapped DB instead of the checked-in fixture.
 # To use real ShotGrid, set PRODTRACK_PROVIDER=shotgrid and SHOTGRID_URL, SHOTGRID_SCRIPT_NAME, SHOTGRID_API_KEY.
 services:
   api:
@@ -10,6 +12,7 @@ services:
       - SHOTGRID_API_KEY=************
       - SHOTGRID_SCRIPT_NAME=DNA_local_testing
       - PRODTRACK_PROVIDER=mock
+      - MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db
       - VEXA_API_KEY=**********
       - VEXA_API_URL=http://vexa:8056
       - OPENAI_API_KEY=your-openai-api-key
diff --git a/backend/makefile b/backend/makefile
index 9c2986ce..bdade8b9 100644
--- a/backend/makefile
+++ b/backend/makefile
@@ -56,4 +56,7 @@ format-python: venv-lint
 	.venv-lint/bin/isort src/ tests/
 
 seed-mock-db:
-	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.local.yml run --rm api python -m dna.prodtrack_providers.mock_data.seed_db --project-id 124 --url https://aswf.shotgrid.autodesk.com --script-name DNA_local_testing --api-key '$(SHOTGRID_API_KEY)'
\ No newline at end of file
+	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.local.yml run --rm api python -m dna.prodtrack_providers.mock_data.seed_db --project-id 124 --url https://aswf.shotgrid.autodesk.com --script-name DNA_local_testing --api-key '$(SHOTGRID_API_KEY)'
+
+start-mongo:
+	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.local.yml up -d mongo
diff --git a/backend/src/dna/devtools/__init__.py b/backend/src/dna/devtools/__init__.py
new file mode 100644
index 00000000..450264d1
--- /dev/null
+++ b/backend/src/dna/devtools/__init__.py
@@ -0,0 +1 @@
+"""Developer tooling utilities for backend workflows."""
diff --git a/backend/src/dna/devtools/bootstrap_dataset.py b/backend/src/dna/devtools/bootstrap_dataset.py
new file mode 100644
index 00000000..63e4a06c
--- /dev/null
+++ b/backend/src/dna/devtools/bootstrap_dataset.py
@@ -0,0 +1,783 @@
+"""Bootstrap a standalone demo dataset into local development stores.
+
+This script seeds:
+- the mock prodtrack SQLite database used by the mock provider
+- MongoDB collections used by /generate-note and transcript viewing
+
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import hashlib
+import json
+import os
+import re
+import sqlite3
+import sys
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Optional
+
+from pymongo import AsyncMongoClient
+
+from dna.models.stored_segment import generate_segment_id
+
+BACKEND_ROOT = Path(__file__).resolve().parents[3]
+
+
+DEFAULT_SQLITE_PATH = BACKEND_ROOT / ".local" / "mock.db"
+SCHEMA_PATH = (
+    Path(__file__).resolve().parents[1]
+    / "prodtrack_providers"
+    / "mock_data"
+    / "schema.sql"
+)
+
+
+@dataclass(slots=True)
+class DatasetPlan:
+    dataset_name: str
+    dataset_path: Path
+    project_id: int
+    project_code: str
+    project_name: str
+    playlist: DatasetPlaylist
+    users: list[DatasetUser]
+    shots: list[DatasetShot]
+    tasks: list[DatasetTask]
+    versions: list[DatasetVersion]
+    segments: list[DatasetSegment]
+    in_review_version_id: int
+    sample_user_email: str
+    warnings: list[str] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class DatasetPlaylist:
+    id: int
+    code: str
+    description: str
+    created_at: str
+    updated_at: str
+    version_ids: list[int]
+
+
+@dataclass(slots=True)
+class DatasetSegment:
+    segment_id: str
+    playlist_id: int
+    version_id: int
+    speaker: str
+    text: str
+    absolute_start_time: str
+    absolute_end_time: str
+
+
+@dataclass(slots=True)
+class DatasetShot:
+    id: int
+    name: str
+    description: str
+
+
+@dataclass(slots=True)
+class DatasetTask:
+    id: int
+    name: str
+    status: str
+    pipeline_step_name: str
+    entity_type: str
+    entity_id: int
+
+
+@dataclass(slots=True)
+class DatasetUser:
+    id: int
+    name: str
+    email: str
+    login: str
+
+
+@dataclass(slots=True)
+class DatasetVersion:
+    id: int
+    source_id: int
+    code: str
+    description: str
+    status: str
+    created_at: Optional[str]
+    updated_at: Optional[str]
+    user_id: Optional[int]
+    shot_id: Optional[int]
+    task_id: Optional[int]
+    thumbnail: Optional[str]
+    movie_path: Optional[str] = None
+    frame_path: Optional[str] = None
+
+
+@dataclass(slots=True)
+class InReviewEvent:
+    ts: str
+    review_item: str
+
+
+def _assign_utterances_to_in_review_events(
+    utterances: list[dict[str, Any]], events: list[InReviewEvent]
+) -> list[tuple[str, int, dict[str, Any]]]:
+    event_offsets = [_parse_hms(event.ts) for event in events]
+    assignments: list[tuple[str, int, dict[str, Any]]] = []
+
+    event_index = -1
+    for utterance_index, utterance in enumerate(utterances):
+        utterance_ts = utterance.get("ts")
+        if not isinstance(utterance_ts, str) or not utterance_ts:
+            raise ValueError(
+                f"transcript.json utterance at index {utterance_index} is missing a valid 'ts'."
+            )
+
+        utterance_offset = _parse_hms(utterance_ts)
+        while (
+            event_index + 1 < len(event_offsets)
+            and event_offsets[event_index + 1] <= utterance_offset
+        ):
+            event_index += 1
+
+        if event_index < 0:
+            continue
+
+        assignments.append(
+            (events[event_index].review_item, utterance_index, utterance)
+        )
+
+    return assignments
+
+
+def _build_dataset_plan(dataset_path: Path) -> DatasetPlan:
+    session_path = dataset_path / "session.json"
+    shotgrid_path = dataset_path / "shotgrid_data.json"
+    transcript_path = dataset_path / "transcript.json"
+
+    for required_path in (session_path, shotgrid_path, transcript_path):
+        if not required_path.exists():
+            raise FileNotFoundError(f"Required dataset file not found: {required_path}")
+
+    session = json.loads(session_path.read_text())
+    shotgrid_data = json.loads(shotgrid_path.read_text())
+    transcript_data = json.loads(transcript_path.read_text())
+    in_review_events = _load_in_review_events(dataset_path)
+
+    warnings: list[str] = []
+
+    session_id = str(session.get("session_id") or dataset_path.name)
+    transcript_session_id = transcript_data.get("session_id")
+    if transcript_session_id and transcript_session_id != session_id:
+        raise ValueError(
+            f"session.json session_id ({session_id}) does not match transcript.json session_id ({transcript_session_id})."
+        )
+
+    project = session.get("project") or {}
+    project_code = project.get("code") or "DEMO"
+    project_name = project.get("name") or "Demo Project"
+    project_id = _stable_id(session_id, "project", f"{project_code}:{project_name}")
+
+    version_rows = shotgrid_data.get("versions") or []
+    versions_by_review_name = {
+        row.get("entity", {}).get("name"): row
+        for row in version_rows
+        if row.get("entity", {}).get("name")
+    }
+
+    review_set = session.get("review_set") or []
+    if not review_set:
+        raise ValueError("session.json does not contain a review_set.")
+    for event in in_review_events:
+        if event.review_item not in review_set:
+            raise ValueError(
+                f"in_review.json review_item {event.review_item} is not present in session.json review_set."
+            )
+
+    session_dt_raw = session.get("date_utc")
+    if not session_dt_raw:
+        raise ValueError("session.json does not contain date_utc.")
+    session_dt = datetime.fromisoformat(session_dt_raw.replace("Z", "+00:00"))
+
+    utterance_assignments = _assign_utterances_to_in_review_events(
+        transcript_data.get("utterances") or [], in_review_events
+    )
+
+    users_by_name: dict[str, DatasetUser] = {}
+
+    def ensure_user(name: str, hint: str) -> DatasetUser:
+        existing = users_by_name.get(name)
+        if existing:
+            return existing
+        user = DatasetUser(
+            id=_stable_id(session_id, "user", hint),
+            name=name,
+            email=f"{_slugify(name)}@example.com",
+            login=_slugify(name),
+        )
+        users_by_name[name] = user
+        return user
+
+    for participant in session.get("participants") or []:
+        participant_name = participant.get("name")
+        if participant_name:
+            ensure_user(participant_name, f"participant:{participant_name}")
+
+    shots: list[DatasetShot] = []
+    tasks: list[DatasetTask] = []
+    versions: list[DatasetVersion] = []
+    segments: list[DatasetSegment] = []
+    version_ids_for_playlist: list[int] = []
+
+    version_id_by_review_name: dict[str, int] = {}
+
+    for review_name in review_set:
+        version_row = versions_by_review_name.get(review_name)
+        if version_row is None:
+            raise ValueError(
+                f"No version metadata found in shotgrid_data.json for review_set item {review_name}."
+            )
+
+        shot_row = version_row.get("entity") or {}
+        shot_id = _stable_id(session_id, "shot", str(shot_row.get("id") or review_name))
+        task_row = version_row.get("sg_task") or {}
+        task_id = _stable_id(
+            session_id,
+            "task",
+            str(
+                task_row.get("id") or f"{review_name}:{task_row.get('name') or 'task'}"
+            ),
+        )
+        version_id = _stable_id(
+            session_id, "version", str(version_row.get("id") or version_row.get("code"))
+        )
+        version_user_row = version_row.get("user") or {}
+        version_user = None
+        if version_user_row.get("name"):
+            version_user = ensure_user(
+                version_user_row["name"],
+                f"shotgrid-user:{version_user_row.get('id') or version_user_row['name']}",
+            )
+
+        shots.append(
+            DatasetShot(
+                id=shot_id,
+                name=shot_row.get("name") or review_name,
+                description=version_row.get("description") or "",
+            )
+        )
+        tasks.append(
+            DatasetTask(
+                id=task_id,
+                name=task_row.get("name") or "Review",
+                status=version_row.get("sg_status_list") or "rev",
+                pipeline_step_name=task_row.get("step")
+                or task_row.get("name")
+                or "Review",
+                entity_type="Shot",
+                entity_id=shot_id,
+            )
+        )
+        versions.append(
+            DatasetVersion(
+                id=version_id,
+                source_id=int(version_row.get("id") or 0),
+                code=version_row.get("code") or review_name,
+                description=version_row.get("description") or "",
+                status=version_row.get("sg_status_list") or "rev",
+                created_at=version_row.get("created_at"),
+                updated_at=version_row.get("created_at"),
+                user_id=version_user.id if version_user else None,
+                shot_id=shot_id,
+                task_id=task_id,
+                thumbnail=None,
+            )
+        )
+        version_id_by_review_name[review_name] = version_id
+        version_ids_for_playlist.append(version_id)
+
+    if not versions:
+        raise ValueError("No versions could be built from the dataset review_set.")
+
+    playlist_id = _stable_id(session_id, "playlist", session_id)
+    playlist = DatasetPlaylist(
+        id=playlist_id,
+        code=session_id,
+        description=f"Seeded demo dataset for {project_name}",
+        created_at=_isoformat_utc(session_dt),
+        updated_at=_isoformat_utc(session_dt),
+        version_ids=version_ids_for_playlist,
+    )
+
+    final_review_item = in_review_events[-1].review_item
+    in_review_version_id = version_id_by_review_name.get(final_review_item)
+    if in_review_version_id is None:
+        raise ValueError(
+            f"No version metadata found for final in_review item {final_review_item}."
+        )
+
+    utterances = transcript_data.get("utterances") or []
+    for review_name, utterance_index, utterance in utterance_assignments:
+        version_id = version_id_by_review_name.get(review_name)
+        if version_id is None:
+            raise ValueError(
+                f"No version metadata found in shotgrid_data.json for review item {review_name}."
+            )
+
+        start_offset = _parse_hms(utterance["ts"])
+        next_offset = None
+        if utterance_index + 1 < len(utterances):
+            next_offset = _parse_hms(utterances[utterance_index + 1]["ts"])
+        if next_offset is None or next_offset <= start_offset:
+            next_offset = start_offset + 5
+
+        start_dt = session_dt + timedelta(seconds=start_offset)
+        end_dt = session_dt + timedelta(seconds=next_offset)
+        start_iso = _isoformat_utc(start_dt)
+        end_iso = _isoformat_utc(end_dt)
+        segments.append(
+            DatasetSegment(
+                segment_id=generate_segment_id(playlist.id, version_id, start_iso),
+                playlist_id=playlist.id,
+                version_id=version_id,
+                speaker=utterance.get("speaker") or "Unknown",
+                text=(utterance.get("text") or "").strip(),
+                absolute_start_time=start_iso,
+                absolute_end_time=end_iso,
+            )
+        )
+
+    sample_user = users_by_name.get("Cameron") or next(
+        iter(users_by_name.values()), None
+    )
+    if sample_user is None:
+        sample_user = ensure_user("Demo User", "fallback-demo-user")
+
+    deduped_shots = list({shot.id: shot for shot in shots}.values())
+    deduped_tasks = list({task.id: task for task in tasks}.values())
+    deduped_versions = list({version.id: version for version in versions}.values())
+    deduped_users = list({user.id: user for user in users_by_name.values()}.values())
+
+    return DatasetPlan(
+        dataset_name=session_id,
+        dataset_path=dataset_path,
+        project_id=project_id,
+        project_code=project_code,
+        project_name=project_name,
+        playlist=playlist,
+        users=sorted(deduped_users, key=lambda user: user.name),
+        shots=sorted(deduped_shots, key=lambda shot: shot.name),
+        tasks=sorted(deduped_tasks, key=lambda task: task.name),
+        versions=deduped_versions,
+        segments=segments,
+        in_review_version_id=in_review_version_id,
+        sample_user_email=sample_user.email,
+        warnings=warnings,
+    )
+
+
+def _find_default_dataset_path() -> Optional[Path]:
+    candidates = [
+        Path.cwd() / "sample_dailies_dataset",
+        BACKEND_ROOT / "sample_dailies_dataset",
+        BACKEND_ROOT.parent / "sample_dailies_dataset",
+    ]
+
+    dev_datasets_dir = BACKEND_ROOT / "dev_datasets"
+    if dev_datasets_dir.exists():
+        candidates.extend(
+            sorted(path for path in dev_datasets_dir.iterdir() if path.is_dir())
+        )
+
+    seen: set[Path] = set()
+    for candidate in candidates:
+        resolved = candidate.resolve()
+        if resolved in seen:
+            continue
+        seen.add(resolved)
+        if _is_dataset_dir(resolved):
+            return resolved
+    return None
+
+
+def _format_plan_summary(plan: DatasetPlan) -> str:
+    version_lookup = {version.id: version for version in plan.versions}
+    segment_counts: dict[int, int] = {version.id: 0 for version in plan.versions}
+    for segment in plan.segments:
+        segment_counts[segment.version_id] = (
+            segment_counts.get(segment.version_id, 0) + 1
+        )
+
+    lines = [
+        f"Dataset: {plan.dataset_name}",
+        f"Path: {plan.dataset_path}",
+        f"Project: {plan.project_name} ({plan.project_code})",
+        f"Playlist: {plan.playlist.code} [id={plan.playlist.id}]",
+        f"Users: {len(plan.users)}",
+        f"Shots: {len(plan.shots)}",
+        f"Tasks: {len(plan.tasks)}",
+        f"Versions: {len(plan.versions)}",
+        f"Segments: {len(plan.segments)}",
+        f"Sample user email: {plan.sample_user_email}",
+        f"In-review version id: {plan.in_review_version_id}",
+        "",
+        "Version transcript coverage:",
+    ]
+
+    for version in plan.versions:
+        lines.append(
+            f"- {version.code} [id={version.id}]: {segment_counts.get(version.id, 0)} segments"
+        )
+
+    lines.extend(
+        [
+            "",
+            "Example generate-note payload:",
+            json.dumps(
+                {
+                    "playlist_id": plan.playlist.id,
+                    "version_id": plan.in_review_version_id,
+                    "user_email": plan.sample_user_email,
+                },
+                indent=2,
+            ),
+        ]
+    )
+
+    if plan.warnings:
+        lines.append("")
+        lines.append("Warnings:")
+        lines.extend(f"- {warning}" for warning in plan.warnings)
+
+    return "\n".join(lines)
+
+
+def _is_dataset_dir(path: Path) -> bool:
+    return all(
+        (path / name).exists()
+        for name in (
+            "session.json",
+            "shotgrid_data.json",
+            "transcript.json",
+            "in_review.json",
+        )
+    )
+
+
+def _isoformat_utc(dt: datetime) -> str:
+    return dt.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
+
+
+def _load_in_review_events(dataset_path: Path) -> list[InReviewEvent]:
+    in_review_path = dataset_path / "in_review.json"
+    if not in_review_path.exists():
+        raise FileNotFoundError(f"Required dataset file not found: {in_review_path}")
+
+    raw_events = json.loads(in_review_path.read_text())
+    if not isinstance(raw_events, list) or not raw_events:
+        raise ValueError("in_review.json must contain a non-empty list of events.")
+
+    events: list[InReviewEvent] = []
+    previous_offset: Optional[int] = None
+    for index, raw_event in enumerate(raw_events):
+        if not isinstance(raw_event, dict):
+            raise ValueError(
+                f"in_review.json event at index {index} must be an object."
+            )
+
+        ts = raw_event.get("ts")
+        review_item = raw_event.get("review_item")
+        if not isinstance(ts, str) or not ts:
+            raise ValueError(
+                f"in_review.json event at index {index} is missing a valid 'ts'."
+            )
+        if not isinstance(review_item, str) or not review_item:
+            raise ValueError(
+                f"in_review.json event at index {index} is missing a valid 'review_item'."
+            )
+
+        offset = _parse_hms(ts)
+        if previous_offset is not None and offset <= previous_offset:
+            raise ValueError(
+                "in_review.json events must be strictly ordered by ascending timestamp."
+            )
+        previous_offset = offset
+        events.append(InReviewEvent(ts=ts, review_item=review_item))
+
+    return events
+
+
+def _parse_hms(value: str) -> int:
+    hours, minutes, seconds = value.split(":")
+    return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
+
+
+async def _seed_mongo(plan: DatasetPlan) -> None:
+    mongo_url = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
+    mongo_db_name = os.getenv("MONGODB_DB", "dna")
+    client: AsyncMongoClient[Any] = AsyncMongoClient(mongo_url)
+    try:
+        db = client[mongo_db_name]
+        now = datetime.now(timezone.utc)
+
+        await db.playlist_metadata.find_one_and_update(
+            {"playlist_id": plan.playlist.id},
+            {
+                "$set": {
+                    "in_review": plan.in_review_version_id,
+                    "transcription_paused": False,
+                },
+                "$setOnInsert": {"playlist_id": plan.playlist.id},
+            },
+            upsert=True,
+        )
+
+        for segment in plan.segments:
+            await db.segments.find_one_and_update(
+                {
+                    "segment_id": segment.segment_id,
+                    "playlist_id": segment.playlist_id,
+                    "version_id": segment.version_id,
+                },
+                {
+                    "$set": {
+                        "text": segment.text,
+                        "speaker": segment.speaker,
+                        "absolute_start_time": segment.absolute_start_time,
+                        "absolute_end_time": segment.absolute_end_time,
+                        "updated_at": now,
+                    },
+                    "$setOnInsert": {
+                        "created_at": now,
+                        "segment_id": segment.segment_id,
+                        "playlist_id": segment.playlist_id,
+                        "version_id": segment.version_id,
+                    },
+                },
+                upsert=True,
+            )
+
+        await db.user_settings.find_one_and_update(
+            {"user_email": plan.sample_user_email},
+            {
+                "$set": {"updated_at": now},
+                "$setOnInsert": {
+                    "created_at": now,
+                    "user_email": plan.sample_user_email,
+                    "note_prompt": "",
+                    "regenerate_on_version_change": False,
+                    "regenerate_on_transcript_update": False,
+                },
+            },
+            upsert=True,
+        )
+    finally:
+        await client.close()
+
+
+def _seed_sqlite(plan: DatasetPlan, db_path: Path) -> None:
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    if not os.access(db_path.parent, os.W_OK):
+        raise PermissionError(
+            "SQLite output directory is not writable: "
+            f"{db_path.parent}. Use --output-sqlite-path to choose a writable path, "
+            "or fix the directory permissions."
+        )
+
+    try:
+        conn = sqlite3.connect(db_path)
+    except sqlite3.OperationalError as exc:
+        raise RuntimeError(
+            f"Could not open SQLite database at {db_path}: {exc}. "
+            "Use --output-sqlite-path to choose a writable path, or fix the directory permissions."
+        ) from exc
+
+    try:
+        conn.executescript(SCHEMA_PATH.read_text())
+
+        conn.execute(
+            "INSERT OR REPLACE INTO projects (id, name) VALUES (?, ?)",
+            (plan.project_id, plan.project_name),
+        )
+
+        for user in plan.users:
+            conn.execute(
+                "INSERT OR REPLACE INTO users (id, name, email, login) VALUES (?, ?, ?, ?)",
+                (user.id, user.name, user.email, user.login),
+            )
+            conn.execute(
+                "INSERT OR IGNORE INTO project_users (project_id, user_id) VALUES (?, ?)",
+                (plan.project_id, user.id),
+            )
+
+        for shot in plan.shots:
+            conn.execute(
+                "INSERT OR REPLACE INTO shots (id, name, description, project_id) VALUES (?, ?, ?, ?)",
+                (shot.id, shot.name, shot.description, plan.project_id),
+            )
+
+        for task in plan.tasks:
+            conn.execute(
+                """INSERT OR REPLACE INTO tasks (
+                       id, name, status, pipeline_step_id, pipeline_step_name,
+                       project_id, entity_type, entity_id
+                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
+                (
+                    task.id,
+                    task.name,
+                    task.status,
+                    None,
+                    task.pipeline_step_name,
+                    plan.project_id,
+                    task.entity_type,
+                    task.entity_id,
+                ),
+            )
+
+        for version in plan.versions:
+            conn.execute(
+                """INSERT OR REPLACE INTO versions (
+                       id, name, description, status, user_id, created_at, updated_at,
+                       movie_path, frame_path, thumbnail, project_id, entity_type, entity_id, task_id
+                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                (
+                    version.id,
+                    version.code,
+                    version.description,
+                    version.status,
+                    version.user_id,
+                    version.created_at,
+                    version.updated_at,
+                    version.movie_path,
+                    version.frame_path,
+                    version.thumbnail,
+                    plan.project_id,
+                    "Shot" if version.shot_id else None,
+                    version.shot_id,
+                    version.task_id,
+                ),
+            )
+
+        conn.execute(
+            "DELETE FROM playlist_versions WHERE playlist_id = ?", (plan.playlist.id,)
+        )
+        conn.execute(
+            """INSERT OR REPLACE INTO playlists (
+                   id, code, description, project_id, created_at, updated_at
+               ) VALUES (?, ?, ?, ?, ?, ?)""",
+            (
+                plan.playlist.id,
+                plan.playlist.code,
+                plan.playlist.description,
+                plan.project_id,
+                plan.playlist.created_at,
+                plan.playlist.updated_at,
+            ),
+        )
+        for version_id in plan.playlist.version_ids:
+            conn.execute(
+                "INSERT OR IGNORE INTO playlist_versions (playlist_id, version_id) VALUES (?, ?)",
+                (plan.playlist.id, version_id),
+            )
+
+        status_codes = sorted({version.status or "rev" for version in plan.versions})
+        for status_code in status_codes:
+            conn.execute(
+                "INSERT OR REPLACE INTO version_statuses (code, name, project_id) VALUES (?, ?, ?)",
+                (status_code, status_code.upper(), plan.project_id),
+            )
+
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def _slugify(value: str) -> str:
+    slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
+    return slug or "user"
+
+
+def _stable_id(dataset_name: str, category: str, source_key: str) -> int:
+    digest = hashlib.sha256(
+        f"{dataset_name}:{category}:{source_key}".encode("utf-8")
+    ).hexdigest()
+    return 100_000_000 + int(digest[:7], 16)
+
+
+async def _run_import(plan: DatasetPlan, sqlite_path: Path) -> None:
+    _seed_sqlite(plan, sqlite_path)
+    await _seed_mongo(plan)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Bootstrap a standalone demo dataset into local dev stores.",
+    )
+    parser.add_argument(
+        "dataset_path",
+        nargs="?",
+        type=Path,
+        default=None,
+        help=(
+            "Path to a dataset directory containing session.json, shotgrid_data.json, "
+            "transcript.json, and in_review.json. If omitted, the script searches common dataset locations."
+        ),
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Parse the dataset and print what would be seeded without writing anything.",
+    )
+    parser.add_argument(
+        "--output-sqlite-path",
+        type=Path,
+        default=DEFAULT_SQLITE_PATH,
+        help=(
+            "SQLite DB path to write for the mock prodtrack provider "
+            "(default: backend/.local/mock.db)"
+        ),
+    )
+    args = parser.parse_args()
+
+    try:
+        dataset_path = (
+            args.dataset_path.resolve()
+            if args.dataset_path is not None
+            else _find_default_dataset_path()
+        )
+        if dataset_path is None:
+            raise FileNotFoundError(
+                "Could not find a default dataset directory. Pass dataset_path explicitly."
+            )
+        plan = _build_dataset_plan(dataset_path)
+        print(_format_plan_summary(plan))
+        if args.dry_run:
+            return 0
+        sqlite_path = args.output_sqlite_path
+        if not sqlite_path.is_absolute():
+            sqlite_path = (BACKEND_ROOT / sqlite_path).resolve()
+        asyncio.run(_run_import(plan, sqlite_path))
+    except Exception as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        return 1
+
+    print("")
+    print(f"Seeded SQLite: {sqlite_path}")
+    print(
+        f"Seeded MongoDB URL: {os.getenv('MONGODB_URL', 'mongodb://localhost:27017')}"
+    )
+    print(
+        "Reminder: set MOCK_PRODTRACK_DB_PATH=/app/.local/mock.db in "
+        "backend/docker-compose.local.yml and restart the stack if you want "
+        "the app to use this bootstrapped SQLite DB."
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/backend/src/dna/prodtrack_providers/mock_provider.py b/backend/src/dna/prodtrack_providers/mock_provider.py
index 07d9eb18..06e0cf4e 100644
--- a/backend/src/dna/prodtrack_providers/mock_provider.py
+++ b/backend/src/dna/prodtrack_providers/mock_provider.py
@@ -3,9 +3,12 @@
 import os
 import sqlite3
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, cast
 
 THUMBNAIL_LOCAL = "__local__"
+MOCK_PRODTRACK_DB_PATH_ENV = "MOCK_PRODTRACK_DB_PATH"
+BACKEND_ROOT = Path(__file__).resolve().parents[3]
+DEFAULT_MOCK_DB_PATH = Path(__file__).parent / "mock_data" / "mock.db"
 
 from dna.models.entity import (
     ENTITY_MODELS,
@@ -40,12 +43,23 @@ def _project_link(project_id: int) -> dict[str, Any]:
 def _shallow_entity(
     entity_type: str, entity_id: int, name: Optional[str] = None
 ) -> EntityBase:
-    model_class = ENTITY_MODELS.get(entity_type)
-    if not model_class:
-        return EntityBase(id=entity_id)
     if entity_type == "playlist":
-        return model_class(id=entity_id, code=name)
-    return model_class(id=entity_id, name=name)
+        return Playlist(id=entity_id, code=name)
+    if entity_type == "project":
+        return Project(id=entity_id, name=name)
+    if entity_type == "shot":
+        return Shot(id=entity_id, name=name)
+    if entity_type == "asset":
+        return Asset(id=entity_id, name=name)
+    if entity_type == "task":
+        return Task(id=entity_id, name=name)
+    if entity_type == "version":
+        return Version(id=entity_id, name=name)
+    if entity_type == "note":
+        return Note(id=entity_id, subject=name)
+    if entity_type == "user":
+        return User(id=entity_id, name=name)
+    return EntityBase(id=entity_id)
 
 
 class MockProdtrackProvider(ProdtrackProviderBase):
@@ -57,14 +71,37 @@ def __init__(
         base_url: Optional[str] = None,
     ):
         super().__init__()
-        if db_path is None:
-            db_path = Path(__file__).parent / "mock_data" / "mock.db"
-        self._db_path = Path(db_path)
+        self._db_path = self._resolve_db_path(db_path)
         self._base_url = (
             base_url or os.getenv("API_BASE_URL", "http://localhost:8000")
         ).rstrip("/")
         self._conn: Optional[sqlite3.Connection] = None
 
+    @staticmethod
+    def _resolve_configured_path(path_str: str) -> Path:
+        configured_path = Path(path_str)
+        if configured_path.is_absolute():
+            return configured_path.resolve()
+
+        cwd_path = (Path.cwd() / configured_path).resolve()
+        if cwd_path.exists():
+            return cwd_path
+
+        return (BACKEND_ROOT / configured_path).resolve()
+
+    @classmethod
+    def _resolve_db_path(cls, db_path: Optional[Path]) -> Path:
+        if db_path is not None:
+            return Path(db_path).resolve()
+
+        configured_path = os.getenv(MOCK_PRODTRACK_DB_PATH_ENV)
+        if configured_path:
+            resolved_path = cls._resolve_configured_path(configured_path)
+            if resolved_path.exists():
+                return resolved_path
+
+        return DEFAULT_MOCK_DB_PATH.resolve()
+
     def _get_conn(self) -> sqlite3.Connection:
         if self._conn is None:
             uri = f"file:{self._db_path}?mode=ro"
@@ -239,8 +276,11 @@ def get_entity(
             if resolve_links and row["entity_type"] and row["entity_id"]:
                 dna_type = _SG_TYPE_TO_DNA.get(row["entity_type"], "shot")
                 if dna_type in ("shot", "asset"):
-                    entity = self.get_entity(
-                        dna_type, row["entity_id"], resolve_links=False
+                    entity = cast(
+                        Shot | Asset,
+                        self.get_entity(
+                            dna_type, row["entity_id"], resolve_links=False
+                        ),
                     )
             return self._task_from_row(row, row["project_id"], entity)
         if entity_type == "version":
@@ -260,13 +300,22 @@ def get_entity(
                 if row["entity_type"] and row["entity_id"]:
                     dna_type = _SG_TYPE_TO_DNA.get(row["entity_type"], "shot")
                     if dna_type in ("shot", "asset"):
-                        entity = self.get_entity(
-                            dna_type, row["entity_id"], resolve_links=False
+                        entity = cast(
+                            Shot | Asset,
+                            self.get_entity(
+                                dna_type, row["entity_id"], resolve_links=False
+                            ),
                         )
                 if row["task_id"]:
-                    task = self.get_entity("task", row["task_id"], resolve_links=False)
+                    task = cast(
+                        Task,
+                        self.get_entity("task", row["task_id"], resolve_links=False),
+                    )
                 if row["user_id"]:
-                    user = self.get_entity("user", row["user_id"], resolve_links=False)
+                    user = cast(
+                        User,
+                        self.get_entity("user", row["user_id"], resolve_links=False),
+                    )
                 for n in conn.execute(
                     "SELECT nl.note_id FROM note_links nl WHERE nl.entity_type = 'Version' AND nl.entity_id = ?",
                     (entity_id,),
@@ -307,8 +356,9 @@ def get_entity(
             note_links = []
             if resolve_links:
                 if row["author_id"]:
-                    author = self.get_entity(
-                        "user", row["author_id"], resolve_links=False
+                    author = cast(
+                        User,
+                        self.get_entity("user", row["author_id"], resolve_links=False),
                     )
                 for link in conn.execute(
                     "SELECT entity_type, entity_id FROM note_links WHERE note_id = ?",
@@ -379,6 +429,8 @@ def _build_where(
         params: list[Any] = []
         for f in filters:
             field = f.get("field")
+            if not isinstance(field, str):
+                raise ValueError("Filter field must be a string")
             operator = f.get("operator", "is")
             value = f.get("value")
             if isinstance(value, dict) and "id" in value:
@@ -392,6 +444,8 @@ def _build_where(
                 conditions.append(f"{sql_col} = ?")
                 params.append(value)
             elif operator == "in":
+                if value is None:
+                    raise ValueError("Filter value for 'in' operator cannot be None")
                 ids = [
                     v["id"] if isinstance(v, dict) and "id" in v else v for v in value
                 ]
diff --git a/backend/tests/providers/test_mock_provider.py b/backend/tests/providers/test_mock_provider.py
index b37bded2..a0cbb163 100644
--- a/backend/tests/providers/test_mock_provider.py
+++ b/backend/tests/providers/test_mock_provider.py
@@ -386,6 +386,32 @@ def test_provider_init_uses_env_base_url(mock_db_path):
     assert version.thumbnail == "http://api.test/api/mock-thumbnails/300"
 
 
+def test_provider_init_uses_env_db_path(mock_db_path):
+    with mock.patch.dict(
+        os.environ,
+        {"MOCK_PRODTRACK_DB_PATH": str(mock_db_path)},
+        clear=False,
+    ):
+        provider = MockProdtrackProvider()
+
+    assert provider._db_path == mock_db_path.resolve()
+
+
+def test_provider_missing_env_db_path_falls_back_to_default(mock_db_path):
+    with mock.patch.dict(
+        os.environ,
+        {"MOCK_PRODTRACK_DB_PATH": "does/not/exist/mock.db"},
+        clear=False,
+    ):
+        with mock.patch(
+            "dna.prodtrack_providers.mock_provider.DEFAULT_MOCK_DB_PATH",
+            mock_db_path,
+        ):
+            provider = MockProdtrackProvider()
+
+    assert provider._db_path == mock_db_path.resolve()
+
+
 def test_find_with_filters(mock_provider):
     shots = mock_provider.find(
         "shot",
diff --git a/backend/tests/test_bootstrap_dataset.py b/backend/tests/test_bootstrap_dataset.py
new file mode 100644
index 00000000..d38f0be1
--- /dev/null
+++ b/backend/tests/test_bootstrap_dataset.py
@@ -0,0 +1,531 @@
+import asyncio
+import json
+import sqlite3
+import sys
+from os import PathLike, fsdecode
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+import dna.devtools.bootstrap_dataset as bootstrap_dataset
+from dna.devtools.bootstrap_dataset import (
+    _assign_utterances_to_in_review_events,
+    _build_dataset_plan,
+    _find_default_dataset_path,
+    _format_plan_summary,
+    _is_dataset_dir,
+    _load_in_review_events,
+    _parse_hms,
+    _seed_mongo,
+    _seed_sqlite,
+    _slugify,
+    _stable_id,
+)
+
+
+def _sample_dataset_path(tmp_path: Path) -> Path:
+    dataset_path = tmp_path / "sample_dailies_dataset"
+    dataset_path.mkdir(parents=True)
+
+    (dataset_path / "session.json").write_text(
+        json.dumps(
+            {
+                "session_id": "demo_dailies_2025_10_02",
+                "project": {"code": "HSM", "name": "Hyperspace Mini"},
+                "date_utc": "2025-10-02T16:00:00Z",
+                "participants": [
+                    {"name": "Cameron", "role": "Supervisor"},
+                    {"name": "Sonia", "role": "Lighting"},
+                    {"name": "Lars", "role": "Compositor"},
+                ],
+                "review_set": ["HSM_SATL_0010", "HSM_SATL_0015"],
+            }
+        )
+    )
+    (dataset_path / "shotgrid_data.json").write_text(
+        json.dumps(
+            {
+                "versions": [
+                    {
+                        "id": 6720,
+                        "code": "HSM_SATL_0010_TD",
+                        "entity": {"id": 1162, "name": "HSM_SATL_0010", "type": "Shot"},
+                        "sg_status_list": "rev",
+                        "description": "Lighting pass",
+                        "created_at": "2016-08-15T14:34:22-04:00",
+                        "user": {"id": 123, "name": "Sonia Demo", "type": "HumanUser"},
+                        "sg_task": {
+                            "id": 5632,
+                            "name": "Lighting",
+                            "type": "Task",
+                            "step": "Light",
+                        },
+                    },
+                    {
+                        "id": 6722,
+                        "code": "HSM_SATL_0015_TD",
+                        "entity": {"id": 1163, "name": "HSM_SATL_0015", "type": "Shot"},
+                        "sg_status_list": "rev",
+                        "description": "Comp pass",
+                        "created_at": "2016-08-15T14:34:23-04:00",
+                        "user": {"id": 122, "name": "Lars Demo", "type": "HumanUser"},
+                        "sg_task": {
+                            "id": 5636,
+                            "name": "Compositing",
+                            "type": "Task",
+                            "step": "Comp",
+                        },
+                    },
+                ]
+            }
+        )
+    )
+    (dataset_path / "transcript.json").write_text(
+        json.dumps(
+            {
+                "session_id": "demo_dailies_2025_10_02",
+                "utterances": [
+                    {
+                        "ts": "00:00:00",
+                        "speaker": "Cameron",
+                        "text": "Let's start with HSM SATL 0010.",
+                    },
+                    {
+                        "ts": "00:00:10",
+                        "speaker": "Sonia",
+                        "text": "The sun reflection still needs work.",
+                    },
+                    {
+                        "ts": "00:00:20",
+                        "speaker": "Cameron",
+                        "text": "Next up is HSM SATL 0015 for comp review.",
+                    },
+                    {
+                        "ts": "00:00:30",
+                        "speaker": "Lars",
+                        "text": "I want to reduce the visor reflection.",
+                    },
+                ],
+            }
+        )
+    )
+    (dataset_path / "in_review.json").write_text(
+        json.dumps(
+            [
+                {"ts": "00:00:16", "review_item": "HSM_SATL_0010"},
+                {"ts": "00:00:21", "review_item": "HSM_SATL_0015"},
+            ]
+        )
+    )
+
+    return dataset_path
+
+
+def test_build_dataset_plan_from_sample_dataset(tmp_path: Path):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+
+    assert plan.project_name == "Hyperspace Mini"
+    assert plan.project_code == "HSM"
+    assert plan.playlist.code == "demo_dailies_2025_10_02"
+    assert len(plan.versions) == 2
+    assert len(plan.segments) > 0
+    assert plan.sample_user_email == "cameron@example.com"
+    assert plan.warnings == []
+    assert plan.in_review_version_id == plan.versions[1].id
+
+
+def test_build_dataset_plan_assigns_segments_to_each_review_version(tmp_path: Path):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+
+    segment_counts: dict[int, int] = {version.id: 0 for version in plan.versions}
+    for segment in plan.segments:
+        segment_counts[segment.version_id] += 1
+
+    assert all(count > 0 for count in segment_counts.values())
+
+
+def test_format_plan_summary_contains_generate_note_payload(tmp_path: Path):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+
+    summary = _format_plan_summary(plan)
+
+    assert "Example generate-note payload" in summary
+    assert str(plan.playlist.id) in summary
+    assert plan.sample_user_email in summary
+
+
+def test_seed_sqlite_writes_playlist_and_versions(tmp_path: Path):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+    db_path = tmp_path / "seeded.db"
+
+    _seed_sqlite(plan, db_path)
+
+    conn = sqlite3.connect(db_path)
+    try:
+        playlist_row = conn.execute(
+            "SELECT code FROM playlists WHERE id = ?", (plan.playlist.id,)
+        ).fetchone()
+        version_count = conn.execute(
+            "SELECT COUNT(*) FROM playlist_versions WHERE playlist_id = ?",
+            (plan.playlist.id,),
+        ).fetchone()[0]
+        user_row = conn.execute(
+            "SELECT email FROM users WHERE email = ?", (plan.sample_user_email,)
+        ).fetchone()
+    finally:
+        conn.close()
+
+    assert playlist_row == (plan.playlist.code,)
+    assert version_count == len(plan.playlist.version_ids)
+    assert user_row == (plan.sample_user_email,)
+
+
+def test_seed_sqlite_rejects_unwritable_output_directory(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+    db_path = tmp_path / "locked" / "seeded.db"
+    db_path.parent.mkdir()
+
+    original_access = bootstrap_dataset.os.access
+
+    def fake_access(
+        path: str | bytes | PathLike[str] | PathLike[bytes], mode: int
+    ) -> bool:
+        if Path(fsdecode(path)) == db_path.parent and mode == bootstrap_dataset.os.W_OK:
+            return False
+        return original_access(path, mode)
+
+    monkeypatch.setattr(bootstrap_dataset.os, "access", fake_access)
+
+    with pytest.raises(
+        PermissionError, match="SQLite output directory is not writable"
+    ):
+        _seed_sqlite(plan, db_path)
+
+
+def test_build_dataset_plan_rejects_in_review_item_outside_review_set(tmp_path: Path):
+    dataset_path = _sample_dataset_path(tmp_path)
+    in_review_path = dataset_path / "in_review.json"
+    in_review_path.write_text(
+        json.dumps(
+            [
+                {"ts": "00:00:16", "review_item": "HSM_SATL_0010"},
+                {"ts": "00:00:21", "review_item": "HSM_SATL_0099"},
+            ]
+        )
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="in_review.json review_item HSM_SATL_0099 is not present in session.json review_set",
+    ):
+        _build_dataset_plan(dataset_path)
+
+
+def test_build_dataset_plan_rejects_missing_version_metadata_for_review_set_item(
+    tmp_path: Path,
+):
+    dataset_path = _sample_dataset_path(tmp_path)
+    shotgrid_path = dataset_path / "shotgrid_data.json"
+    shotgrid_data = json.loads(shotgrid_path.read_text())
+    shotgrid_data["versions"] = shotgrid_data["versions"][:1]
+    shotgrid_path.write_text(json.dumps(shotgrid_data))
+
+    with pytest.raises(
+        ValueError,
+        match="No version metadata found in shotgrid_data.json for review_set item HSM_SATL_0015",
+    ):
+        _build_dataset_plan(dataset_path)
+
+
+def test_build_dataset_plan_requires_in_review_json(tmp_path: Path):
+    dataset_path = _sample_dataset_path(tmp_path)
+    (dataset_path / "in_review.json").unlink()
+
+    with pytest.raises(FileNotFoundError, match="in_review.json"):
+        _build_dataset_plan(dataset_path)
+
+
+@pytest.mark.parametrize(
+    ("payload", "match"),
+    [
+        ([], "non-empty list of events"),
+        (["bad-event"], "must be an object"),
+        ([{"review_item": "HSM_SATL_0010"}], "missing a valid 'ts'"),
+        ([{"ts": "00:00:16"}], "missing a valid 'review_item'"),
+        (
+            [
+                {"ts": "00:00:16", "review_item": "HSM_SATL_0010"},
+                {"ts": "00:00:16", "review_item": "HSM_SATL_0015"},
+            ],
+            "strictly ordered by ascending timestamp",
+        ),
+    ],
+)
+def test_load_in_review_events_validates_input(
+    tmp_path: Path, payload: object, match: str
+):
+    dataset_path = tmp_path / "dataset"
+    dataset_path.mkdir()
+    (dataset_path / "in_review.json").write_text(json.dumps(payload))
+
+    with pytest.raises(ValueError, match=match):
+        _load_in_review_events(dataset_path)
+
+
+def test_assign_utterances_to_in_review_events_skips_prelude_and_requires_ts():
+    events = [
+        bootstrap_dataset.InReviewEvent(ts="00:00:05", review_item="HSM_SATL_0010")
+    ]
+    utterances = [
+        {"ts": "00:00:00", "text": "intro"},
+        {"ts": "00:00:06", "text": "covered"},
+    ]
+
+    assignments = _assign_utterances_to_in_review_events(utterances, events)
+
+    assert assignments == [("HSM_SATL_0010", 1, utterances[1])]
+
+    with pytest.raises(ValueError, match="missing a valid 'ts'"):
+        _assign_utterances_to_in_review_events([{"text": "bad"}], events)
+
+
+def test_build_dataset_plan_rejects_mismatched_transcript_session_id(tmp_path: Path):
+    dataset_path = _sample_dataset_path(tmp_path)
+    transcript_path = dataset_path / "transcript.json"
+    transcript = json.loads(transcript_path.read_text())
+    transcript["session_id"] = "different_session"
+    transcript_path.write_text(json.dumps(transcript))
+
+    with pytest.raises(ValueError, match="does not match transcript.json session_id"):
+        _build_dataset_plan(dataset_path)
+
+
+def test_build_dataset_plan_requires_review_set_and_date_utc(tmp_path: Path):
+    dataset_path = _sample_dataset_path(tmp_path)
+    session_path = dataset_path / "session.json"
+    session = json.loads(session_path.read_text())
+    session["review_set"] = []
+    session_path.write_text(json.dumps(session))
+
+    with pytest.raises(ValueError, match="does not contain a review_set"):
+        _build_dataset_plan(dataset_path)
+
+    dataset_path = _sample_dataset_path(tmp_path / "other")
+    session_path = dataset_path / "session.json"
+    session = json.loads(session_path.read_text())
+    session.pop("date_utc")
+    session_path.write_text(json.dumps(session))
+
+    with pytest.raises(ValueError, match="does not contain date_utc"):
+        _build_dataset_plan(dataset_path)
+
+
+def test_build_dataset_plan_falls_back_to_demo_user_when_no_users_present(
+    tmp_path: Path,
+):
+    dataset_path = _sample_dataset_path(tmp_path)
+    session_path = dataset_path / "session.json"
+    session = json.loads(session_path.read_text())
+    session["participants"] = []
+    session_path.write_text(json.dumps(session))
+
+    shotgrid_path = dataset_path / "shotgrid_data.json"
+    shotgrid_data = json.loads(shotgrid_path.read_text())
+    for version in shotgrid_data["versions"]:
+        version["user"] = {}
+    shotgrid_path.write_text(json.dumps(shotgrid_data))
+
+    plan = _build_dataset_plan(dataset_path)
+
+    assert plan.sample_user_email == "demo-user@example.com"
+
+
+def test_find_default_dataset_path_and_is_dataset_dir(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+):
+    dataset_path = tmp_path / "sample_dailies_dataset"
+    dataset_path.mkdir()
+    for name in (
+        "session.json",
+        "shotgrid_data.json",
+        "transcript.json",
+        "in_review.json",
+    ):
+        (dataset_path / name).write_text("{}")
+
+    monkeypatch.setattr(bootstrap_dataset, "BACKEND_ROOT", tmp_path)
+
+    assert _is_dataset_dir(dataset_path) is True
+    assert _find_default_dataset_path() == dataset_path.resolve()
+
+
+def test_format_plan_summary_includes_warnings():
+    plan = bootstrap_dataset.DatasetPlan(
+        dataset_name="dataset",
+        dataset_path=Path("/tmp/dataset"),
+        project_id=1,
+        project_code="HSM",
+        project_name="Hyperspace Mini",
+        playlist=bootstrap_dataset.DatasetPlaylist(
+            id=10,
+            code="demo",
+            description="desc",
+            created_at="2025-10-02T16:00:00Z",
+            updated_at="2025-10-02T16:00:00Z",
+            version_ids=[20],
+        ),
+        users=[],
+        shots=[],
+        tasks=[],
+        versions=[
+            bootstrap_dataset.DatasetVersion(
+                id=20,
+                source_id=1,
+                code="HSM_SATL_0010_TD",
+                description="desc",
+                status="rev",
+                created_at=None,
+                updated_at=None,
+                user_id=None,
+                shot_id=None,
+                task_id=None,
+                thumbnail=None,
+            )
+        ],
+        segments=[],
+        in_review_version_id=20,
+        sample_user_email="demo@example.com",
+        warnings=["warning one"],
+    )
+
+    summary = _format_plan_summary(plan)
+
+    assert "Warnings:" in summary
+    assert "warning one" in summary
+
+
+def test_helper_functions_are_stable():
+    assert _parse_hms("01:02:03") == 3723
+    assert _slugify("Demo User") == "demo-user"
+    assert _stable_id("dataset", "version", "abc") == _stable_id(
+        "dataset", "version", "abc"
+    )
+
+
+def test_seed_mongo_writes_expected_documents(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+
+    class FakeCollection:
+        def __init__(self):
+            self.calls: list[
+                tuple[dict[str, Any], dict[str, Any], bool, dict[str, Any]]
+            ] = []
+
+        async def find_one_and_update(self, query, update, upsert=False, **kwargs):
+            self.calls.append((query, update, upsert, kwargs))
+            return {}
+
+    class FakeDatabase:
+        def __init__(self):
+            self.playlist_metadata = FakeCollection()
+            self.segments = FakeCollection()
+            self.user_settings = FakeCollection()
+
+    class FakeClient:
+        last_instance = None
+
+        def __init__(self, url: str):
+            self.url = url
+            self.closed = False
+            self.db = FakeDatabase()
+            FakeClient.last_instance = self
+
+        def __getitem__(self, name: str) -> FakeDatabase:
+            return self.db
+
+        async def close(self):
+            self.closed = True
+
+    monkeypatch.setattr(bootstrap_dataset, "AsyncMongoClient", FakeClient)
+
+    asyncio.run(_seed_mongo(plan))
+
+    client = FakeClient.last_instance
+    assert client is not None
+    assert client.db.playlist_metadata.calls[0][0] == {"playlist_id": plan.playlist.id}
+    assert (
+        client.db.playlist_metadata.calls[0][1]["$set"]["in_review"]
+        == plan.in_review_version_id
+    )
+    assert len(client.db.segments.calls) == len(plan.segments)
+    assert client.db.user_settings.calls[0][0] == {"user_email": plan.sample_user_email}
+    assert client.closed is True
+
+
+def test_run_import_calls_sqlite_then_mongo(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+):
+    plan = _build_dataset_plan(_sample_dataset_path(tmp_path))
+    calls: list[tuple[str, object]] = []
+
+    def fake_seed_sqlite(received_plan, received_path):
+        calls.append(("sqlite", received_path))
+        assert received_plan == plan
+
+    async def fake_seed_mongo(received_plan):
+        calls.append(("mongo", received_plan.playlist.id))
+        assert received_plan == plan
+
+    monkeypatch.setattr(bootstrap_dataset, "_seed_sqlite", fake_seed_sqlite)
+    monkeypatch.setattr(bootstrap_dataset, "_seed_mongo", fake_seed_mongo)
+
+    asyncio.run(bootstrap_dataset._run_import(plan, tmp_path / "mock.db"))
+
+    assert calls == [("sqlite", tmp_path / "mock.db"), ("mongo", plan.playlist.id)]
+
+
+def test_main_supports_dry_run_and_reports_errors(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+):
+    dataset_path = _sample_dataset_path(tmp_path)
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["bootstrap_dataset", str(dataset_path), "--dry-run"],
+    )
+    assert bootstrap_dataset.main() == 0
+    captured = capsys.readouterr()
+    assert "Dataset: demo_dailies_2025_10_02" in captured.out
+
+    monkeypatch.setattr(sys, "argv", ["bootstrap_dataset"])
+    monkeypatch.setattr(bootstrap_dataset, "_find_default_dataset_path", lambda: None)
+    assert bootstrap_dataset.main() == 1
+    captured = capsys.readouterr()
+    assert "Could not find a default dataset directory" in captured.err
+
+
+def test_main_prints_bootstrap_db_reminder_on_success(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+):
+    dataset_path = _sample_dataset_path(tmp_path)
+
+    async def fake_run_import(plan, sqlite_path):
+        return None
+
+    monkeypatch.setattr(bootstrap_dataset, "_run_import", fake_run_import)
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["bootstrap_dataset", str(dataset_path), "--output-sqlite-path", "mock.db"],
+    )
+
+    assert bootstrap_dataset.main() == 0
+    captured = capsys.readouterr()
+    assert "Seeded SQLite:" in captured.out
+    assert "MOCK_PRODTRACK_DB_PATH=/app/.local/mock.db" in captured.out
diff --git a/backend/uv.lock b/backend/uv.lock
new file mode 100644
index 00000000..a5bc5147
--- /dev/null
+++ b/backend/uv.lock
@@ -0,0 +1,3 @@
+version = 1
+revision = 3
+requires-python = ">=3.14"
diff --git a/sample_dailies_dataset/in_review.json b/sample_dailies_dataset/in_review.json
new file mode 100644
index 00000000..c0fe6765
--- /dev/null
+++ b/sample_dailies_dataset/in_review.json
@@ -0,0 +1,7 @@
+[
+  { "ts": "00:00:16", "review_item": "HSM_SATL_0010" },
+  { "ts": "00:01:26", "review_item": "HSM_SATL_0015" },
+  { "ts": "00:02:29", "review_item": "HSM_SATL_0020" },
+  { "ts": "00:05:39", "review_item": "HSM_SATL_0050" },
+  { "ts": "00:08:13", "review_item": "HSM_SATL_0010" }
+]
diff --git a/sample_dailies_dataset/transcript.json b/sample_dailies_dataset/transcript.json
index 551c984c..361e5c2c 100644
--- a/sample_dailies_dataset/transcript.json
+++ b/sample_dailies_dataset/transcript.json
@@ -1,5 +1,5 @@
 {
-  "session_id": "demo_dailies_2025_10_03",
+  "session_id": "demo_dailies_2025_10_02",
   "utterances": [
     {
       "ts": "00:00:00",