From d69abe39b815dc68740b1b07a642f4d89e277f2b Mon Sep 17 00:00:00 2001 From: Allen Rose Date: Mon, 20 Apr 2026 21:33:36 -0700 Subject: [PATCH] Add script to bootstrap SQLite & Mongo from files Signed-off-by: Allen Rose --- QUICKSTART.md | 34 +- backend/.gitignore | 1 + backend/docker-compose.yml | 1 + backend/example.docker-compose.local.yml | 3 + backend/makefile | 5 +- backend/src/dna/devtools/__init__.py | 1 + backend/src/dna/devtools/bootstrap_dataset.py | 783 ++++++++++++++++++ .../dna/prodtrack_providers/mock_provider.py | 88 +- backend/tests/providers/test_mock_provider.py | 26 + backend/tests/test_bootstrap_dataset.py | 531 ++++++++++++ backend/uv.lock | 3 + sample_dailies_dataset/in_review.json | 7 + sample_dailies_dataset/transcript.json | 2 +- 13 files changed, 1462 insertions(+), 23 deletions(-) create mode 100644 backend/src/dna/devtools/__init__.py create mode 100644 backend/src/dna/devtools/bootstrap_dataset.py create mode 100644 backend/tests/test_bootstrap_dataset.py create mode 100644 backend/uv.lock create mode 100644 sample_dailies_dataset/in_review.json diff --git a/QUICKSTART.md b/QUICKSTART.md index 705e9d9c..15f8f4fb 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -29,7 +29,7 @@ cp example.docker-compose.local.vexa.yml docker-compose.local.vexa.yml Edit `docker-compose.local.yml` with your credentials. -**Production tracking (ShotGrid):** To run without a ShotGrid seat, set **`PRODTRACK_PROVIDER=mock`** in `docker-compose.local.yml`. The mock provider uses read-only SQLite with pre-seeded data. To use real ShotGrid, set `PRODTRACK_PROVIDER=shotgrid` (or leave it unset) and add `SHOTGRID_URL`, `SHOTGRID_SCRIPT_NAME`, and `SHOTGRID_API_KEY`. See [Mock setup](#mock-production-tracking-setup) below for how to refresh or customize the mock data. +**Production tracking (ShotGrid):** To run without a ShotGrid seat, set **`PRODTRACK_PROVIDER=mock`** in `docker-compose.local.yml`. Set **`MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db`** to make the active mock DB explicit and easy to change later. The mock provider uses read-only SQLite with pre-seeded data. To use real ShotGrid, set `PRODTRACK_PROVIDER=shotgrid` (or leave it unset) and add `SHOTGRID_URL`, `SHOTGRID_SCRIPT_NAME`, and `SHOTGRID_API_KEY`. See [Mock setup](#mock-production-tracking-setup) below for how to refresh or customize the mock data. **LLM provider:** Set `LLM_PROVIDER` to choose which backend LLM integration to use. @@ -140,6 +140,7 @@ The React app will be available at `http://localhost:5173`. | `SHOTGRID_API_KEY` | Yes\* | - | ShotGrid API key (required when using ShotGrid) | | `SHOTGRID_SCRIPT_NAME` | Yes\* | - | ShotGrid script name (required when using ShotGrid) | | `PRODTRACK_PROVIDER` | No | `shotgrid` | `shotgrid` or `mock`; set to `mock` to use the read-only mock DB without ShotGrid | +| `MOCK_PRODTRACK_DB_PATH` | No | bundled `mock.db` | Path to the SQLite DB used when `PRODTRACK_PROVIDER=mock` | | `MONGODB_URL` | No | `mongodb://mongo:27017` | MongoDB connection string | | `STORAGE_PROVIDER` | No | `mongodb` | Storage provider type | | `VEXA_API_KEY` | Yes | - | API key for Vexa transcription service | @@ -257,12 +258,13 @@ The DNA API serves as the central hub: ## Mock Production Tracking Setup -When you set **`PRODTRACK_PROVIDER=mock`**, the backend uses a read-only mock provider backed by SQLite (`backend/src/dna/prodtrack_providers/mock_data/mock.db`). The app runs normally with this data so you can develop and test the UI without a ShotGrid seat. +When you set **`PRODTRACK_PROVIDER=mock`**, the backend uses a read-only mock provider backed by SQLite. By default, set **`MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db`** in `docker-compose.local.yml` so the active DB is explicit and easy to change. The app runs normally with this data so you can develop and test the UI without a ShotGrid seat. ### Using the mock provider -- In `docker-compose.local.yml`, set **`PRODTRACK_PROVIDER=mock`**. You do not need to set any ShotGrid variables when using the mock. +- In `docker-compose.local.yml`, set **`PRODTRACK_PROVIDER=mock`** and **`MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db`**. - The mock provider is used only when explicitly set; there is no automatic fallback if ShotGrid credentials are missing. +- To switch the backend to a bootstrapped local DB, change `MOCK_PRODTRACK_DB_PATH` to `/app/.local/mock.db`. ### Refreshing or customizing mock data from ShotGrid @@ -283,10 +285,34 @@ docker-compose -f docker-compose.yml -f docker-compose.local.yml run --rm api \ --api-key 'YOUR_API_KEY' ``` -- This overwrites `mock_data/mock.db` with entities (projects, users, shots, assets, tasks, versions, playlists, notes) from the given ShotGrid project. +- To bootstrap sample review data, first start Mongo, then run `bootstrap_dataset`. +- By default, `bootstrap_dataset` writes to `backend/.local/mock.db`, so it does not modify the checked-in mock DB. +- If you want the backend to use that bootstrapped DB, set `MOCK_PRODTRACK_DB_PATH=/app/.local/mock.db` in `docker-compose.local.yml`. +- `seed-mock-db` overwrites `mock_data/mock.db` with entities (projects, users, shots, assets, tasks, versions, playlists, notes) from the given ShotGrid project. - Use `--skip-thumbnails` to skip downloading version thumbnails (faster seed; thumbnails will not work after signed URLs expire). - Without `--skip-thumbnails`, thumbnails are downloaded to `mock_data/thumbnails/` and served by the API at `/api/mock-thumbnails/{version_id}` so they keep working after ShotGrid signed URLs expire. +Example sample bootstrap workflow: + +```bash +cd backend + +# Start only Mongo first +make start-mongo + +# See available arguments +python -m dna.devtools.bootstrap_dataset --help + +# Preview the import without writing anything +python -m dna.devtools.bootstrap_dataset ../sample_dailies_dataset --dry-run + +# Seed SQLite + Mongo using the default local SQLite output +python -m dna.devtools.bootstrap_dataset ../sample_dailies_dataset + +# Then start the full backend stack so the API comes up with the seeded data +make start-local +``` + The mock provider is **read-only**: it does not write to ShotGrid or to the SQLite file at runtime. Writes such as publishing notes will raise an error when using the mock provider. ## Docker Compose Files diff --git a/backend/.gitignore b/backend/.gitignore index e30b5633..04e870fe 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -42,3 +42,4 @@ backend/docker-compose.vexa.yml # Local environment files docker-compose.local.yml docker-compose.local.vexa.yml +.local/ diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml index 5be99118..ac5772ca 100644 --- a/backend/docker-compose.yml +++ b/backend/docker-compose.yml @@ -29,6 +29,7 @@ services: - ./pytest.ini:/app/pytest.ini - ./.coveragerc:/app/.coveragerc - ./htmlcov:/app/htmlcov + - ./.local:/app/.local environment: - PYTHONUNBUFFERED=1 - SHOTGRID_URL=https://your-shotgrid-url.com diff --git a/backend/example.docker-compose.local.yml b/backend/example.docker-compose.local.yml index b0095bb4..f9a27f88 100644 --- a/backend/example.docker-compose.local.yml +++ b/backend/example.docker-compose.local.yml @@ -1,6 +1,8 @@ # Local development override. Copy to docker-compose.local.yml and fill in secrets. # Auth: AUTH_PROVIDER=none uses the noop provider (sign in with any email; no token validation). # Prodtrack: Set PRODTRACK_PROVIDER=mock to use the read-only mock (no ShotGrid needed). +# Mock DB path: change MOCK_PRODTRACK_DB_PATH to /app/.local/mock.db if you +# want the backend to use a locally bootstrapped DB instead of the checked-in fixture. # To use real ShotGrid, set PRODTRACK_PROVIDER=shotgrid and SHOTGRID_URL, SHOTGRID_SCRIPT_NAME, SHOTGRID_API_KEY. services: api: @@ -10,6 +12,7 @@ services: - SHOTGRID_API_KEY=************ - SHOTGRID_SCRIPT_NAME=DNA_local_testing - PRODTRACK_PROVIDER=mock + - MOCK_PRODTRACK_DB_PATH=src/dna/prodtrack_providers/mock_data/mock.db - VEXA_API_KEY=********** - VEXA_API_URL=http://vexa:8056 - OPENAI_API_KEY=your-openai-api-key diff --git a/backend/makefile b/backend/makefile index 9c2986ce..bdade8b9 100644 --- a/backend/makefile +++ b/backend/makefile @@ -56,4 +56,7 @@ format-python: venv-lint .venv-lint/bin/isort src/ tests/ seed-mock-db: - $(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.local.yml run --rm api python -m dna.prodtrack_providers.mock_data.seed_db --project-id 124 --url https://aswf.shotgrid.autodesk.com --script-name DNA_local_testing --api-key '$(SHOTGRID_API_KEY)' \ No newline at end of file + $(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.local.yml run --rm api python -m dna.prodtrack_providers.mock_data.seed_db --project-id 124 --url https://aswf.shotgrid.autodesk.com --script-name DNA_local_testing --api-key '$(SHOTGRID_API_KEY)' + +start-mongo: + $(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.local.yml up -d mongo diff --git a/backend/src/dna/devtools/__init__.py b/backend/src/dna/devtools/__init__.py new file mode 100644 index 00000000..450264d1 --- /dev/null +++ b/backend/src/dna/devtools/__init__.py @@ -0,0 +1 @@ +"""Developer tooling utilities for backend workflows.""" diff --git a/backend/src/dna/devtools/bootstrap_dataset.py b/backend/src/dna/devtools/bootstrap_dataset.py new file mode 100644 index 00000000..63e4a06c --- /dev/null +++ b/backend/src/dna/devtools/bootstrap_dataset.py @@ -0,0 +1,783 @@ +"""Bootstrap a standalone demo dataset into local development stores. + +This script seeds: +- the mock prodtrack SQLite database used by the mock provider +- MongoDB collections used by /generate-note and transcript viewing + +""" + +from __future__ import annotations + +import argparse +import asyncio +import hashlib +import json +import os +import re +import sqlite3 +import sys +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Optional + +from pymongo import AsyncMongoClient + +from dna.models.stored_segment import generate_segment_id + +BACKEND_ROOT = Path(__file__).resolve().parents[3] + + +DEFAULT_SQLITE_PATH = BACKEND_ROOT / ".local" / "mock.db" +SCHEMA_PATH = ( + Path(__file__).resolve().parents[1] + / "prodtrack_providers" + / "mock_data" + / "schema.sql" +) + + +@dataclass(slots=True) +class DatasetPlan: + dataset_name: str + dataset_path: Path + project_id: int + project_code: str + project_name: str + playlist: DatasetPlaylist + users: list[DatasetUser] + shots: list[DatasetShot] + tasks: list[DatasetTask] + versions: list[DatasetVersion] + segments: list[DatasetSegment] + in_review_version_id: int + sample_user_email: str + warnings: list[str] = field(default_factory=list) + + +@dataclass(slots=True) +class DatasetPlaylist: + id: int + code: str + description: str + created_at: str + updated_at: str + version_ids: list[int] + + +@dataclass(slots=True) +class DatasetSegment: + segment_id: str + playlist_id: int + version_id: int + speaker: str + text: str + absolute_start_time: str + absolute_end_time: str + + +@dataclass(slots=True) +class DatasetShot: + id: int + name: str + description: str + + +@dataclass(slots=True) +class DatasetTask: + id: int + name: str + status: str + pipeline_step_name: str + entity_type: str + entity_id: int + + +@dataclass(slots=True) +class DatasetUser: + id: int + name: str + email: str + login: str + + +@dataclass(slots=True) +class DatasetVersion: + id: int + source_id: int + code: str + description: str + status: str + created_at: Optional[str] + updated_at: Optional[str] + user_id: Optional[int] + shot_id: Optional[int] + task_id: Optional[int] + thumbnail: Optional[str] + movie_path: Optional[str] = None + frame_path: Optional[str] = None + + +@dataclass(slots=True) +class InReviewEvent: + ts: str + review_item: str + + +def _assign_utterances_to_in_review_events( + utterances: list[dict[str, Any]], events: list[InReviewEvent] +) -> list[tuple[str, int, dict[str, Any]]]: + event_offsets = [_parse_hms(event.ts) for event in events] + assignments: list[tuple[str, int, dict[str, Any]]] = [] + + event_index = -1 + for utterance_index, utterance in enumerate(utterances): + utterance_ts = utterance.get("ts") + if not isinstance(utterance_ts, str) or not utterance_ts: + raise ValueError( + f"transcript.json utterance at index {utterance_index} is missing a valid 'ts'." + ) + + utterance_offset = _parse_hms(utterance_ts) + while ( + event_index + 1 < len(event_offsets) + and event_offsets[event_index + 1] <= utterance_offset + ): + event_index += 1 + + if event_index < 0: + continue + + assignments.append( + (events[event_index].review_item, utterance_index, utterance) + ) + + return assignments + + +def _build_dataset_plan(dataset_path: Path) -> DatasetPlan: + session_path = dataset_path / "session.json" + shotgrid_path = dataset_path / "shotgrid_data.json" + transcript_path = dataset_path / "transcript.json" + + for required_path in (session_path, shotgrid_path, transcript_path): + if not required_path.exists(): + raise FileNotFoundError(f"Required dataset file not found: {required_path}") + + session = json.loads(session_path.read_text()) + shotgrid_data = json.loads(shotgrid_path.read_text()) + transcript_data = json.loads(transcript_path.read_text()) + in_review_events = _load_in_review_events(dataset_path) + + warnings: list[str] = [] + + session_id = str(session.get("session_id") or dataset_path.name) + transcript_session_id = transcript_data.get("session_id") + if transcript_session_id and transcript_session_id != session_id: + raise ValueError( + f"session.json session_id ({session_id}) does not match transcript.json session_id ({transcript_session_id})." + ) + + project = session.get("project") or {} + project_code = project.get("code") or "DEMO" + project_name = project.get("name") or "Demo Project" + project_id = _stable_id(session_id, "project", f"{project_code}:{project_name}") + + version_rows = shotgrid_data.get("versions") or [] + versions_by_review_name = { + row.get("entity", {}).get("name"): row + for row in version_rows + if row.get("entity", {}).get("name") + } + + review_set = session.get("review_set") or [] + if not review_set: + raise ValueError("session.json does not contain a review_set.") + for event in in_review_events: + if event.review_item not in review_set: + raise ValueError( + f"in_review.json review_item {event.review_item} is not present in session.json review_set." + ) + + session_dt_raw = session.get("date_utc") + if not session_dt_raw: + raise ValueError("session.json does not contain date_utc.") + session_dt = datetime.fromisoformat(session_dt_raw.replace("Z", "+00:00")) + + utterance_assignments = _assign_utterances_to_in_review_events( + transcript_data.get("utterances") or [], in_review_events + ) + + users_by_name: dict[str, DatasetUser] = {} + + def ensure_user(name: str, hint: str) -> DatasetUser: + existing = users_by_name.get(name) + if existing: + return existing + user = DatasetUser( + id=_stable_id(session_id, "user", hint), + name=name, + email=f"{_slugify(name)}@example.com", + login=_slugify(name), + ) + users_by_name[name] = user + return user + + for participant in session.get("participants") or []: + participant_name = participant.get("name") + if participant_name: + ensure_user(participant_name, f"participant:{participant_name}") + + shots: list[DatasetShot] = [] + tasks: list[DatasetTask] = [] + versions: list[DatasetVersion] = [] + segments: list[DatasetSegment] = [] + version_ids_for_playlist: list[int] = [] + + version_id_by_review_name: dict[str, int] = {} + + for review_name in review_set: + version_row = versions_by_review_name.get(review_name) + if version_row is None: + raise ValueError( + f"No version metadata found in shotgrid_data.json for review_set item {review_name}." + ) + + shot_row = version_row.get("entity") or {} + shot_id = _stable_id(session_id, "shot", str(shot_row.get("id") or review_name)) + task_row = version_row.get("sg_task") or {} + task_id = _stable_id( + session_id, + "task", + str( + task_row.get("id") or f"{review_name}:{task_row.get('name') or 'task'}" + ), + ) + version_id = _stable_id( + session_id, "version", str(version_row.get("id") or version_row.get("code")) + ) + version_user_row = version_row.get("user") or {} + version_user = None + if version_user_row.get("name"): + version_user = ensure_user( + version_user_row["name"], + f"shotgrid-user:{version_user_row.get('id') or version_user_row['name']}", + ) + + shots.append( + DatasetShot( + id=shot_id, + name=shot_row.get("name") or review_name, + description=version_row.get("description") or "", + ) + ) + tasks.append( + DatasetTask( + id=task_id, + name=task_row.get("name") or "Review", + status=version_row.get("sg_status_list") or "rev", + pipeline_step_name=task_row.get("step") + or task_row.get("name") + or "Review", + entity_type="Shot", + entity_id=shot_id, + ) + ) + versions.append( + DatasetVersion( + id=version_id, + source_id=int(version_row.get("id") or 0), + code=version_row.get("code") or review_name, + description=version_row.get("description") or "", + status=version_row.get("sg_status_list") or "rev", + created_at=version_row.get("created_at"), + updated_at=version_row.get("created_at"), + user_id=version_user.id if version_user else None, + shot_id=shot_id, + task_id=task_id, + thumbnail=None, + ) + ) + version_id_by_review_name[review_name] = version_id + version_ids_for_playlist.append(version_id) + + if not versions: + raise ValueError("No versions could be built from the dataset review_set.") + + playlist_id = _stable_id(session_id, "playlist", session_id) + playlist = DatasetPlaylist( + id=playlist_id, + code=session_id, + description=f"Seeded demo dataset for {project_name}", + created_at=_isoformat_utc(session_dt), + updated_at=_isoformat_utc(session_dt), + version_ids=version_ids_for_playlist, + ) + + final_review_item = in_review_events[-1].review_item + in_review_version_id = version_id_by_review_name.get(final_review_item) + if in_review_version_id is None: + raise ValueError( + f"No version metadata found for final in_review item {final_review_item}." + ) + + utterances = transcript_data.get("utterances") or [] + for review_name, utterance_index, utterance in utterance_assignments: + version_id = version_id_by_review_name.get(review_name) + if version_id is None: + raise ValueError( + f"No version metadata found in shotgrid_data.json for review item {review_name}." + ) + + start_offset = _parse_hms(utterance["ts"]) + next_offset = None + if utterance_index + 1 < len(utterances): + next_offset = _parse_hms(utterances[utterance_index + 1]["ts"]) + if next_offset is None or next_offset <= start_offset: + next_offset = start_offset + 5 + + start_dt = session_dt + timedelta(seconds=start_offset) + end_dt = session_dt + timedelta(seconds=next_offset) + start_iso = _isoformat_utc(start_dt) + end_iso = _isoformat_utc(end_dt) + segments.append( + DatasetSegment( + segment_id=generate_segment_id(playlist.id, version_id, start_iso), + playlist_id=playlist.id, + version_id=version_id, + speaker=utterance.get("speaker") or "Unknown", + text=(utterance.get("text") or "").strip(), + absolute_start_time=start_iso, + absolute_end_time=end_iso, + ) + ) + + sample_user = users_by_name.get("Cameron") or next( + iter(users_by_name.values()), None + ) + if sample_user is None: + sample_user = ensure_user("Demo User", "fallback-demo-user") + + deduped_shots = list({shot.id: shot for shot in shots}.values()) + deduped_tasks = list({task.id: task for task in tasks}.values()) + deduped_versions = list({version.id: version for version in versions}.values()) + deduped_users = list({user.id: user for user in users_by_name.values()}.values()) + + return DatasetPlan( + dataset_name=session_id, + dataset_path=dataset_path, + project_id=project_id, + project_code=project_code, + project_name=project_name, + playlist=playlist, + users=sorted(deduped_users, key=lambda user: user.name), + shots=sorted(deduped_shots, key=lambda shot: shot.name), + tasks=sorted(deduped_tasks, key=lambda task: task.name), + versions=deduped_versions, + segments=segments, + in_review_version_id=in_review_version_id, + sample_user_email=sample_user.email, + warnings=warnings, + ) + + +def _find_default_dataset_path() -> Optional[Path]: + candidates = [ + Path.cwd() / "sample_dailies_dataset", + BACKEND_ROOT / "sample_dailies_dataset", + BACKEND_ROOT.parent / "sample_dailies_dataset", + ] + + dev_datasets_dir = BACKEND_ROOT / "dev_datasets" + if dev_datasets_dir.exists(): + candidates.extend( + sorted(path for path in dev_datasets_dir.iterdir() if path.is_dir()) + ) + + seen: set[Path] = set() + for candidate in candidates: + resolved = candidate.resolve() + if resolved in seen: + continue + seen.add(resolved) + if _is_dataset_dir(resolved): + return resolved + return None + + +def _format_plan_summary(plan: DatasetPlan) -> str: + version_lookup = {version.id: version for version in plan.versions} + segment_counts: dict[int, int] = {version.id: 0 for version in plan.versions} + for segment in plan.segments: + segment_counts[segment.version_id] = ( + segment_counts.get(segment.version_id, 0) + 1 + ) + + lines = [ + f"Dataset: {plan.dataset_name}", + f"Path: {plan.dataset_path}", + f"Project: {plan.project_name} ({plan.project_code})", + f"Playlist: {plan.playlist.code} [id={plan.playlist.id}]", + f"Users: {len(plan.users)}", + f"Shots: {len(plan.shots)}", + f"Tasks: {len(plan.tasks)}", + f"Versions: {len(plan.versions)}", + f"Segments: {len(plan.segments)}", + f"Sample user email: {plan.sample_user_email}", + f"In-review version id: {plan.in_review_version_id}", + "", + "Version transcript coverage:", + ] + + for version in plan.versions: + lines.append( + f"- {version.code} [id={version.id}]: {segment_counts.get(version.id, 0)} segments" + ) + + lines.extend( + [ + "", + "Example generate-note payload:", + json.dumps( + { + "playlist_id": plan.playlist.id, + "version_id": plan.in_review_version_id, + "user_email": plan.sample_user_email, + }, + indent=2, + ), + ] + ) + + if plan.warnings: + lines.append("") + lines.append("Warnings:") + lines.extend(f"- {warning}" for warning in plan.warnings) + + return "\n".join(lines) + + +def _is_dataset_dir(path: Path) -> bool: + return all( + (path / name).exists() + for name in ( + "session.json", + "shotgrid_data.json", + "transcript.json", + "in_review.json", + ) + ) + + +def _isoformat_utc(dt: datetime) -> str: + return dt.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + + +def _load_in_review_events(dataset_path: Path) -> list[InReviewEvent]: + in_review_path = dataset_path / "in_review.json" + if not in_review_path.exists(): + raise FileNotFoundError(f"Required dataset file not found: {in_review_path}") + + raw_events = json.loads(in_review_path.read_text()) + if not isinstance(raw_events, list) or not raw_events: + raise ValueError("in_review.json must contain a non-empty list of events.") + + events: list[InReviewEvent] = [] + previous_offset: Optional[int] = None + for index, raw_event in enumerate(raw_events): + if not isinstance(raw_event, dict): + raise ValueError( + f"in_review.json event at index {index} must be an object." + ) + + ts = raw_event.get("ts") + review_item = raw_event.get("review_item") + if not isinstance(ts, str) or not ts: + raise ValueError( + f"in_review.json event at index {index} is missing a valid 'ts'." + ) + if not isinstance(review_item, str) or not review_item: + raise ValueError( + f"in_review.json event at index {index} is missing a valid 'review_item'." + ) + + offset = _parse_hms(ts) + if previous_offset is not None and offset <= previous_offset: + raise ValueError( + "in_review.json events must be strictly ordered by ascending timestamp." + ) + previous_offset = offset + events.append(InReviewEvent(ts=ts, review_item=review_item)) + + return events + + +def _parse_hms(value: str) -> int: + hours, minutes, seconds = value.split(":") + return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + + +async def _seed_mongo(plan: DatasetPlan) -> None: + mongo_url = os.getenv("MONGODB_URL", "mongodb://localhost:27017") + mongo_db_name = os.getenv("MONGODB_DB", "dna") + client: AsyncMongoClient[Any] = AsyncMongoClient(mongo_url) + try: + db = client[mongo_db_name] + now = datetime.now(timezone.utc) + + await db.playlist_metadata.find_one_and_update( + {"playlist_id": plan.playlist.id}, + { + "$set": { + "in_review": plan.in_review_version_id, + "transcription_paused": False, + }, + "$setOnInsert": {"playlist_id": plan.playlist.id}, + }, + upsert=True, + ) + + for segment in plan.segments: + await db.segments.find_one_and_update( + { + "segment_id": segment.segment_id, + "playlist_id": segment.playlist_id, + "version_id": segment.version_id, + }, + { + "$set": { + "text": segment.text, + "speaker": segment.speaker, + "absolute_start_time": segment.absolute_start_time, + "absolute_end_time": segment.absolute_end_time, + "updated_at": now, + }, + "$setOnInsert": { + "created_at": now, + "segment_id": segment.segment_id, + "playlist_id": segment.playlist_id, + "version_id": segment.version_id, + }, + }, + upsert=True, + ) + + await db.user_settings.find_one_and_update( + {"user_email": plan.sample_user_email}, + { + "$set": {"updated_at": now}, + "$setOnInsert": { + "created_at": now, + "user_email": plan.sample_user_email, + "note_prompt": "", + "regenerate_on_version_change": False, + "regenerate_on_transcript_update": False, + }, + }, + upsert=True, + ) + finally: + await client.close() + + +def _seed_sqlite(plan: DatasetPlan, db_path: Path) -> None: + db_path.parent.mkdir(parents=True, exist_ok=True) + if not os.access(db_path.parent, os.W_OK): + raise PermissionError( + "SQLite output directory is not writable: " + f"{db_path.parent}. Use --output-sqlite-path to choose a writable path, " + "or fix the directory permissions." + ) + + try: + conn = sqlite3.connect(db_path) + except sqlite3.OperationalError as exc: + raise RuntimeError( + f"Could not open SQLite database at {db_path}: {exc}. " + "Use --output-sqlite-path to choose a writable path, or fix the directory permissions." + ) from exc + + try: + conn.executescript(SCHEMA_PATH.read_text()) + + conn.execute( + "INSERT OR REPLACE INTO projects (id, name) VALUES (?, ?)", + (plan.project_id, plan.project_name), + ) + + for user in plan.users: + conn.execute( + "INSERT OR REPLACE INTO users (id, name, email, login) VALUES (?, ?, ?, ?)", + (user.id, user.name, user.email, user.login), + ) + conn.execute( + "INSERT OR IGNORE INTO project_users (project_id, user_id) VALUES (?, ?)", + (plan.project_id, user.id), + ) + + for shot in plan.shots: + conn.execute( + "INSERT OR REPLACE INTO shots (id, name, description, project_id) VALUES (?, ?, ?, ?)", + (shot.id, shot.name, shot.description, plan.project_id), + ) + + for task in plan.tasks: + conn.execute( + """INSERT OR REPLACE INTO tasks ( + id, name, status, pipeline_step_id, pipeline_step_name, + project_id, entity_type, entity_id + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", + ( + task.id, + task.name, + task.status, + None, + task.pipeline_step_name, + plan.project_id, + task.entity_type, + task.entity_id, + ), + ) + + for version in plan.versions: + conn.execute( + """INSERT OR REPLACE INTO versions ( + id, name, description, status, user_id, created_at, updated_at, + movie_path, frame_path, thumbnail, project_id, entity_type, entity_id, task_id + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + version.id, + version.code, + version.description, + version.status, + version.user_id, + version.created_at, + version.updated_at, + version.movie_path, + version.frame_path, + version.thumbnail, + plan.project_id, + "Shot" if version.shot_id else None, + version.shot_id, + version.task_id, + ), + ) + + conn.execute( + "DELETE FROM playlist_versions WHERE playlist_id = ?", (plan.playlist.id,) + ) + conn.execute( + """INSERT OR REPLACE INTO playlists ( + id, code, description, project_id, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?)""", + ( + plan.playlist.id, + plan.playlist.code, + plan.playlist.description, + plan.project_id, + plan.playlist.created_at, + plan.playlist.updated_at, + ), + ) + for version_id in plan.playlist.version_ids: + conn.execute( + "INSERT OR IGNORE INTO playlist_versions (playlist_id, version_id) VALUES (?, ?)", + (plan.playlist.id, version_id), + ) + + status_codes = sorted({version.status or "rev" for version in plan.versions}) + for status_code in status_codes: + conn.execute( + "INSERT OR REPLACE INTO version_statuses (code, name, project_id) VALUES (?, ?, ?)", + (status_code, status_code.upper(), plan.project_id), + ) + + conn.commit() + finally: + conn.close() + + +def _slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") + return slug or "user" + + +def _stable_id(dataset_name: str, category: str, source_key: str) -> int: + digest = hashlib.sha256( + f"{dataset_name}:{category}:{source_key}".encode("utf-8") + ).hexdigest() + return 100_000_000 + int(digest[:7], 16) + + +async def _run_import(plan: DatasetPlan, sqlite_path: Path) -> None: + _seed_sqlite(plan, sqlite_path) + await _seed_mongo(plan) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Bootstrap a standalone demo dataset into local dev stores.", + ) + parser.add_argument( + "dataset_path", + nargs="?", + type=Path, + default=None, + help=( + "Path to a dataset directory containing session.json, shotgrid_data.json, " + "transcript.json, and in_review.json. If omitted, the script searches common dataset locations." + ), + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Parse the dataset and print what would be seeded without writing anything.", + ) + parser.add_argument( + "--output-sqlite-path", + type=Path, + default=DEFAULT_SQLITE_PATH, + help=( + "SQLite DB path to write for the mock prodtrack provider " + "(default: backend/.local/mock.db)" + ), + ) + args = parser.parse_args() + + try: + dataset_path = ( + args.dataset_path.resolve() + if args.dataset_path is not None + else _find_default_dataset_path() + ) + if dataset_path is None: + raise FileNotFoundError( + "Could not find a default dataset directory. Pass dataset_path explicitly." + ) + plan = _build_dataset_plan(dataset_path) + print(_format_plan_summary(plan)) + if args.dry_run: + return 0 + sqlite_path = args.output_sqlite_path + if not sqlite_path.is_absolute(): + sqlite_path = (BACKEND_ROOT / sqlite_path).resolve() + asyncio.run(_run_import(plan, sqlite_path)) + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + print("") + print(f"Seeded SQLite: {sqlite_path}") + print( + f"Seeded MongoDB URL: {os.getenv('MONGODB_URL', 'mongodb://localhost:27017')}" + ) + print( + "Reminder: set MOCK_PRODTRACK_DB_PATH=/app/.local/mock.db in " + "backend/docker-compose.local.yml and restart the stack if you want " + "the app to use this bootstrapped SQLite DB." + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/backend/src/dna/prodtrack_providers/mock_provider.py b/backend/src/dna/prodtrack_providers/mock_provider.py index 07d9eb18..06e0cf4e 100644 --- a/backend/src/dna/prodtrack_providers/mock_provider.py +++ b/backend/src/dna/prodtrack_providers/mock_provider.py @@ -3,9 +3,12 @@ import os import sqlite3 from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, cast THUMBNAIL_LOCAL = "__local__" +MOCK_PRODTRACK_DB_PATH_ENV = "MOCK_PRODTRACK_DB_PATH" +BACKEND_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_MOCK_DB_PATH = Path(__file__).parent / "mock_data" / "mock.db" from dna.models.entity import ( ENTITY_MODELS, @@ -40,12 +43,23 @@ def _project_link(project_id: int) -> dict[str, Any]: def _shallow_entity( entity_type: str, entity_id: int, name: Optional[str] = None ) -> EntityBase: - model_class = ENTITY_MODELS.get(entity_type) - if not model_class: - return EntityBase(id=entity_id) if entity_type == "playlist": - return model_class(id=entity_id, code=name) - return model_class(id=entity_id, name=name) + return Playlist(id=entity_id, code=name) + if entity_type == "project": + return Project(id=entity_id, name=name) + if entity_type == "shot": + return Shot(id=entity_id, name=name) + if entity_type == "asset": + return Asset(id=entity_id, name=name) + if entity_type == "task": + return Task(id=entity_id, name=name) + if entity_type == "version": + return Version(id=entity_id, name=name) + if entity_type == "note": + return Note(id=entity_id, subject=name) + if entity_type == "user": + return User(id=entity_id, name=name) + return EntityBase(id=entity_id) class MockProdtrackProvider(ProdtrackProviderBase): @@ -57,14 +71,37 @@ def __init__( base_url: Optional[str] = None, ): super().__init__() - if db_path is None: - db_path = Path(__file__).parent / "mock_data" / "mock.db" - self._db_path = Path(db_path) + self._db_path = self._resolve_db_path(db_path) self._base_url = ( base_url or os.getenv("API_BASE_URL", "http://localhost:8000") ).rstrip("/") self._conn: Optional[sqlite3.Connection] = None + @staticmethod + def _resolve_configured_path(path_str: str) -> Path: + configured_path = Path(path_str) + if configured_path.is_absolute(): + return configured_path.resolve() + + cwd_path = (Path.cwd() / configured_path).resolve() + if cwd_path.exists(): + return cwd_path + + return (BACKEND_ROOT / configured_path).resolve() + + @classmethod + def _resolve_db_path(cls, db_path: Optional[Path]) -> Path: + if db_path is not None: + return Path(db_path).resolve() + + configured_path = os.getenv(MOCK_PRODTRACK_DB_PATH_ENV) + if configured_path: + resolved_path = cls._resolve_configured_path(configured_path) + if resolved_path.exists(): + return resolved_path + + return DEFAULT_MOCK_DB_PATH.resolve() + def _get_conn(self) -> sqlite3.Connection: if self._conn is None: uri = f"file:{self._db_path}?mode=ro" @@ -239,8 +276,11 @@ def get_entity( if resolve_links and row["entity_type"] and row["entity_id"]: dna_type = _SG_TYPE_TO_DNA.get(row["entity_type"], "shot") if dna_type in ("shot", "asset"): - entity = self.get_entity( - dna_type, row["entity_id"], resolve_links=False + entity = cast( + Shot | Asset, + self.get_entity( + dna_type, row["entity_id"], resolve_links=False + ), ) return self._task_from_row(row, row["project_id"], entity) if entity_type == "version": @@ -260,13 +300,22 @@ def get_entity( if row["entity_type"] and row["entity_id"]: dna_type = _SG_TYPE_TO_DNA.get(row["entity_type"], "shot") if dna_type in ("shot", "asset"): - entity = self.get_entity( - dna_type, row["entity_id"], resolve_links=False + entity = cast( + Shot | Asset, + self.get_entity( + dna_type, row["entity_id"], resolve_links=False + ), ) if row["task_id"]: - task = self.get_entity("task", row["task_id"], resolve_links=False) + task = cast( + Task, + self.get_entity("task", row["task_id"], resolve_links=False), + ) if row["user_id"]: - user = self.get_entity("user", row["user_id"], resolve_links=False) + user = cast( + User, + self.get_entity("user", row["user_id"], resolve_links=False), + ) for n in conn.execute( "SELECT nl.note_id FROM note_links nl WHERE nl.entity_type = 'Version' AND nl.entity_id = ?", (entity_id,), @@ -307,8 +356,9 @@ def get_entity( note_links = [] if resolve_links: if row["author_id"]: - author = self.get_entity( - "user", row["author_id"], resolve_links=False + author = cast( + User, + self.get_entity("user", row["author_id"], resolve_links=False), ) for link in conn.execute( "SELECT entity_type, entity_id FROM note_links WHERE note_id = ?", @@ -379,6 +429,8 @@ def _build_where( params: list[Any] = [] for f in filters: field = f.get("field") + if not isinstance(field, str): + raise ValueError("Filter field must be a string") operator = f.get("operator", "is") value = f.get("value") if isinstance(value, dict) and "id" in value: @@ -392,6 +444,8 @@ def _build_where( conditions.append(f"{sql_col} = ?") params.append(value) elif operator == "in": + if value is None: + raise ValueError("Filter value for 'in' operator cannot be None") ids = [ v["id"] if isinstance(v, dict) and "id" in v else v for v in value ] diff --git a/backend/tests/providers/test_mock_provider.py b/backend/tests/providers/test_mock_provider.py index b37bded2..a0cbb163 100644 --- a/backend/tests/providers/test_mock_provider.py +++ b/backend/tests/providers/test_mock_provider.py @@ -386,6 +386,32 @@ def test_provider_init_uses_env_base_url(mock_db_path): assert version.thumbnail == "http://api.test/api/mock-thumbnails/300" +def test_provider_init_uses_env_db_path(mock_db_path): + with mock.patch.dict( + os.environ, + {"MOCK_PRODTRACK_DB_PATH": str(mock_db_path)}, + clear=False, + ): + provider = MockProdtrackProvider() + + assert provider._db_path == mock_db_path.resolve() + + +def test_provider_missing_env_db_path_falls_back_to_default(mock_db_path): + with mock.patch.dict( + os.environ, + {"MOCK_PRODTRACK_DB_PATH": "does/not/exist/mock.db"}, + clear=False, + ): + with mock.patch( + "dna.prodtrack_providers.mock_provider.DEFAULT_MOCK_DB_PATH", + mock_db_path, + ): + provider = MockProdtrackProvider() + + assert provider._db_path == mock_db_path.resolve() + + def test_find_with_filters(mock_provider): shots = mock_provider.find( "shot", diff --git a/backend/tests/test_bootstrap_dataset.py b/backend/tests/test_bootstrap_dataset.py new file mode 100644 index 00000000..d38f0be1 --- /dev/null +++ b/backend/tests/test_bootstrap_dataset.py @@ -0,0 +1,531 @@ +import asyncio +import json +import sqlite3 +import sys +from os import PathLike, fsdecode +from pathlib import Path +from typing import Any + +import pytest + +import dna.devtools.bootstrap_dataset as bootstrap_dataset +from dna.devtools.bootstrap_dataset import ( + _assign_utterances_to_in_review_events, + _build_dataset_plan, + _find_default_dataset_path, + _format_plan_summary, + _is_dataset_dir, + _load_in_review_events, + _parse_hms, + _seed_mongo, + _seed_sqlite, + _slugify, + _stable_id, +) + + +def _sample_dataset_path(tmp_path: Path) -> Path: + dataset_path = tmp_path / "sample_dailies_dataset" + dataset_path.mkdir(parents=True) + + (dataset_path / "session.json").write_text( + json.dumps( + { + "session_id": "demo_dailies_2025_10_02", + "project": {"code": "HSM", "name": "Hyperspace Mini"}, + "date_utc": "2025-10-02T16:00:00Z", + "participants": [ + {"name": "Cameron", "role": "Supervisor"}, + {"name": "Sonia", "role": "Lighting"}, + {"name": "Lars", "role": "Compositor"}, + ], + "review_set": ["HSM_SATL_0010", "HSM_SATL_0015"], + } + ) + ) + (dataset_path / "shotgrid_data.json").write_text( + json.dumps( + { + "versions": [ + { + "id": 6720, + "code": "HSM_SATL_0010_TD", + "entity": {"id": 1162, "name": "HSM_SATL_0010", "type": "Shot"}, + "sg_status_list": "rev", + "description": "Lighting pass", + "created_at": "2016-08-15T14:34:22-04:00", + "user": {"id": 123, "name": "Sonia Demo", "type": "HumanUser"}, + "sg_task": { + "id": 5632, + "name": "Lighting", + "type": "Task", + "step": "Light", + }, + }, + { + "id": 6722, + "code": "HSM_SATL_0015_TD", + "entity": {"id": 1163, "name": "HSM_SATL_0015", "type": "Shot"}, + "sg_status_list": "rev", + "description": "Comp pass", + "created_at": "2016-08-15T14:34:23-04:00", + "user": {"id": 122, "name": "Lars Demo", "type": "HumanUser"}, + "sg_task": { + "id": 5636, + "name": "Compositing", + "type": "Task", + "step": "Comp", + }, + }, + ] + } + ) + ) + (dataset_path / "transcript.json").write_text( + json.dumps( + { + "session_id": "demo_dailies_2025_10_02", + "utterances": [ + { + "ts": "00:00:00", + "speaker": "Cameron", + "text": "Let's start with HSM SATL 0010.", + }, + { + "ts": "00:00:10", + "speaker": "Sonia", + "text": "The sun reflection still needs work.", + }, + { + "ts": "00:00:20", + "speaker": "Cameron", + "text": "Next up is HSM SATL 0015 for comp review.", + }, + { + "ts": "00:00:30", + "speaker": "Lars", + "text": "I want to reduce the visor reflection.", + }, + ], + } + ) + ) + (dataset_path / "in_review.json").write_text( + json.dumps( + [ + {"ts": "00:00:16", "review_item": "HSM_SATL_0010"}, + {"ts": "00:00:21", "review_item": "HSM_SATL_0015"}, + ] + ) + ) + + return dataset_path + + +def test_build_dataset_plan_from_sample_dataset(tmp_path: Path): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + + assert plan.project_name == "Hyperspace Mini" + assert plan.project_code == "HSM" + assert plan.playlist.code == "demo_dailies_2025_10_02" + assert len(plan.versions) == 2 + assert len(plan.segments) > 0 + assert plan.sample_user_email == "cameron@example.com" + assert plan.warnings == [] + assert plan.in_review_version_id == plan.versions[1].id + + +def test_build_dataset_plan_assigns_segments_to_each_review_version(tmp_path: Path): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + + segment_counts: dict[int, int] = {version.id: 0 for version in plan.versions} + for segment in plan.segments: + segment_counts[segment.version_id] += 1 + + assert all(count > 0 for count in segment_counts.values()) + + +def test_format_plan_summary_contains_generate_note_payload(tmp_path: Path): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + + summary = _format_plan_summary(plan) + + assert "Example generate-note payload" in summary + assert str(plan.playlist.id) in summary + assert plan.sample_user_email in summary + + +def test_seed_sqlite_writes_playlist_and_versions(tmp_path: Path): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + db_path = tmp_path / "seeded.db" + + _seed_sqlite(plan, db_path) + + conn = sqlite3.connect(db_path) + try: + playlist_row = conn.execute( + "SELECT code FROM playlists WHERE id = ?", (plan.playlist.id,) + ).fetchone() + version_count = conn.execute( + "SELECT COUNT(*) FROM playlist_versions WHERE playlist_id = ?", + (plan.playlist.id,), + ).fetchone()[0] + user_row = conn.execute( + "SELECT email FROM users WHERE email = ?", (plan.sample_user_email,) + ).fetchone() + finally: + conn.close() + + assert playlist_row == (plan.playlist.code,) + assert version_count == len(plan.playlist.version_ids) + assert user_row == (plan.sample_user_email,) + + +def test_seed_sqlite_rejects_unwritable_output_directory( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + db_path = tmp_path / "locked" / "seeded.db" + db_path.parent.mkdir() + + original_access = bootstrap_dataset.os.access + + def fake_access( + path: str | bytes | PathLike[str] | PathLike[bytes], mode: int + ) -> bool: + if Path(fsdecode(path)) == db_path.parent and mode == bootstrap_dataset.os.W_OK: + return False + return original_access(path, mode) + + monkeypatch.setattr(bootstrap_dataset.os, "access", fake_access) + + with pytest.raises( + PermissionError, match="SQLite output directory is not writable" + ): + _seed_sqlite(plan, db_path) + + +def test_build_dataset_plan_rejects_in_review_item_outside_review_set(tmp_path: Path): + dataset_path = _sample_dataset_path(tmp_path) + in_review_path = dataset_path / "in_review.json" + in_review_path.write_text( + json.dumps( + [ + {"ts": "00:00:16", "review_item": "HSM_SATL_0010"}, + {"ts": "00:00:21", "review_item": "HSM_SATL_0099"}, + ] + ) + ) + + with pytest.raises( + ValueError, + match="in_review.json review_item HSM_SATL_0099 is not present in session.json review_set", + ): + _build_dataset_plan(dataset_path) + + +def test_build_dataset_plan_rejects_missing_version_metadata_for_review_set_item( + tmp_path: Path, +): + dataset_path = _sample_dataset_path(tmp_path) + shotgrid_path = dataset_path / "shotgrid_data.json" + shotgrid_data = json.loads(shotgrid_path.read_text()) + shotgrid_data["versions"] = shotgrid_data["versions"][:1] + shotgrid_path.write_text(json.dumps(shotgrid_data)) + + with pytest.raises( + ValueError, + match="No version metadata found in shotgrid_data.json for review_set item HSM_SATL_0015", + ): + _build_dataset_plan(dataset_path) + + +def test_build_dataset_plan_requires_in_review_json(tmp_path: Path): + dataset_path = _sample_dataset_path(tmp_path) + (dataset_path / "in_review.json").unlink() + + with pytest.raises(FileNotFoundError, match="in_review.json"): + _build_dataset_plan(dataset_path) + + +@pytest.mark.parametrize( + ("payload", "match"), + [ + ([], "non-empty list of events"), + (["bad-event"], "must be an object"), + ([{"review_item": "HSM_SATL_0010"}], "missing a valid 'ts'"), + ([{"ts": "00:00:16"}], "missing a valid 'review_item'"), + ( + [ + {"ts": "00:00:16", "review_item": "HSM_SATL_0010"}, + {"ts": "00:00:16", "review_item": "HSM_SATL_0015"}, + ], + "strictly ordered by ascending timestamp", + ), + ], +) +def test_load_in_review_events_validates_input( + tmp_path: Path, payload: object, match: str +): + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + (dataset_path / "in_review.json").write_text(json.dumps(payload)) + + with pytest.raises(ValueError, match=match): + _load_in_review_events(dataset_path) + + +def test_assign_utterances_to_in_review_events_skips_prelude_and_requires_ts(): + events = [ + bootstrap_dataset.InReviewEvent(ts="00:00:05", review_item="HSM_SATL_0010") + ] + utterances = [ + {"ts": "00:00:00", "text": "intro"}, + {"ts": "00:00:06", "text": "covered"}, + ] + + assignments = _assign_utterances_to_in_review_events(utterances, events) + + assert assignments == [("HSM_SATL_0010", 1, utterances[1])] + + with pytest.raises(ValueError, match="missing a valid 'ts'"): + _assign_utterances_to_in_review_events([{"text": "bad"}], events) + + +def test_build_dataset_plan_rejects_mismatched_transcript_session_id(tmp_path: Path): + dataset_path = _sample_dataset_path(tmp_path) + transcript_path = dataset_path / "transcript.json" + transcript = json.loads(transcript_path.read_text()) + transcript["session_id"] = "different_session" + transcript_path.write_text(json.dumps(transcript)) + + with pytest.raises(ValueError, match="does not match transcript.json session_id"): + _build_dataset_plan(dataset_path) + + +def test_build_dataset_plan_requires_review_set_and_date_utc(tmp_path: Path): + dataset_path = _sample_dataset_path(tmp_path) + session_path = dataset_path / "session.json" + session = json.loads(session_path.read_text()) + session["review_set"] = [] + session_path.write_text(json.dumps(session)) + + with pytest.raises(ValueError, match="does not contain a review_set"): + _build_dataset_plan(dataset_path) + + dataset_path = _sample_dataset_path(tmp_path / "other") + session_path = dataset_path / "session.json" + session = json.loads(session_path.read_text()) + session.pop("date_utc") + session_path.write_text(json.dumps(session)) + + with pytest.raises(ValueError, match="does not contain date_utc"): + _build_dataset_plan(dataset_path) + + +def test_build_dataset_plan_falls_back_to_demo_user_when_no_users_present( + tmp_path: Path, +): + dataset_path = _sample_dataset_path(tmp_path) + session_path = dataset_path / "session.json" + session = json.loads(session_path.read_text()) + session["participants"] = [] + session_path.write_text(json.dumps(session)) + + shotgrid_path = dataset_path / "shotgrid_data.json" + shotgrid_data = json.loads(shotgrid_path.read_text()) + for version in shotgrid_data["versions"]: + version["user"] = {} + shotgrid_path.write_text(json.dumps(shotgrid_data)) + + plan = _build_dataset_plan(dataset_path) + + assert plan.sample_user_email == "demo-user@example.com" + + +def test_find_default_dataset_path_and_is_dataset_dir( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + dataset_path = tmp_path / "sample_dailies_dataset" + dataset_path.mkdir() + for name in ( + "session.json", + "shotgrid_data.json", + "transcript.json", + "in_review.json", + ): + (dataset_path / name).write_text("{}") + + monkeypatch.setattr(bootstrap_dataset, "BACKEND_ROOT", tmp_path) + + assert _is_dataset_dir(dataset_path) is True + assert _find_default_dataset_path() == dataset_path.resolve() + + +def test_format_plan_summary_includes_warnings(): + plan = bootstrap_dataset.DatasetPlan( + dataset_name="dataset", + dataset_path=Path("/tmp/dataset"), + project_id=1, + project_code="HSM", + project_name="Hyperspace Mini", + playlist=bootstrap_dataset.DatasetPlaylist( + id=10, + code="demo", + description="desc", + created_at="2025-10-02T16:00:00Z", + updated_at="2025-10-02T16:00:00Z", + version_ids=[20], + ), + users=[], + shots=[], + tasks=[], + versions=[ + bootstrap_dataset.DatasetVersion( + id=20, + source_id=1, + code="HSM_SATL_0010_TD", + description="desc", + status="rev", + created_at=None, + updated_at=None, + user_id=None, + shot_id=None, + task_id=None, + thumbnail=None, + ) + ], + segments=[], + in_review_version_id=20, + sample_user_email="demo@example.com", + warnings=["warning one"], + ) + + summary = _format_plan_summary(plan) + + assert "Warnings:" in summary + assert "warning one" in summary + + +def test_helper_functions_are_stable(): + assert _parse_hms("01:02:03") == 3723 + assert _slugify("Demo User") == "demo-user" + assert _stable_id("dataset", "version", "abc") == _stable_id( + "dataset", "version", "abc" + ) + + +def test_seed_mongo_writes_expected_documents( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + + class FakeCollection: + def __init__(self): + self.calls: list[ + tuple[dict[str, Any], dict[str, Any], bool, dict[str, Any]] + ] = [] + + async def find_one_and_update(self, query, update, upsert=False, **kwargs): + self.calls.append((query, update, upsert, kwargs)) + return {} + + class FakeDatabase: + def __init__(self): + self.playlist_metadata = FakeCollection() + self.segments = FakeCollection() + self.user_settings = FakeCollection() + + class FakeClient: + last_instance = None + + def __init__(self, url: str): + self.url = url + self.closed = False + self.db = FakeDatabase() + FakeClient.last_instance = self + + def __getitem__(self, name: str) -> FakeDatabase: + return self.db + + async def close(self): + self.closed = True + + monkeypatch.setattr(bootstrap_dataset, "AsyncMongoClient", FakeClient) + + asyncio.run(_seed_mongo(plan)) + + client = FakeClient.last_instance + assert client is not None + assert client.db.playlist_metadata.calls[0][0] == {"playlist_id": plan.playlist.id} + assert ( + client.db.playlist_metadata.calls[0][1]["$set"]["in_review"] + == plan.in_review_version_id + ) + assert len(client.db.segments.calls) == len(plan.segments) + assert client.db.user_settings.calls[0][0] == {"user_email": plan.sample_user_email} + assert client.closed is True + + +def test_run_import_calls_sqlite_then_mongo( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + plan = _build_dataset_plan(_sample_dataset_path(tmp_path)) + calls: list[tuple[str, object]] = [] + + def fake_seed_sqlite(received_plan, received_path): + calls.append(("sqlite", received_path)) + assert received_plan == plan + + async def fake_seed_mongo(received_plan): + calls.append(("mongo", received_plan.playlist.id)) + assert received_plan == plan + + monkeypatch.setattr(bootstrap_dataset, "_seed_sqlite", fake_seed_sqlite) + monkeypatch.setattr(bootstrap_dataset, "_seed_mongo", fake_seed_mongo) + + asyncio.run(bootstrap_dataset._run_import(plan, tmp_path / "mock.db")) + + assert calls == [("sqlite", tmp_path / "mock.db"), ("mongo", plan.playlist.id)] + + +def test_main_supports_dry_run_and_reports_errors( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +): + dataset_path = _sample_dataset_path(tmp_path) + + monkeypatch.setattr( + sys, + "argv", + ["bootstrap_dataset", str(dataset_path), "--dry-run"], + ) + assert bootstrap_dataset.main() == 0 + captured = capsys.readouterr() + assert "Dataset: demo_dailies_2025_10_02" in captured.out + + monkeypatch.setattr(sys, "argv", ["bootstrap_dataset"]) + monkeypatch.setattr(bootstrap_dataset, "_find_default_dataset_path", lambda: None) + assert bootstrap_dataset.main() == 1 + captured = capsys.readouterr() + assert "Could not find a default dataset directory" in captured.err + + +def test_main_prints_bootstrap_db_reminder_on_success( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +): + dataset_path = _sample_dataset_path(tmp_path) + + async def fake_run_import(plan, sqlite_path): + return None + + monkeypatch.setattr(bootstrap_dataset, "_run_import", fake_run_import) + monkeypatch.setattr( + sys, + "argv", + ["bootstrap_dataset", str(dataset_path), "--output-sqlite-path", "mock.db"], + ) + + assert bootstrap_dataset.main() == 0 + captured = capsys.readouterr() + assert "Seeded SQLite:" in captured.out + assert "MOCK_PRODTRACK_DB_PATH=/app/.local/mock.db" in captured.out diff --git a/backend/uv.lock b/backend/uv.lock new file mode 100644 index 00000000..a5bc5147 --- /dev/null +++ b/backend/uv.lock @@ -0,0 +1,3 @@ +version = 1 +revision = 3 +requires-python = ">=3.14" diff --git a/sample_dailies_dataset/in_review.json b/sample_dailies_dataset/in_review.json new file mode 100644 index 00000000..c0fe6765 --- /dev/null +++ b/sample_dailies_dataset/in_review.json @@ -0,0 +1,7 @@ +[ + { "ts": "00:00:16", "review_item": "HSM_SATL_0010" }, + { "ts": "00:01:26", "review_item": "HSM_SATL_0015" }, + { "ts": "00:02:29", "review_item": "HSM_SATL_0020" }, + { "ts": "00:05:39", "review_item": "HSM_SATL_0050" }, + { "ts": "00:08:13", "review_item": "HSM_SATL_0010" } +] diff --git a/sample_dailies_dataset/transcript.json b/sample_dailies_dataset/transcript.json index 551c984c..361e5c2c 100644 --- a/sample_dailies_dataset/transcript.json +++ b/sample_dailies_dataset/transcript.json @@ -1,5 +1,5 @@ { - "session_id": "demo_dailies_2025_10_03", + "session_id": "demo_dailies_2025_10_02", "utterances": [ { "ts": "00:00:00",