diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 12d3319..a425355 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -89,63 +89,11 @@ jobs: python -m pip install --upgrade pip pip install -c backend/constraints-${{ matrix.lancedb }}.txt \ -r backend/requirements.txt - pip install httpx # Required for TestClient + # httpx is required by TestClient; <0.28 keeps the app= kwarg + # that the starlette version pinned by fastapi 0.104 still uses + pip install pytest "httpx<0.28" - - name: Debug dependency versions + - name: Run API endpoint tests run: | cd backend - python -c " - import lancedb - import pyarrow - import fastapi - import starlette - from fastapi.testclient import TestClient - import inspect - - print(f'=== Lance {lancedb.__version__} Dependencies ===') - print(f'LanceDB: {lancedb.__version__}') - print(f'PyArrow: {pyarrow.__version__}') - print(f'FastAPI: {fastapi.__version__}') - print(f'Starlette: {starlette.__version__}') - - print(f'\\n=== TestClient signature ===') - sig = inspect.signature(TestClient.__init__) - print(f'TestClient.__init__{sig}') - - print(f'\\n=== App module structure ===') - import app - print(f'app module type: {type(app)}') - if hasattr(app, 'app'): - print(f'app.app type: {type(app.app)}') - print(f'app.app class: {app.app.__class__.__name__}') - else: - print('No app.app attribute found') - " - - - name: Test API endpoints - run: | - cd backend - python -c " - import app - import lancedb - import pyarrow - from fastapi.testclient import TestClient - - # Print version information first - print(f'Testing with LanceDB {lancedb.__version__}, PyArrow {pyarrow.__version__}') - - # Test health endpoint only - skip TestClient for now - # response = client.get('/healthz') - # assert response.status_code == 200 - # assert response.json()['ok'] == True - print('✓ Health check skipped (debugging TestClient)') - - # Test datasets endpoint (will fail without data but should not crash) - # try: - # response = client.get('/datasets') - # print('✓ Datasets endpoint accessible') - # except Exception as e: - # print(f'✓ Datasets endpoint handled error gracefully: {e}') - - print('✓ Debug completed - TestClient investigation needed') - " \ No newline at end of file + python -m pytest tests/ -v diff --git a/CHANGELOG.md b/CHANGELOG.md index cdabc4f..032e93b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- API endpoint test suite (pytest + FastAPI TestClient) covering all six endpoints, pagination, column filtering, value serialization, and corrupted-dataset handling. The CI test job now runs it against every supported Lance version (#28). + ### Fixed - Frontend fetch calls now check `response.ok` before parsing JSON, so HTTP error responses surface as error states instead of being parsed as data (#27). +- `/rows` with unknown column names and `/vector/preview` with a missing or non-vector column now return 400 as intended; the error was previously masked as a generic 500 (#28). ## [0.2.0] - 2026-04-16 diff --git a/backend/app.py b/backend/app.py index 815ea55..e399d54 100644 --- a/backend/app.py +++ b/backend/app.py @@ -341,6 +341,8 @@ async def get_dataset_rows( "offset": offset } + except HTTPException: + raise except Exception as e: logger.error(f"Error getting rows for {dataset_name}: {e}") raise HTTPException(status_code=500, detail="Failed to get dataset rows") @@ -392,6 +394,8 @@ async def get_vector_preview( return {"stats": stats, "preview": preview} + except HTTPException: + raise except Exception as e: logger.error(f"Error getting vector preview for {dataset_name}.{column}: {e}") raise HTTPException(status_code=500, detail="Failed to get vector preview") diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 0000000..84f5f69 --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,84 @@ +"""Shared fixtures: a temporary Lance database and a FastAPI test client. + +Run from the backend directory: + + python -m pytest tests/ -v + +The sample data is written once per session with whatever lancedb/pyarrow +versions are installed, so the same tests run against every constraints file. +""" + +import os +import sys +from pathlib import Path + +import lancedb +import pyarrow as pa +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +ROWS = 10 +VEC_DIM = 4 +CLIP_DIM = 512 + + +def _sample_table() -> pa.Table: + ids = list(range(ROWS)) + texts = [None if i == 3 else f"row {i}" for i in ids] + scores = [i * 1.5 for i in ids] + blobs = [b"hello" if i % 2 == 0 else b"\xff\xfe\x01\x02" for i in ids] + vecs = [None if i == 5 else [float(i), -1.0, 0.5, 2.0] for i in ids] + unit = 1.0 / CLIP_DIM ** 0.5 + embeddings = [[unit] * CLIP_DIM for _ in ids] + + return pa.table({ + "id": pa.array(ids, type=pa.int64()), + "text": pa.array(texts, type=pa.string()), + "score": pa.array(scores, type=pa.float64()), + "blob": pa.array(blobs, type=pa.binary()), + "vec": pa.array(vecs, type=pa.list_(pa.float32())), + "embedding": pa.array(embeddings, type=pa.list_(pa.float32(), CLIP_DIM)), + }) + + +def _corrupt_table(db_dir: Path, name: str) -> None: + """Overwrite the data fragments of a table so reads fail but the + manifest stays intact and open_table() still succeeds.""" + table_dir = db_dir / f"{name}.lance" + data_files = [p for p in table_dir.rglob("*.lance") if p.is_file()] + assert data_files, f"no data files found under {table_dir}" + for path in data_files: + path.write_bytes(b"not a lance data file") + + +@pytest.fixture(scope="session") +def data_dir(tmp_path_factory): + path = tmp_path_factory.mktemp("lance-data") + db = lancedb.connect(str(path)) + db.create_table("sample", _sample_table()) + db.create_table("broken", pa.table({"id": pa.array([1, 2, 3], type=pa.int64())})) + _corrupt_table(path, "broken") + return path + + +@pytest.fixture(scope="session") +def vec_nulls_preserved(data_dir): + """Lance format v1 (lancedb 0.3.x/0.5) stores a null list as an empty + list. Detect what the installed version actually does so tests can + assert the matching serialization.""" + db = lancedb.connect(str(data_dir)) + values = db.open_table("sample").to_arrow().column("vec").to_pylist() + return values[5] is None + + +@pytest.fixture(scope="session") +def client(data_dir): + # DATA_PATH is read at import time, so set it before importing app + os.environ["DATA_PATH"] = str(data_dir) + import app as app_module + app_module.DATA_PATH = data_dir + + from fastapi.testclient import TestClient + with TestClient(app_module.app) as test_client: + yield test_client diff --git a/backend/tests/test_api.py b/backend/tests/test_api.py new file mode 100644 index 0000000..bd5db28 --- /dev/null +++ b/backend/tests/test_api.py @@ -0,0 +1,258 @@ +"""API endpoint tests, based on docs/spec.md. + +Covers /healthz, /datasets, /schema, /columns, /rows (pagination, column +filtering, serialization), /vector/preview, and the graceful-degradation +path for unreadable datasets. +""" + +import base64 + +import lancedb +import pytest +from packaging.version import parse as parse_version + +from conftest import CLIP_DIM, ROWS, VEC_DIM + + +# /healthz + +def test_healthz_reports_versions(client): + response = client.get("/healthz") + assert response.status_code == 200 + body = response.json() + assert body["ok"] is True + assert body["lancedb_version"] == lancedb.__version__ + assert body["build_tag"] == f"app-{body['app_version']}_lancedb-{lancedb.__version__}" + + +def test_healthz_compat_flags(client): + compat = client.get("/healthz").json()["compat"] + installed = parse_version(lancedb.__version__) + assert compat["vector_preview"] is True + assert compat["schema_evolution"] == (installed >= parse_version("0.5")) + assert compat["lance_v2_format"] == (installed >= parse_version("0.16")) + + +# /datasets + +def test_datasets_lists_created_tables(client): + response = client.get("/datasets") + assert response.status_code == 200 + names = response.json()["datasets"] + assert "sample" in names + assert "broken" in names + + +# /datasets/{name}/schema + +def test_schema_fields(client): + response = client.get("/datasets/sample/schema") + assert response.status_code == 200 + body = response.json() + assert "metadata" in body + fields = {f["name"]: f for f in body["fields"]} + assert set(fields) == {"id", "text", "score", "blob", "vec", "embedding"} + assert fields["id"]["type"] == "int64" + assert fields["id"]["nullable"] is True + assert fields["vec"]["type"] == "list" + assert fields["embedding"]["type"].startswith("fixed_size_list") + assert str(CLIP_DIM) in fields["embedding"]["type"] + + +def test_schema_vector_dim_only_on_vector_fields(client): + fields = {f["name"]: f for f in client.get("/datasets/sample/schema").json()["fields"]} + assert "vector_dim" in fields["vec"] + assert "vector_dim" in fields["embedding"] + assert "vector_dim" not in fields["id"] + assert "vector_dim" not in fields["text"] + + +def test_schema_invalid_dataset_name(client): + assert client.get("/datasets/bad.name/schema").status_code == 400 + + +def test_schema_missing_dataset(client): + assert client.get("/datasets/nosuchtable/schema").status_code == 500 + + +def test_schema_readable_on_corrupted_dataset(client): + response = client.get("/datasets/broken/schema") + assert response.status_code == 200 + assert [f["name"] for f in response.json()["fields"]] == ["id"] + + +# /datasets/{name}/columns + +def test_columns_vector_flags(client): + response = client.get("/datasets/sample/columns") + assert response.status_code == 200 + columns = {c["name"]: c for c in response.json()["columns"]} + assert columns["vec"]["is_vector"] is True + assert columns["vec"]["dim"] is None + assert columns["embedding"]["is_vector"] is True + assert columns["id"]["is_vector"] is False + assert "dim" not in columns["id"] + + +def test_columns_invalid_dataset_name(client): + assert client.get("/datasets/bad.name/columns").status_code == 400 + + +# /datasets/{name}/rows + +def test_rows_defaults(client): + response = client.get("/datasets/sample/rows") + assert response.status_code == 200 + body = response.json() + assert body["total"] == ROWS + assert body["limit"] == 50 + assert body["offset"] == 0 + assert len(body["rows"]) == ROWS + + +def test_rows_pagination(client): + body = client.get("/datasets/sample/rows", params={"limit": 3, "offset": 0}).json() + assert [r["id"] for r in body["rows"]] == [0, 1, 2] + assert body["total"] == ROWS + assert body["limit"] == 3 + + body = client.get("/datasets/sample/rows", params={"limit": 3, "offset": 8}).json() + assert [r["id"] for r in body["rows"]] == [8, 9] + + +def test_rows_offset_past_end(client): + body = client.get("/datasets/sample/rows", params={"offset": ROWS}).json() + assert body["rows"] == [] + assert body["total"] == ROWS + + +def test_rows_limit_bounds(client): + assert client.get("/datasets/sample/rows", params={"limit": 0}).status_code == 422 + assert client.get("/datasets/sample/rows", params={"limit": 201}).status_code == 422 + assert client.get("/datasets/sample/rows", params={"offset": -1}).status_code == 422 + + +def test_rows_column_filtering(client): + body = client.get("/datasets/sample/rows", params={"columns": "id,text"}).json() + assert all(set(row) == {"id", "text"} for row in body["rows"]) + + +def test_rows_invalid_column_returns_400(client): + response = client.get("/datasets/sample/rows", params={"columns": "id,nope"}) + assert response.status_code == 400 + assert "nope" in response.json()["detail"] + + +def test_rows_invalid_dataset_name(client): + assert client.get("/datasets/bad.name/rows").status_code == 400 + + +def test_rows_missing_dataset(client): + assert client.get("/datasets/nosuchtable/rows").status_code == 500 + + +def test_rows_scalar_serialization(client): + rows = client.get("/datasets/sample/rows").json()["rows"] + assert rows[0]["id"] == 0 + assert rows[0]["text"] == "row 0" + assert rows[0]["score"] == 0.0 + assert rows[1]["score"] == 1.5 + assert rows[3]["text"] is None + + +def test_rows_binary_serialization(client): + rows = client.get("/datasets/sample/rows").json()["rows"] + # UTF-8 decodable bytes come back as text, the rest as base64 + assert rows[0]["blob"] == "hello" + assert rows[1]["blob"] == base64.b64encode(b"\xff\xfe\x01\x02").decode() + + +def test_rows_vector_serialization(client): + rows = client.get("/datasets/sample/rows").json()["rows"] + vec = rows[0]["vec"] + assert vec["type"] == "vector" + assert vec["dim"] == VEC_DIM + assert vec["preview"] == [0.0, -1.0, 0.5, 2.0] + assert vec["norm"] == pytest.approx(5.25 ** 0.5) + assert vec["min"] == -1.0 + assert vec["max"] == 2.0 + assert vec["mean"] == pytest.approx(0.375) + + +def test_rows_null_vector(client, vec_nulls_preserved): + rows = client.get("/datasets/sample/rows").json()["rows"] + if vec_nulls_preserved: + assert rows[5]["vec"] is None + else: + # storage turned the null into an empty list, which serializes + # to the invalid-vector error object + assert rows[5]["vec"] == {"type": "vector", "error": "Invalid vector data"} + + +def test_rows_clip_detection(client): + embedding = client.get("/datasets/sample/rows").json()["rows"][0]["embedding"] + assert embedding["type"] == "vector" + assert embedding["dim"] == CLIP_DIM + assert embedding["model"] == "likely_clip" + assert len(embedding["preview"]) == 32 + assert embedding["norm"] == pytest.approx(1.0, abs=1e-3) + assert embedding["stats"]["normalized"] is True + assert embedding["stats"]["sparsity"] == 0.0 + assert embedding["stats"]["positive_ratio"] == 1.0 + + +def test_rows_graceful_degradation_on_corrupted_dataset(client): + response = client.get("/datasets/broken/rows") + assert response.status_code == 200 + body = response.json() + assert body["total"] == 1 + row = body["rows"][0] + assert row["error"] == "Unable to read dataset" + assert row["dataset"] == "broken" + assert row["details"].startswith("Error:") + + +# /datasets/{name}/vector/preview + +def test_vector_preview_stats(client, vec_nulls_preserved): + response = client.get("/datasets/sample/vector/preview", params={"column": "vec"}) + assert response.status_code == 200 + body = response.json() + # the null vector is filtered from the count; on format v1 it comes + # back as an empty list instead, which counts but adds no values + assert body["stats"]["count"] == (ROWS - 1 if vec_nulls_preserved else ROWS) + assert body["stats"]["dim"] == VEC_DIM + assert body["stats"]["min"] == -1.0 + assert body["stats"]["max"] == 9.0 + assert body["stats"]["mean"] == pytest.approx(53.5 / 36) + assert len(body["preview"]) == ROWS - 1 + first = body["preview"][0] + assert first["sample"] == [0.0, -1.0, 0.5, 2.0] + assert first["norm"] == pytest.approx(5.25 ** 0.5) + + +def test_vector_preview_sample_capped_at_32(client): + body = client.get( + "/datasets/sample/vector/preview", params={"column": "embedding"} + ).json() + assert body["stats"]["dim"] == CLIP_DIM + assert all(len(p["sample"]) == 32 for p in body["preview"]) + + +def test_vector_preview_non_vector_column(client): + response = client.get("/datasets/sample/vector/preview", params={"column": "score"}) + assert response.status_code == 400 + + +def test_vector_preview_missing_column(client): + response = client.get("/datasets/sample/vector/preview", params={"column": "nope"}) + assert response.status_code == 400 + + +def test_vector_preview_column_param_required(client): + assert client.get("/datasets/sample/vector/preview").status_code == 422 + + +def test_vector_preview_invalid_dataset_name(client): + response = client.get("/datasets/bad.name/vector/preview", params={"column": "vec"}) + assert response.status_code == 400