From 73ddcb622282660f29f79bde4ae5b7eaaaadda0c Mon Sep 17 00:00:00 2001 From: Jorge Rivera Date: Thu, 11 Jun 2026 15:21:12 +0200 Subject: [PATCH] fix(loading): treat uncached remote deps as soft; correct OECD DAC notice (0.1.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Partial remote caches are first-class: declared dependencies on remote packs absent from the local cache are skipped during dependency validation and explicit-mode queuing, instead of raising MissingModuleDependencyError from Resolver.auto() or transitively queuing ~796 MB of siblings. Bundled, cached, and unknown dependencies remain hard errors. NOTICE.md: OECD distributes content under CC BY 4.0 (commercial use permitted) since July 2024 — the previous wording claimed a non-commercial restriction. Documents the DAC contribution to the org entity store and fixes the upstream URL. --- CHANGELOG.md | 16 ++ docs/getting-started/install.md | 2 +- docs/reference/resolver.md | 2 +- pyproject.toml | 2 +- src/resolvekit/NOTICE.md | 25 +- .../core/api/loading/module_catalog.py | 55 +++- tests/core/test_partial_remote_cache.py | 237 ++++++++++++++++++ uv.lock | 2 +- 8 files changed, 329 insertions(+), 12 deletions(-) create mode 100644 tests/core/test_partial_remote_cache.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 34c94c9..02c2a33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## 0.1.1 (2026-06-11) + +**Fixed.** Partial remote caches no longer error: after downloading a single +remote tier (e.g. `download("geo.admin1")`), `Resolver.auto()` raised +`MissingModuleDependencyError` demanding sibling tiers, and explicitly +requesting one tier via `module_ids` transitively queued every declared +sibling (a ~796 MB download). Declared dependencies on remote packs that are +not in the local cache are now skipped during loading and validation; bundled +and cached dependencies are still enforced. + +**Docs.** Corrected the OECD DAC entry in NOTICE.md: OECD distributes its +content under CC BY 4.0 (attribution, commercial use permitted) since July +2024 — the previous wording incorrectly claimed a non-commercial restriction. +Also documented the DAC contribution to the org entity store and updated the +upstream URL. + ## 0.1.0 (2026-06-11) First public beta release. diff --git a/docs/getting-started/install.md b/docs/getting-started/install.md index 6c8f981..4bba8f8 100644 --- a/docs/getting-started/install.md +++ b/docs/getting-started/install.md @@ -75,7 +75,7 @@ rk.download("geo.admin1") # ~12 MB; verifies checksum, then marks is_available ```python >>> import resolvekit as rk >>> rk.__version__ -'0.1.0' +'0.1.1' >>> rk.resolve_id("United States") 'country/USA' ``` diff --git a/docs/reference/resolver.md b/docs/reference/resolver.md index a3335f9..efb875a 100644 --- a/docs/reference/resolver.md +++ b/docs/reference/resolver.md @@ -660,7 +660,7 @@ Resolver.lite().domains # ['geo'] ```python r.info.data_version # "2026.06" -r.info.resolvekit_version # "0.1.0" +r.info.resolvekit_version # "0.1.1" r.info.domains # ("geo", "org") r.info.routing_mode # "auto" r.info.closed # False diff --git a/pyproject.toml b/pyproject.toml index b60aa2a..821d843 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "resolvekit" -version = "0.1.0" +version = "0.1.1" description = "Entity and place resolution system that maps messy place/entity strings and codes to canonical entities" requires-python = ">=3.12" diff --git a/src/resolvekit/NOTICE.md b/src/resolvekit/NOTICE.md index bfcd860..4ff7f2b 100644 --- a/src/resolvekit/NOTICE.md +++ b/src/resolvekit/NOTICE.md @@ -43,12 +43,25 @@ license requires attribution, that attribution is given below. ## OECD DAC (Development Assistance Committee) -- **Contributes:** DAC country codes and recipient classifications used in the - geo entity store. -- **Upstream:** https://www.oecd.org/dac/financing-sustainable-development/development-finance-standards/dacandcrscodelists.htm -- **License:** OECD terms of use — data is freely available for non-commercial - and research use with attribution. - https://www.oecd.org/termsandconditions/ +- **Contributes:** DAC recipient, provider, channel, and agency codes with + their English/French names and an ISO3 crosswalk. These populate DAC codes + on countries and regions in the geo entity store and provider and + government-agency entities in the org entity store. +- **Upstream:** https://development-finance-codelists.oecd.org/ (DAC and CRS + code lists) +- **License:** OECD Terms and Conditions. Since 1 July 2024 most OECD data and + content is published under Creative Commons Attribution 4.0 International + (CC BY 4.0), which permits reuse — including commercial use — with + attribution; earlier content is available on terms similar to CC BY 4.0. + https://www.oecd.org/en/about/terms-conditions.html + https://creativecommons.org/licenses/by/4.0/ +- **Modifications:** only the codelists (codes, names, and the ISO3 crosswalk) + were extracted and repackaged; no OECD statistical, financial, or aid-flow + data is redistributed. +- **Attribution required:** OECD DAC codelists are used under the OECD Terms + and Conditions (CC BY 4.0). Source: OECD Development Assistance Committee + (DAC). The OECD logo and branding are not covered by this license and are + not used. ## HDX Python Country / UN M49 diff --git a/src/resolvekit/core/api/loading/module_catalog.py b/src/resolvekit/core/api/loading/module_catalog.py index 802052f..37f7865 100644 --- a/src/resolvekit/core/api/loading/module_catalog.py +++ b/src/resolvekit/core/api/loading/module_catalog.py @@ -134,12 +134,41 @@ def _load_and_separate_datapacks( return base_packs, overlay_packs +def _remote_dependency_uncached( + module_id: str, + available: dict[str, Path], + manifest_overrides: dict[str, dict[str, object]], +) -> bool: + """Return True if ``module_id`` is a remote-distribution module whose data + is not in the local cache. + + Such a dependency may be absent from a load set without error: remote + packs the user hasn't downloaded are not hard errors (partial caches are + first-class, and ``Resolver.auto()`` never triggers a network fetch). + Bundled dependencies and unknown module ids stay hard errors. + """ + from resolvekit.core.module_registry import load_module_metadata + from resolvekit.core.remote import is_cached + + if module_id not in available: + return False + path = available[module_id] + if not (path / "metadata.json").exists(): + return False + metadata = load_module_metadata(module_id, path, overrides=manifest_overrides) + return metadata.distribution == "remote" and not is_cached(metadata) + + def _validate_module_dependencies( base_packs: dict[str, LoadedDataPack], overlay_packs: dict[str, LoadedDataPack], pack_filter: set[str], ) -> None: + from resolvekit.core.module_registry import get_manifest_overrides + available_module_ids = set(base_packs) | set(overlay_packs) + registry: dict[str, Path] | None = None + manifest_overrides: dict[str, dict[str, object]] | None = None for loaded in [*base_packs.values(), *overlay_packs.values()]: if pack_filter and loaded.pack_id not in pack_filter: continue @@ -149,6 +178,22 @@ def _validate_module_dependencies( for module_id in loaded.metadata.module_dependencies if module_id not in available_module_ids ] + if missing: + # An absent dependency is only a hard error when its data could + # have been loaded — a declared dep on a remote pack the user + # hasn't downloaded is skipped, mirroring the auto-mode intent in + # _resolve_requested_module_paths. + if registry is None: + registry = list_available_modules() + manifest_overrides = get_manifest_overrides() + assert manifest_overrides is not None + missing = [ + module_id + for module_id in missing + if not _remote_dependency_uncached( + module_id, registry, manifest_overrides + ) + ] if missing: raise MissingModuleDependencyError(loaded.module_id, missing) @@ -250,8 +295,10 @@ def _resolve_requested_module_paths( continue # In auto mode, silently skip dependencies whose data isn't # locally available (remote packs the user hasn't downloaded - # are not hard errors — see v1-scope §225). In explicit mode - # we always queue them so the queue-loop's own + # are not hard errors — see v1-scope §225). In explicit mode, + # likewise skip remote-uncached dependencies so requesting one + # module never transitively forces sibling downloads; unknown + # module ids are still queued so the queue-loop's own # ``module_id not in available`` check raises # ``DataModuleNotFoundError``. if auto_mode and ( @@ -261,6 +308,10 @@ def _resolve_requested_module_paths( ) ): continue + if not auto_mode and _remote_dependency_uncached( + dependency, available, manifest_overrides + ): + continue queue.append(dependency) if metadata.is_overlay: for dependency in metadata.base_module_ids or []: diff --git a/tests/core/test_partial_remote_cache.py b/tests/core/test_partial_remote_cache.py new file mode 100644 index 0000000..a46f833 --- /dev/null +++ b/tests/core/test_partial_remote_cache.py @@ -0,0 +1,237 @@ +"""Partial remote caches are first-class: a module whose declared dependency +is a remote pack the user hasn't downloaded must load without error, and an +explicit request for one module must not transitively queue (or download) its +remote siblings. + +Regression tests for the 0.1.0 failure where ``download("geo.admin1")`` +followed by ``Resolver.auto()`` raised ``MissingModuleDependencyError`` +demanding geo.admin2/geo.admin3/geo.cities. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from resolvekit.core.api.loading import module_catalog +from resolvekit.core.datapack import DataPackMetadata, LoadedDataPack +from resolvekit.core.errors import MissingModuleDependencyError + + +def _metadata( + module_id: str, + *, + distribution: str = "bundled", + module_dependencies: list[str] | None = None, +) -> DataPackMetadata: + kwargs: dict = { + "datapack_id": f"{module_id}-v2026.1", + "module_id": module_id, + "domain_pack_id": module_id.split(".", maxsplit=1)[0], + "entity_schema_version": "1.0", + "feature_schema_version": f"{module_id.split('.', maxsplit=1)[0]}.features.v1", + "build_timestamp": "2026-01-01T00:00:00Z", + "distribution": distribution, + "module_dependencies": module_dependencies or [], + } + if distribution == "remote": + from resolvekit.core.datapack import RemoteArtifactSpec + + kwargs["remote_artifacts"] = { + "sqlite": RemoteArtifactSpec( + url=f"https://example.com/{module_id}.sqlite.gz", + sha256="0" * 64, + gz_sha256="0" * 64, + size_mb=1.0, + ), + } + return DataPackMetadata(**kwargs) + + +@pytest.fixture() +def fake_registry(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + """A synthetic module registry with controllable metadata and cache state. + + Returns a helper to register modules; patches list_available_modules, + load_module_metadata, and is_cached accordingly. + """ + paths: dict[str, Path] = {} + metadatas: dict[str, DataPackMetadata] = {} + cached: set[str] = set() + + def register( + module_id: str, + *, + distribution: str = "bundled", + module_dependencies: list[str] | None = None, + is_cached: bool = False, + data_present: bool = True, + ) -> DataPackMetadata: + path = tmp_path / module_id.replace(".", "_") + path.mkdir(exist_ok=True) + (path / "metadata.json").touch() + meta = _metadata( + module_id, + distribution=distribution, + module_dependencies=module_dependencies, + ) + if distribution == "bundled" and data_present: + (path / meta.store_file).touch() + if is_cached: + cached.add(module_id) + paths[module_id] = path + metadatas[module_id] = meta + return meta + + def fake_list_available_modules() -> dict[str, Path]: + return dict(paths) + + def fake_load_module_metadata( + module_id: str, path: Path, *, overrides=None + ) -> DataPackMetadata: + return metadatas[module_id] + + def fake_is_cached(metadata: DataPackMetadata) -> bool: + return metadata.module_id in cached + + monkeypatch.setattr( + module_catalog, "list_available_modules", fake_list_available_modules + ) + monkeypatch.setattr( + "resolvekit.core.module_registry.list_available_modules", + fake_list_available_modules, + ) + monkeypatch.setattr( + "resolvekit.core.module_registry.load_module_metadata", + fake_load_module_metadata, + ) + monkeypatch.setattr( + "resolvekit.core.module_registry.get_manifest_overrides", lambda: {} + ) + monkeypatch.setattr("resolvekit.core.remote.is_cached", fake_is_cached) + return register + + +def _loaded_pack(meta: DataPackMetadata, path: Path) -> LoadedDataPack: + return LoadedDataPack(meta, path) + + +class TestValidateModuleDependencies: + def test_remote_uncached_dependency_is_soft(self, fake_registry, tmp_path): + meta = fake_registry( + "geo.admin1", + distribution="remote", + module_dependencies=["geo.admin2", "geo.countries"], + is_cached=True, + ) + fake_registry("geo.admin2", distribution="remote", is_cached=False) + countries = fake_registry("geo.countries") + + base_packs = { + "geo.admin1": _loaded_pack(meta, tmp_path / "geo_admin1"), + "geo.countries": _loaded_pack(countries, tmp_path / "geo_countries"), + } + # geo.admin2 is declared but remote-and-uncached: no error. + module_catalog._validate_module_dependencies(base_packs, {}, set()) + + def test_remote_cached_dependency_still_hard(self, fake_registry, tmp_path): + meta = fake_registry( + "geo.admin1", + distribution="remote", + module_dependencies=["geo.admin2"], + is_cached=True, + ) + fake_registry("geo.admin2", distribution="remote", is_cached=True) + + base_packs = {"geo.admin1": _loaded_pack(meta, tmp_path / "geo_admin1")} + # geo.admin2 is cached, so its absence from the load set is a real + # loading bug and must still raise. + with pytest.raises(MissingModuleDependencyError): + module_catalog._validate_module_dependencies(base_packs, {}, set()) + + def test_bundled_missing_dependency_still_hard(self, fake_registry, tmp_path): + meta = fake_registry( + "org.governments", + module_dependencies=["org.providers"], + ) + fake_registry("org.providers") + + base_packs = { + "org.governments": _loaded_pack(meta, tmp_path / "org_governments") + } + with pytest.raises(MissingModuleDependencyError): + module_catalog._validate_module_dependencies(base_packs, {}, set()) + + def test_unknown_dependency_still_hard(self, fake_registry, tmp_path): + meta = fake_registry( + "geo.admin1", + distribution="remote", + module_dependencies=["geo.nonexistent"], + is_cached=True, + ) + base_packs = {"geo.admin1": _loaded_pack(meta, tmp_path / "geo_admin1")} + with pytest.raises(MissingModuleDependencyError): + module_catalog._validate_module_dependencies(base_packs, {}, set()) + + +class TestResolveRequestedModulePaths: + def test_explicit_request_skips_remote_uncached_deps(self, fake_registry): + fake_registry( + "geo.admin1", + distribution="remote", + module_dependencies=["geo.admin2", "geo.cities", "geo.countries"], + is_cached=True, + ) + fake_registry("geo.admin2", distribution="remote", is_cached=False) + fake_registry("geo.cities", distribution="remote", is_cached=False) + fake_registry("geo.countries") + + resolved = module_catalog._resolve_requested_module_paths(["geo.admin1"]) + assert set(resolved) == {"geo.admin1", "geo.countries"} + + def test_explicit_request_queues_cached_remote_deps(self, fake_registry): + fake_registry( + "geo.admin2", + distribution="remote", + module_dependencies=["geo.admin1"], + is_cached=True, + ) + fake_registry("geo.admin1", distribution="remote", is_cached=True) + + resolved = module_catalog._resolve_requested_module_paths(["geo.admin2"]) + assert set(resolved) == {"geo.admin2", "geo.admin1"} + + def test_auto_mode_skips_uncached_remote_modules(self, fake_registry): + fake_registry( + "geo.admin1", + distribution="remote", + module_dependencies=["geo.admin2", "geo.countries"], + is_cached=True, + ) + fake_registry("geo.admin2", distribution="remote", is_cached=False) + fake_registry("geo.countries") + + resolved = module_catalog._resolve_requested_module_paths(None) + assert set(resolved) == {"geo.admin1", "geo.countries"} + + +@pytest.mark.requires_remote_data +def test_auto_resolver_with_only_admin1_downloaded( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """End-to-end repro of the 0.1.0 bug: download exactly one remote tier + into a fresh cache, then Resolver.auto() must work and resolve through it. + """ + import resolvekit + from resolvekit.core.config import _reset_config + + _reset_config() + monkeypatch.setenv("RESOLVEKIT_CACHE_DIR", str(tmp_path / "cache")) + monkeypatch.setenv("RESOLVEKIT_AUTO_DOWNLOAD", "1") + try: + resolvekit.download("geo.admin1") + result = resolvekit.resolve("Bavaria", as_result=True) + assert result.entity_id is not None + finally: + _reset_config() diff --git a/uv.lock b/uv.lock index 00ec364..4b9fb3e 100644 --- a/uv.lock +++ b/uv.lock @@ -1700,7 +1700,7 @@ wheels = [ [[package]] name = "resolvekit" -version = "0.1.0" +version = "0.1.1" source = { editable = "." } dependencies = [ { name = "packaging" },