diff --git a/plan.md b/plan.md new file mode 100644 index 000000000..4b8173e71 --- /dev/null +++ b/plan.md @@ -0,0 +1,493 @@ +# Plan: Replace Custom Cache with CacheControl + +Fixes https://github.com/python-jsonschema/check-jsonschema/issues/668 + +## Problem + +The current cache implementation in `cachedownloader.py` uses only the +`Last-Modified` header for cache validation. When a server omits that header +(but provides `ETag` or `Cache-Control`), `_lastmod_from_response()` returns +`0.0` and `_cache_hit()` always returns `True`, so the cache never invalidates. + +## Solution + +Replace the hand-rolled mtime-based cache logic with the +[CacheControl](https://github.com/psf/cachecontrol) library (v0.14, maintained +under the PSF). CacheControl wraps a `requests.Session` and implements full HTTP +caching semantics — `ETag`/`If-None-Match`, `Last-Modified`/`If-Modified-Since`, +and `Cache-Control` headers — transparently. On-disk persistence uses its +`FileCache` backend (backed by `filelock` for concurrency safety). + +## Behavior Changes + +| Aspect | Before | After | +|---|---|---| +| ETag support | None | Full (`If-None-Match` / 304) | +| `Cache-Control` header | Ignored | Respected (`max-age`, `no-cache`, `no-store`) | +| `Last-Modified` | Only mechanism (buggy when absent) | Supported via `If-Modified-Since` / 304 | +| No caching headers at all | Cache never invalidates (bug) | Not cached — see note below | +| Existing cache files on upgrade | Used | Ignored (incompatible format) — see open question below | +| Cache file format | Raw content, SHA-256 filename | CacheControl msgpack-serialized HTTP response, SHA-224 filename in 5-level dir tree | +| `--no-cache` flag | Unchanged | Unchanged — plain session, no caching | +| Retry on error / bad data | Unchanged | Unchanged — custom retry loop preserved | +| Concurrency safety | `_atomic_write` (tempfile + shutil.copy) | `filelock` (via `FileCache`) | + +### Servers with no caching headers + +When a server provides no caching headers at all (no `ETag`, no `Last-Modified`, +no `Cache-Control`, no `Expires`), the current code caches the response forever +due to the `_lastmod_from_response` returning `0.0` bug. After this change, such +responses will not be cached, following correct HTTP semantics. In practice, +nearly all real-world schema endpoints provide at least one caching header. + +CacheControl has built-in heuristic support for overriding this behavior if +needed. The `ExpiresAfter` heuristic (bundled, not custom) can apply a TTL to all +responses. For a conditional variant that only applies to responses lacking +server-provided caching headers, the documented `BaseHeuristic` extensibility +point allows a small subclass (~10 lines). Neither is a hack — heuristics are a +first-class CacheControl feature designed for exactly this purpose, and +[RFC 7234](https://tools.ietf.org/html/rfc7234) explicitly permits caching +systems to use heuristics for responses that lack caching headers. + +### Open question: clean up old cache files? + +CacheControl uses the same cache directories (`check_jsonschema/schemas/`, +`check_jsonschema/refs/`) but a different file layout (5-level SHA-224 tree vs. +flat SHA-256 files). Old cache files will sit alongside the new CacheControl +tree, ignored but taking up space. + +Options: +1. **Do nothing** (current default) — old files are harmless. Users can manually + delete the cache directory if they want to reclaim space. +2. **Delete on first run** — on `_build_session()`, detect and remove files + matching the old naming pattern (64-char hex + optional extension, no + subdirectories). Risk: could delete user files if they happen to match. +3. **One-time migration warning** — log a message suggesting users clear their + cache directory. Non-invasive but adds noise. + +**Proceeding with option 1 (do nothing) unless revisited.** + +--- + +## 1. Dependency Changes + +**File: `pyproject.toml`** (line 39-46) + +Add `CacheControl[filecache]` to runtime dependencies: + +```toml +dependencies = [ + 'tomli>=2.0;python_version<"3.11"', + "ruamel.yaml>=0.18.10,<0.20.0", + "jsonschema>=4.18.0,<5.0", + "regress>=2024.11.1", + "requests<3.0", + "click>=8,<9", + "CacheControl[filecache]>=0.14,<0.15", +] +``` + +The `[filecache]` extra pulls in `filelock`. CacheControl also depends on +`msgpack` (for response serialization) and `requests` (already a dependency). + +--- + +## 2. Rewrite `src/check_jsonschema/cachedownloader.py` + +### Remove + +| Symbol | Lines | Reason | +|---|---|---| +| `_LASTMOD_FMT` | 16 | CacheControl handles Last-Modified natively | +| `_lastmod_from_response()` | 45-54 | Same | +| `_cache_hit()` | 88-97 | CacheControl handles freshness and conditional requests | +| `_atomic_write()` | 77-85 | FileCache uses `filelock` for concurrency | + +### Preserve existing imports + +`from __future__ import annotations` (line 1) stays. `hashlib` stays (used by +`url_to_cache_filename`). `platform` stays (used by `_base_cache_dir`). + +### Remove imports no longer needed + +`calendar`, `shutil`, `tempfile`, `time` — verify each is truly unused after +the rewrite. + +### Add imports (top-level, third-party group) + +```python +import cachecontrol +from cachecontrol.caches.file_cache import FileCache +``` + +These go in the third-party import group alongside `import requests`, following +the project's PEP 8 / isort convention (no lazy imports in production code). + +### Keep unchanged + +| Symbol | Lines | Reason | +|---|---|---| +| `_base_cache_dir()` | 19-35 | Still needed for platform-specific cache path | +| `_resolve_cache_dir()` | 38-42 | Still needed | +| `url_to_cache_filename()` | 100-116 | Useful utility; not used by CacheControl but no reason to remove | +| `FailedDownloadError` | 119-120 | Still raised by retry logic | + +### Modify `_get_request()` (lines 57-74) + +- Add `session: requests.Session` as the first parameter. +- Call `session.get(file_url)` instead of `requests.get(file_url, stream=True)`. +- Remove `stream=True` — CacheControl needs the full response to cache it, and + schemas are small. +- Keep the retry loop (2 retries) and `response_ok` callback — CacheControl does + not provide retry logic. + +**Retry / CacheControl interaction:** When a server returns a 200 with corrupt +body *and* caching headers (e.g., ETag), CacheControl caches the response before +our `response_ok` callback runs. On a naive retry, `session.get()` would serve +the cached corrupt response and every retry would fail identically. To avoid +this, retry attempts (`_attempt > 0`) pass `Cache-Control: no-cache` in the +request headers. This is standard HTTP (RFC 7234 §5.2.1.4) — it tells +CacheControl to revalidate with the server rather than serve from its local +cache. CacheControl implements this natively; no custom logic is needed. + +```python +def _get_request( + session: requests.Session, + file_url: str, + *, + response_ok: t.Callable[[requests.Response], bool], +) -> requests.Response: + num_retries = 2 + r: requests.Response | None = None + for _attempt in range(num_retries + 1): + try: + # On retries, bypass CacheControl's local cache to avoid + # re-serving a cached bad response. + headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} + r = session.get(file_url, headers=headers) + except requests.RequestException as e: + if _attempt == num_retries: + raise FailedDownloadError("encountered error during download") from e + continue + if r.ok and response_ok(r): + return r + assert r is not None + raise FailedDownloadError( + f"got response with status={r.status_code}, retries exhausted" + ) +``` + +### Rewrite `CacheDownloader` (lines 123-180) + +- `__init__`: store config only. Do **not** build the session eagerly — + `test_default_cache_dir` creates `CacheDownloader` with fake/relative cache + paths purely to assert `_cache_dir` resolution and never makes HTTP requests. + Eager `os.makedirs` + `FileCache()` in `__init__` would create real + directories using those fake paths. +- Build the session lazily via a `_session` property on first access (i.e., + first `open()` call). This also avoids creating sessions that are never used + (e.g., if the schema turns out to be local after all). +- Remove `_download()` entirely — CacheControl manages on-disk cache files. +- Simplify `open()` — always yields `io.BytesIO(response.content)`. No + branching needed: when cache is enabled, `self._session` is a + CacheControl-wrapped session (handles caching transparently); when disabled, + it is a plain `Session`. +- `bind()` — unchanged API. + +CacheControl imports go at the **top of the module** alongside `requests`, +following the project convention (no lazy/deferred imports in production code): + +```python +import cachecontrol +from cachecontrol.caches.file_cache import FileCache +``` + +```python +class CacheDownloader: + def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: + self._cache_dir = _resolve_cache_dir(cache_dir) + self._disable_cache = disable_cache + self._cached_session: requests.Session | None = None + + @property + def _session(self) -> requests.Session: + if self._cached_session is None: + self._cached_session = self._build_session() + return self._cached_session + + def _build_session(self) -> requests.Session: + session = requests.Session() + if self._cache_dir and not self._disable_cache: + os.makedirs(self._cache_dir, exist_ok=True) + session = cachecontrol.CacheControl( + session, cache=FileCache(self._cache_dir) + ) + return session + + @contextlib.contextmanager + def open( + self, + file_url: str, + filename: str, + validate_response: t.Callable[[requests.Response], bool], + ) -> t.Iterator[t.IO[bytes]]: + response = _get_request( + self._session, file_url, response_ok=validate_response + ) + yield io.BytesIO(response.content) + + def bind( + self, + file_url: str, + filename: str | None = None, + validation_callback: t.Callable[[bytes], t.Any] | None = None, + ) -> BoundCacheDownloader: + return BoundCacheDownloader( + file_url, self, filename=filename, + validation_callback=validation_callback, + ) +``` + +The `filename` parameter in `open()` and `bind()` is kept for API compatibility +but ignored — CacheControl manages its own filenames (SHA-224 hash of normalized +URL in a 5-level directory tree). + +`BoundCacheDownloader.__init__` still computes `self._filename = filename or +url_to_cache_filename(file_url)` (line 193). This is now dead code — the value +is passed through to `CacheDownloader.open()` which ignores it. Kept for now +since `test_default_filename_from_uri` asserts on `cd._filename`. Can be cleaned +up in a follow-up. + +### Modify `BoundCacheDownloader._validate_response()` (lines 206-213) + +Skip validation on cache hits. CacheControl sets `response.from_cache = True` on +every response served from cache (either a direct hit or after a 304 +revalidation). This preserves the fix for +[#453](https://github.com/python-jsonschema/check-jsonschema/issues/453) +(validation callback must not run on cache hits). + +```python +def _validate_response(self, response: requests.Response) -> bool: + if not self._validation_callback: + return True + # CacheControl sets from_cache=True on cache hits; skip re-validation + if getattr(response, "from_cache", False): + return True + try: + self._validation_callback(response.content) + return True + except ValueError: + return False +``` + +--- + +## 3. Changes to Callers + +**No changes** to `readers.py` or `resolver.py`. Both use `CacheDownloader` via +`bind()` and `open()`, whose signatures are unchanged. + +- `readers.py:80` — `CacheDownloader("schemas", disable_cache=...).bind(url, validation_callback=...)` +- `resolver.py:53` — `CacheDownloader("refs", disable_cache=...)` + +Cache subdirectory names (`"schemas"`, `"refs"`) are preserved. + +**Update `cli/main_command.py:96`**: The `--schemafile` help text says schemas +will be `"downloaded and cached locally based on mtime."` Replace with generic +wording (e.g., `"downloaded and cached locally"`) since caching is no longer +mtime-based. + +--- + +## 4. Test Changes + +### Test strategy: trust CacheControl, test integration only + +CacheControl's `FileCache` write mechanism relies on a `CallbackFileWrapper` +that monitors when a response stream is fully consumed. The `responses` mock +library creates `BytesIO` bodies that never report `.closed = True` after +exhaustion, so the cache-write callback never fires. **CacheControl never +populates its FileCache when responses come from `responses` mocks.** + +This means tests cannot verify CacheControl's caching behavior (cache hits, +ETag revalidation, max-age expiry) using `responses`. Rather than switching test +infrastructure (e.g., adding a real HTTP server), the test strategy is: + +- **Trust CacheControl** for caching correctness — it is a well-maintained PSF + project with its own test suite. +- **Test our integration**: retry logic, validation callbacks, `--no-cache` flag, + session construction, error handling. +- **Do not assert** on cache file existence, `response.from_cache`, or + `len(responses.calls)` staying constant across requests. + +### `responses` library compatibility + +The `responses` library (v0.26.0) patches `HTTPAdapter.send` at the class level. +CacheControl's `CacheControlAdapter` calls `super().send()`, which hits the +`responses` mock. HTTP request/response flow works normally — only the cache-write +side effect is broken. Tests that use `responses` for mocking HTTP (retry tests, +validation tests, error tests) work fine. + +### `tests/conftest.py` — Fixture changes + +**Update `patch_cache_dir`** (line 67): Works as-is — it monkeypatches +`_base_cache_dir`, which is still called by `_resolve_cache_dir`, which is still +called by `CacheDownloader.__init__`. CacheControl's `FileCache` will use the +patched temp directory. + +**Remove the following fixtures** — they either inject raw cache files (no longer +valid format) or predict cache file paths using `url_to_cache_filename` +(SHA-256), which CacheControl does not use: + +| Fixture | Line | Action | +|---|---|---| +| `url2cachepath` | 77 | Remove — only consumed by the path-prediction fixtures below | +| `inject_cached_download` | 100 | Remove — writes raw content; CacheControl uses msgpack format | +| `inject_cached_ref` | 126 | Remove — same | +| `get_download_cache_loc` | 92 | Remove — path prediction no longer valid | +| `get_ref_cache_loc` | 118 | Remove — same | + +**Keep `downloads_cache_dir` / `refs_cache_dir`** (lines 87, 113) and +**keep `cache_dir`** (line 63) — still useful for asserting cache directory +existence when needed. + +### `tests/unit/test_cachedownloader.py` — Specific changes + +**Update the import block** (lines 10-16): Remove `_cache_hit` and +`_lastmod_from_response` — both symbols are deleted from `cachedownloader.py`. + +**Tests to remove:** + +| Test | Lines | Reason | +|---|---|---| +| `test_cache_hit_by_mtime` | 95-113 | mtime-based caching removed entirely | +| `test_cachedownloader_handles_bad_lastmod_header` | 257-310 | `_lastmod_from_response` removed | +| `test_lastmod_from_header_uses_gmtime` | 352-383 | Same | +| `test_cachedownloader_cached_file` | 116-129 | Tests monkeypatched `_download` method, which is removed | +| `test_cachedownloader_validation_is_not_invoked_on_hit` | 313-345 | Depends on `inject_cached_download` (removed) and `response.from_cache` (not testable with `responses` mocks) | + +**Tests to simplify** (remove `get_download_cache_loc` dependency and file- +existence assertions — these tests are about retry behavior, not caching): + +`test_cachedownloader_on_success` (line 132): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. +- Keep: verify `cd.open()` returns correct content (`b"{}"`). + +`test_cachedownloader_succeeds_after_few_errors` (line 160): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. +- Keep: verify `cd.open()` returns correct content after retries. + +`test_cachedownloader_fails_after_many_errors` (line 194): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertion. +- Keep: verify `FailedDownloadError` is raised. + +`test_cachedownloader_retries_on_bad_data` (line 225): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. +- Keep: verify `cd.open()` returns correct content after retrying past bad data. + +`test_cachedownloader_using_alternate_target_dir` (line 149): +- Remove `url2cachepath` / path-based assertions. +- Keep: verify `cd.open()` returns correct content. + +**Tests to keep as-is:** + +| Test | Lines | Notes | +|---|---|---| +| `test_default_filename_from_uri` | 37-39 | `url_to_cache_filename` unchanged | +| `test_default_cache_dir` | 42-92 | `_base_cache_dir` unchanged. Works as-is thanks to lazy session building — `CacheDownloader("downloads")` only stores `_cache_dir`, no `os.makedirs` or `FileCache` until `open()` is called. | + +**New tests to add (integration-focused):** + +| Test | Purpose | +|---|---| +| `test_disable_cache_uses_plain_session` | When `disable_cache=True`, verify `_build_session` returns a plain `requests.Session` (not wrapped by CacheControl). | +| `test_enable_cache_uses_cachecontrol_session` | When `disable_cache=False` and cache dir is valid, verify `_build_session` returns a CacheControl-wrapped session (check for `CacheControlAdapter` on the session). | +| `test_cache_dir_none_uses_plain_session` | When `_resolve_cache_dir` returns `None` (e.g., Windows with no env vars), verify `_build_session` returns a plain session. | + +**Aspirational cache-behavior tests** (require a real HTTP server or alternative +to `responses`; not included in this PR but tracked for follow-up): + +| Test | Purpose | +|---|---| +| `test_etag_cache_revalidation` | Server provides `ETag`, no `Last-Modified`. Second request sends `If-None-Match`, gets 304. | +| `test_cache_control_max_age` | Server provides `Cache-Control: max-age=3600`. Second request served from cache. | +| `test_last_modified_revalidation` | Server provides `Last-Modified`. Second request sends `If-Modified-Since`. | +| `test_no_caching_headers_not_cached` | Server provides no caching headers. Response is not cached. | + +### `tests/acceptance/test_remote_ref_resolution.py` + +`test_remote_ref_resolution_cache_control` (line 68): +- Remove `get_ref_cache_loc` fixture dependency and file-path assertions. +- When `disable_cache=False`: verify the command succeeds (exit code 0). + Cache behavior is trusted to CacheControl. +- When `disable_cache=True`: verify the command succeeds (exit code 0). + +`test_remote_ref_resolution_loads_from_cache` (line 105): +- Remove `inject_cached_ref` and `get_ref_cache_loc` fixture dependencies. +- This test's premise (inject good cache data, serve bad HTTP, verify cache is + used) cannot be replicated without direct FileCache manipulation. **Remove + this test.** Cache-hit behavior is trusted to CacheControl. + +`test_remote_ref_resolution_callout_count_is_scale_free_in_instancefiles` +(line 251): +- Should work as-is. Within-run deduplication is handled by `lru_cache` on + `_get_validator` and the in-memory `ResourceCache` — not by the file cache. + CacheControl's file cache only affects cross-run persistence. + +### `tests/acceptance/test_nonjson_schema_handling.py` + +`test_can_load_remote_yaml_schema_ref_from_cache` (line 140): +- Remove `inject_cached_ref` fixture dependency. This test's premise (inject + good cache, serve bad HTTP) cannot be replicated. **Remove this test.** + Cache-hit behavior is trusted to CacheControl. + +--- + +## 5. Pytest Warning Filter + +CacheControl's `FileCache` uses `filelock` for concurrency safety. When connection +errors occur during tests (e.g., `test_cachedownloader_fails_after_many_errors`), +filelock may leave temporary files open, triggering a `ResourceWarning` that pytest +converts to `PytestUnraisableExceptionWarning`. + +This is not a bug in our code — it's a side effect of how filelock handles error +conditions. Add a filter to `pyproject.toml` to ignore this specific warning: + +```toml +[tool.pytest.ini_options] +filterwarnings = [ + # ... existing filters ... + # filelock (used by CacheControl) may leave temp files open during connection errors + 'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning', +] +``` + +**TODO:** Investigate whether this warning can be avoided rather than suppressed. + +**Investigation results (2026-03-27):** +- The warning originates from CacheControl's `CallbackFileWrapper` class in + `filewrapper.py`, not filelock. +- `CallbackFileWrapper.__init__` creates a `NamedTemporaryFile` (line 37), which + is only closed in `_close()` when the response is fully read (line 105-106). +- When a `requests.ConnectionError` occurs, the temp file is never closed because + `_close()` is never called. +- No existing upstream issues found in CacheControl or filelock for this specific + problem. +- The `CallbackFileWrapper` class lacks a `__del__` method or context manager + protocol that would ensure cleanup on exceptions. +- **Recommendation:** File an upstream issue with CacheControl suggesting that + `CallbackFileWrapper` implement `__del__` to close `self.__buf` if not already + closed, or use a weak reference callback to ensure cleanup. + +--- + +## 6. Execution Order + +1. Add `CacheControl[filecache]` dependency to `pyproject.toml` +2. Rewrite `cachedownloader.py` +3. Update `conftest.py` fixtures +4. Rewrite/update unit tests in `test_cachedownloader.py` +5. Update acceptance tests +6. Add pytest warning filter for filelock ResourceWarning +7. Run full test suite, fix any `responses` library compatibility issues +8. Commit, push, open PR diff --git a/pyproject.toml b/pyproject.toml index 75a2fce27..a1c43ce2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "regress>=2024.11.1", "requests<3.0", "click>=8,<9", + "CacheControl[filecache]>=0.14,<0.15", ] [project.readme] @@ -80,6 +81,12 @@ filterwarnings = [ "error", # dateutil has a Python 3.12 compatibility issue. 'ignore:datetime\.datetime\.utcfromtimestamp\(\) is deprecated:DeprecationWarning', + # CacheControl's CallbackFileWrapper (filewrapper.py) creates a NamedTemporaryFile + # in __init__ that is only closed when the response is fully read. When a + # requests.ConnectionError occurs, _close() is never called and the temp file + # leaks. This is a CacheControl limitation, not a check-jsonschema bug. + # See plan.md section "5. Pytest Warning Filter" for details. + 'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning', ] addopts = [ "--color=yes", diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index 86aad0e62..c75cd932e 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -1,19 +1,17 @@ from __future__ import annotations -import calendar import contextlib -import hashlib import io +import logging import os import platform -import shutil -import tempfile -import time import typing as t +import cachecontrol import requests +from cachecontrol.caches.file_cache import FileCache -_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z" +log = logging.getLogger(__name__) def _base_cache_dir() -> str | None: @@ -42,26 +40,23 @@ def _resolve_cache_dir(dirname: str) -> str | None: return cache_dir -def _lastmod_from_response(response: requests.Response) -> float: - try: - return calendar.timegm( - time.strptime(response.headers["last-modified"], _LASTMOD_FMT) - ) - # OverflowError: time outside of platform-specific bounds - # ValueError: malformed/unparseable - # LookupError: no such header - except (OverflowError, ValueError, LookupError): - return 0.0 - - def _get_request( - file_url: str, *, response_ok: t.Callable[[requests.Response], bool] + session: requests.Session, + file_url: str, + *, + response_ok: t.Callable[[requests.Response], bool], ) -> requests.Response: num_retries = 2 r: requests.Response | None = None for _attempt in range(num_retries + 1): try: - r = requests.get(file_url, stream=True) + # On retries, bypass CacheControl's local cache to avoid + # re-serving a cached bad response. Ideally we'd delete the + # cache entry directly, but CacheControl doesn't expose a public + # API for this (see https://github.com/psf/cachecontrol/issues/219). + # The no-cache header forces revalidation with the origin server. + headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} + r = session.get(file_url, headers=headers) except requests.RequestException as e: if _attempt == num_retries: raise FailedDownloadError("encountered error during download") from e @@ -74,48 +69,6 @@ def _get_request( ) -def _atomic_write(dest: str, content: bytes) -> None: - # download to a temp file and then move to the dest - # this makes the download safe if run in parallel (parallel runs - # won't create a new empty file for writing and cause failures) - fp = tempfile.NamedTemporaryFile(mode="wb", delete=False) - fp.write(content) - fp.close() - shutil.copy(fp.name, dest) - os.remove(fp.name) - - -def _cache_hit(cachefile: str, response: requests.Response) -> bool: - # no file? miss - if not os.path.exists(cachefile): - return False - - # compare mtime on any cached file against the remote last-modified time - # it is considered a hit if the local file is at least as new as the remote file - local_mtime = os.path.getmtime(cachefile) - remote_mtime = _lastmod_from_response(response) - return local_mtime >= remote_mtime - - -def url_to_cache_filename(ref_url: str) -> str: - """ - Given a schema URL, convert it to a filename for caching in a cache dir. - - Rules are as follows: - - the base filename is an sha256 hash of the URL - - if the filename ends in an extension (.json, .yaml, etc) that extension - is appended to the hash - - Preserving file extensions preserves the extension-based logic used for parsing, and - it also helps a local editor (browsing the cache) identify filetypes. - """ - filename = hashlib.sha256(ref_url.encode()).hexdigest() - if "." in (last_part := ref_url.rpartition("/")[-1]): - _, _, extension = last_part.rpartition(".") - filename = f"{filename}.{extension}" - return filename - - class FailedDownloadError(Exception): pass @@ -124,59 +77,42 @@ class CacheDownloader: def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: self._cache_dir = _resolve_cache_dir(cache_dir) self._disable_cache = disable_cache - - def _download( - self, - file_url: str, - filename: str, - response_ok: t.Callable[[requests.Response], bool], - ) -> str: - assert self._cache_dir is not None - os.makedirs(self._cache_dir, exist_ok=True) - dest = os.path.join(self._cache_dir, filename) - - def check_response_for_download(r: requests.Response) -> bool: - # if the response indicates a cache hit, treat it as valid - # this ensures that we short-circuit any further evaluation immediately on - # a hit - if _cache_hit(dest, r): - return True - # we now know it's not a hit, so validate the content (forces download) - return response_ok(r) - - response = _get_request(file_url, response_ok=check_response_for_download) - # check to see if we have a file which matches the connection - # only download if we do not (cache miss, vs hit) - if not _cache_hit(dest, response): - _atomic_write(dest, response.content) - - return dest + self._cached_session: requests.Session | None = None + + @property + def _session(self) -> requests.Session: + if self._cached_session is None: + self._cached_session = self._build_session() + return self._cached_session + + def _build_session(self) -> requests.Session: + session = requests.Session() + if self._cache_dir and not self._disable_cache: + log.debug("using cache dir: %s", self._cache_dir) + os.makedirs(self._cache_dir, exist_ok=True) + session = cachecontrol.CacheControl( + session, cache=FileCache(self._cache_dir) + ) + else: + log.debug("caching disabled") + return session @contextlib.contextmanager def open( self, file_url: str, - filename: str, validate_response: t.Callable[[requests.Response], bool], ) -> t.Iterator[t.IO[bytes]]: - if (not self._cache_dir) or self._disable_cache: - yield io.BytesIO( - _get_request(file_url, response_ok=validate_response).content - ) - else: - with open( - self._download(file_url, filename, response_ok=validate_response), "rb" - ) as fp: - yield fp + response = _get_request(self._session, file_url, response_ok=validate_response) + yield io.BytesIO(response.content) def bind( self, file_url: str, - filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> BoundCacheDownloader: return BoundCacheDownloader( - file_url, self, filename=filename, validation_callback=validation_callback + file_url, self, validation_callback=validation_callback ) @@ -186,11 +122,9 @@ def __init__( file_url: str, downloader: CacheDownloader, *, - filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> None: self._file_url = file_url - self._filename = filename or url_to_cache_filename(file_url) self._downloader = downloader self._validation_callback = validation_callback @@ -198,7 +132,6 @@ def __init__( def open(self) -> t.Iterator[t.IO[bytes]]: with self._downloader.open( self._file_url, - self._filename, validate_response=self._validate_response, ) as fp: yield fp @@ -206,7 +139,11 @@ def open(self) -> t.Iterator[t.IO[bytes]]: def _validate_response(self, response: requests.Response) -> bool: if not self._validation_callback: return True - + # CacheControl sets from_cache=True on cache hits; skip re-validation. + # Plain requests.Session (used when disable_cache=True) doesn't set this + # attribute at all, so we use getattr with a default. + if getattr(response, "from_cache", False): + return True try: self._validation_callback(response.content) return True diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 62d79bb35..96dc9d5cb 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import textwrap import typing as t @@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None: }[value] +def configure_logging( + ctx: click.Context, param: click.Parameter, value: str | None +) -> None: + if value is None: + return + level = getattr(logging, value.upper()) + logging.basicConfig( + level=level, + format="%(name)s [%(levelname)s]: %(message)s", + ) + + def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: return textwrap.indent( "\n".join( @@ -88,12 +101,20 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: ) @click.help_option("-h", "--help") @click.version_option() +@click.option( + "--log-level", + help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False), + callback=configure_logging, + expose_value=False, + is_eager=True, +) @click.option( "--schemafile", help=( "The path to a file containing the JSON Schema to use or an " "HTTP(S) URI for the schema. If a remote file is used, " - "it will be downloaded and cached locally based on mtime. " + "it will be downloaded and cached locally. " "Use '-' for stdin." ), metavar="[PATH|URI]", diff --git a/tests/acceptance/test_nonjson_schema_handling.py b/tests/acceptance/test_nonjson_schema_handling.py index 4e56d25e2..ffa325bf1 100644 --- a/tests/acceptance/test_nonjson_schema_handling.py +++ b/tests/acceptance/test_nonjson_schema_handling.py @@ -135,31 +135,3 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data): result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) assert result.exit_code == (0 if passing_data else 1) - - -def test_can_load_remote_yaml_schema_ref_from_cache( - run_line, inject_cached_ref, tmp_path -): - retrieval_uri = "https://example.org/retrieval/schemas/main.yaml" - responses.add( - "GET", - retrieval_uri, - body="""\ -"$schema": "http://json-schema.org/draft-07/schema" -properties: - "title": {"$ref": "./title_schema.yaml"} -additionalProperties: false -""", - ) - - ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml" - # populate a bad schema, but then "override" that with a good cache value - # this can only pass (in the success case) if the cache loading really works - responses.add("GET", ref_loc, body="false") - inject_cached_ref(ref_loc, "type: string") - - doc = tmp_path / "doc.json" - doc.write_text(json.dumps(PASSING_DOCUMENT)) - - result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) - assert result.exit_code == 0 diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py index 3dafc4c8a..59670cdf6 100644 --- a/tests/acceptance/test_remote_ref_resolution.py +++ b/tests/acceptance/test_remote_ref_resolution.py @@ -66,16 +66,14 @@ def test_remote_ref_resolution_simple_case(run_line, check_passes, casename, tmp @pytest.mark.parametrize("casename", ("case1", "case2")) @pytest.mark.parametrize("disable_cache", (True, False)) def test_remote_ref_resolution_cache_control( - run_line, tmp_path, get_ref_cache_loc, casename, disable_cache + run_line, tmp_path, casename, disable_cache, schemas_cache_dir, refs_cache_dir ): main_schema_loc = "https://example.com/main.json" responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) - ref_locs = [] for name, subschema in CASES[casename]["other_schemas"].items(): other_schema_loc = f"https://example.com/{name}.json" responses.add("GET", other_schema_loc, json=subschema) - ref_locs.append(other_schema_loc) instance_path = tmp_path / "instance.json" instance_path.write_text(json.dumps(CASES[casename]["passing_document"])) @@ -88,58 +86,16 @@ def test_remote_ref_resolution_cache_control( output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" assert result.exit_code == 0, output - cache_locs = [] - for ref_loc in ref_locs: - cache_locs.append(get_ref_cache_loc(ref_loc)) - assert cache_locs # sanity check + # Cache directories are created only when caching is enabled + cache_dirs = [schemas_cache_dir, refs_cache_dir] if disable_cache: - for loc in cache_locs: + for loc in cache_dirs: assert not loc.exists() else: - for loc in cache_locs: + for loc in cache_dirs: assert loc.exists() -@pytest.mark.parametrize("casename", ("case1", "case2")) -@pytest.mark.parametrize("check_passes", (True, False)) -def test_remote_ref_resolution_loads_from_cache( - run_line, tmp_path, get_ref_cache_loc, inject_cached_ref, casename, check_passes -): - main_schema_loc = "https://example.com/main.json" - responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) - - ref_locs = [] - cache_locs = [] - for name, subschema in CASES[casename]["other_schemas"].items(): - other_schema_loc = f"https://example.com/{name}.json" - # intentionally populate the HTTP location with "bad data" - responses.add("GET", other_schema_loc, json="{}") - ref_locs.append(other_schema_loc) - - # but populate the cache with "good data" - inject_cached_ref(other_schema_loc, json.dumps(subschema)) - cache_locs.append(get_ref_cache_loc(other_schema_loc)) - - instance_path = tmp_path / "instance.json" - instance_path.write_text( - json.dumps( - CASES[casename]["passing_document"] - if check_passes - else CASES[casename]["failing_document"] - ) - ) - - # run the command - result = run_line( - ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] - ) - output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" - if check_passes: - assert result.exit_code == 0, output - else: - assert result.exit_code == 1, output - - # this test ensures that `$id` is preferred for the base URI over # the retrieval URI @pytest.mark.parametrize("check_passes", (True, False)) diff --git a/tests/conftest.py b/tests/conftest.py index e8fc84176..82dc3139d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,39 +74,8 @@ def patch_cache_dir(monkeypatch, cache_dir): @pytest.fixture -def url2cachepath(): - from check_jsonschema.cachedownloader import url_to_cache_filename - - def _get(cache_dir, url): - return cache_dir / url_to_cache_filename(url) - - return _get - - -@pytest.fixture -def downloads_cache_dir(tmp_path): - return tmp_path / ".cache" / "check_jsonschema" / "downloads" - - -@pytest.fixture -def get_download_cache_loc(downloads_cache_dir, url2cachepath): - def _get(url): - return url2cachepath(downloads_cache_dir, url) - - return _get - - -@pytest.fixture -def inject_cached_download(downloads_cache_dir, get_download_cache_loc): - def _write(uri, content): - downloads_cache_dir.mkdir(parents=True) - path = get_download_cache_loc(uri) - if isinstance(content, str): - path.write_text(content) - else: - path.write_bytes(content) - - return _write +def schemas_cache_dir(tmp_path): + return tmp_path / ".cache" / "check_jsonschema" / "schemas" @pytest.fixture @@ -114,18 +83,7 @@ def refs_cache_dir(tmp_path): return tmp_path / ".cache" / "check_jsonschema" / "refs" +# Alias for unit tests that use "downloads" as the cache dir name @pytest.fixture -def get_ref_cache_loc(refs_cache_dir, url2cachepath): - def _get(url): - return url2cachepath(refs_cache_dir, url) - - return _get - - -@pytest.fixture -def inject_cached_ref(refs_cache_dir, get_ref_cache_loc): - def _write(uri, content): - refs_cache_dir.mkdir(parents=True) - get_ref_cache_loc(uri).write_text(content) - - return _write +def downloads_cache_dir(tmp_path): + return tmp_path / ".cache" / "check_jsonschema" / "downloads" diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index b906ca03a..402bb2c56 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -1,7 +1,6 @@ import json import os import platform -import time import pytest import requests @@ -10,9 +9,6 @@ from check_jsonschema.cachedownloader import ( CacheDownloader, FailedDownloadError, - _cache_hit, - _lastmod_from_response, - url_to_cache_filename, ) DEFAULT_RESPONSE_URL = "https://example.com/schema1.json" @@ -34,11 +30,6 @@ def default_response(): add_default_response() -def test_default_filename_from_uri(default_response): - cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) - assert cd._filename == url_to_cache_filename(DEFAULT_RESPONSE_URL) - - @pytest.mark.parametrize( "sysname, fakeenv, expect_value", [ @@ -92,48 +83,10 @@ def fake_expanduser(path): assert expanduser_path is None -def test_cache_hit_by_mtime(monkeypatch, default_response): - monkeypatch.setattr(os.path, "exists", lambda x: True) - - # local mtime = NOW, cache hit - monkeypatch.setattr(os.path, "getmtime", lambda x: time.time()) - assert _cache_hit( - "/tmp/schema1.json", - requests.get(DEFAULT_RESPONSE_URL, stream=True), - ) - - # local mtime = 0, cache miss - monkeypatch.setattr(os.path, "getmtime", lambda x: 0) - assert ( - _cache_hit( - "/tmp/schema1.json", - requests.get(DEFAULT_RESPONSE_URL, stream=True), - ) - is False - ) - - -def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): - # create a file - f = tmp_path / "foo.json" - f.write_text("{}") - - # set the cache_dir to the tmp dir (so that cache_dir will always be set) - cd = CacheDownloader(tmp_path).bind(str(f), filename="foo.json") - # patch the downloader to skip any download "work" - monkeypatch.setattr( - cd._downloader, "_download", lambda file_uri, filename, response_ok: str(f) - ) - - with cd.open() as fp: - assert fp.read() == b"{}" - - @pytest.mark.parametrize("disable_cache", (True, False)) def test_cachedownloader_on_success( - get_download_cache_loc, disable_cache, default_response + disable_cache, default_response, downloads_cache_dir ): - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) @@ -141,26 +94,24 @@ def test_cachedownloader_on_success( with cd.open() as fp: assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() -def test_cachedownloader_using_alternate_target_dir( - cache_dir, default_response, url2cachepath -): - cache_dir = cache_dir / "check_jsonschema" / "otherdir" - f = url2cachepath(cache_dir, DEFAULT_RESPONSE_URL) +def test_cachedownloader_using_alternate_target_dir(cache_dir, default_response): cd = CacheDownloader("otherdir").bind(DEFAULT_RESPONSE_URL) with cd.open() as fp: assert fp.read() == b"{}" - assert f.exists() + + # Cache directory is created for the alternate target dir + assert cache_dir.joinpath("check_jsonschema", "otherdir").exists() @pytest.mark.parametrize("disable_cache", (True, False)) @pytest.mark.parametrize("failures", (1, 2, requests.ConnectionError)) def test_cachedownloader_succeeds_after_few_errors( - get_download_cache_loc, disable_cache, failures + disable_cache, failures, downloads_cache_dir ): if isinstance(failures, int): for _i in range(failures): @@ -178,7 +129,6 @@ def test_cachedownloader_succeeds_after_few_errors( match_querystring=None, ) add_default_response() - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) @@ -186,15 +136,15 @@ def test_cachedownloader_succeeds_after_few_errors( with cd.open() as fp: assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() @pytest.mark.parametrize("disable_cache", (True, False)) @pytest.mark.parametrize("connection_error", (True, False)) def test_cachedownloader_fails_after_many_errors( - get_download_cache_loc, disable_cache, connection_error + disable_cache, connection_error, downloads_cache_dir ): for _i in range(10): if connection_error: @@ -212,18 +162,20 @@ def test_cachedownloader_fails_after_many_errors( match_querystring=None, ) add_default_response() # never reached, the 11th response - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) with pytest.raises(FailedDownloadError): with cd.open(): pass - assert not f.exists() + + # Cache directory is created only when caching is enabled + # (even though the request failed, the session was built) + assert downloads_cache_dir.exists() is not disable_cache @pytest.mark.parametrize("disable_cache", (True, False)) -def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cache): +def test_cachedownloader_retries_on_bad_data(disable_cache, downloads_cache_dir): responses.add( "GET", DEFAULT_RESPONSE_URL, @@ -232,7 +184,6 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac match_querystring=None, ) add_default_response() - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader( "downloads", disable_cache=disable_cache, @@ -245,139 +196,39 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() -@pytest.mark.parametrize("file_exists", (True, False)) -@pytest.mark.parametrize( - "failure_mode", ("header_missing", "header_malformed", "time_overflow") -) -def test_cachedownloader_handles_bad_lastmod_header( - monkeypatch, - get_download_cache_loc, - inject_cached_download, - file_exists, - failure_mode, -): - if failure_mode == "header_missing": - responses.add( - "GET", DEFAULT_RESPONSE_URL, headers={}, json={}, match_querystring=None - ) - elif failure_mode == "header_malformed": - responses.add( - "GET", - DEFAULT_RESPONSE_URL, - headers={"Last-Modified": "Jan 2000 00:00:01"}, - json={}, - match_querystring=None, - ) - elif failure_mode == "time_overflow": - add_default_response() - - def fake_timegm(*args): - raise OverflowError("uh-oh") - - monkeypatch.setattr("calendar.timegm", fake_timegm) - else: - raise NotImplementedError - - original_file_contents = b'{"foo": "bar"}' - file_path = get_download_cache_loc(DEFAULT_RESPONSE_URL) - - assert not file_path.exists() - if file_exists: - inject_cached_download(DEFAULT_RESPONSE_URL, original_file_contents) - - cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) - - # if the file already existed, it will not be overwritten by the cachedownloader - # so the returned value for both the downloader and a direct file read should be the - # original contents - if file_exists: - with cd.open() as fp: - assert fp.read() == original_file_contents - assert file_path.read_bytes() == original_file_contents - # otherwise, the file will have been created with new content - # both reads will show that new content - else: - with cd.open() as fp: - assert fp.read() == b"{}" - assert file_path.read_bytes() == b"{}" - - # at the end, the file always exists on disk - assert file_path.exists() - - -def test_cachedownloader_validation_is_not_invoked_on_hit( - monkeypatch, default_response, inject_cached_download -): - """ - Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453 - - This was a bug in which the validation callback was invoked eagerly, even on a cache - hit. As a result, cache hits did not demonstrate their expected performance gain. - """ - # 1: construct some perfectly good data (it doesn't really matter what it is) - # <> - # 2: put equivalent data on disk - inject_cached_download(DEFAULT_RESPONSE_URL, "{}") - - # 3: construct a validator which marks that it ran in a variable - validator_ran = False - - def dummy_validate_bytes(data): - nonlocal validator_ran - validator_ran = True - - # construct a downloader pointed at the schema and file, expecting a cache hit - # and use the above validation method - cd = CacheDownloader("downloads").bind( - DEFAULT_RESPONSE_URL, - validation_callback=dummy_validate_bytes, - ) - - # read data from the downloader - with cd.open() as fp: - assert fp.read() == b"{}" - # assert that the validator was not run - assert validator_ran is False - +def test_disable_cache_uses_plain_session(): + """When disable_cache=True, verify _build_session returns a plain Session.""" + cd = CacheDownloader("downloads", disable_cache=True) + session = cd._build_session() + # A plain requests.Session does not have CacheControlAdapter + assert type(session) is requests.Session -@pytest.mark.skipif( - platform.system() == "Windows", - reason="time.tzset() is not available on Windows", -) -def test_lastmod_from_header_uses_gmtime(request, monkeypatch, default_response): - """ - Regression test for https://github.com/python-jsonschema/check-jsonschema/pull/565 - - The time was converted in local time, when UTC/GMT was desired. - """ - - def final_tzset(): - time.tzset() - request.addfinalizer(final_tzset) +def test_enable_cache_uses_cachecontrol_session(tmp_path, patch_cache_dir): + """When disable_cache=False, verify _build_session returns a CacheControl session.""" + from cachecontrol import CacheControlAdapter - response = requests.get(DEFAULT_RESPONSE_URL, stream=True) + cd = CacheDownloader("downloads", disable_cache=False) + session = cd._build_session() + # CacheControl wraps the session and attaches CacheControlAdapter + assert isinstance(session.get_adapter("https://"), CacheControlAdapter) + assert isinstance(session.get_adapter("http://"), CacheControlAdapter) - with monkeypatch.context() as m: - m.setenv("TZ", "GMT0") - time.tzset() - gmt_parsed_time = _lastmod_from_response(response) - with monkeypatch.context() as m: - m.setenv("TZ", "EST5") - time.tzset() - est_parsed_time = _lastmod_from_response(response) - - with monkeypatch.context() as m: - m.setenv("TZ", "UTC0") - time.tzset() - utc_parsed_time = _lastmod_from_response(response) - - # assert that they all match - assert gmt_parsed_time == utc_parsed_time - assert gmt_parsed_time == est_parsed_time +def test_cache_dir_none_uses_plain_session(monkeypatch, patch_cache_dir): + """When _resolve_cache_dir returns None, _build_session returns plain Session.""" + # Undo the patch and simulate Windows with no env vars + patch_cache_dir.undo() + monkeypatch.delenv("LOCALAPPDATA", raising=False) + monkeypatch.delenv("APPDATA", raising=False) + monkeypatch.setattr(platform, "system", lambda: "Windows") + + cd = CacheDownloader("downloads", disable_cache=False) + assert cd._cache_dir is None + session = cd._build_session() + assert type(session) is requests.Session