Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
493 changes: 493 additions & 0 deletions plan.md

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ dependencies = [
"regress>=2024.11.1",
"requests<3.0",
"click>=8,<9",
"CacheControl[filecache]>=0.14,<0.15",
]

[project.readme]
Expand Down Expand Up @@ -80,6 +81,12 @@ filterwarnings = [
"error",
# dateutil has a Python 3.12 compatibility issue.
'ignore:datetime\.datetime\.utcfromtimestamp\(\) is deprecated:DeprecationWarning',
# CacheControl's CallbackFileWrapper (filewrapper.py) creates a NamedTemporaryFile
# in __init__ that is only closed when the response is fully read. When a
# requests.ConnectionError occurs, _close() is never called and the temp file
# leaks. This is a CacheControl limitation, not a check-jsonschema bug.
# See plan.md section "5. Pytest Warning Filter" for details.
'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning',
]
addopts = [
"--color=yes",
Expand Down
147 changes: 42 additions & 105 deletions src/check_jsonschema/cachedownloader.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
from __future__ import annotations

import calendar
import contextlib
import hashlib
import io
import logging
import os
import platform
import shutil
import tempfile
import time
import typing as t

import cachecontrol
import requests
from cachecontrol.caches.file_cache import FileCache

_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"
log = logging.getLogger(__name__)


def _base_cache_dir() -> str | None:
Expand Down Expand Up @@ -42,26 +40,23 @@ def _resolve_cache_dir(dirname: str) -> str | None:
return cache_dir


def _lastmod_from_response(response: requests.Response) -> float:
try:
return calendar.timegm(
time.strptime(response.headers["last-modified"], _LASTMOD_FMT)
)
# OverflowError: time outside of platform-specific bounds
# ValueError: malformed/unparseable
# LookupError: no such header
except (OverflowError, ValueError, LookupError):
return 0.0


def _get_request(
file_url: str, *, response_ok: t.Callable[[requests.Response], bool]
session: requests.Session,
file_url: str,
*,
response_ok: t.Callable[[requests.Response], bool],
) -> requests.Response:
num_retries = 2
r: requests.Response | None = None
for _attempt in range(num_retries + 1):
try:
r = requests.get(file_url, stream=True)
# On retries, bypass CacheControl's local cache to avoid
# re-serving a cached bad response. Ideally we'd delete the
# cache entry directly, but CacheControl doesn't expose a public
# API for this (see https://github.com/psf/cachecontrol/issues/219).
# The no-cache header forces revalidation with the origin server.
headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {}
r = session.get(file_url, headers=headers)
except requests.RequestException as e:
if _attempt == num_retries:
raise FailedDownloadError("encountered error during download") from e
Expand All @@ -74,48 +69,6 @@ def _get_request(
)


def _atomic_write(dest: str, content: bytes) -> None:
# download to a temp file and then move to the dest
# this makes the download safe if run in parallel (parallel runs
# won't create a new empty file for writing and cause failures)
fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
fp.write(content)
fp.close()
shutil.copy(fp.name, dest)
os.remove(fp.name)


def _cache_hit(cachefile: str, response: requests.Response) -> bool:
# no file? miss
if not os.path.exists(cachefile):
return False

# compare mtime on any cached file against the remote last-modified time
# it is considered a hit if the local file is at least as new as the remote file
local_mtime = os.path.getmtime(cachefile)
remote_mtime = _lastmod_from_response(response)
return local_mtime >= remote_mtime


def url_to_cache_filename(ref_url: str) -> str:
"""
Given a schema URL, convert it to a filename for caching in a cache dir.

Rules are as follows:
- the base filename is an sha256 hash of the URL
- if the filename ends in an extension (.json, .yaml, etc) that extension
is appended to the hash

Preserving file extensions preserves the extension-based logic used for parsing, and
it also helps a local editor (browsing the cache) identify filetypes.
"""
filename = hashlib.sha256(ref_url.encode()).hexdigest()
if "." in (last_part := ref_url.rpartition("/")[-1]):
_, _, extension = last_part.rpartition(".")
filename = f"{filename}.{extension}"
return filename


class FailedDownloadError(Exception):
pass

Expand All @@ -124,59 +77,42 @@ class CacheDownloader:
def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None:
self._cache_dir = _resolve_cache_dir(cache_dir)
self._disable_cache = disable_cache

def _download(
self,
file_url: str,
filename: str,
response_ok: t.Callable[[requests.Response], bool],
) -> str:
assert self._cache_dir is not None
os.makedirs(self._cache_dir, exist_ok=True)
dest = os.path.join(self._cache_dir, filename)

def check_response_for_download(r: requests.Response) -> bool:
# if the response indicates a cache hit, treat it as valid
# this ensures that we short-circuit any further evaluation immediately on
# a hit
if _cache_hit(dest, r):
return True
# we now know it's not a hit, so validate the content (forces download)
return response_ok(r)

response = _get_request(file_url, response_ok=check_response_for_download)
# check to see if we have a file which matches the connection
# only download if we do not (cache miss, vs hit)
if not _cache_hit(dest, response):
_atomic_write(dest, response.content)

return dest
self._cached_session: requests.Session | None = None

@property
def _session(self) -> requests.Session:
if self._cached_session is None:
self._cached_session = self._build_session()
return self._cached_session

def _build_session(self) -> requests.Session:
session = requests.Session()
if self._cache_dir and not self._disable_cache:
log.debug("using cache dir: %s", self._cache_dir)
os.makedirs(self._cache_dir, exist_ok=True)
session = cachecontrol.CacheControl(
session, cache=FileCache(self._cache_dir)
)
else:
log.debug("caching disabled")
return session

@contextlib.contextmanager
def open(
self,
file_url: str,
filename: str,
validate_response: t.Callable[[requests.Response], bool],
) -> t.Iterator[t.IO[bytes]]:
if (not self._cache_dir) or self._disable_cache:
yield io.BytesIO(
_get_request(file_url, response_ok=validate_response).content
)
else:
with open(
self._download(file_url, filename, response_ok=validate_response), "rb"
) as fp:
yield fp
response = _get_request(self._session, file_url, response_ok=validate_response)
yield io.BytesIO(response.content)

def bind(
self,
file_url: str,
filename: str | None = None,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> BoundCacheDownloader:
return BoundCacheDownloader(
file_url, self, filename=filename, validation_callback=validation_callback
file_url, self, validation_callback=validation_callback
)


Expand All @@ -186,27 +122,28 @@ def __init__(
file_url: str,
downloader: CacheDownloader,
*,
filename: str | None = None,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> None:
self._file_url = file_url
self._filename = filename or url_to_cache_filename(file_url)
self._downloader = downloader
self._validation_callback = validation_callback

@contextlib.contextmanager
def open(self) -> t.Iterator[t.IO[bytes]]:
with self._downloader.open(
self._file_url,
self._filename,
validate_response=self._validate_response,
) as fp:
yield fp

def _validate_response(self, response: requests.Response) -> bool:
if not self._validation_callback:
return True

# CacheControl sets from_cache=True on cache hits; skip re-validation.
# Plain requests.Session (used when disable_cache=True) doesn't set this
# attribute at all, so we use getattr with a default.
if getattr(response, "from_cache", False):
return True
try:
self._validation_callback(response.content)
return True
Expand Down
23 changes: 22 additions & 1 deletion src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
import os
import textwrap
import typing as t
Expand Down Expand Up @@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None:
}[value]


def configure_logging(
ctx: click.Context, param: click.Parameter, value: str | None
) -> None:
if value is None:
return
level = getattr(logging, value.upper())
logging.basicConfig(
level=level,
format="%(name)s [%(levelname)s]: %(message)s",
)


def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
return textwrap.indent(
"\n".join(
Expand Down Expand Up @@ -88,12 +101,20 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
)
@click.help_option("-h", "--help")
@click.version_option()
@click.option(
"--log-level",
help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
callback=configure_logging,
expose_value=False,
is_eager=True,
)
@click.option(
"--schemafile",
help=(
"The path to a file containing the JSON Schema to use or an "
"HTTP(S) URI for the schema. If a remote file is used, "
"it will be downloaded and cached locally based on mtime. "
"it will be downloaded and cached locally. "
"Use '-' for stdin."
),
metavar="[PATH|URI]",
Expand Down
28 changes: 0 additions & 28 deletions tests/acceptance/test_nonjson_schema_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,31 +135,3 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data):

result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
assert result.exit_code == (0 if passing_data else 1)


def test_can_load_remote_yaml_schema_ref_from_cache(
run_line, inject_cached_ref, tmp_path
):
retrieval_uri = "https://example.org/retrieval/schemas/main.yaml"
responses.add(
"GET",
retrieval_uri,
body="""\
"$schema": "http://json-schema.org/draft-07/schema"
properties:
"title": {"$ref": "./title_schema.yaml"}
additionalProperties: false
""",
)

ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml"
# populate a bad schema, but then "override" that with a good cache value
# this can only pass (in the success case) if the cache loading really works
responses.add("GET", ref_loc, body="false")
inject_cached_ref(ref_loc, "type: string")

doc = tmp_path / "doc.json"
doc.write_text(json.dumps(PASSING_DOCUMENT))

result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
assert result.exit_code == 0
Loading