diff --git a/.github/workflows/trivy.yaml b/.github/workflows/trivy.yaml index aa6735f4..94b60a64 100644 --- a/.github/workflows/trivy.yaml +++ b/.github/workflows/trivy.yaml @@ -49,6 +49,7 @@ jobs: template: "@/contrib/sarif.tpl" output: "trivy-results.sarif" severity: "CRITICAL,HIGH" + timeout: 15m - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..2afd8536 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,52 @@ +# CDM Data Loaders Changelog + +- [CDM Data Loaders Changelog](#cdm-data-loaders-changelog) + - [v0.1.8](#v018) + - [v0.1.7](#v017) + - [v0.1.6](#v016) + - [v0.1.5](#v015) + - [v0.1.4](#v014) + - [v0.1.3](#v013) + - [v0.1.2](#v012) + - [v0.1.1](#v011) + - [v0.1.0](#v010) + + +### v0.1.8 + +- Add rotating file log handler for easier debugging. + +### v0.1.7 + +- Add in AllTheBacteria file download client. + +### v0.1.6 + +- Make NCBI REST API client more resilient to errors and ensure existing imports are not lost. + +### v0.1.5 + +- Add batch size parameter to the NCBI REST API interface. + + +### v0.1.4 + +- Add in NCBI REST API interface. + + +### v0.1.3 + +- Add in file batcher for use with file-based importers. + + +### v0.1.2 + +- Update XML File Splitter to use the latest version, which includes the `gzip` parameter. + +### v0.1.1 + +- Add [XML File Splitter](https://github.com/ialarmedalien/xml_file_splitter) to the container. + +### v0.1.0 + +- Initial release. diff --git a/Dockerfile b/Dockerfile index 35a2085c..a218a8bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,7 +54,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Place executables in the environment at the front of the path ENV PATH="/app/.venv/bin:$PATH" -COPY --chmod=+x ./scripts/entrypoint.sh /app/ +RUN chmod +x ./scripts/entrypoint.sh # Use the non-root user to run our application USER nonroot -ENTRYPOINT ["./entrypoint.sh"] +ENTRYPOINT ["./scripts/entrypoint.sh"] diff --git a/README.md b/README.md index 4ecc8b96..322ad2ed 100644 --- a/README.md +++ b/README.md @@ -11,15 +11,6 @@ Repo for CDM input data loading and wrangling - [Tests](#tests) - [Loading genomes, contigs, and features](#loading-genomes-contigs-and-features) - [Running bbmap stats and checkm2 on genome or contigset files](#running-bbmap-stats-and-checkm2-on-genome-or-contigset-files) - - [Changelog](#changelog) - - [v0.1.7](#v017) - - [v0.1.6](#v016) - - [v0.1.5](#v015) - - [v0.1.4](#v014) - - [v0.1.3](#v013) - - [v0.1.2](#v012) - - [v0.1.1](#v011) - - [v0.1.0](#v010) @@ -168,41 +159,3 @@ Run the stats and checkm2 tools with the following command: bash scripts/run_tools.sh path/to/genome_paths_file.json output_dir ``` where `path/to/genome_paths_file.json` specifies the path to the genome paths file (format specified above) and `output_dir` is the directory for the results. - - -## Changelog - -### v0.1.7 - -- Add in AllTheBacteria file download client. - -### v0.1.6 - -- Make NCBI REST API client more resilient to errors and ensure existing imports are not lost. - -### v0.1.5 - -- Add batch size parameter to the NCBI REST API interface. - - -### v0.1.4 - -- Add in NCBI REST API interface. - - -### v0.1.3 - -- Add in file batcher for use with file-based importers. - - -### v0.1.2 - -- Update XML File Splitter to use the latest version, which includes the `gzip` parameter. - -### v0.1.1 - -- Add [XML File Splitter](https://github.com/ialarmedalien/xml_file_splitter) to the container. - -### v0.1.0 - -- Initial release. diff --git a/pyproject.toml b/pyproject.toml index b5cce288..c1642088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cdm-data-loaders" -version = "0.1.7" +version = "0.1.8" description = "Data loaders and wranglers for the CDM." requires-python = ">= 3.13" readme = "README.md" @@ -17,7 +17,7 @@ dependencies = [ "dlt[deltalake,duckdb,filesystem,parquet]>=1.22.2", "frictionless[aws]>=5.18.1", "frozendict>=2.4.7", - "lxml>=6.0.2", + "lxml>=6.1.0", "pydantic>=2.12.5", "pydantic-settings>=2.12.0", "tqdm>=4.67.3", diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 2be0475f..f9edcbfd 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,9 +1,16 @@ #!/usr/bin/env bash set -euo pipefail -# Ensure at least one argument is provided +VALID_COMMANDS=(all_the_bacteria ncbi_rest_api uniprot uniref xml_split test bash) + +usage() { + local joined + joined=$(IFS='|'; echo "${VALID_COMMANDS[*]}") + echo "Usage: $0 {${joined}} [args...]" >&2 +} + if [ "$#" -eq 0 ]; then - echo "Usage: $0 {all_the_bacteria|ncbi_rest_api|uniprot|uniref|xml_split|test} [args...]" + usage exit 1 fi @@ -12,34 +19,29 @@ shift case "$cmd" in all_the_bacteria) - # All the Bacteria file importer exec /usr/bin/tini -- uv run --no-sync all_the_bacteria "$@" ;; ncbi_rest_api) - # Run the NCBI datasets API importer exec /usr/bin/tini -- uv run --no-sync ncbi_rest_api "$@" ;; uniprot) - # Run the uniprot pipeline with any additional arguments exec /usr/bin/tini -- uv run --no-sync uniprot "$@" ;; uniref) - # Run the uniref pipeline with any additional arguments exec /usr/bin/tini -- uv run --no-sync uniref "$@" ;; xml_split) - # Run the xml_file_splitter app exec /usr/bin/tini -- xml_file_splitter "$@" ;; test) - # run the tests exec /usr/bin/tini -- uv run --no-sync pytest -m "not requires_spark" ;; bash) exec /usr/bin/tini -- /bin/bash ;; *) - echo "Error: unknown command '$cmd'; valid commands are 'all_the_bacteria', 'ncbi_rest_api', 'uniprot', 'uniref', or 'xml_split'." >&2 + echo "Error: unknown command '$cmd'." >&2 + usage exit 1 ;; esac diff --git a/src/cdm_data_loaders/pipelines/all_the_bacteria.py b/src/cdm_data_loaders/pipelines/all_the_bacteria.py index bb32836a..d874b8b8 100644 --- a/src/cdm_data_loaders/pipelines/all_the_bacteria.py +++ b/src/cdm_data_loaders/pipelines/all_the_bacteria.py @@ -17,9 +17,10 @@ import dlt from dlt.extract.items import DataItemWithMeta +from dlt.sources.helpers import requests from dlt.sources.helpers.rest_client.client import RESTClient from frozendict import frozendict -from pydantic import AliasChoices, Field +from pydantic import AliasChoices, Field, computed_field from pydantic_settings import SettingsConfigDict from cdm_data_loaders.pipelines.core import ( @@ -28,20 +29,31 @@ ) from cdm_data_loaders.pipelines.cts_defaults import DEFAULT_SETTINGS_CONFIG_DICT, CtsSettings from cdm_data_loaders.utils.download.sync_client import FileDownloader +from cdm_data_loaders.utils.s3 import stream_to_s3 logger = logging.getLogger("dlt") DATASET_NAME = "all_the_bacteria" ALL_FILES_TSV_FILE_ID = "R6gcp" +ALL_ATB_FILE_NAME = "all_atb_files.tsv" +REGEX_FILE = "filters.txt" ATB_VERSION = "2025-05" ARG_ALIASES = frozendict( { "version": ["-v", "--version"], - } + "pattern_file": ["-f", "--pattern-file", "--pattern_file"], + }, ) +# project parts needed: +PROJECT_PARTS = ["Annotation/Bakta", "Assembly", "Metadata"] +PROJECT_PART_REGEX = re.compile(f"^AllTheBacteria/({'|'.join(PROJECT_PARTS)})") + +EXPECTED_ATB_FIELDNAMES = ["project", "project_id", "filename", "url", "md5", "size(MB)"] +REQUIRED_ATB_FIELDNAMES = {"project", "filename", "url", "md5"} + class AtbSettings(CtsSettings): """Configuration for running the AllTheBacteria import pipeline.""" @@ -54,37 +66,71 @@ class AtbSettings(CtsSettings): validation_alias=AliasChoices(*[alias.strip("-") for alias in ARG_ALIASES["version"]]), ) + pattern_file: str | None = Field( + default=None, + description="Path, relative to the input dir, of a file containing patterns to match when downloading ATB files", + validation_alias=AliasChoices(*[alias.strip("-") for alias in ARG_ALIASES["pattern_file"]]), + ) + + @computed_field @property def raw_data_dir(self) -> str: """Directory in which to save the raw data files that are downloaded. Set to the output directory / "raw_data" / version. """ - return str(Path(self.output) / "raw_data" / self.version) - - -# project parts needed: -PROJECT_PARTS = ["Annotation/Bakta", "Assembly", "Metadata"] + if self.use_destination == "local_fs": + return str(Path(self.output) / "raw_data" / self.version) + return f"{self.output}/raw_data/{self.version}" -PROJECT_PART_REGEX = re.compile(f"^AllTheBacteria/({'|'.join(PROJECT_PARTS)})") + @computed_field + @property + def pattern_matches(self) -> re.Pattern: + """The regular expression pattern to be used to select files for download. -EXPECTED_ATB_FIELDNAMES = ["project", "project_id", "filename", "url", "md5", "size(MB)"] -REQUIRED_ATB_FIELDNAMES = {"project", "filename", "url", "md5"} + If a pattern_file is supplied, it will read in the file at {input_dir}/{pattern_file} + and convert the contents into a regular expression. If no file is supplied or the file is empty + or does not contain any content, the default PROJECT_PART_REGEX will be used instead. + """ + if self.pattern_file: + pattern_file = Path(self.input_dir) / self.pattern_file + regex = load_patterns(pattern_file) + if regex is not None: + return regex + # return the default + return PROJECT_PART_REGEX + + +def load_patterns(pattern_file: Path) -> re.Pattern | None: + """Load the pattern file and convert it into a set of regexes.""" + patterns = [] + try: + for line in pattern_file.read_text(encoding="utf-8").splitlines(): + trimmed_line = line.strip() + # skip blank lines + if not trimmed_line: + continue + patterns.append( + re.escape(trimmed_line[:-1]) + ".*" if trimmed_line.endswith("*") else re.escape(trimmed_line) + ) + + if patterns: + return re.compile("^(" + "|".join(patterns) + ")$") + except Exception: + logger.exception("Could not load patterns from %s", str(pattern_file)) + + return None def download_atb_index_tsv(settings: AtbSettings) -> Path: """Download the ATB file index TSV file from the OSF and save it to disk. :param settings: pipeline config - :type settings: Settings + :type settings: AtbSettings :raises RuntimeError: if the download URL cannot be found :return: path to the downloaded file :rtype: Path """ - # make sure that the directory structure to save the file in can be written to - raw_data_dir = Path(settings.raw_data_dir) - raw_data_dir.mkdir(parents=True, exist_ok=True) - # get the all_atb_files.tsv file info from the OSF API and retrieve the download link osf_client = RESTClient( base_url="https://api.osf.io/v2/", @@ -102,20 +148,40 @@ def download_atb_index_tsv(settings: AtbSettings) -> Path: err_msg = f"Could not find download URL in response from 'https://api.osf.io/v2/files/{ALL_FILES_TSV_FILE_ID}/'" raise RuntimeError(err_msg) - atb_files_tsv = raw_data_dir / "all_atb_files.tsv" + if settings.use_destination == "s3": + # download to a local temp file and also copy the file to s3 + save_path = f"{settings.raw_data_dir}/{ALL_ATB_FILE_NAME}" + try: + stream_to_s3(url=all_files_tsv_download, s3_path=save_path, requests=requests) + except Exception: + logger.exception("Could not transfer %s to s3", ALL_ATB_FILE_NAME) + raise + # TODO: save as a temporary file, delete after pipeline has completed + # save a local copy of the file to current dir + atb_files_tsv = Path(ALL_ATB_FILE_NAME) + else: + # make sure that the directory structure to save the file in can be written to + raw_data_dir = Path(settings.raw_data_dir) + raw_data_dir.mkdir(parents=True, exist_ok=True) + atb_files_tsv = raw_data_dir / ALL_ATB_FILE_NAME + # download the file listing and save it - FileDownloader().download(all_files_tsv_download, atb_files_tsv) + FileDownloader().download(url=all_files_tsv_download, destination=atb_files_tsv) + logger.info("Downloaded TSV index file.") return atb_files_tsv -def get_file_download_links(atb_files_tsv: Path) -> Generator[list[dict[str, Any]], Any]: +def get_file_download_links(settings: AtbSettings, atb_files_tsv: Path) -> Generator[list[dict[str, Any]], Any]: """Parse the ATB file index TSV and to yield a list of files to download. + :param settings: pipeline config + :type settings: AtbSettings :param atb_files_tsv: path to the ATB file index TSV file :type atb_files_tsv: Path :yield: list of fields to download :rtype: Generator[list[dict[str, Any]], Any] """ + pattern_to_match = settings.pattern_matches with atb_files_tsv.open() as index_file: reader = csv.DictReader(index_file, delimiter="\t") all_lines = list(reader) @@ -134,7 +200,7 @@ def get_file_download_links(atb_files_tsv: Path) -> Generator[list[dict[str, Any logger.error(err_msg) raise RuntimeError(err_msg) - files_to_download = [row for row in all_lines if PROJECT_PART_REGEX.match(row["project"])] + files_to_download = [row for row in all_lines if pattern_to_match.match(row["project"])] yield files_to_download @@ -144,16 +210,24 @@ def osf_file_downloader(settings: AtbSettings, atb_file_list: list[dict[str, Any :param settings: pipeline config :type settings: Settings - :param atb_file_list: list of dictionaries + :param atb_file_list: info about files to transfer, as a list of dictionaries :type atb_file_list: list[dict[str, Any]] """ client = FileDownloader() - raw_data_dir = Path(settings.raw_data_dir) successful_downloads = [] for f in atb_file_list: try: - save_path = raw_data_dir / f["filename"] - client.download(f["url"], save_path, expected_checksum=f["md5"], checksum_fn="md5") + project_part = f["project"].removeprefix("AllTheBacteria").removeprefix("/").rstrip("/") + if settings.use_destination == "s3": + if project_part: + project_part = f"{project_part}/" + save_path = f"{settings.raw_data_dir}/{project_part}{f['filename']}" + stream_to_s3(url=f["url"], s3_path=save_path, requests=requests) + logger.debug("Successfully transferred file from %s to %s", f["url"], save_path) + else: + save_path = Path(settings.raw_data_dir) / project_part / f["filename"] + client.download(url=f["url"], destination=save_path, expected_checksum=f["md5"], checksum_fn="md5") + f["path"] = str(save_path) successful_downloads.append(f) except Exception as e: @@ -169,7 +243,7 @@ def osf_file_downloader(settings: AtbSettings, atb_file_list: list[dict[str, Any def atb_file_list(settings: AtbSettings) -> Generator[list[dict[str, Any]], Any, Any]: """Generate a list of files to download from the list of all ATB files.""" atb_files_tsv = download_atb_index_tsv(settings) - return get_file_download_links(atb_files_tsv) + return get_file_download_links(settings, atb_files_tsv) @dlt.transformer(name="file_downloader", data_from=atb_file_list, parallelized=True) diff --git a/src/cdm_data_loaders/pipelines/cts_defaults.py b/src/cdm_data_loaders/pipelines/cts_defaults.py index 5f59ff91..2ea95128 100644 --- a/src/cdm_data_loaders/pipelines/cts_defaults.py +++ b/src/cdm_data_loaders/pipelines/cts_defaults.py @@ -1,6 +1,5 @@ """Common defaults for running pipelines on the KBase CTS.""" -from pathlib import Path from typing import Any, Self import dlt.common.configuration.accessors @@ -86,7 +85,7 @@ class CtsSettings(BaseSettings): ) use_destination: str = Field( default=DEFAULT_CTS_SETTINGS["use_destination"], - description=f"DLT destination configuration to use for data output. Choices: {VALID_DESTINATIONS}", + description=f"DLT destination configuration to use for data output. Data to be saved to s3 should use the destination 's3'; to save data locally, use the destination 'local_fs'. The output directory can be specified using the 'output' field. Choices: {VALID_DESTINATIONS}", validation_alias=AliasChoices(*[alias.strip("-") for alias in ARG_ALIASES["use_destination"]]), ) use_output_dir_for_pipeline_metadata: bool = Field( @@ -120,6 +119,27 @@ def reconcile_with_dlt_config(self) -> Self: raise ValueError(err_msg) self.output = self.dlt_config[f"destination.{self.use_destination}.bucket_url"] + if self.output != "/": + self.output.rstrip("/") + + # TODO: this should never happen + if not self.output: + err_msg = "No output specified!" + raise ValueError(err_msg) + + # ensure that the use_destination value does not conflict with whether or not pipeline data should be saved + destination_is_s3 = False + if self.output.startswith("s3://") or self.output.startswith("s3a://"): + destination_is_s3 = True + + # self.use_destination should be "s3" if the output is an s3 url and vice versa + if bool(self.use_destination == "s3") != destination_is_s3: + err_msg = "Mismatch between output location and use_destination. To ensure internal settings functions work correctly, set use_destination to 's3' for writing files to s3, and 'local_fs' for writing files locally. The output directory can be configured using the 'output' parameter." + raise ValueError(err_msg) + + if self.use_output_dir_for_pipeline_metadata and destination_is_s3: + err_msg = "It is not currently possible to have the pipeline directory on s3." + raise ValueError(err_msg) return self @@ -148,7 +168,7 @@ def raw_data_dir(self) -> str: If not set, defaults to a 'raw_data' directory within the output directory after reconciling with dlt config. """ - return str(Path(self.output or "") / "raw_data") + return f"{self.output}{'' if self.output in ('', '/') else '/'}raw_data" @computed_field @property @@ -158,7 +178,7 @@ def pipeline_dir(self) -> str | None: If use_output_dir_for_pipeline_metadata is true, this defaults to a `.dlt_conf` directory within the output directory. """ if self.use_output_dir_for_pipeline_metadata: - return str(Path(self.output or "") / ".dlt_conf") + return f"{self.output}{'' if self.output in ('', '/') else '/'}.dlt_conf" return None diff --git a/src/cdm_data_loaders/utils/cdm_logger.py b/src/cdm_data_loaders/utils/cdm_logger.py index 25c341b7..cb91cae4 100644 --- a/src/cdm_data_loaders/utils/cdm_logger.py +++ b/src/cdm_data_loaders/utils/cdm_logger.py @@ -2,138 +2,224 @@ Provides structured logging with contextual metadata for CDM data import pipelines. """ +import json import logging +import logging.config import logging.handlers import os -import sys from pathlib import Path -DEFAULT_LOGGER_NAME = "cdm_data_loader" +from frozendict import frozendict +DEFAULT_LOGGER_NAME = "cdm_data_loader" GENERIC_ERROR_MESSAGE = "An error of unknown origin occurred." LOG_FILENAME = "cdm_data_loader.log" MAX_LOG_FILE_SIZE = 2**30 # 1 GiB MAX_LOG_BACKUPS = 5 -__LOGGER = None - -# TODO: adopt logging config, set just once -LOGGING_CONFIG = { - "root": {"name": "cdm_data_loader", "level": "INFO", "handlers": ["console", "file"]}, - "version": 1, - "handlers": { - "console": { - "class": "logging.StreamHandler", - "formatter": "json", - "level": "INFO", - "stream": "ext://sys.stdout", - }, - "file": { - "class": "logging.handlers.RotatingFileHandler", - "formatter": "json", - "filename": LOG_FILENAME, - "maxBytes": MAX_LOG_FILE_SIZE, - "backupCount": MAX_LOG_BACKUPS, +VALID_LOG_LEVELS = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"} +LOGGING_CONFIG_FILENAME = "logging_config.json" + +JSON_LOG_CONFIG = '{"time": "%(asctime)s", "level": "%(levelname)s", "module": "%(module)s", "msg": "%(message)s"}' + +_module_logger = logging.getLogger(__name__) + +# Immutable fallback config used when no external config file can be found. +# Note: disable_existing_loggers is intentionally absent — it is always +# injected at runtime by _load_logging_config() to guarantee it is False. +LOGGING_CONFIG = frozendict( + { + "version": 1, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "json", + "level": "INFO", + "stream": "ext://sys.stdout", + }, }, - }, - "formatters": { - "json": { - "format": '{"time": "%(asctime)s", "level": "%(levelname)s", "module": "%(module)s", "msg": "%(message)s"}' - } - }, -} + "formatters": {"json": {"format": JSON_LOG_CONFIG}}, + "loggers": {DEFAULT_LOGGER_NAME: {"level": "INFO", "handlers": ["console"]}}, + } +) -def get_cdm_logger( - logger_name: str | None = None, log_level: str | None = None, log_dir: str | None = None -) -> logging.Logger: - """Retrieve the logger, initialising it if necessary. +def _load_config_from_path(path: Path) -> dict: + """Attempt to load and parse a JSON logging config from the given path. + + :param path: path to a JSON logging config file + :type path: Path + :return: parsed config dict + :rtype: dict + :raises FileNotFoundError: if no file exists at the given path + :raises ValueError: if the file content is not valid JSON + """ + with path.open() as f: + return json.load(f) - If the logger name is not set, the default name "cdm_data_loader" will be used. - :param logger_name: name for the logger, defaults to None - :type logger_name: str | None, optional - :param log_level: logger level, defaults to None - :type log_level: str | None, optional - :param log_dir: directory to save log files to, optional. If no directory is specified, logs will just be emitted to the console. - :type log_dir: str | None - :return: initialised logger - :rtype: logging.Logger +def _load_logging_config(config_file: str | Path | None = None) -> dict: + """Resolve and load a logging config, working through a priority chain until a source succeeds. + + Resolution order: + 1. ``config_file`` argument (if provided) + 2. ``LOG_CONFIG_FILE`` environment variable (if set) + 3. ``logging_config.json`` in the current working directory + 4. Built-in ``LOGGING_CONFIG`` frozendict + + If a file is found but sets ``disable_existing_loggers`` to ``True``, + a warning is emitted and the value is overridden. The key is always + set to ``False`` in the returned dict, regardless of source, to prevent + ``dictConfig`` from silently disabling pre-existing loggers. + + :param config_file: explicit path to a logging config file, defaults to None + :type config_file: str | Path | None, optional + :return: logging config dict ready to pass to dictConfig + :rtype: dict """ - global __LOGGER - if not __LOGGER: - __LOGGER = init_logger(logger_name, log_level, log_dir) - return __LOGGER + candidates: list[tuple[Path, str]] = [] + + if config_file is not None: + candidates.append((Path(config_file), "config_file argument")) + + env_path = os.getenv("LOG_CONFIG_FILE") + if env_path: + candidates.append((Path(env_path), "LOG_CONFIG_FILE env var")) + + candidates.append((Path.cwd() / LOGGING_CONFIG_FILENAME, "current working directory")) + + for path, source in candidates: + try: + config = _load_config_from_path(path) + if config.get("disable_existing_loggers", True): + _module_logger.warning( + "Logging config loaded from %s (%s) sets disable_existing_loggers " + "to True. Overriding to prevent existing loggers being silently disabled.", + path, + source, + ) + _module_logger.info("Loaded logging config from %s (%s).", path, source) + break + except FileNotFoundError: + _module_logger.warning("Logging config not found at %s (%s). Trying next source.", path, source) + except ValueError: + _module_logger.warning("Logging config at %s (%s) is not valid JSON. Trying next source.", path, source) + else: + _module_logger.warning("No logging config file found. Falling back to built-in config.") + config = dict(LOGGING_CONFIG) + + config["disable_existing_loggers"] = False + return config + + +def _json_formatter() -> logging.Formatter: + """Construct the standard CDM JSON log formatter. + + :return: configured Formatter instance + :rtype: logging.Formatter + """ + return logging.Formatter(JSON_LOG_CONFIG) -def init_logger( - logger_name: str | None = None, log_level: str | None = None, log_dir: str | None = None -) -> logging.Logger: - """Initialise the logger for the module. +def _set_level_safe(logger: logging.Logger, level: str) -> None: + """Set the log level on a logger, raising a descriptive error for invalid values. - If the logger name is not set, the default name "cdm_data_loader" will be used. + :param logger: the logger to configure + :type logger: logging.Logger + :param level: log level string + :type level: str + :raises ValueError: if the level string is not a recognised logging level + """ + normalised = level.upper() + if normalised not in VALID_LOG_LEVELS: + msg = f"Invalid log level {level!r}. Must be one of: {', '.join(sorted(VALID_LOG_LEVELS))}" + raise ValueError(msg) + logger.setLevel(normalised) + + +def get_cdm_logger() -> logging.Logger: + """Retrieve the default CDM logger, initialising it if necessary. + + Prefers the 'dlt' logger if it has already been configured, otherwise + falls back to the CDM logger, creating it if needed. - :param logger_name: name for the logger, defaults to None - :type logger_name: str | None, optional - :param log_level: logger level, defaults to None - :type log_level: str | None, optional - :param log_dir: directory to save log files to, optional. If no directory is specified, logs will just be emitted to the console. - :type log_dir: str | None :return: initialised logger :rtype: logging.Logger """ - if not logger_name: - logger_name = DEFAULT_LOGGER_NAME + all_loggers = logging.root.manager.loggerDict + if "dlt" in all_loggers: + return logging.getLogger("dlt") + if DEFAULT_LOGGER_NAME in all_loggers: + return logging.getLogger(DEFAULT_LOGGER_NAME) + return init_logger() + - # Always get the same logger by name - logger = logging.getLogger(logger_name) +def _attach_file_handler(logger: logging.Logger, log_file: str | Path) -> None: + """Attach a file handler to the given logger if no file handler is already present. - # Determine log level (argument > env var > default) - effective_log_level = (log_level or os.getenv("LOG_LEVEL", "INFO")).upper() - logger.setLevel(getattr(logging, effective_log_level, logging.DEBUG)) + Checks for any existing handler whose class name contains 'FileHandler' + to avoid attaching duplicate file handlers of any type. Creates the parent + directory of log_file if it does not already exist. - # JSON-style structured formatter - formatter = logging.Formatter( - '{"time": "%(asctime)s", "level": "%(levelname)s", "module": "%(module)s", "msg": "%(message)s"}' + :param logger: the logger to attach the handler to + :type logger: logging.Logger + :param log_file: path to the log file + :type log_file: str | Path + """ + if any("FileHandler" in type(h).__name__ for h in logger.handlers): + return + + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + + handler = logging.handlers.RotatingFileHandler( + log_path, + maxBytes=MAX_LOG_FILE_SIZE, + backupCount=MAX_LOG_BACKUPS, ) + handler.setFormatter(_json_formatter()) + logger.addHandler(handler) - # Console handler - ch = logging.StreamHandler(sys.stdout) - ch.setFormatter(formatter) - logger.addHandler(ch) - - if log_dir: - log_dir_path = Path(log_dir) - if not log_dir_path.exists() and log_dir_path.is_dir(): - msg = f"{log_dir} does not exist or is not a directory." - raise FileNotFoundError(msg) - # Add the log message handler to the logger - file_handler = logging.handlers.RotatingFileHandler( - LOG_FILENAME, maxBytes=MAX_LOG_FILE_SIZE, backupCount=MAX_LOG_BACKUPS - ) - logger.addHandler(file_handler) - return logger +def init_logger( + log_level: str | None = None, + config_file: str | Path | None = None, + enable_file_logging: bool = False, # noqa: FBT001, FBT002 + log_file: str | Path | None = None, +) -> logging.Logger: + """Initialise the logger for the module. + + Loads config by working through the following priority chain: + 1. ``config_file`` argument (if provided) + 2. ``LOG_CONFIG_FILE`` environment variable (if set) + 3. ``logging_config.json`` in the current working directory + 4. Built-in ``LOGGING_CONFIG`` frozendict -def log_and_die(error_msg: str, error_class: type[Exception], logger_name: str | None = None) -> None: - """Log an error message and then raise the error. + If ``log_level`` is specified or the ``LOG_LEVEL`` env var is set, the + logger is set to that level. File logging is opt-in via + ``enable_file_logging`` or the ``ENABLE_FILE_LOGGING`` env var. - :param error_msg: error message string - :type error_msg: str - :param error_class: class of error to throw - :type error_class: type[Exception] - :param logger_name: name of the logger to use, defaults to None - :type logger_name: str | None, optional + :param log_level: logger level, defaults to None + :type log_level: str | None, optional + :param config_file: explicit path to a logging config file, defaults to None + :type config_file: str | Path | None, optional + :param enable_file_logging: attach a file handler, defaults to False + :type enable_file_logging: bool, optional + :param log_file: path to the log file, defaults to LOG_FILENAME in the CWD + :type log_file: str | Path, optional + :return: initialised logger + :rtype: logging.Logger """ - logger = get_cdm_logger(logger_name) + logging.config.dictConfig(_load_logging_config(config_file)) - if not error_msg: - logger.warning("No error supplied to log_and_die. Using generic error message.") - error_msg = GENERIC_ERROR_MESSAGE + logger = logging.getLogger(DEFAULT_LOGGER_NAME) - if not isinstance(error_class, type) or not issubclass(error_class, BaseException): - error_class = RuntimeError + new_log_level = log_level or os.getenv("LOG_LEVEL") + if new_log_level: + _set_level_safe(logger, new_log_level) - logger.error(error_msg) - raise error_class(error_msg) + if enable_file_logging or os.getenv("ENABLE_FILE_LOGGING", "").lower() == "true": + _attach_file_handler(logger, log_file or LOG_FILENAME) + + return logger diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 8253e2cd..fd7e8849 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -1,12 +1,14 @@ """Utilities for s3 interaction.""" from pathlib import Path +from types import ModuleType from typing import Any import boto3 import botocore import botocore.client import tqdm +from botocore.config import Config CDM_LAKE_BUCKET = "cdm-lake" DEFAULT_EXTRA_ARGS = {"ChecksumAlgorithm": "CRC64NVME"} @@ -14,6 +16,12 @@ VALID_S3_PREFIXES = ["s3://", "s3a://"] VALID_BUCKETS = [CDM_LAKE_BUCKET, "cts"] +# "legacy", "standard", "adaptive" +AWS_CLIENT_RETRY_MODE = "adaptive" +# how many times to retry, including the initial attempt +AWS_CLIENT_TOTAL_MAX_ATTEMPTS = 10 + + _s3_client: botocore.client.BaseClient | None = None @@ -33,9 +41,19 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli if _s3_client is not None: return _s3_client + config = Config(retries={"total_max_attempts": AWS_CLIENT_TOTAL_MAX_ATTEMPTS, "mode": AWS_CLIENT_RETRY_MODE}) + if not args: + # try using env vars and skip manual configuration + client = boto3.client("s3", config=config) + # check for credentials and endpoint_url + credentials = client._request_signer._credentials # noqa: SLF001 + if credentials.access_key and credentials.secret_key and client.meta.endpoint_url: + _s3_client = client + return _s3_client + try: - from berdl_notebook_utils.berdl_settings import get_settings + from berdl_notebook_utils.berdl_settings import get_settings # noqa: PLC0415 settings = get_settings() args = { @@ -56,7 +74,7 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli msg = "Cannot initialise s3 client: missing arguments: " + ", ".join(missing) raise ValueError(msg) - _s3_client = boto3.client("s3", **keyword_args) + _s3_client = boto3.client("s3", config=config, **keyword_args) return _s3_client @@ -126,6 +144,19 @@ def list_matching_objects(s3_path: str) -> list[dict[str, Any]]: return contents +def head_object(s3_path: str) -> dict[str, Any]: + """Check whether an object exists on s3. + + :param s3_path: path to the object on s3, INCLUDING the bucket name + :type s3_path: str + :return: response from the head_object request + :rtype: dict[str, Any] + """ + s3 = get_s3_client() + (bucket, key) = split_s3_path(s3_path) + return s3.head_object(Bucket=bucket, Key=key) + + def object_exists(s3_path: str) -> bool: """Check whether an object exists on s3. @@ -134,11 +165,8 @@ def object_exists(s3_path: str) -> bool: :return: True if the object exists, False otherwise :rtype: bool """ - s3 = get_s3_client() - - (bucket, key) = split_s3_path(s3_path) try: - s3.head_object(Bucket=bucket, Key=key) + head_object(s3_path) except Exception as e: error_string = str(e) if not error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): @@ -199,6 +227,35 @@ def upload_file( return True +def stream_to_s3(url: str, s3_path: str, requests: ModuleType) -> str: + """Stream directly from an HTTP download to s3. + + :param url: address of the object to transfer to s3 + :type url: str + :param s3_path: save path on s3 + :type s3_path: str + :param requests: module implementing requests.get and returning a response + :type requests: ModuleType + :return: path of the file on s3, in the form bucket/key + :rtype: str + """ + s3_client = get_s3_client() + (bucket, key) = split_s3_path(s3_path) + with requests.get(url, stream=True) as response: + response.raise_for_status() + s3_client.upload_fileobj( + # raw stream from urllib3 + response.raw, + bucket, + key, + ExtraArgs={ + **DEFAULT_EXTRA_ARGS, + "ContentType": response.headers.get("content-type", "application/octet-stream"), + }, + ) + return f"{bucket}/{key}" + + def download_file(s3_path: str, local_file_path: str | Path, version_id: str | None = None) -> None: """Download an object from s3. @@ -335,6 +392,9 @@ def copy_object(current_s3_path: str, new_s3_path: str) -> dict[str, Any]: A successful copy operation will return a response where resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + Errors (e.g, buckets or keys not existing, wrong credentials, etc.) are passed + directly to the user without being caught. + :param current_path: path to the file on s3, INCLUDING the bucket name :type current_path: str :param new_path: the desired new file path on s3, INCLUDING the bucket name @@ -360,6 +420,9 @@ def delete_object(s3_path: str) -> dict[str, Any]: A successful deletion will return a response where resp["ResponseMetadata"]["HTTPStatusCode"] == 204. + Errors (e.g, buckets or keys not existing, wrong credentials, etc.) are passed + directly to the user without being caught. + :param s3_path: path to the file on s3, INCLUDING the bucket name :type s3_path: str :return: dictionary containing response diff --git a/tests/conftest.py b/tests/conftest.py index 37d704b9..fc3b3909 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ """Global configuration settings for tests.""" import datetime +import logging import shutil from collections.abc import Generator from copy import deepcopy @@ -41,6 +42,21 @@ ALT_PIPELINE_RUN = {RUN_ID: "9876-5432-10", PIPELINE: "KeystoneXXXL", SOURCE: "/path/to/dir"} +@pytest.fixture(autouse=True) +def logging_setup(caplog: pytest.LogCaptureFixture) -> None: + """Fiddle with the loggers used in the tests for a better experience. + + N.b. this is overwritten by the conftest in the pipelines directory, which uses the dlt logger. + """ + vcr_logger = logging.getLogger("vcr") + vcr_logger.setLevel("ERROR") + # turn on log propagation for the dlt logger + dlt_logger = logging.getLogger("dlt") + dlt_logger.propagate = True + caplog.set_level(logging.INFO) + caplog.clear() + + @pytest.fixture def spark(tmp_path: Path) -> Generator[SparkSession, Any]: """Generate a spark session with spark.sql.warehouse.dir set to the pytest temporary directory.""" diff --git a/tests/data/atb/assembly_bakta_metadata_exact.tsv b/tests/data/atb/assembly_bakta_metadata_exact.tsv new file mode 100644 index 00000000..8b8c6ae8 --- /dev/null +++ b/tests/data/atb/assembly_bakta_metadata_exact.tsv @@ -0,0 +1,7 @@ +project project_id filename url md5 size(MB) +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.incr_release.202408.status.tsv.gz https://osf.io/download/2skzy/ 7da48fce8e310916a072786872f475b1 14.19 +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.r0.2.status.tsv.gz https://osf.io/download/rxfks/ b7255867206dc66f2c26db31921a67a6 53.87 +AllTheBacteria/Assembly zxfmy File_Lists/file_list.all.20240805.tsv.gz https://osf.io/download/dw3h7/ 5fc5c88a0593785341e466143a97f126 17.04 +AllTheBacteria/Assembly zxfmy File_Lists/file_list.all.202505.tsv.gz https://osf.io/download/69a03f582574717cb3643d62/ 0a9ee0b1efaf42b3ea9e89ce91d2b9e1 22.13 +AllTheBacteria/Metadata h7wzy 0.1/HQ_set_0.1/hq_dataset.accessions.txt.gz https://osf.io/download/3znhb/ 6f001b72101779b5ff6a556f46c9ddab 4.11 +AllTheBacteria/Metadata h7wzy 0.1/HQ_set_0.1/hq_dataset.counts_bar_plot.pdf https://osf.io/download/p5vkt/ a147e807f6a661b6f7f6dc856aa0465c 0.02 diff --git a/tests/data/atb/assembly_bakta_star_metadata.tsv b/tests/data/atb/assembly_bakta_star_metadata.tsv new file mode 100644 index 00000000..06098902 --- /dev/null +++ b/tests/data/atb/assembly_bakta_star_metadata.tsv @@ -0,0 +1,15 @@ +project project_id filename url md5 size(MB) +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.incr_release.202408.status.tsv.gz https://osf.io/download/2skzy/ 7da48fce8e310916a072786872f475b1 14.19 +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.r0.2.status.tsv.gz https://osf.io/download/rxfks/ b7255867206dc66f2c26db31921a67a6 53.87 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_1 kdnwp atb.bakta.incr_release.202408.batch.1.tar.xz https://osf.io/download/r84xg/ a3ba0148c435f31b4de3f0b72e01075d 1378.51 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_1 kdnwp atb.bakta.incr_release.202408.batch.10.tar.xz https://osf.io/download/qxye9/ e0724b0a605da7298b802f01145d8b49 388.76 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_2 p4kvy atb.bakta.incr_release.202408.batch.49.tar.xz https://osf.io/download/nyua7/ c8871eab5ee9071b101ba7bbafa7983b 2320.01 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_2 p4kvy atb.bakta.incr_release.202408.batch.50.tar.xz https://osf.io/download/tqrs2/ 887db770a7e176e644a2b413de4b07e5 2169.25 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_1 tyw72 atb.bakta.r0.2.batch.1.tar.xz https://osf.io/download/vpg3d/ 5e6ccd68f5c6a69a14dafaf599f9377e 293.86 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_1 tyw72 atb.bakta.r0.2.batch.10.tar.xz https://osf.io/download/gbzev/ 8fab65bbeadc99201a1df83f32b8c779 132.76 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_9 evurw atb.bakta.r0.2.batch.228.tar.xz https://osf.io/download/679b9dd3bc1fdea8290eb046/ dc061bcc2e1bb9c30887a4dfefb1b6c2 3841.85 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_9 evurw atb.bakta.r0.2.batch.229.tar.xz https://osf.io/download/679ba14cae64ecefd50eaea0/ db0081059366d08e8c3cd8c19857ef42 3902.73 +AllTheBacteria/Assembly zxfmy File_Lists/file_list.all.20240805.tsv.gz https://osf.io/download/dw3h7/ 5fc5c88a0593785341e466143a97f126 17.04 +AllTheBacteria/Assembly zxfmy File_Lists/file_list.all.202505.tsv.gz https://osf.io/download/69a03f582574717cb3643d62/ 0a9ee0b1efaf42b3ea9e89ce91d2b9e1 22.13 +AllTheBacteria/Metadata h7wzy 0.1/HQ_set_0.1/hq_dataset.accessions.txt.gz https://osf.io/download/3znhb/ 6f001b72101779b5ff6a556f46c9ddab 4.11 +AllTheBacteria/Metadata h7wzy 0.1/HQ_set_0.1/hq_dataset.counts_bar_plot.pdf https://osf.io/download/p5vkt/ a147e807f6a661b6f7f6dc856aa0465c 0.02 diff --git a/tests/data/atb/bakta_exact.tsv b/tests/data/atb/bakta_exact.tsv new file mode 100644 index 00000000..5ca71a80 --- /dev/null +++ b/tests/data/atb/bakta_exact.tsv @@ -0,0 +1,3 @@ +project project_id filename url md5 size(MB) +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.incr_release.202408.status.tsv.gz https://osf.io/download/2skzy/ 7da48fce8e310916a072786872f475b1 14.19 +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.r0.2.status.tsv.gz https://osf.io/download/rxfks/ b7255867206dc66f2c26db31921a67a6 53.87 diff --git a/tests/data/atb/bakta_star.tsv b/tests/data/atb/bakta_star.tsv new file mode 100644 index 00000000..4f5d5517 --- /dev/null +++ b/tests/data/atb/bakta_star.tsv @@ -0,0 +1,11 @@ +project project_id filename url md5 size(MB) +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.incr_release.202408.status.tsv.gz https://osf.io/download/2skzy/ 7da48fce8e310916a072786872f475b1 14.19 +AllTheBacteria/Annotation/Bakta zt57s File_Lists/atb.bakta.r0.2.status.tsv.gz https://osf.io/download/rxfks/ b7255867206dc66f2c26db31921a67a6 53.87 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_1 kdnwp atb.bakta.incr_release.202408.batch.1.tar.xz https://osf.io/download/r84xg/ a3ba0148c435f31b4de3f0b72e01075d 1378.51 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_1 kdnwp atb.bakta.incr_release.202408.batch.10.tar.xz https://osf.io/download/qxye9/ e0724b0a605da7298b802f01145d8b49 388.76 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_2 p4kvy atb.bakta.incr_release.202408.batch.49.tar.xz https://osf.io/download/nyua7/ c8871eab5ee9071b101ba7bbafa7983b 2320.01 +AllTheBacteria/Annotation/Bakta/Incr_release.202408_Set_2 p4kvy atb.bakta.incr_release.202408.batch.50.tar.xz https://osf.io/download/tqrs2/ 887db770a7e176e644a2b413de4b07e5 2169.25 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_1 tyw72 atb.bakta.r0.2.batch.1.tar.xz https://osf.io/download/vpg3d/ 5e6ccd68f5c6a69a14dafaf599f9377e 293.86 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_1 tyw72 atb.bakta.r0.2.batch.10.tar.xz https://osf.io/download/gbzev/ 8fab65bbeadc99201a1df83f32b8c779 132.76 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_9 evurw atb.bakta.r0.2.batch.228.tar.xz https://osf.io/download/679b9dd3bc1fdea8290eb046/ dc061bcc2e1bb9c30887a4dfefb1b6c2 3841.85 +AllTheBacteria/Annotation/Bakta/Release_0.2_Set_9 evurw atb.bakta.r0.2.batch.229.tar.xz https://osf.io/download/679ba14cae64ecefd50eaea0/ db0081059366d08e8c3cd8c19857ef42 3902.73 diff --git a/tests/pipelines/conftest.py b/tests/pipelines/conftest.py index 4fefeae0..203038a1 100644 --- a/tests/pipelines/conftest.py +++ b/tests/pipelines/conftest.py @@ -16,7 +16,6 @@ from cdm_data_loaders.pipelines.cts_defaults import ( DEFAULT_CTS_SETTINGS, DEFAULT_START_AT, - VALID_DESTINATIONS, BatchedFileInputSettings, CtsSettings, ) @@ -29,18 +28,38 @@ START_AT_VALUE = 50 START_AT_STRING = "50" +CONFIG_BUCKET = {"local_fs": "/output_dir", "s3": "s3://some/s3/bucket"} + TEST_DLT_CONFIG = frozendict( { - "destination.local_fs.bucket_url": "/output_dir", + "destination.local_fs.bucket_url": CONFIG_BUCKET["local_fs"], "destination.local_fs.destination_type": "filesystem", - "destination.s3.bucket_url": "s3://some/s3/bucket", + "destination.s3.bucket_url": CONFIG_BUCKET["s3"], "destination.s3.destination_type": "filesystem", "normalize.data_writer.disable_compression": False, } ) -DESTINATION_OUTPUT = TEST_DLT_CONFIG[f"destination.{DEFAULT_CTS_SETTINGS['use_destination']}.bucket_url"] +def _generate_dlt_config() -> dict[str, Any]: + """Return a fresh DLT config dict (same shape as the conftest fixture).""" + return { + "destination": { + "local_fs": {"bucket_url": CONFIG_BUCKET["local_fs"]}, + "s3": {"bucket_url": CONFIG_BUCKET["s3"]}, + }, + "destination.local_fs.bucket_url": CONFIG_BUCKET["local_fs"], + "destination.s3.bucket_url": CONFIG_BUCKET["s3"], + "normalize.data_writer.disable_compression": False, + } + + +DESTINATION_TO_OUTPUT = { + "local_fs": TEST_DLT_CONFIG["destination.local_fs.bucket_url"], + "s3": TEST_DLT_CONFIG["destination.s3.bucket_url"], +} + +DESTINATION_OUTPUT = DESTINATION_TO_OUTPUT[DEFAULT_CTS_SETTINGS["use_destination"]] DEFAULT_CTS_SETTINGS_RECONCILED = frozendict( { @@ -58,34 +77,33 @@ "dev_mode": "false", "input_dir": "/dir/path", "output": "/some/dir", - "use_destination": VALID_DESTINATIONS[1], + "use_destination": "local_fs", "use_output_dir_for_pipeline_metadata": "true", } ) -TEST_CTS_SETTINGS_EXPECTED = frozendict( +TEST_CTS_SETTINGS_RECONCILED = frozendict( { **TEST_CTS_SETTINGS, "dev_mode": False, "use_output_dir_for_pipeline_metadata": True, + "pipeline_dir": "/some/dir/.dlt_conf", + "raw_data_dir": "/some/dir/raw_data", } ) -TEST_CTS_SETTINGS_RECONCILED = frozendict( - {**TEST_CTS_SETTINGS_EXPECTED, "pipeline_dir": "/some/dir/.dlt_conf", "raw_data_dir": "/some/dir/raw_data"} -) - TEST_BATCH_FILE_SETTINGS = frozendict( **TEST_CTS_SETTINGS, start_at=START_AT_STRING, ) -TEST_BATCH_FILE_SETTINGS_EXPECTED = frozendict( - **TEST_CTS_SETTINGS_EXPECTED, - start_at=START_AT_VALUE, -) TEST_BATCH_FILE_SETTINGS_RECONCILED = frozendict( - {**TEST_BATCH_FILE_SETTINGS_EXPECTED, "pipeline_dir": "/some/dir/.dlt_conf", "raw_data_dir": "/some/dir/raw_data"} + { + **TEST_CTS_SETTINGS_RECONCILED, + "start_at": START_AT_VALUE, + "pipeline_dir": "/some/dir/.dlt_conf", + "raw_data_dir": "/some/dir/raw_data", + } ) @@ -121,16 +139,6 @@ def make_batcher(files: list[Path], batch_size: int = 5) -> MagicMock: return mock_batcher -def _generate_dlt_config() -> dict[str, Any]: - """Return a fresh DLT config dict (same shape as the conftest fixture).""" - return { - "destination": {"local_fs": {"bucket_url": "/output_dir"}, "s3": {"bucket_url": "s3://my-bucket/output"}}, - "destination.local_fs.bucket_url": "/output_dir", - "destination.s3.bucket_url": "s3://my-bucket/output", - "normalize.data_writer.disable_compression": False, - } - - def make_settings( settings_cls: type[CtsSettings], dlt_config: dict[str, Any] | None = None, diff --git a/tests/pipelines/test_all_the_bacteria.py b/tests/pipelines/test_all_the_bacteria.py index cec1f4e2..06c5f458 100644 --- a/tests/pipelines/test_all_the_bacteria.py +++ b/tests/pipelines/test_all_the_bacteria.py @@ -2,21 +2,24 @@ import csv import logging -from collections.abc import Callable +import re from pathlib import Path from typing import Any from unittest.mock import MagicMock, call, patch import pytest +from frozendict import frozendict from requests.exceptions import HTTPError from cdm_data_loaders.pipelines import all_the_bacteria, core from cdm_data_loaders.pipelines.all_the_bacteria import ( + ALL_ATB_FILE_NAME, DATASET_NAME, AtbSettings, cli, download_atb_index_tsv, get_file_download_links, + load_patterns, osf_file_downloader, run_atb_pipeline, ) @@ -50,8 +53,25 @@ def test_settings(tmp_path: Path, dlt_config: dict[str, Any]) -> AtbSettings: return AtbSettings(dlt_config=dlt_config, output=str(tmp_path)) -def test_cli_calls_run_ncbi_pipeline(monkeypatch: pytest.MonkeyPatch, dlt_config: dict[str, Any]) -> None: - """Ensure that cli() calls run_ncbi_pipeline with the settings.""" +@pytest.fixture +def test_s3_settings(dlt_config: dict[str, Any]) -> AtbSettings: + """Generate fake settings that use s3.""" + return AtbSettings(dlt_config=dlt_config, use_destination="s3") + + +@pytest.fixture +def pattern_file(tmp_path: Path) -> Path: + """Pattern file for testing load_patterns.""" + p = tmp_path / "patterns.txt" + p.write_text( + "hello world\nfoo.bar\nstarts with*\n2+2=4\n", + encoding="utf-8", + ) + return p + + +def test_cli_calls_run_atb_pipeline(monkeypatch: pytest.MonkeyPatch, dlt_config: dict[str, Any]) -> None: + """Ensure that cli() calls run_atb_pipeline with the settings.""" mock_settings_instance = MagicMock() mock_settings_cls = MagicMock(return_value=mock_settings_instance) mock_run_atb_pipeline = MagicMock() @@ -65,6 +85,92 @@ def test_cli_calls_run_ncbi_pipeline(monkeypatch: pytest.MonkeyPatch, dlt_config mock_run_atb_pipeline.assert_called_once_with(mock_settings_instance) +def test_load_patterns_returns_compiled_pattern(pattern_file: Path) -> None: + """load_patterns() should return a compiled re.Pattern object.""" + assert isinstance(load_patterns(pattern_file), re.Pattern) + + +def test_load_patterns_exact_match(pattern_file: Path) -> None: + """Plain text lines should match exactly against their literal content.""" + pattern = load_patterns(pattern_file) + assert isinstance(pattern, re.Pattern) + assert pattern.match("hello world") + assert pattern.match("foo.bar") + assert pattern.match("2+2=4") + + # patterns are anchored with ^ and $, so partial matches should not succeed + assert not pattern.match("hello world!!!") + assert not pattern.match("well, hello world") + + # dot is literal, not a wildcard + assert not pattern.match("fooXbar") + # + is literal, not a quantifier + assert not pattern.match("22=4") + + # line ending with * should match the prefix followed by any suffix + assert pattern.match("starts with") + assert pattern.match("starts with anything") + assert pattern.match("starts with 123!@#") + + # line ending with * should not match strings with a different prefix + assert not pattern.match("ends with") + + +def test_load_patterns_blank_lines_are_ignored(tmp_path: Path) -> None: + """Blank lines in the file should be silently skipped.""" + p = tmp_path / "patterns.txt" + p.write_text("\nhello\n\nworld\n", encoding="utf-8") + pattern = load_patterns(p) + assert isinstance(pattern, re.Pattern) + assert pattern.match("hello") + assert pattern.match("world") + assert not pattern.match("") + + +def test_load_patterns_only_wildcard_matches_anything(tmp_path: Path) -> None: + """A file containing only '*' should produce a pattern that matches any string.""" + p = tmp_path / "patterns.txt" + p.write_text("*\n", encoding="utf-8") + pattern = load_patterns(p) + assert isinstance(pattern, re.Pattern) + assert pattern.match("") + assert pattern.match("anything at all") + + +def test_load_patterns_alternation(tmp_path: Path) -> None: + """Each line in the file should become an alternative in the combined pattern.""" + p = tmp_path / "patterns.txt" + p.write_text("cat\ndog\nbird\n", encoding="utf-8") + pattern = load_patterns(p) + assert isinstance(pattern, re.Pattern) + assert pattern.match("cat") + assert pattern.match("dog") + assert pattern.match("bird") + assert not pattern.match("fish") + + +def test_load_patterns_no_file_returns_none(tmp_path: Path) -> None: + """Ensure that loading a non-existent file returns None.""" + pattern = load_patterns(tmp_path / "some" / "path") + assert pattern is None + + +def test_load_patterns_touched_file_returns_none(tmp_path: Path) -> None: + """Ensure that loading an empty file returns None.""" + p = tmp_path / "patterns.txt" + p.touch() + pattern = load_patterns(p) + assert pattern is None + + +def test_load_patterns_empty_file_returns_none(tmp_path: Path) -> None: + """Ensure that loading an empty file returns None.""" + p = tmp_path / "patterns.txt" + p.write_text("\n\n \t\n \n \n\t\t\n", encoding="utf-8") + pattern = load_patterns(p) + assert pattern is None + + @pytest.mark.vcr def test_download_atb_index_tsv_vcr(test_settings: AtbSettings) -> None: """Ensure that the download_atb_index function fetches the correct file.""" @@ -75,6 +181,27 @@ def test_download_atb_index_tsv_vcr(test_settings: AtbSettings) -> None: assert output_file.parent == raw_data_dir +@pytest.mark.default_cassette("test_download_atb_index_tsv_vcr.yaml") +def test_download_atb_index_tsv_vcr_destination_s3(test_s3_settings: AtbSettings) -> None: + """Ensure that the download_atb_index function fetches the correct file.""" + mock_download_client = MagicMock() + mock_stream_to_s3 = MagicMock() + mock_requests = MagicMock() + with ( + patch("cdm_data_loaders.pipelines.all_the_bacteria.FileDownloader", return_value=mock_download_client), + patch("cdm_data_loaders.pipelines.all_the_bacteria.stream_to_s3", mock_stream_to_s3), + patch("cdm_data_loaders.pipelines.all_the_bacteria.requests", mock_requests), + ): + output_file = download_atb_index_tsv(test_s3_settings) + + download_url = "https://osf.io/download/r6gcp/" + mock_stream_to_s3.assert_called_once_with( + url=download_url, s3_path=f"{test_s3_settings.raw_data_dir}/{ALL_ATB_FILE_NAME}", requests=mock_requests + ) + mock_download_client.download.assert_called_once_with(url=download_url, destination=Path(ALL_ATB_FILE_NAME)) + assert output_file == Path(ALL_ATB_FILE_NAME) + + @pytest.mark.vcr def test_download_atb_index_tsv_error_404(test_settings: AtbSettings) -> None: """Ensure that a 404 response causes an error and the function to die.""" @@ -82,6 +209,31 @@ def test_download_atb_index_tsv_error_404(test_settings: AtbSettings) -> None: download_atb_index_tsv(test_settings) +@pytest.mark.default_cassette("test_download_atb_index_tsv_vcr.yaml") +def test_download_atb_index_s3_error_boom(test_s3_settings: AtbSettings, caplog: pytest.LogCaptureFixture) -> None: + """Ensure that an error in the s3 upload causes things to die unpleasantly.""" + mock_download_client = MagicMock() + mock_stream_to_s3 = MagicMock(side_effect=ValueError("ZOMG!")) + mock_requests = MagicMock() + with ( + patch("cdm_data_loaders.pipelines.all_the_bacteria.FileDownloader", return_value=mock_download_client), + patch("cdm_data_loaders.pipelines.all_the_bacteria.stream_to_s3", mock_stream_to_s3), + patch("cdm_data_loaders.pipelines.all_the_bacteria.requests", mock_requests), + pytest.raises(ValueError, match="ZOMG!"), + ): + download_atb_index_tsv(test_s3_settings) + + download_url = "https://osf.io/download/r6gcp/" + mock_stream_to_s3.assert_called_once_with( + url=download_url, s3_path=f"{test_s3_settings.raw_data_dir}/{ALL_ATB_FILE_NAME}", requests=mock_requests + ) + mock_download_client.assert_not_called() + mock_download_client.download.assert_not_called() + last_log_record = caplog.records.pop() + assert last_log_record.levelno == logging.ERROR + assert last_log_record.message == f"Could not transfer {ALL_ATB_FILE_NAME} to s3" + + @pytest.mark.vcr def test_download_atv_index_tsv_error_missing_key(test_settings: AtbSettings) -> None: """Ensure that the lack of a download link in the response throws an error.""" @@ -102,10 +254,10 @@ def test_download_atv_index_tsv_error_cannot_download_tsv(test_settings: AtbSett download_atb_index_tsv(test_settings) -def test_get_file_download_links() -> None: - """Ensure that the appropriate files are picked out of the ATB file index TSV file.""" +def test_get_file_download_links(test_settings: AtbSettings) -> None: + """Ensure that the appropriate files are picked out of the ATB file index TSV file using the default matcher.""" file_path = Path("tests") / "data" / "atb" / "all_atb_files.tsv" - filtered_files = list(get_file_download_links(file_path)) + filtered_files = list(get_file_download_links(test_settings, file_path)) # load the expected results expected = Path("tests") / "data" / "atb" / "filtered_files.tsv" with expected.open() as fh: @@ -115,11 +267,41 @@ def test_get_file_download_links() -> None: assert filtered_files[0] == expected_files -def test_get_file_download_links_invalid_file(caplog: pytest.LogCaptureFixture) -> None: +EXPECTED_LINES = { + "*": "all_atb_files.tsv", + "AllTheBacteria/Annotation/Bakta\nAllTheBacteria/Assembly\nAllTheBacteria/Metadata\n": "assembly_bakta_metadata_exact.tsv", + "AllTheBacteria/Annotation/Bakta*\nAllTheBacteria/Assembly\nAllTheBacteria/Metadata\n": "assembly_bakta_star_metadata.tsv", + "AllTheBacteria/Annotation/Bakta": "bakta_exact.tsv", + "AllTheBacteria/Annotation/Bakta*": "bakta_star.tsv", +} + + +@pytest.mark.parametrize("pattern_lines", EXPECTED_LINES) +def test_get_file_download_links_use_pattern_file( + tmp_path: Path, dlt_config: dict[str, Any], pattern_lines: str +) -> None: + """Generate a pattern file from EXPECTED_LINES and check that the output from get_file_download_links is correct.""" + # create the pattern file + p = tmp_path / "patterns.txt" + p.write_text(f"{pattern_lines}\n", encoding="utf-8") + + settings = AtbSettings(dlt_config=dlt_config, input_dir=str(tmp_path), pattern_file="patterns.txt") + file_path = Path("tests") / "data" / "atb" / "all_atb_files.tsv" + filtered_files = list(get_file_download_links(settings, file_path)) + # load the expected results + expected = Path("tests") / "data" / "atb" / EXPECTED_LINES[pattern_lines] + with expected.open() as fh: + reader = csv.DictReader(fh, delimiter="\t") + expected_files = list(reader) + assert len(filtered_files[0]) > 1 + assert filtered_files[0] == expected_files + + +def test_get_file_download_links_invalid_file(test_settings: AtbSettings, caplog: pytest.LogCaptureFixture) -> None: """Ensure that the correct fields are present in the ATB TSV file and throw an error if not.""" file_path = Path("tests") / "data" / "atb" / "invalid_atb_files.tsv" with pytest.raises(RuntimeError, match="Missing required ATB file index TSV headers"): - list(get_file_download_links(file_path)) + list(get_file_download_links(test_settings, file_path)) records = caplog.records assert records[-1].levelno == logging.ERROR assert records[-1].message.startswith( @@ -129,143 +311,244 @@ def test_get_file_download_links_invalid_file(caplog: pytest.LogCaptureFixture) assert records[-2].message.startswith("ATB file index TSV headers have changed.") -def test_get_file_download_links_empty_file(caplog: pytest.LogCaptureFixture, tmp_path: Path) -> None: +def test_get_file_download_links_empty_file( + test_settings: AtbSettings, caplog: pytest.LogCaptureFixture, tmp_path: Path +) -> None: """Ensure that an empty file causes a runtime error.""" file_path = tmp_path / "fake_file.tsv" file_path.touch() with pytest.raises(RuntimeError, match=f"No valid TSV data found in {file_path!s}"): - list(get_file_download_links(file_path)) + list(get_file_download_links(test_settings, file_path)) records = caplog.records assert records[-1].levelno == logging.ERROR assert records[-1].message == f"No valid TSV data found in {file_path!s}" -def test_get_file_download_links_no_file() -> None: +def test_get_file_download_links_no_file(test_settings: AtbSettings) -> None: """Ensure an error is thrown if the file cannot be found.""" file_path = Path("/path") / "to" / "file" with pytest.raises(FileNotFoundError, match="No such file or directory"): - list(get_file_download_links(file_path)) + list(get_file_download_links(test_settings, file_path)) + + +FILE_DOWNLOADER_OUTPUT = [ + frozendict( + { + "filename": "file1.txt", + "url": "https://osf.io/file1", + "md5": "md5sum1", + "project": "AllTheBacteria/Annotation/Project", + "path": "Annotation/Project/file1.txt", + } + ), + frozendict( + { + "filename": "file2.txt", + "url": "https://osf.io/file2", + "md5": "md5sum2", + "project": "AllTheBacteria/Side/Project/", + "path": "Side/Project/file2.txt", + } + ), + frozendict( + { + "filename": "some/path/to/file3.txt", + "url": "https://osf.io/file3", + "md5": "md5sum3", + "project": "AllTheBacteria", + "path": "some/path/to/file3.txt", + } + ), + frozendict( + { + "filename": "not/least/file4.txt", + "url": "https://osf.io/file4", + "md5": "md5sum4", + "project": "AllTheBacteria/last/but", + "path": "last/but/not/least/file4.txt", + } + ), +] # osf_file_downloader tests @pytest.mark.parametrize( - ("atb_file_list", "expected_calls", "expected_paths"), + "atb_input", [ - ( - [ - {"filename": "file1.txt", "url": "https://osf.io/file1", "md5": "md5sum1"}, - {"filename": "file2.txt", "url": "https://osf.io/file2", "md5": "md5sum2"}, - ], - [ - ( - "https://osf.io/file1", - "file1.txt", - "md5sum1", - ), - ( - "https://osf.io/file2", - "file2.txt", - "md5sum2", - ), - ], - ["file1.txt", "file2.txt"], - ), + # each of the files singly and then the whole lot as a batch + *[[f] for f in FILE_DOWNLOADER_OUTPUT], + FILE_DOWNLOADER_OUTPUT, ], ) +@pytest.mark.parametrize("use_destination", VALID_DESTINATIONS) def test_osf_file_downloader_success( - test_settings: AtbSettings, - atb_file_list: list[dict[str, Any]], - expected_calls: list[tuple[str, str, str]], - expected_paths: list[str], + request: pytest.FixtureRequest, + atb_input: list[dict[str, Any]], + use_destination: str, caplog: pytest.LogCaptureFixture, ) -> None: """Ensure that the osf_file_downloader function correctly calls the download client for each file.""" + settings = request.getfixturevalue("test_s3_settings" if use_destination == "s3" else "test_settings") + + atb_file_list = [{k: v for k, v in f.items() if k != "path"} for f in atb_input] + + # expected_output path needs the raw data dir adding to it + expected_output = [dict(f.items()) for f in atb_input] + for f in expected_output: + f["path"] = f"{settings.raw_data_dir}/{f['path']}" + mock_download_client = MagicMock() - mock_logger = MagicMock() + mock_stream_to_s3 = MagicMock() + mock_requests = MagicMock() with ( patch("cdm_data_loaders.pipelines.all_the_bacteria.FileDownloader", return_value=mock_download_client), - patch("cdm_data_loaders.pipelines.all_the_bacteria.logger", mock_logger), + patch("cdm_data_loaders.pipelines.all_the_bacteria.stream_to_s3", mock_stream_to_s3), + patch("cdm_data_loaders.pipelines.all_the_bacteria.requests", mock_requests), ): - output = list(osf_file_downloader(test_settings, atb_file_list)) - - for item, filename in zip(atb_file_list, expected_paths, strict=True): - assert item["path"] == str(Path(test_settings.raw_data_dir) / filename) - - assert output[0].data == [ - {**f, "path": str(Path(test_settings.raw_data_dir) / f["filename"])} for f in atb_file_list - ] - - assert mock_download_client.download.call_count == len(expected_calls) - - for url, filename, checksum in expected_calls: - mock_download_client.download.assert_any_call( - url, - Path(test_settings.raw_data_dir) / filename, - expected_checksum=checksum, - checksum_fn="md5", - ) + # get the output from the generator + output = list(osf_file_downloader(settings, atb_file_list)) + + # should have a separate call for each file downloaded + if use_destination == "s3": + mock_download_client.download.assert_not_called() + call_args = [c.kwargs for c in mock_stream_to_s3.call_args_list] + assert call_args == [ + { + "url": f["url"], + "s3_path": f["path"], + "requests": mock_requests, + } + for f in atb_file_list + ] + else: + assert mock_download_client.download.call_count == len(atb_file_list) + + call_list = [c.kwargs for c in mock_download_client.download.call_args_list] + expected_calls = [ + { + "url": f["url"], + "destination": Path(settings.raw_data_dir) / f["path"], + "expected_checksum": f["md5"], + "checksum_fn": "md5", + } + for f in atb_input + ] + assert call_list == expected_calls + + # output from dlt.mark.with_table_name + assert output[0].data == expected_output + # the input args are mutated in place + assert atb_file_list == expected_output - mock_logger.assert_not_called() # no logs should be emitted for successful downloads assert caplog.records == [] @pytest.mark.parametrize( - ("atb_file_list", "download_side_effect", "expected_exceptions", "expected_paths"), + ("atb_file_list", "expected_exceptions", "expected_paths"), [ ( [ - {"filename": "good_file.txt", "url": "https://osf.io/good", "md5": "md5sum1"}, - {"filename": "great_file.txt", "url": "https://osf.io/great", "md5": "md5sum2"}, - {"filename": "bad_file.txt", "url": "https://osf.io/bad", "md5": "badmd5"}, + { + "project": "AllTheBacteria/One", + "filename": "Two/good_file.txt", + "url": "https://osf.io/good", + "md5": "md5sum1", + }, + { + "project": "AllTheBacteria/One/Two", + "filename": "great_file.txt", + "url": "https://osf.io/great", + "md5": "md5sum2", + }, + { + "project": "AllTheBacteria/One", + "filename": "fail.txt", + "url": "https://osf.io/fail", + "md5": "badmd5", + }, ], - lambda url, _save_path, **_kwargs: ( - (_ for _ in ()).throw(RuntimeError("download failed")) if url == "https://osf.io/bad" else None - ), - ["Could not download file from https://osf.io/bad: download failed"], - {"good_file.txt": True, "great_file.txt": True, "bad_file.txt": False}, + ["Could not download file from https://osf.io/fail: Loser!"], + {"Two/good_file.txt": True, "great_file.txt": True, "fail.txt": False}, ), ( [ - {"filename": "bad_file.txt", "url": "https://osf.io/bad", "md5": "badmd5"}, - {"filename": "even_worse.txt", "url": "https://osf.io/even_worse", "md5": "badmd5"}, + {"project": "Dud", "filename": "bad_file.txt", "url": "https://osf.io/bad", "md5": "badmd5"}, + { + "project": "Dud", + "filename": "also_very_bad.txt", + "url": "https://osf.io/also_very_bad", + "md5": "badmd5", + }, ], - lambda _url, _save_path, **_kwargs: (_ for _ in ()).throw(Exception("Boom!")), [ - "Could not download file from https://osf.io/bad: Boom!", - "Could not download file from https://osf.io/even_worse: Boom!", + "Could not download file from https://osf.io/bad: BOOM!", + "Could not download file from https://osf.io/also_very_bad: BOOM!", ], - {"bad_file.txt": False, "even_worse.txt": False}, + {"bad_file.txt": False, "also_very_bad.txt": False}, ), ], ) -def test_osf_file_downloader_error_handling( - test_settings: AtbSettings, +@pytest.mark.parametrize("use_destination", VALID_DESTINATIONS) +def test_osf_file_downloader_error_handling( # noqa: PLR0913 atb_file_list: list[dict[str, Any]], - download_side_effect: Callable, expected_exceptions: list[str], expected_paths: dict[str, bool], + use_destination: str, + caplog: pytest.LogCaptureFixture, + request: pytest.FixtureRequest, ) -> None: """Ensure that errors during file download are handled correctly.""" + settings = request.getfixturevalue("test_s3_settings" if use_destination == "s3" else "test_settings") + + def file_downloader_boom(**args) -> None: # noqa: ANN003 + if "url" in args: + if "bad" in args["url"]: + msg = "BOOM!" + raise ValueError(msg) + if "fail" in args["url"]: + msg_0 = "Loser!" + raise RuntimeError(msg_0) + + mock_requests = MagicMock() mock_download_client = MagicMock() - mock_download_client.download.side_effect = download_side_effect - mock_logger = MagicMock() + mock_download_client.download.side_effect = file_downloader_boom + mock_stream_to_s3 = MagicMock(side_effect=file_downloader_boom) with ( patch("cdm_data_loaders.pipelines.all_the_bacteria.FileDownloader", return_value=mock_download_client), - patch("cdm_data_loaders.pipelines.all_the_bacteria.logger", mock_logger), + patch("cdm_data_loaders.pipelines.all_the_bacteria.stream_to_s3", mock_stream_to_s3), + patch("cdm_data_loaders.pipelines.all_the_bacteria.requests", mock_requests), ): - list(osf_file_downloader(test_settings, atb_file_list)) + list(osf_file_downloader(settings, atb_file_list)) + + if use_destination == "s3": + mock_download_client.download.assert_not_called() + call_args = [c.kwargs for c in mock_stream_to_s3.call_args_list] + assert call_args == [ + { + "url": f["url"], + "s3_path": f"{settings.raw_data_dir}/{f['project'].replace('AllTheBacteria/', '')}/{f['filename']}", + "requests": mock_requests, + } + for f in atb_file_list + ] + else: + mock_stream_to_s3.assert_not_called() + + expected_file_names = { + "Two/good_file.txt": f"{settings.raw_data_dir}/One/Two/good_file.txt", + "great_file.txt": f"{settings.raw_data_dir}/One/Two/great_file.txt", + } for item in atb_file_list: if item["filename"] in expected_paths and expected_paths[item["filename"]]: - assert item["path"] == str(Path(test_settings.raw_data_dir) / item["filename"]) + assert item["path"] == expected_file_names[item["filename"]] else: assert "path" not in item - # FIXME: why is caplog not working here? Ideally this should use caplog instead of a mock logger. - exception_call_args = [call.args[0] for call in mock_logger.exception.call_args_list] - assert exception_call_args == expected_exceptions + log_messages = [r.message for r in caplog.records] + assert expected_exceptions == log_messages def test_run_atb_pipeline( diff --git a/tests/pipelines/test_core.py b/tests/pipelines/test_core.py index c6d57a75..0cc15b48 100644 --- a/tests/pipelines/test_core.py +++ b/tests/pipelines/test_core.py @@ -66,7 +66,7 @@ def test_cts_settings() -> CtsSettings: params=[ pytest.param({"input_dir": "/fake/input"}, id="default"), pytest.param( - {"input_dir": "/path/to/dir", "use_destination": "s3", "start_at": 15, "output": "/some/dir"}, + {"input_dir": "/path/to/dir", "use_destination": "local_fs", "start_at": 15, "output": "/some/dir"}, id="alt", ), ] @@ -372,9 +372,8 @@ def test_run_cli_no_slack_env_var_when_vars_missing( # dlt.config state after successful run @pytest.mark.parametrize("settings_cls", SETTINGS_CLASSES) -@pytest.mark.parametrize("use_destination", VALID_DESTINATIONS) @pytest.mark.parametrize("dev_mode", [True, False]) -@pytest.mark.parametrize("output", ["/some/path", "s3://bucket/whatever"]) +@pytest.mark.parametrize(("use_destination", "output"), [("local_fs", "/some/path"), ("s3", "s3://bucket/whatever")]) def test_run_cli_dlt_config_updated_after_success( dlt_config: dict[str, Any], settings_cls: type[CtsSettings], dev_mode: bool, use_destination: str, output: str ) -> None: @@ -413,7 +412,7 @@ def test_run_pipeline_destination_pipeline_pipeline_run_kwargs_set( pipeline_run_kwargs: dict[str, Any] | None, ) -> None: """Ensure a non-empty output sets the correct dlt.config bucket_url key.""" - settings = make_batched_settings(input_dir="/i", output="/custom/output", use_destination="s3") + settings = make_batched_settings(input_dir="/i", output="/custom/output", use_destination="local_fs") fake_resource = MagicMock() run_pipeline( settings, @@ -425,7 +424,7 @@ def test_run_pipeline_destination_pipeline_pipeline_run_kwargs_set( assert_pipeline_run_correctly( mock_dlt, fake_resource, - "s3", + "local_fs", destination_kwargs, pipeline_kwargs, pipeline_run_kwargs, @@ -794,7 +793,7 @@ def test_integration_empty_input_pipeline_run_still_called( # test run_cli + stream_xml_file_resource + run_pipeline -@pytest.mark.use_fixtures("patched_io_empty_batcher", "test_bfi_settings") +@pytest.mark.usefixtures("patched_io_empty_batcher", "test_bfi_settings") def test_integration_run_cli_calls_pipeline_fn_with_config(mock_dlt: MagicMock) -> None: """The exact config produced by run_cli reaches stream_xml_file_resource unchanged.""" received: list[CtsSettings] = [] diff --git a/tests/pipelines/test_cts_defaults.py b/tests/pipelines/test_cts_defaults.py index 035b81d6..26428a3d 100644 --- a/tests/pipelines/test_cts_defaults.py +++ b/tests/pipelines/test_cts_defaults.py @@ -1,6 +1,5 @@ """Tests for the Settings objects used by DLT pipelines.""" -from pathlib import Path from typing import Any import pytest @@ -17,12 +16,14 @@ from tests.pipelines.conftest import ( DEFAULT_BATCH_FILE_SETTINGS_RECONCILED, DEFAULT_CTS_SETTINGS_RECONCILED, + DESTINATION_TO_OUTPUT, TEST_BATCH_FILE_SETTINGS, TEST_BATCH_FILE_SETTINGS_RECONCILED, TEST_CTS_SETTINGS, TEST_CTS_SETTINGS_RECONCILED, check_settings, make_settings, + make_settings_autofill_config, ) SETTINGS_CLASSES = [CtsSettings, BatchedFileInputSettings] @@ -30,6 +31,31 @@ INVALID_DESTINATIONS = ["gcs", "filesystem", "", "LocalFs", "S3"] INVALID_BOOLEAN_VALUES = ["what", "yep", "nope", "2", -1, "", " ", "wtf", None] +S3 = "is_s3" +OUT = "output" +RAW = "raw_data_dir" +PIPE = "pipeline_dir" + + +# manually specify to avoid recapitulating logic +OUTPUT_PATHS: dict[str, dict[str, Any]] = { + "": {S3: False, OUT: "", RAW: "raw_data", PIPE: ".dlt_conf"}, + "/": {S3: False, OUT: "/", RAW: "/raw_data", PIPE: "/.dlt_conf"}, + # from destination.local_fs + "/output_dir": {S3: False, OUT: "/output_dir", RAW: "/output_dir/raw_data", PIPE: "/output_dir/.dlt_conf"}, + "/output/dir": {S3: False, OUT: "/output/dir", RAW: "/output/dir/raw_data", PIPE: "/output/dir/.dlt_conf"}, + "s3/some/path/": {S3: False, OUT: "s3/some/path", RAW: "s3/some/path/raw_data", PIPE: "s3/some/path/.dlt_conf"}, + # normalised form of the above + "s3/some/path": {S3: False, OUT: "s3/some/path", RAW: "s3/some/path/raw_data", PIPE: "s3/some/path/.dlt_conf"}, + "s3a://bucket/key": {S3: True, OUT: "s3a://bucket/key", RAW: "s3a://bucket/key/raw_data", PIPE: None}, + "s3://test/bucket/": {S3: True, OUT: "s3://test/bucket", RAW: "s3://test/bucket/raw_data", PIPE: None}, + # normalised from above + "s3://test/bucket": {S3: True, OUT: "s3://test/bucket", RAW: "s3://test/bucket/raw_data", PIPE: None}, + # from destination.s3 + "s3://some/s3/bucket": {S3: True, OUT: "s3://some/s3/bucket", RAW: "s3://some/s3/bucket/raw_data", PIPE: None}, +} + + # a whole load of values that Pydantic will coerce to a boolean TRUE_FALSE_VALUES = [ ("0", False), @@ -344,48 +370,130 @@ def test_settings_reconcile_with_dlt_config_output_resolved_from_dlt_config_buck assert s.output == dlt_config[f"destination.{use_destination}.bucket_url"] -# properties derived from self.output +# properties derived from self.output: pipeline_dir and raw_data_dir @pytest.mark.parametrize("settings_cls", SETTINGS_CLASSES) -@pytest.mark.parametrize("output", ["", "/output/dir", "some/convoluted/path/to/dir/"]) +@pytest.mark.parametrize( + "output", + list(OUTPUT_PATHS.keys()), +) @pytest.mark.parametrize("use_output_dir_for_pipeline_metadata", [True, False]) @pytest.mark.parametrize("use_destination", VALID_DESTINATIONS) def test_settings_generate_pipeline_raw_data_dirs( settings_cls: type[CtsSettings], output: str, use_output_dir_for_pipeline_metadata: bool, - dlt_config: dict[str, Any], use_destination: str, ) -> None: - """Ensure that the correct paths are generated for pipeline and raw data directories.""" - s = make_settings( - settings_cls, - dlt_config=dlt_config, - output=output, - use_destination=use_destination, - use_output_dir_for_pipeline_metadata=use_output_dir_for_pipeline_metadata, - ) + """Ensure that the correct paths are generated for pipeline and raw data directories. + + Ensure that the destination set in `use_destination` concurs with any output path set. + + Ensure that pipeline directories cannot be set if the output is set to s3. + """ + make_settings_args = { + "output": output, + "use_destination": use_destination, + "use_output_dir_for_pipeline_metadata": use_output_dir_for_pipeline_metadata, + } expected = { **DEFAULT_CTS_SETTINGS_RECONCILED, "use_destination": use_destination, "use_output_dir_for_pipeline_metadata": use_output_dir_for_pipeline_metadata, - "output": output.rstrip("/") or dlt_config[f"destination.{use_destination}.bucket_url"], + "output": DESTINATION_TO_OUTPUT[use_destination] if output == "" else OUTPUT_PATHS[output][OUT], } - if settings_cls == BatchedFileInputSettings: expected["start_at"] = DEFAULT_START_AT - # list containing the projected raw_data_dir and pipeline_dir - expected_properties = { - "": [str(Path(expected["output"]) / "raw_data"), str(Path(expected["output"]) / ".dlt_conf")], - "/output/dir": [str(Path("/output/dir") / "raw_data"), str(Path("/output/dir") / ".dlt_conf")], - "some/convoluted/path/to/dir/": [ - str(Path("some/convoluted/path/to/dir") / "raw_data"), - str(Path("some/convoluted/path/to/dir") / ".dlt_conf"), - ], + if (OUTPUT_PATHS[expected["output"]][S3] and use_destination == "local_fs") or ( + OUTPUT_PATHS[expected["output"]][S3] is False and use_destination == "s3" + ): + with pytest.raises(ValueError, match="Mismatch between output location and use_destination"): + make_settings_autofill_config(settings_cls, **make_settings_args) + return + + if use_output_dir_for_pipeline_metadata and OUTPUT_PATHS[expected["output"]][S3] is True: + # can't have pipeline dir on s3 + with pytest.raises(ValueError, match="It is not currently possible to have the pipeline directory on s3"): + make_settings_autofill_config(settings_cls, **make_settings_args) + return + + s = make_settings_autofill_config(settings_cls, **make_settings_args) + + # get the pipeline and raw data dirs from OUTPUT_PATHS + expected["raw_data_dir"] = OUTPUT_PATHS[expected["output"]][RAW] + # No pipeline_dir if use_output_dir_for_pipeline_metadata is not set + expected["pipeline_dir"] = OUTPUT_PATHS[expected["output"]][PIPE] if use_output_dir_for_pipeline_metadata else None + check_settings(s, expected) + + +@pytest.mark.parametrize("settings_cls", SETTINGS_CLASSES) +@pytest.mark.parametrize( + "output", + list(OUTPUT_PATHS.keys()), +) +@pytest.mark.parametrize("use_output_dir_for_pipeline_metadata", [True, False]) +@pytest.mark.parametrize("use_destination", VALID_DESTINATIONS) +def test_cli_app_run_generate_pipeline_raw_data_dirs( + settings_cls: type[CtsSettings], + output: str, + use_output_dir_for_pipeline_metadata: bool, + use_destination: str, + dlt_config: dict[str, Any], +) -> None: + """Ensure that the correct paths are generated for pipeline and raw data directories. + + Ensure that the destination set in `use_destination` concurs with any output path set. + + Ensure that pipeline directories cannot be set if the output is set to s3. + """ + make_settings_args = [ + "--output", + output, + "--use_destination", + use_destination, + "--use_output_dir_for_pipeline_metadata", + str(use_output_dir_for_pipeline_metadata), + ] + + expected = { + **DEFAULT_CTS_SETTINGS_RECONCILED, + "use_destination": use_destination, + "use_output_dir_for_pipeline_metadata": use_output_dir_for_pipeline_metadata, + "output": DESTINATION_TO_OUTPUT[use_destination] if output == "" else OUTPUT_PATHS[output][OUT], } + if settings_cls == BatchedFileInputSettings: + expected["start_at"] = DEFAULT_START_AT + + if (OUTPUT_PATHS[expected["output"]][S3] and use_destination == "local_fs") or ( + OUTPUT_PATHS[expected["output"]][S3] is False and use_destination == "s3" + ): + with pytest.raises(ValueError, match="Mismatch between output location and use_destination"): + CliApp.run( + settings_cls, + dlt_config=dlt_config, + cli_args=make_settings_args, + ) + return + + if use_output_dir_for_pipeline_metadata and OUTPUT_PATHS[expected["output"]][S3] is True: + # can't have pipeline dir on s3 + with pytest.raises(ValueError, match="It is not currently possible to have the pipeline directory on s3"): + CliApp.run( + settings_cls, + dlt_config=dlt_config, + cli_args=make_settings_args, + ) + return - expected["raw_data_dir"] = expected_properties[output][0] - expected["pipeline_dir"] = expected_properties[output][1] if use_output_dir_for_pipeline_metadata else None + s = CliApp.run( + settings_cls, + dlt_config=dlt_config, + cli_args=make_settings_args, + ) + # get the pipeline and raw data dirs from OUTPUT_PATHS + expected["raw_data_dir"] = OUTPUT_PATHS[expected["output"]][RAW] + # No pipeline_dir if use_output_dir_for_pipeline_metadata is not set + expected["pipeline_dir"] = OUTPUT_PATHS[expected["output"]][PIPE] if use_output_dir_for_pipeline_metadata else None check_settings(s, expected) diff --git a/tests/utils/test_cdm_logger.py b/tests/utils/test_cdm_logger.py new file mode 100644 index 00000000..852f7be5 --- /dev/null +++ b/tests/utils/test_cdm_logger.py @@ -0,0 +1,391 @@ +"""Tests for cdm_data_loaders/utils/cdm_logger.py.""" + +import json +import logging +import logging.handlers +from collections.abc import Generator +from copy import deepcopy +from pathlib import Path +from typing import Any + +import pytest +from frozendict import frozendict + +import cdm_data_loaders.utils.cdm_logger as cdm_logger_module +from cdm_data_loaders.utils.cdm_logger import ( + DEFAULT_LOGGER_NAME, + JSON_LOG_CONFIG, + LOG_FILENAME, + LOGGING_CONFIG, + MAX_LOG_BACKUPS, + MAX_LOG_FILE_SIZE, + _attach_file_handler, + _load_logging_config, + _set_level_safe, + get_cdm_logger, + init_logger, +) + +# Add near the top of the test file, alongside the other imports +MODULE_LOGGER_NAME = "cdm_data_loaders.utils.cdm_logger" + + +VALID_JSON_CONFIG = frozendict( + { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "CONFIGURE_HANDLER_NAME": { + "class": "logging.StreamHandler", + "formatter": "json", + "level": "INFO", + "stream": "ext://sys.stdout", + } + }, + "formatters": {"json": {"format": JSON_LOG_CONFIG}}, + "loggers": {DEFAULT_LOGGER_NAME: {"level": "INFO", "handlers": ["CONFIGURE_HANDLER_NAME"]}}, + } +) + + +def _write_config(path: Path, test_name: str, config: dict[str, Any] | None = None) -> Path: + """Write a JSON logging config to path, using VALID_JSON_CONFIG by default.""" + # edit the config to ensure that it is recognisable as being from a specific source + if not config: + config = deepcopy(dict(VALID_JSON_CONFIG)) + # switch out the handler name for a test-specific name + config["loggers"][DEFAULT_LOGGER_NAME]["handlers"] = [test_name] + config["handlers"][test_name] = config["handlers"].pop("CONFIGURE_HANDLER_NAME") + + path.write_text(json.dumps(config if config is not None else VALID_JSON_CONFIG)) + return path + + +@pytest.fixture(autouse=True) +def reset_logging() -> Generator[None, Any]: + """Remove CDM and dlt loggers from the manager and clear their handlers before and after tests.""" + + def _clean() -> None: + for name in (DEFAULT_LOGGER_NAME, "dlt"): + logger = logging.root.manager.loggerDict.pop(name, None) + if isinstance(logger, logging.Logger): + logger.handlers.clear() + + _clean() + yield + _clean() + + +@pytest.fixture +def clean_env(monkeypatch: pytest.MonkeyPatch) -> None: + """Remove logging-related env vars during tests.""" + monkeypatch.delenv("LOG_LEVEL", raising=False) + monkeypatch.delenv("LOG_CONFIG_FILE", raising=False) + monkeypatch.delenv("ENABLE_FILE_LOGGING", raising=False) + + +@pytest.fixture +def empty_cwd(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """Remove LOG_CONFIG_FILE env vars and chdir to an empty temporary directory with no config file.""" + monkeypatch.chdir(tmp_path) + monkeypatch.delenv("LOG_CONFIG_FILE", raising=False) + + +@pytest.fixture +def cdm_logger(clean_env: None, empty_cwd: None) -> logging.Logger: # noqa: ARG001 + """Return a CDM logger initialised with LOGGING_CONFIG.""" + return init_logger() + + +@pytest.fixture +def dlt_logger() -> logging.Logger: + """Register a pre-configured dlt logger in the logging manager.""" + logger = logging.getLogger("dlt") + logger.setLevel(logging.WARNING) + return logger + + +@pytest.fixture +def config_in_cwd(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Write a valid config file in a temporary dir and chdir into it, simulating a config found in the CWD.""" + path = _write_config(tmp_path / cdm_logger_module.LOGGING_CONFIG_FILENAME, "config_in_cwd") + monkeypatch.chdir(tmp_path) + monkeypatch.delenv("LOG_CONFIG_FILE", raising=False) + return path + + +@pytest.fixture +def config_at_explicit_path(tmp_path: Path) -> Path: + """Write a valid config file in a temporary directory.""" + return _write_config(tmp_path / "custom_logging_config.json", "config_at_explicit_path") + + +# _load_logging_config — resolution order +@pytest.mark.usefixtures("config_in_cwd") +def test_load_logging_config_uses_explicit_path_first(config_at_explicit_path: Path) -> None: + """Ensure that the config file argument is used in preference to other choices.""" + config = _load_logging_config(config_file=config_at_explicit_path) + assert config["disable_existing_loggers"] is False + assert config["loggers"][DEFAULT_LOGGER_NAME]["handlers"] == ["config_at_explicit_path"] + assert set(config["handlers"]) == {"config_at_explicit_path"} + + +@pytest.mark.usefixtures("config_in_cwd") +def test_load_logging_config_uses_env_var_over_cwd( + config_at_explicit_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure that the LOG_CONFIG_FILE env var is used in preference to the LOGGING_CONFIG fallback.""" + monkeypatch.setenv("LOG_CONFIG_FILE", str(config_at_explicit_path)) + config = _load_logging_config() + assert config["loggers"][DEFAULT_LOGGER_NAME]["handlers"] == ["config_at_explicit_path"] + assert set(config["handlers"]) == {"config_at_explicit_path"} + + +@pytest.mark.usefixtures("config_in_cwd") +def test_load_logging_config_uses_cwd_when_no_arg_or_env(monkeypatch: pytest.MonkeyPatch) -> None: + """When no argument or env var is provided, the logging_config.json in the current working directory is loaded.""" + monkeypatch.delenv("LOG_CONFIG_FILE", raising=False) + config = _load_logging_config() + assert config["disable_existing_loggers"] is False + assert config["loggers"][DEFAULT_LOGGER_NAME]["handlers"] == ["config_in_cwd"] + assert set(config["handlers"]) == {"config_in_cwd"} + + +@pytest.mark.usefixtures("empty_cwd") +def test_load_logging_config_falls_back_to_frozendict_when_all_sources_fail(caplog: pytest.LogCaptureFixture) -> None: + """Ensure that logging falls back to the default frozendict if no other sources are found.""" + with caplog.at_level(logging.WARNING, logger=MODULE_LOGGER_NAME): + config = _load_logging_config() + assert config == {**LOGGING_CONFIG, "disable_existing_loggers": False} + log_message = caplog.records[-1] + assert log_message.message == "No logging config file found. Falling back to built-in config." + + +@pytest.mark.usefixtures("config_in_cwd") +def test_load_logging_config_skips_bad_argument_path_and_tries_next( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + """Ensure that a non-existent or invalid config file is ignored and the next source tried.""" + monkeypatch.delenv("LOG_CONFIG_FILE", raising=False) + with caplog.at_level(logging.DEBUG, logger=MODULE_LOGGER_NAME): + config = _load_logging_config(config_file=tmp_path / "nonexistent.json") + assert config["disable_existing_loggers"] is False + assert any("nonexistent.json" in m for m in caplog.messages) + + +@pytest.mark.usefixtures("config_in_cwd") +def test_load_logging_config_skips_bad_env_var_path_and_tries_next( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + """Ensure that a non-existent or invalid config file is ignored and the next source tried.""" + monkeypatch.setenv("LOG_CONFIG_FILE", str(tmp_path / "nonexistent.json")) + with caplog.at_level(logging.WARNING, logger=MODULE_LOGGER_NAME): + config = _load_logging_config() + assert config["disable_existing_loggers"] is False + assert any("nonexistent.json" in m for m in caplog.messages) + + +@pytest.mark.usefixtures("config_in_cwd") +def test_load_logging_config_skips_invalid_json_and_tries_next( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + """Ensure that a non-existent or invalid config file is ignored and the next source tried.""" + bad_path = tmp_path / "bad_config.json" + bad_path.write_text("this is not json {{{") + monkeypatch.delenv("LOG_CONFIG_FILE", raising=False) + with caplog.at_level(logging.WARNING, logger=MODULE_LOGGER_NAME): + config = _load_logging_config(config_file=bad_path) + assert config["disable_existing_loggers"] is False + assert any("bad_config.json" in m for m in caplog.messages) + + +def test_load_logging_config_overrides_disable_existing_loggers_when_true_in_file( + tmp_path: Path, caplog: pytest.LogCaptureFixture, request: pytest.FixtureRequest +) -> None: + """If a config file sets disable_existing_loggers to True, _load_logging_config should override it.""" + path = _write_config( + tmp_path / "bad.json", request.node.originalname, {**VALID_JSON_CONFIG, "disable_existing_loggers": True} + ) + with caplog.at_level(logging.WARNING, logger=MODULE_LOGGER_NAME): + config = _load_logging_config(config_file=path) + assert config["disable_existing_loggers"] is False + assert any( + "sets disable_existing_loggers to True. Overriding to prevent existing loggers being silently disabled" in m + for m in caplog.messages + ) + + +@pytest.mark.usefixtures("clean_env") +def test_init_logger_accepts_explicit_config_file(config_at_explicit_path: Path) -> None: + """init_logger should accept a config_file argument and pass it through to _load_logging_config without error.""" + logger = init_logger(config_file=config_at_explicit_path) + assert isinstance(logger, logging.Logger) + + +@pytest.mark.usefixtures("empty_cwd") +def test_init_logger_uses_log_config_file_env_var( + config_at_explicit_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """When LOG_CONFIG_FILE is set in the environment and no config_file argument is passed, init_logger should load config from that path.""" + monkeypatch.setenv("LOG_CONFIG_FILE", str(config_at_explicit_path)) + monkeypatch.delenv("LOG_LEVEL", raising=False) + logger = init_logger() + assert isinstance(logger, logging.Logger) + + +def test_init_logger_happy_path(cdm_logger: logging.Logger) -> None: + """init_logger should return a Logger with the CDM default name.""" + assert isinstance(cdm_logger, logging.Logger) + assert cdm_logger.name == DEFAULT_LOGGER_NAME + # calling init_logger more than once returns the same object + assert init_logger() is init_logger() + assert cdm_logger.level == logging.INFO + + +@pytest.mark.parametrize("level", ["DEBUG", "Info", "warning"]) +@pytest.mark.usefixtures("clean_env", "empty_cwd") +def test_init_logger_explicit_level_argument(level: str) -> None: + """Ensure that the log level can be set explicitly.""" + logger = init_logger(log_level=level) + assert logger.level == getattr(logging, level.upper()) + + +@pytest.mark.usefixtures("clean_env", "empty_cwd") +def test_init_logger_env_var_sets_level(monkeypatch: pytest.MonkeyPatch) -> None: + """When LOG_LEVEL is set in the environment, init_logger should apply it to the logger level.""" + monkeypatch.setenv("LOG_LEVEL", "ERROR") + assert init_logger().level == logging.ERROR + + +@pytest.mark.usefixtures("clean_env", "empty_cwd") +def test_init_logger_argument_takes_priority_over_env_var(monkeypatch: pytest.MonkeyPatch) -> None: + """An explicit log_level argument should take precedence over the LOG_LEVEL env var when both are set.""" + monkeypatch.setenv("LOG_LEVEL", "ERROR") + assert init_logger(log_level="DEBUG").level == logging.DEBUG + + +def test_init_logger_log_level_gates_emission_correctly( + cdm_logger: logging.Logger, caplog: pytest.LogCaptureFixture +) -> None: + """Messages at or above the configured level should be captured; messages below it should be suppressed.""" + with caplog.at_level(logging.INFO, logger=DEFAULT_LOGGER_NAME): + cdm_logger.info("should appear") + cdm_logger.debug("should not appear") + + assert "should appear" in caplog.messages + assert "should not appear" not in caplog.messages + + +# console +def test_init_logger_console_handler_configured_correctly(cdm_logger: logging.Logger) -> None: + """The logger should have a StreamHandler at INFO level by default. + + No RotatingFileHandler should be attached unless explicitly requested. + """ + stream_handlers = [h for h in cdm_logger.handlers if type(h) is logging.StreamHandler] + assert len(stream_handlers) == 1 + assert stream_handlers[0].level == logging.INFO + rotating_handlers = [h for h in cdm_logger.handlers if isinstance(h, logging.handlers.RotatingFileHandler)] + assert rotating_handlers == [] + + +@pytest.mark.usefixtures("clean_env", "empty_cwd") +def test_get_cdm_logger_creates_cdm_logger_when_none_exists() -> None: + """When neither a dlt nor a CDM logger exists, get_cdm_logger should initialise and return a new CDM logger.""" + logger = get_cdm_logger() + assert isinstance(logger, logging.Logger) + assert logger.name == DEFAULT_LOGGER_NAME + assert DEFAULT_LOGGER_NAME in logging.root.manager.loggerDict + # get_cdm_logger should returns the existing logger rather than creating a new one. + assert get_cdm_logger() is init_logger() + + +@pytest.mark.usefixtures("clean_env", "empty_cwd", "dlt_logger") +def test_get_cdm_logger_prefers_dlt_over_cdm_logger() -> None: + """When a dlt logger is present in the logging manager, get_cdm_logger should return it even if a CDM logger also exists.""" + init_logger() + assert get_cdm_logger().name == "dlt" + + +@pytest.mark.parametrize("level", ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]) +def test_set_level_safe_accepts_all_valid_levels(cdm_logger: logging.Logger, level: str) -> None: + """_set_level_safe should accept all standard logging level strings.""" + _set_level_safe(cdm_logger, level) + assert cdm_logger.level == getattr(logging, level) + + +@pytest.mark.parametrize("level", ["debug", "Info", "wARNING"]) +def test_set_level_safe_is_case_insensitive(cdm_logger: logging.Logger, level: str) -> None: + """Ensure that _set_level_safe normalises case issues.""" + _set_level_safe(cdm_logger, level) + assert cdm_logger.level == getattr(logging, level.upper()) + + +def test_set_level_safe_raises_descriptive_error_for_invalid_level(cdm_logger: logging.Logger) -> None: + """_set_level_safe should raise a ValueError for an unrecognised level string. + + The error message should echo back the bad value and list the valid options. + """ + with pytest.raises(ValueError, match=r"Invalid log level 'VERBOS'\. Must be one of"): + _set_level_safe(cdm_logger, "VERBOS") + + +# file handler +def test_attach_file_handler_adds_correctly_configured_handler(cdm_logger: logging.Logger, tmp_path: Path) -> None: + """_attach_file_handler should add a single RotatingFileHandler with a log file at the specified path.""" + log_path = tmp_path / LOG_FILENAME + _attach_file_handler(cdm_logger, log_path) + + rotating_handlers = [h for h in cdm_logger.handlers if isinstance(h, logging.handlers.RotatingFileHandler)] + assert len(rotating_handlers) == 1 + + handler = rotating_handlers[0] + assert handler.maxBytes == MAX_LOG_FILE_SIZE + assert handler.backupCount == MAX_LOG_BACKUPS + assert handler.formatter._fmt == JSON_LOG_CONFIG # noqa: SLF001 + + assert log_path.exists() + + # try attaching a second handler + _attach_file_handler(cdm_logger, log_path) + + assert len([h for h in cdm_logger.handlers if isinstance(h, logging.handlers.RotatingFileHandler)]) == 1 + + +def test_attach_file_handler_creates_missing_parent_directory(cdm_logger: logging.Logger, tmp_path: Path) -> None: + """_attach_file_handler should create the parent directory of the log file path if it does not exist.""" + log_path = tmp_path / "nested" / "dirs" / LOG_FILENAME + assert not log_path.parent.exists() + _attach_file_handler(cdm_logger, log_path) + assert log_path.parent.exists() + assert log_path.exists() + + +def test_attach_file_handler_detects_any_file_handler_type(cdm_logger: logging.Logger, tmp_path: Path) -> None: + """_attach_file_handler should not add a second file handler.""" + existing = logging.FileHandler(tmp_path / "other.log") + cdm_logger.addHandler(existing) + + _attach_file_handler(cdm_logger, tmp_path / LOG_FILENAME) + + file_handlers = [h for h in cdm_logger.handlers if "FileHandler" in type(h).__name__] + assert len(file_handlers) == 1 + + +@pytest.mark.parametrize("env_value", ["true", "TRUE", "True"]) +@pytest.mark.usefixtures("clean_env") +def test_init_logger_file_handler_added_when_requested( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, env_value: str +) -> None: + """A file handler should be attached when enable_file_logging=True or ENABLE_FILE_LOGGING env var is set.""" + log_path = tmp_path / LOG_FILENAME + + logger_arg = init_logger(enable_file_logging=True, log_file=log_path) + assert any("FileHandler" in type(h).__name__ for h in logger_arg.handlers) + + logger_arg.handlers.clear() + logging.root.manager.loggerDict.pop(DEFAULT_LOGGER_NAME, None) + + monkeypatch.setenv("ENABLE_FILE_LOGGING", env_value) + logger_env = init_logger(log_file=log_path) + assert any("FileHandler" in type(h).__name__ for h in logger_env.handlers) diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 7d6398d2..0effb71a 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -1,17 +1,20 @@ """Tests for s3_utils.py using moto to mock AWS S3.""" import functools +import io from collections.abc import Callable, Generator from pathlib import Path from typing import Any -from unittest.mock import patch +from unittest.mock import MagicMock, patch import boto3 -import botocore import pytest +from botocore.exceptions import ClientError from moto import mock_aws +from requests.exceptions import ConnectionError as ConnError +from requests.exceptions import HTTPError -import cdm_data_loaders.utils.s3 as s3_utils # adjust to match your module name +import cdm_data_loaders.utils.s3 as s3_utils from cdm_data_loaders.utils.s3 import ( CDM_LAKE_BUCKET, DEFAULT_EXTRA_ARGS, @@ -19,10 +22,12 @@ delete_object, download_file, get_s3_client, + head_object, list_matching_objects, object_exists, reset_s3_client, split_s3_path, + stream_to_s3, upload_dir, upload_file, ) @@ -184,6 +189,13 @@ def test_get_s3_client_returns_same_instance() -> None: assert s3_utils._s3_client is None # noqa: SLF001 +@pytest.mark.s3 +def test_get_s3_client_populates_from_environment() -> None: + # set up the environment + + pass + + # split_s3_path PATH = "path" @@ -324,13 +336,20 @@ def test_list_matching_objects_returns_more_than_1000_entries( # object_exists @pytest.mark.parametrize("protocol", ["", "s3://", "s3a://"]) @pytest.mark.s3 -def test_object_exists_returns_true_when_present(mock_s3_client: Any, protocol: str) -> None: +def test_head_object_and_object_exists_true_and_false(mock_s3_client: Any, protocol: str) -> None: """Verify that object_exists returns True for an object that exists in the bucket.""" populate_mock_s3(mock_s3_client, FILES_IN_BUCKETS) for bucket, file_list in FILES_IN_BUCKETS.items(): for f in file_list: + output = head_object(f"{protocol}{bucket}/{f}") + assert output.get("ResponseMetadata", {}).get("HTTPStatusCode") == 200 assert object_exists(f"{protocol}{bucket}/{f}") is True + nonexistent_file = f"{protocol}{bucket}/a-file-i-just-made-up.txt" + assert object_exists(nonexistent_file) is False + with pytest.raises(ClientError, match=r"An error occurred \(404\) when calling the HeadObject operation"): + head_object(nonexistent_file) + @pytest.mark.parametrize("s3_path", ["absent", "dir_one", "dir_one/", "dir_one/file1.tnt"]) @pytest.mark.parametrize("bucket", BUCKETS) @@ -394,6 +413,128 @@ def test_upload_file_error(sample_file: Path) -> None: upload_file(sample_file, "") +# TODO: Missing tests +# - Upload failure (S3 error) - returns False + + +def make_mock_requests( + content: bytes = b"hello world", + status_code: int = 200, + content_type: str = "application/octet-stream", +) -> tuple[MagicMock, MagicMock]: + """Build a mock requests module whose .get() returns a mock response.""" + mock_response = MagicMock() + mock_response.status_code = status_code + mock_response.raw = io.BytesIO(content) + mock_response.raw.decode_content = True + mock_response.headers = { + "content-type": content_type, + } + mock_response.raise_for_status = MagicMock() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + mock_requests = MagicMock() + mock_requests.get.return_value = mock_response + + return mock_requests, mock_response + + +UPLOAD_TEST_KEY = "uploads/test-file.pdf" +UPLOAD_BUCKET_KEY = f"{ALT_BUCKET}/{UPLOAD_TEST_KEY}" +TEST_URL = "https://example.com/test-file.pdf" + + +def test_stream_to_s3_happy_path(mock_s3_client: Any) -> None: + """File content from the HTTP response is stored correctly in S3.""" + content = b"hello world" + mock_requests, _ = make_mock_requests(content=content) + + saved_path = stream_to_s3(TEST_URL, UPLOAD_BUCKET_KEY, mock_requests) + + mock_requests.get.assert_called_once_with(TEST_URL, stream=True) + + # s3 path including bucket returned + assert saved_path == UPLOAD_BUCKET_KEY + + result = mock_s3_client.get_object(Bucket=ALT_BUCKET, Key=UPLOAD_TEST_KEY) + # check the content is correct + assert result["Body"].read() == content + + # new file shows up in list_objects + objects = mock_s3_client.list_objects_v2(Bucket=ALT_BUCKET)["Contents"] + keys = [obj["Key"] for obj in objects] + assert UPLOAD_TEST_KEY in keys + + +@pytest.mark.parametrize("content_type", [None, "application/json", "application/pdf", "text"]) +def test_stream_to_s3_sets_content_type_from_response_headers(mock_s3_client: Any, content_type: str | None) -> None: + """ContentType metadata on the S3 object matches the HTTP response header.""" + content_type_args = {} + if content_type: + content_type_args["content_type"] = content_type + mock_requests, _ = make_mock_requests(**content_type_args) + + stream_to_s3(TEST_URL, UPLOAD_BUCKET_KEY, mock_requests) + + head = mock_s3_client.head_object(Bucket=ALT_BUCKET, Key=UPLOAD_TEST_KEY) + assert head["ContentType"] == content_type or "application/octet-stream" + + +def test_stream_to_s3_raises_on_http_error_status(mock_s3_client: Any) -> None: + """An HTTP error status causes raise_for_status() to propagate an exception.""" + mock_requests, mock_response = make_mock_requests(status_code=404) + mock_response.raise_for_status.side_effect = HTTPError("404 Not Found") + + with ( + pytest.raises(HTTPError, match="404 Not Found"), + ): + stream_to_s3(TEST_URL, UPLOAD_BUCKET_KEY, mock_requests) + + with pytest.raises(ClientError, match="Not Found"): + mock_s3_client.head_object(Bucket=ALT_BUCKET, Key=UPLOAD_TEST_KEY) + + +def test_stream_to_s3_raises_on_connection_error(mock_s3_client: Any) -> None: + """A network-level failure raises a ConnectionError.""" + mock_requests, _ = make_mock_requests(status_code=404) + mock_requests.get.side_effect = ConnError("Network unreachable") + + with pytest.raises(ConnError, match="Network unreachable"): + stream_to_s3(TEST_URL, UPLOAD_BUCKET_KEY, mock_requests) + + with pytest.raises(ClientError, match="Not Found"): + mock_s3_client.head_object(Bucket=ALT_BUCKET, Key=UPLOAD_TEST_KEY) + + +# FIXME: don't upload if there is nothing there? +def test_stream_to_s3_uploads_empty_file(mock_s3_client: Any) -> None: + """An empty HTTP response body results in an empty S3 object.""" + mock_requests, _ = make_mock_requests(content=b"") + + stream_to_s3(TEST_URL, UPLOAD_BUCKET_KEY, mock_requests) + + result = mock_s3_client.get_object(Bucket=ALT_BUCKET, Key=UPLOAD_TEST_KEY) + assert result["Body"].read() == b"" + + +def test_stream_to_s3_uploads_large_file(mock_s3_client: Any) -> None: + """A large payload (>5MB) is uploaded correctly via multipart.""" + content = b"x" * (6 * 1024 * 1024) # 6 MB + mock_requests, _ = make_mock_requests(content=content) + + stream_to_s3(TEST_URL, UPLOAD_BUCKET_KEY, mock_requests) + + result = mock_s3_client.get_object(Bucket=ALT_BUCKET, Key=UPLOAD_TEST_KEY) + assert result["Body"].read() == content + + +@pytest.mark.skip("TODO: add test(s)") +def test_accepts_custom_requests_implementation() -> None: + """A subclassed or alternate requests module works as a drop-in.""" + # TODO: add test here? + + @pytest.mark.parametrize("bucket", BUCKETS) @pytest.mark.parametrize("protocol", ["", "s3://", "s3a://"]) @pytest.mark.s3 @@ -475,14 +616,15 @@ def test_download_file_does_not_clobber_existing_file_to_mkdir(mock_s3_client: A @pytest.mark.s3 -def test_download_file_does_not_exist(mock_s3_client: Any, tmp_path: Path, capsys: pytest.CaptureFixture) -> None: +@pytest.mark.usefixtures("mock_s3_client") +def test_download_file_does_not_exist(tmp_path: Path, capsys: pytest.CaptureFixture) -> None: """Ensure that attempting to download a file that does not exist raises an error.""" bucket = BUCKETS[0] key = "to/the/door.txt" assert not object_exists(f"{bucket}/{key}") with pytest.raises( - botocore.exceptions.ClientError, + ClientError, match=r"An error occurred \(404\) when calling the HeadObject", ): download_file(f"{bucket}/{key}", tmp_path / "file.txt") @@ -490,6 +632,12 @@ def test_download_file_does_not_exist(mock_s3_client: Any, tmp_path: Path, capsy assert "File not found" in capsys.readouterr().out +# TODO: Missing tests +# - Non-404 S3 error during head +# - Error during directory creation (other than FileExistsError)? +# - version_id parameter behavior + + # upload_dir @pytest.mark.parametrize("bucket", [CDM_LAKE_BUCKET, ALT_BUCKET]) @pytest.mark.s3 @@ -577,6 +725,26 @@ def test_copy_file(mocked_s3_client_no_checksum: Any, destination: str) -> None: assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 +@pytest.mark.s3 +@pytest.mark.usefixtures("mock_s3_client") +def test_copy_file_source_object_nonexistent() -> None: + """Ensure that the code throws an error if the source object does not exist.""" + s3_path = f"{CDM_LAKE_BUCKET}/some/path/to/file" + assert object_exists(s3_path) is False + with pytest.raises(Exception, match="The specified key does not exist"): + copy_object(s3_path, f"{CDM_LAKE_BUCKET}/a/different/path/to/file") + + +@pytest.mark.s3 +@pytest.mark.usefixtures("mock_s3_client") +def test_copy_file_source_bucket_nonexistent() -> None: + """Ensure that the code throws an error if the bucket does not exist.""" + s3_path = "some-bucket/some/path/to/file" + assert object_exists(s3_path) is False + with pytest.raises(Exception, match="The specified bucket does not exist"): + copy_object(s3_path, f"{CDM_LAKE_BUCKET}/a/different/path/to/file") + + # delete_object @pytest.mark.parametrize("bucket", BUCKETS) @pytest.mark.parametrize("protocol", ["", "s3://", "s3a://"]) @@ -595,3 +763,14 @@ def test_delete_object_removes_object(mock_s3_client: Any, bucket: str, protocol resp = delete_object(s3_path) assert object_exists(s3_path) is False assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == 204 + + +# delete_object - bucket does not exist +@pytest.mark.s3 +@pytest.mark.usefixtures("mock_s3_client") +def test_delete_object_no_such_bucket() -> None: + """Verify that delete_object removes the object from the specified bucket.""" + s3_path = "fake-bucket/to/delete.txt" + assert object_exists(s3_path) is False + with pytest.raises(Exception, match="The specified bucket does not exist"): + delete_object(s3_path) diff --git a/uv.lock b/uv.lock index 243a2da8..5e485d34 100644 --- a/uv.lock +++ b/uv.lock @@ -393,7 +393,7 @@ crt = [ [[package]] name = "cdm-data-loaders" -version = "0.1.7" +version = "0.1.8" source = { editable = "." } dependencies = [ { name = "bioregistry" }, @@ -440,7 +440,7 @@ requires-dist = [ { name = "dlt", extras = ["deltalake", "duckdb", "filesystem", "parquet"], specifier = ">=1.22.2" }, { name = "frictionless", extras = ["aws"], specifier = ">=5.18.1" }, { name = "frozendict", specifier = ">=2.4.7" }, - { name = "lxml", specifier = ">=6.0.2" }, + { name = "lxml", specifier = ">=6.1.0" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "pydantic-settings", specifier = ">=2.12.0" }, { name = "tqdm", specifier = ">=4.67.3" }, @@ -1725,64 +1725,64 @@ wheels = [ [[package]] name = "lxml" -version = "6.0.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ce/08/1217ca4043f55c3c92993b283a7dbfa456a2058d8b57bbb416cc96b6efff/lxml-6.0.4.tar.gz", hash = "sha256:4137516be2a90775f99d8ef80ec0283f8d78b5d8bd4630ff20163b72e7e9abf2", size = 4237780, upload-time = "2026-04-12T16:28:24.182Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/f6/550a1ed9afde66e24bfcf9892446ea9779152df336062c6df0f7733151a2/lxml-6.0.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecc3d55ed756ee6c3447748862a97e1f5392d2c5d7f474bace9382345e4fc274", size = 8559522, upload-time = "2026-04-12T16:24:51.563Z" }, - { url = "https://files.pythonhosted.org/packages/11/93/3f687c14d2b4d24b60fe13fd5482c8853f82a10bb87f2b577123e342ed1a/lxml-6.0.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7d5a627a368a0e861350ccc567a70ec675d2bc4d8b3b54f48995ae78d8d530e", size = 4617380, upload-time = "2026-04-12T16:24:54.042Z" }, - { url = "https://files.pythonhosted.org/packages/b5/ed/91e443366063d3fb7640ae2badd5d7b65be4095ac6d849788e39c043baae/lxml-6.0.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d385141b186cc39ebe4863c1e41936282c65df19b2d06a701dedc2a898877d6a", size = 4922791, upload-time = "2026-04-12T16:24:56.381Z" }, - { url = "https://files.pythonhosted.org/packages/30/4b/2243260b70974aca9ba0cc71bd668c0c3a79644d80ddcabbfbdb4b131848/lxml-6.0.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0132bb040e9bb5a199302e12bf942741defbc52922a2a06ce9ff7be0d0046483", size = 5080972, upload-time = "2026-04-12T16:24:58.823Z" }, - { url = "https://files.pythonhosted.org/packages/f8/c3/54c53c4f772341bc12331557f8b0882a426f53133926306cbe6d7f0ee7e4/lxml-6.0.4-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:26aee5321e4aa1f07c9090a35f6ab8b703903fb415c6c823cfdb20ee0d779855", size = 4992236, upload-time = "2026-04-12T16:25:01.099Z" }, - { url = "https://files.pythonhosted.org/packages/be/0f/416de42e22f287585abee610eb0d1c2638c9fe24cee7e15136e0b5e138f8/lxml-6.0.4-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b5652455de198ff76e02cfa57d5efc5f834fa45521aaf3fcc13d6b5a88bde23d", size = 5612398, upload-time = "2026-04-12T16:25:03.517Z" }, - { url = "https://files.pythonhosted.org/packages/7d/63/29a3fa79b8a182f5bd5b5bdcb6f625f49f08f41d60a26ca25482820a1b99/lxml-6.0.4-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:75842801fb48aea73f4c281b923a010dfb39bad75edf8ceb2198ec30c27f01cc", size = 5227480, upload-time = "2026-04-12T16:25:06.119Z" }, - { url = "https://files.pythonhosted.org/packages/7c/4a/44d1843de599b1c6dbe578e4248c2f15e7fac90c5c86eb26775eaeac0fe0/lxml-6.0.4-cp313-cp313-manylinux_2_28_i686.whl", hash = "sha256:94a1f74607a5a049ff6ff8de429fec922e643e32b5b08ec7a4fe49e8de76e17c", size = 5341001, upload-time = "2026-04-12T16:25:08.563Z" }, - { url = "https://files.pythonhosted.org/packages/0d/52/c8aebde49f169e4e3452e7756be35be1cb2903e30d961cb57aa65a27055f/lxml-6.0.4-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:173cc246d3d3b6d3b6491f0b3aaf22ebdf2eed616879482acad8bd84d73eb231", size = 4699105, upload-time = "2026-04-12T16:25:10.757Z" }, - { url = "https://files.pythonhosted.org/packages/78/60/76fc3735c31c28b70220d99452fb72052e84b618693ca2524da96f0131d8/lxml-6.0.4-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f0f2ee1be1b72e9890da87e4e422f2f703ff4638fd5ec5383055db431e8e30e9", size = 5231095, upload-time = "2026-04-12T16:25:13.305Z" }, - { url = "https://files.pythonhosted.org/packages/e5/60/448f01c52110102f23df5f07b3f4fde57c8e13e497e182a743d125324c0b/lxml-6.0.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c51a274b7e8b9ce394c3f8b471eb0b23c1914eec64fdccf674e082daf72abf11", size = 5042411, upload-time = "2026-04-12T16:25:15.541Z" }, - { url = "https://files.pythonhosted.org/packages/4a/2a/90612a001fa4fa0ff0443ebb0256a542670fe35473734c559720293e7aff/lxml-6.0.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:210ea934cba1a1ec42f88c4190c4d5c67b2d14321a8faed9b39e8378198ff99d", size = 4768431, upload-time = "2026-04-12T16:25:17.581Z" }, - { url = "https://files.pythonhosted.org/packages/84/d8/572845a7d741c8a8ffeaf928185263e14d97fbd355de164677340951d7a5/lxml-6.0.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:14fe654a59eebe16368c51778caeb0c8fda6f897adcd9afe828d87d13b5d5e51", size = 5634972, upload-time = "2026-04-12T16:25:20.111Z" }, - { url = "https://files.pythonhosted.org/packages/d7/1d/392b8c9f8cf1d502bbec50dee137c7af3dd5def5e5cd84572fbf0ba0541c/lxml-6.0.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:ec160a2b7e2b3cb71ec35010b19a1adea05785d19ba5c9c5f986b64b78fef564", size = 5222909, upload-time = "2026-04-12T16:25:22.243Z" }, - { url = "https://files.pythonhosted.org/packages/21/ab/949fc96f825cf083612aee65d5a02eacc5eaeb2815561220e33e1e160677/lxml-6.0.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d305b86ef10b23cf3a6d62a2ad23fa296f76495183ee623f64d2600f65ffe09c", size = 5249096, upload-time = "2026-04-12T16:25:24.781Z" }, - { url = "https://files.pythonhosted.org/packages/56/e8/fbe44df79ede5ff760401cc3c49c4204f49f0f529cc6b27d0af7b63f5472/lxml-6.0.4-cp313-cp313-win32.whl", hash = "sha256:a2f31380aa9a9b52591e79f1c1d3ac907688fbeb9d883ba28be70f2eb5db2277", size = 3595808, upload-time = "2026-04-12T16:25:26.747Z" }, - { url = "https://files.pythonhosted.org/packages/f8/df/e873abb881092256520edf0d67d686e36f3c86b3cf289f01b6458272dede/lxml-6.0.4-cp313-cp313-win_amd64.whl", hash = "sha256:b8efa9f681f15043e497293d58a4a63199564b253ed2291887d92bb3f74f59ab", size = 3994635, upload-time = "2026-04-12T16:25:28.828Z" }, - { url = "https://files.pythonhosted.org/packages/23/a8/9c56c8914b9b18d89face5a7472445002baf309167f7af65d988842129fd/lxml-6.0.4-cp313-cp313-win_arm64.whl", hash = "sha256:905abe6a5888129be18f85f2aea51f0c9863fa0722fb8530dfbb687d2841d221", size = 3657374, upload-time = "2026-04-12T16:25:30.901Z" }, - { url = "https://files.pythonhosted.org/packages/10/18/36e28a809c509a67496202771f545219ac5a2f1cd61aae325991fcf5ab91/lxml-6.0.4-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:569d3b18340863f603582d2124e742a68e85755eff5e47c26a55e298521e3a01", size = 8575045, upload-time = "2026-04-12T16:25:33.57Z" }, - { url = "https://files.pythonhosted.org/packages/11/38/a168c820e3b08d3b4fa0f4e6b53b3930086b36cc11e428106d38c36778cd/lxml-6.0.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3b6245ee5241342d45e1a54a4a8bc52ef322333ada74f24aa335c4ab36f20161", size = 4622963, upload-time = "2026-04-12T16:25:36.818Z" }, - { url = "https://files.pythonhosted.org/packages/53/e0/2c9d6abdd82358cea3c0d8d6ca272a6af0f38156abce7827efb6d5b62d17/lxml-6.0.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:79a1173ba3213a3693889a435417d4e9f3c07d96e30dc7cc3a712ed7361015fe", size = 4948832, upload-time = "2026-04-12T16:25:39.104Z" }, - { url = "https://files.pythonhosted.org/packages/96/d7/f2202852e91d7baf3a317f4523a9c14834145301e5b0f2e80c01c4bfbd49/lxml-6.0.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc18bb975666b443ba23aedd2fcf57e9d0d97546b52a1de97a447c4061ba4110", size = 5085865, upload-time = "2026-04-12T16:25:41.226Z" }, - { url = "https://files.pythonhosted.org/packages/09/57/abee549324496e92708f71391c6060a164d3c95369656a1a15e9f20d8162/lxml-6.0.4-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2079f5dc83291ac190a52f8354b78648f221ecac19fb2972a2d056b555824de7", size = 5030001, upload-time = "2026-04-12T16:25:43.695Z" }, - { url = "https://files.pythonhosted.org/packages/c2/f8/432da7178c5917a16468af6c5da68fef7cf3357d4bd0e6f50272ec9a59b5/lxml-6.0.4-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3eda02da4ca16e9ca22bbe5654470c17fa1abcd967a52e4c2e50ff278221e351", size = 5646303, upload-time = "2026-04-12T16:25:46.577Z" }, - { url = "https://files.pythonhosted.org/packages/82/f9/e1c04ef667a6bf9c9dbd3bf04c50fa51d7ee25b258485bb748b27eb9a1c7/lxml-6.0.4-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c3787cdc3832b70e21ac2efafea2a82a8ccb5e85bec110dc68b26023e9d3caae", size = 5237940, upload-time = "2026-04-12T16:25:49.157Z" }, - { url = "https://files.pythonhosted.org/packages/d0/f0/cdea60d92df731725fc3c4f33e387b100f210acd45c92969e42d2ba993fa/lxml-6.0.4-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:3f276d49c23103565d39440b9b3f4fc08fa22f5a96395ea4b4d4fea4458b1505", size = 5350050, upload-time = "2026-04-12T16:25:52.027Z" }, - { url = "https://files.pythonhosted.org/packages/2e/15/bf52c7a70b6081bb9e00d37cc90fcf60aa84468d9d173ad2fade38ec34c5/lxml-6.0.4-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:fdfdad73736402375b11b3a137e48cd09634177516baf5fc0bd80d1ca85f3cda", size = 4696409, upload-time = "2026-04-12T16:25:55.141Z" }, - { url = "https://files.pythonhosted.org/packages/c5/69/9bade267332cc06f9a9aa773b5a11bdfb249af485df9e142993009ea1fc4/lxml-6.0.4-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:75912421456946931daba0ec3cedfa824c756585d05bde97813a17992bfbd013", size = 5249072, upload-time = "2026-04-12T16:25:57.362Z" }, - { url = "https://files.pythonhosted.org/packages/14/ca/043bcacb096d6ed291cbbc58724e9625a453069d6edeb840b0bf18038d05/lxml-6.0.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:48cd5a88da67233fd82f2920db344503c2818255217cd6ea462c9bb8254ba7cb", size = 5083779, upload-time = "2026-04-12T16:26:00.018Z" }, - { url = "https://files.pythonhosted.org/packages/04/89/f5fb18d76985969e84af13682e489acabee399bb54738a363925ea6e7390/lxml-6.0.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:87af86a8fa55b9ff1e6ee4233d762296f2ce641ba948af783fb995c5a8a3371b", size = 4736953, upload-time = "2026-04-12T16:26:02.289Z" }, - { url = "https://files.pythonhosted.org/packages/84/ba/d1d7284bb4ba951f188c3fc0455943c1fcbd1c33d1324d6d57b7d4a45be6/lxml-6.0.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:a743714cd656ba7ccb29d199783906064c7b5ba3c0e2a79f0244ea0badc6a98c", size = 5669605, upload-time = "2026-04-12T16:26:04.694Z" }, - { url = "https://files.pythonhosted.org/packages/72/05/1463e55f2de27bb60feddc894dd7c0833bd501f8861392ed416291b38db5/lxml-6.0.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e31c76bd066fb4f81d9a32e5843bffdf939ab27afb1ffc1c924e749bfbdb00e3", size = 5236886, upload-time = "2026-04-12T16:26:07.659Z" }, - { url = "https://files.pythonhosted.org/packages/fe/fb/0b6ee9194ce3ac49db4cadaa8a9158f04779fc768b6c27c4e2945d71a99d/lxml-6.0.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f185fd6e7d550e9917d7103dccf51be589aba953e15994fb04646c1730019685", size = 5263382, upload-time = "2026-04-12T16:26:10.067Z" }, - { url = "https://files.pythonhosted.org/packages/9a/93/ec18a08e98dd82cac39f1d2511ee2bed5affb94d228356d8ef165a4ec3b9/lxml-6.0.4-cp314-cp314-win32.whl", hash = "sha256:774660028f8722a598400430d2746fb0075949f84a9a5cd9767d9152e3baaac5", size = 3656164, upload-time = "2026-04-12T16:26:59.568Z" }, - { url = "https://files.pythonhosted.org/packages/15/86/52507316abfc7150bf6bb191e39a12e301ee80334610a493884ae2f9d20d/lxml-6.0.4-cp314-cp314-win_amd64.whl", hash = "sha256:fbd7d14349413f5609c0b537b1a48117d6ccef1af37986af6b03766ad05bf43e", size = 4062512, upload-time = "2026-04-12T16:27:02.212Z" }, - { url = "https://files.pythonhosted.org/packages/f1/d5/09c593a2ef2234b8cd6cf059e2dc212e0654bf05c503f0ef2daf05adb680/lxml-6.0.4-cp314-cp314-win_arm64.whl", hash = "sha256:a61a01ec3fbfd5b73a69a7bf513271051fd6c5795d82fc5daa0255934cd8db3d", size = 3740745, upload-time = "2026-04-12T16:27:04.444Z" }, - { url = "https://files.pythonhosted.org/packages/4a/3c/42a98bf6693938bf7b285ec7f70ba2ae9d785d0e5b2cdb85d2ee29e287eb/lxml-6.0.4-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:504edb62df33cea502ea6e73847c647ba228623ca3f80a228be5723a70984dd5", size = 8826437, upload-time = "2026-04-12T16:26:12.911Z" }, - { url = "https://files.pythonhosted.org/packages/c2/c2/ad13f39b2db8709788aa2dcb6e90b81da76db3b5b2e7d35e0946cf984960/lxml-6.0.4-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:f01b7b0316d4c0926d49a7f003b2d30539f392b140a3374bb788bad180bc8478", size = 4734892, upload-time = "2026-04-12T16:26:15.871Z" }, - { url = "https://files.pythonhosted.org/packages/2c/6d/c559d7b5922c5b0380fc2cb5ac134b6a3f9d79d368347a624ee5d68b0816/lxml-6.0.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab999933e662501efe4b16e6cfb7c9f9deca7d072cd1788b99c8defde78c0dfb", size = 4969173, upload-time = "2026-04-12T16:26:18.335Z" }, - { url = "https://files.pythonhosted.org/packages/c7/78/ca521e36157f38e3e1a29276855cdf48d213138fc0c8365693ff5c876ca7/lxml-6.0.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67c3f084389fe75932c39b6869a377f6c8e21e818f31ae8a30c71dd2e59360e2", size = 5103134, upload-time = "2026-04-12T16:26:20.612Z" }, - { url = "https://files.pythonhosted.org/packages/28/a7/7d62d023bacaa0aaf60af8c0a77c6c05f84327396d755f3aa64b788678a9/lxml-6.0.4-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:377ea1d654f76ed6205c87d14920f829c9f4d31df83374d3cbcbdaae804d37b2", size = 5027205, upload-time = "2026-04-12T16:26:22.981Z" }, - { url = "https://files.pythonhosted.org/packages/34/be/51b194b81684f2e85e5d992771c45d70cb22ac6f7291ac6bc7b255830afe/lxml-6.0.4-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e60cd0bcacbfd1a96d63516b622183fb2e3f202300df9eb5533391a8a939dbfa", size = 5594461, upload-time = "2026-04-12T16:26:25.316Z" }, - { url = "https://files.pythonhosted.org/packages/39/24/8850f38fbf89dd072ff31ba22f9e40347aeada7cadf710ecb04b8d9f32d4/lxml-6.0.4-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e9e30fd63d41dd0bbdb020af5cdfffd5d9b554d907cb210f18e8fcdc8eac013", size = 5223378, upload-time = "2026-04-12T16:26:28.68Z" }, - { url = "https://files.pythonhosted.org/packages/2a/9b/595239ba8c719b0fdc7bc9ebdb7564459c9a6b24b8b363df4a02674aeece/lxml-6.0.4-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:1fb4a1606bb68c533002e7ed50d7e55e58f0ef1696330670281cb79d5ab2050d", size = 5311415, upload-time = "2026-04-12T16:26:31.513Z" }, - { url = "https://files.pythonhosted.org/packages/be/cb/aa27ac8d041acf34691577838494ad08df78e83fdfdb66948d2903e9291e/lxml-6.0.4-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:695c7708438e449d57f404db8cc1b769e77ad5b50655f32f8175686ba752f293", size = 4637953, upload-time = "2026-04-12T16:26:33.806Z" }, - { url = "https://files.pythonhosted.org/packages/f6/f2/f19114fd86825c2d1ce41cd99daad218d30cfdd2093d4de9273986fb4d68/lxml-6.0.4-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d49c35ae1e35ee9b569892cf8f8f88db9524f28d66e9daee547a5ef9f3c5f468", size = 5231532, upload-time = "2026-04-12T16:26:36.518Z" }, - { url = "https://files.pythonhosted.org/packages/9a/0e/c3fa354039ec0b6b09f40fbe1129efc572ac6239faa4906de42d5ce87c0a/lxml-6.0.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5801072f8967625e6249d162065d0d6011ef8ce3d0efb8754496b5246b81a74b", size = 5083767, upload-time = "2026-04-12T16:26:39.332Z" }, - { url = "https://files.pythonhosted.org/packages/b3/4b/1a0dbb6d6ffae16e54a8a3796ded0ad2f9c3bc1ff3728bde33456f4e1d63/lxml-6.0.4-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cbf768541526eba5ef1a49f991122e41b39781eafd0445a5a110fc09947a20b5", size = 4758079, upload-time = "2026-04-12T16:26:42.138Z" }, - { url = "https://files.pythonhosted.org/packages/a9/01/a246cf5f80f96766051de4b305d6552f80bdaefb37f04e019e42af0aba69/lxml-6.0.4-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:eecce87cc09233786fc31c230268183bf6375126cfec1c8b3673fcdc8767b560", size = 5618686, upload-time = "2026-04-12T16:26:44.507Z" }, - { url = "https://files.pythonhosted.org/packages/eb/1f/b072a92369039ebef11b0a654be5134fcf3ed04c0f437faf9435ac9ba845/lxml-6.0.4-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:07dce892881179e11053066faca2da17b0eeb0bb7298f11bcf842a86db207dbd", size = 5227259, upload-time = "2026-04-12T16:26:47.083Z" }, - { url = "https://files.pythonhosted.org/packages/d5/a0/dc97034f9d4c0c4d30875147d81fd2c0c7f3d261b109db36ed746bf8ab1d/lxml-6.0.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e4f97aee337b947e6699e5574c90d087d3e2ce517016241c07e7e98a28dca885", size = 5246190, upload-time = "2026-04-12T16:26:49.468Z" }, - { url = "https://files.pythonhosted.org/packages/f2/ef/85cb69835113583c2516fee07d0ffb4d824b557424b06ba5872c20ba6078/lxml-6.0.4-cp314-cp314t-win32.whl", hash = "sha256:064477c0d4c695aa1ea4b9c1c4ee9043ab740d12135b74c458cc658350adcd86", size = 3896005, upload-time = "2026-04-12T16:26:52.163Z" }, - { url = "https://files.pythonhosted.org/packages/3d/5e/2231f34cc54b8422b793593138d86d3fa4588fb2297d4ea0472390f25627/lxml-6.0.4-cp314-cp314t-win_amd64.whl", hash = "sha256:25bad2d8438f4ef5a7ad4a8d8bcaadde20c0daced8bdb56d46236b0a7d1cbdd0", size = 4391037, upload-time = "2026-04-12T16:26:54.398Z" }, - { url = "https://files.pythonhosted.org/packages/39/53/8ba3cd5984f8363635450c93f63e541a0721b362bb32ae0d8237d9674aee/lxml-6.0.4-cp314-cp314t-win_arm64.whl", hash = "sha256:1dcd9e6cb9b7df808ea33daebd1801f37a8f50e8c075013ed2a2343246727838", size = 3816184, upload-time = "2026-04-12T16:26:57.011Z" }, +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/28/30/9abc9e34c657c33834eaf6cd02124c61bdf5944d802aa48e69be8da3585d/lxml-6.1.0.tar.gz", hash = "sha256:bfd57d8008c4965709a919c3e9a98f76c2c7cb319086b3d26858250620023b13", size = 4197006, upload-time = "2026-04-18T04:32:51.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/03/69347590f1cf4a6d5a4944bb6099e6d37f334784f16062234e1f892fdb1d/lxml-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a0092f2b107b69601adf562a57c956fbb596e05e3e6651cabd3054113b007e45", size = 8559689, upload-time = "2026-04-18T04:31:57.785Z" }, + { url = "https://files.pythonhosted.org/packages/3f/58/25e00bb40b185c974cfe156c110474d9a8a8390d5f7c92a4e328189bb60e/lxml-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fc7140d7a7386e6b545d41b7358f4d02b656d4053f5fa6859f92f4b9c2572c4d", size = 4617892, upload-time = "2026-04-18T04:32:01.78Z" }, + { url = "https://files.pythonhosted.org/packages/f5/54/92ad98a94ac318dc4f97aaac22ff8d1b94212b2ae8af5b6e9b354bf825f7/lxml-6.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:419c58fc92cc3a2c3fa5f78c63dbf5da70c1fa9c1b25f25727ecee89a96c7de2", size = 4923489, upload-time = "2026-04-18T04:33:31.401Z" }, + { url = "https://files.pythonhosted.org/packages/15/3b/a20aecfab42bdf4f9b390590d345857ad3ffd7c51988d1c89c53a0c73faf/lxml-6.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:37fabd1452852636cf38ecdcc9dd5ca4bba7a35d6c53fa09725deeb894a87491", size = 5082162, upload-time = "2026-04-18T04:33:34.262Z" }, + { url = "https://files.pythonhosted.org/packages/45/26/2cdb3d281ac1bd175603e290cbe4bad6eff127c0f8de90bafd6f8548f0fd/lxml-6.1.0-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2853c8b2170cc6cd54a6b4d50d2c1a8a7aeca201f23804b4898525c7a152cfc", size = 4993247, upload-time = "2026-04-18T04:33:36.674Z" }, + { url = "https://files.pythonhosted.org/packages/f6/05/d735aef963740022a08185c84821f689fc903acb3d50326e6b1e9886cc22/lxml-6.1.0-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8e369cbd690e788c8d15e56222d91a09c6a417f49cbc543040cba0fe2e25a79e", size = 5613042, upload-time = "2026-04-18T04:33:39.205Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b8/ead7c10efff731738c72e59ed6eb5791854879fbed7ae98781a12006263a/lxml-6.1.0-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e69aa6805905807186eb00e66c6d97a935c928275182eb02ee40ba00da9623b2", size = 5228304, upload-time = "2026-04-18T04:33:41.647Z" }, + { url = "https://files.pythonhosted.org/packages/6b/10/e9842d2ec322ea65f0a7270aa0315a53abed06058b88ef1b027f620e7a5f/lxml-6.1.0-cp313-cp313-manylinux_2_28_i686.whl", hash = "sha256:4bd1bdb8a9e0e2dd229de19b5f8aebac80e916921b4b2c6ef8a52bc131d0c1f9", size = 5341578, upload-time = "2026-04-18T04:33:44.596Z" }, + { url = "https://files.pythonhosted.org/packages/89/54/40d9403d7c2775fa7301d3ddd3464689bfe9ba71acc17dfff777071b4fdc/lxml-6.1.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:cbd7b79cdcb4986ad78a2662625882747f09db5e4cd7b2ae178a88c9c51b3dfe", size = 4700209, upload-time = "2026-04-18T04:33:47.552Z" }, + { url = "https://files.pythonhosted.org/packages/85/b2/bbdcc2cf45dfc7dfffef4fd97e5c47b15919b6a365247d95d6f684ef5e82/lxml-6.1.0-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:43e4d297f11080ec9d64a4b1ad7ac02b4484c9f0e2179d9c4ef78e886e747b88", size = 5232365, upload-time = "2026-04-18T04:33:50.249Z" }, + { url = "https://files.pythonhosted.org/packages/48/5a/b06875665e53aaba7127611a7bed3b7b9658e20b22bc2dd217a0b7ab0091/lxml-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cc16682cc987a3da00aa56a3aa3075b08edb10d9b1e476938cfdbee8f3b67181", size = 5043654, upload-time = "2026-04-18T04:33:52.71Z" }, + { url = "https://files.pythonhosted.org/packages/e9/9c/e71a069d09641c1a7abeb30e693f828c7c90a41cbe3d650b2d734d876f85/lxml-6.1.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:d6d8efe71429635f0559579092bb5e60560d7b9115ee38c4adbea35632e7fa24", size = 4769326, upload-time = "2026-04-18T04:33:55.244Z" }, + { url = "https://files.pythonhosted.org/packages/cc/06/7a9cd84b3d4ed79adf35f874750abb697dec0b4a81a836037b36e47c091a/lxml-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7e39ab3a28af7784e206d8606ec0e4bcad0190f63a492bca95e94e5a4aef7f6e", size = 5635879, upload-time = "2026-04-18T04:33:58.509Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f0/9d57916befc1e54c451712c7ee48e9e74e80ae4d03bdce49914e0aee42cd/lxml-6.1.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:9eb667bf50856c4a58145f8ca2d5e5be160191e79eb9e30855a476191b3c3495", size = 5224048, upload-time = "2026-04-18T04:34:00.943Z" }, + { url = "https://files.pythonhosted.org/packages/99/75/90c4eefda0c08c92221fe0753db2d6699a4c628f76ff4465ec20dea84cc1/lxml-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7f4a77d6f7edf9230cee3e1f7f6764722a41604ee5681844f18db9a81ea0ec33", size = 5250241, upload-time = "2026-04-18T04:34:03.365Z" }, + { url = "https://files.pythonhosted.org/packages/5e/73/16596f7e4e38fa33084b9ccbccc22a15f82a290a055126f2c1541236d2ff/lxml-6.1.0-cp313-cp313-win32.whl", hash = "sha256:28902146ffbe5222df411c5d19e5352490122e14447e98cd118907ee3fd6ee62", size = 3596938, upload-time = "2026-04-18T04:31:56.206Z" }, + { url = "https://files.pythonhosted.org/packages/8e/63/981401c5680c1eb30893f00a19641ac80db5d1e7086c62cb4b13ed813038/lxml-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:4a1503c56e4e2b38dc76f2f2da7bae69670c0f1933e27cfa34b2fa5876410b16", size = 3995728, upload-time = "2026-04-18T04:31:58.763Z" }, + { url = "https://files.pythonhosted.org/packages/e7/e8/c358a38ac3e541d16a1b527e4e9cb78c0419b0506a070ace11777e5e8404/lxml-6.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:e0af85773850417d994d019741239b901b22c6680206f46a34766926e466141d", size = 3658372, upload-time = "2026-04-18T04:32:03.629Z" }, + { url = "https://files.pythonhosted.org/packages/eb/45/cee4cf203ef0bab5c52afc118da61d6b460c928f2893d40023cfa27e0b80/lxml-6.1.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:ab863fd37458fed6456525f297d21239d987800c46e67da5ef04fc6b3dd93ac8", size = 8576713, upload-time = "2026-04-18T04:32:06.831Z" }, + { url = "https://files.pythonhosted.org/packages/8a/a7/eda05babeb7e046839204eaf254cd4d7c9130ce2bbf0d9e90ea41af5654d/lxml-6.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6fd8b1df8254ff4fd93fd31da1fc15770bde23ac045be9bb1f87425702f61cc9", size = 4623874, upload-time = "2026-04-18T04:32:10.755Z" }, + { url = "https://files.pythonhosted.org/packages/e7/e9/db5846de9b436b91890a62f29d80cd849ea17948a49bf532d5278ee69a9e/lxml-6.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:47024feaae386a92a146af0d2aeed65229bf6fff738e6a11dda6b0015fb8fd03", size = 4949535, upload-time = "2026-04-18T04:34:06.657Z" }, + { url = "https://files.pythonhosted.org/packages/5a/ba/0d3593373dcae1d68f40dc3c41a5a92f2544e68115eb2f62319a4c2a6500/lxml-6.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3f00972f84450204cd5d93a5395965e348956aaceaadec693a22ec743f8ae3eb", size = 5086881, upload-time = "2026-04-18T04:34:09.556Z" }, + { url = "https://files.pythonhosted.org/packages/43/76/759a7484539ad1af0d125a9afe9c3fb5f82a8779fd1f5f56319d9e4ea2fd/lxml-6.1.0-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97faa0860e13b05b15a51fb4986421ef7a30f0b3334061c416e0981e9450ca4c", size = 5031305, upload-time = "2026-04-18T04:34:12.336Z" }, + { url = "https://files.pythonhosted.org/packages/dc/b9/c1f0daf981a11e47636126901fd4ab82429e18c57aeb0fc3ad2940b42d8b/lxml-6.1.0-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:972a6451204798675407beaad97b868d0c733d9a74dafefc63120b81b8c2de28", size = 5647522, upload-time = "2026-04-18T04:34:14.89Z" }, + { url = "https://files.pythonhosted.org/packages/31/e6/1f533dcd205275363d9ba3511bcec52fa2df86abf8abe6a5f2c599f0dc31/lxml-6.1.0-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fe022f20bc4569ec66b63b3fb275a3d628d9d32da6326b2982584104db6d3086", size = 5239310, upload-time = "2026-04-18T04:34:17.652Z" }, + { url = "https://files.pythonhosted.org/packages/c3/8c/4175fb709c78a6e315ed814ed33be3defd8b8721067e70419a6cf6f971da/lxml-6.1.0-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:75c4c7c619a744f972f4451bf5adf6d0fb00992a1ffc9fd78e13b0bc817cc99f", size = 5350799, upload-time = "2026-04-18T04:34:20.529Z" }, + { url = "https://files.pythonhosted.org/packages/fd/77/6ffdebc5994975f0dde4acb59761902bd9d9bb84422b9a0bd239a7da9ca8/lxml-6.1.0-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:3648f20d25102a22b6061c688beb3a805099ea4beb0a01ce62975d926944d292", size = 4697693, upload-time = "2026-04-18T04:34:23.541Z" }, + { url = "https://files.pythonhosted.org/packages/f8/f1/565f36bd5c73294602d48e04d23f81ff4c8736be6ba5e1d1ec670ac9be80/lxml-6.1.0-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:77b9f99b17cbf14026d1e618035077060fc7195dd940d025149f3e2e830fbfcb", size = 5250708, upload-time = "2026-04-18T04:34:26.001Z" }, + { url = "https://files.pythonhosted.org/packages/5a/11/a68ab9dd18c5c499404deb4005f4bc4e0e88e5b72cd755ad96efec81d18d/lxml-6.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:32662519149fd7a9db354175aa5e417d83485a8039b8aaa62f873ceee7ea4cad", size = 5084737, upload-time = "2026-04-18T04:34:28.32Z" }, + { url = "https://files.pythonhosted.org/packages/ab/78/e8f41e2c74f4af564e6a0348aea69fb6daaefa64bc071ef469823d22cc18/lxml-6.1.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:73d658216fc173cf2c939e90e07b941c5e12736b0bf6a99e7af95459cfe8eabb", size = 4737817, upload-time = "2026-04-18T04:34:30.784Z" }, + { url = "https://files.pythonhosted.org/packages/06/2d/aa4e117aa2ce2f3b35d9ff246be74a2f8e853baba5d2a92c64744474603a/lxml-6.1.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ac4db068889f8772a4a698c5980ec302771bb545e10c4b095d4c8be26749616f", size = 5670753, upload-time = "2026-04-18T04:34:33.675Z" }, + { url = "https://files.pythonhosted.org/packages/08/f5/dd745d50c0409031dbfcc4881740542a01e54d6f0110bd420fa7782110b8/lxml-6.1.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:45e9dfbd1b661eb64ba0d4dbe762bd210c42d86dd1e5bd2bdf89d634231beb43", size = 5238071, upload-time = "2026-04-18T04:34:36.12Z" }, + { url = "https://files.pythonhosted.org/packages/3e/74/ad424f36d0340a904665867dab310a3f1f4c96ff4039698de83b77f44c1f/lxml-6.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:89e8d73d09ac696a5ba42ec69787913d53284f12092f651506779314f10ba585", size = 5264319, upload-time = "2026-04-18T04:34:39.035Z" }, + { url = "https://files.pythonhosted.org/packages/53/36/a15d8b3514ec889bfd6aa3609107fcb6c9189f8dc347f1c0b81eded8d87c/lxml-6.1.0-cp314-cp314-win32.whl", hash = "sha256:ebe33f4ec1b2de38ceb225a1749a2965855bffeef435ba93cd2d5d540783bf2f", size = 3657139, upload-time = "2026-04-18T04:32:20.006Z" }, + { url = "https://files.pythonhosted.org/packages/1a/a4/263ebb0710851a3c6c937180a9a86df1206fdfe53cc43005aa2237fd7736/lxml-6.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:398443df51c538bd578529aa7e5f7afc6c292644174b47961f3bf87fe5741120", size = 4064195, upload-time = "2026-04-18T04:32:23.876Z" }, + { url = "https://files.pythonhosted.org/packages/80/68/2000f29d323b6c286de077ad20b429fc52272e44eae6d295467043e56012/lxml-6.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:8c8984e1d8c4b3949e419158fda14d921ff703a9ed8a47236c6eb7a2b6cb4946", size = 3741870, upload-time = "2026-04-18T04:32:27.922Z" }, + { url = "https://files.pythonhosted.org/packages/30/e9/21383c7c8d43799f0da90224c0d7c921870d476ec9b3e01e1b2c0b8237c5/lxml-6.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:1081dd10bc6fa437db2500e13993abf7cc30716d0a2f40e65abb935f02ec559c", size = 8827548, upload-time = "2026-04-18T04:32:15.094Z" }, + { url = "https://files.pythonhosted.org/packages/a5/01/c6bc11cd587030dd4f719f65c5657960649fe3e19196c844c75bf32cd0d6/lxml-6.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:dabecc48db5f42ba348d1f5d5afdc54c6c4cc758e676926c7cd327045749517d", size = 4735866, upload-time = "2026-04-18T04:32:18.924Z" }, + { url = "https://files.pythonhosted.org/packages/f3/01/757132fff5f4acf25463b5298f1a46099f3a94480b806547b29ce5e385de/lxml-6.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e3dd5fe19c9e0ac818a9c7f132a5e43c1339ec1cbbfecb1a938bd3a47875b7c9", size = 4969476, upload-time = "2026-04-18T04:34:41.889Z" }, + { url = "https://files.pythonhosted.org/packages/fd/fb/1bc8b9d27ed64be7c8903db6c89e74dc8c2cd9ec630a7462e4654316dc5b/lxml-6.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9e7b0a4ca6dcc007a4cef00a761bba2dea959de4bd2df98f926b33c92ca5dfb9", size = 5103719, upload-time = "2026-04-18T04:34:44.797Z" }, + { url = "https://files.pythonhosted.org/packages/d5/e7/5bf82fa28133536a54601aae633b14988e89ed61d4c1eb6b899b023233aa/lxml-6.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d27bbe326c6b539c64b42638b18bc6003a8d88f76213a97ac9ed4f885efeab7", size = 5027890, upload-time = "2026-04-18T04:34:47.634Z" }, + { url = "https://files.pythonhosted.org/packages/2d/20/e048db5d4b4ea0366648aa595f26bb764b2670903fc585b87436d0a5032c/lxml-6.1.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4e425db0c5445ef0ad56b0eec54f89b88b2d884656e536a90b2f52aecb4ca86", size = 5596008, upload-time = "2026-04-18T04:34:51.503Z" }, + { url = "https://files.pythonhosted.org/packages/9a/c2/d10807bc8da4824b39e5bd01b5d05c077b6fd01bd91584167edf6b269d22/lxml-6.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4b89b098105b8599dc57adac95d1813409ac476d3c948a498775d3d0c6124bfb", size = 5224451, upload-time = "2026-04-18T04:34:54.263Z" }, + { url = "https://files.pythonhosted.org/packages/3c/15/2ebea45bea427e7f0057e9ce7b2d62c5aba20c6b001cca89ed0aadb3ad41/lxml-6.1.0-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:c4a699432846df86cc3de502ee85f445ebad748a1c6021d445f3e514d2cd4b1c", size = 5312135, upload-time = "2026-04-18T04:34:56.818Z" }, + { url = "https://files.pythonhosted.org/packages/31/e2/87eeae151b0be2a308d49a7ec444ff3eb192b14251e62addb29d0bf3778f/lxml-6.1.0-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:30e7b2ed63b6c8e97cca8af048589a788ab5c9c905f36d9cf1c2bb549f450d2f", size = 4639126, upload-time = "2026-04-18T04:34:59.704Z" }, + { url = "https://files.pythonhosted.org/packages/a3/51/8a3f6a20902ad604dd746ec7b4000311b240d389dac5e9d95adefd349e0c/lxml-6.1.0-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:022981127642fe19866d2907d76241bb07ed21749601f727d5d5dd1ce5d1b773", size = 5232579, upload-time = "2026-04-18T04:35:02.658Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d2/650d619bdbe048d2c3f2c31edb00e35670a5e2d65b4fe3b61bce37b19121/lxml-6.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:23cad0cc86046d4222f7f418910e46b89971c5a45d3c8abfad0f64b7b05e4a9b", size = 5084206, upload-time = "2026-04-18T04:35:05.175Z" }, + { url = "https://files.pythonhosted.org/packages/dd/8a/672ca1a3cbeabd1f511ca275a916c0514b747f4b85bdaae103b8fa92f307/lxml-6.1.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:21c3302068f50d1e8728c67c87ba92aa87043abee517aa2576cca1855326b405", size = 4758906, upload-time = "2026-04-18T04:35:08.098Z" }, + { url = "https://files.pythonhosted.org/packages/be/f1/ef4b691da85c916cb2feb1eec7414f678162798ac85e042fa164419ac05c/lxml-6.1.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:be10838781cb3be19251e276910cd508fe127e27c3242e50521521a0f3781690", size = 5620553, upload-time = "2026-04-18T04:35:11.23Z" }, + { url = "https://files.pythonhosted.org/packages/59/17/94e81def74107809755ac2782fdad4404420f1c92ca83433d117a6d5acf0/lxml-6.1.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2173a7bffe97667bbf0767f8a99e587740a8c56fdf3befac4b09cb29a80276fd", size = 5229458, upload-time = "2026-04-18T04:35:14.254Z" }, + { url = "https://files.pythonhosted.org/packages/21/55/c4be91b0f830a871fc1b0d730943d56013b683d4671d5198260e2eae722b/lxml-6.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c6854e9cf99c84beb004eecd7d3a3868ef1109bf2b1df92d7bc11e96a36c2180", size = 5247861, upload-time = "2026-04-18T04:35:17.006Z" }, + { url = "https://files.pythonhosted.org/packages/c2/ca/77123e4d77df3cb1e968ade7b1f808f5d3a5c1c96b18a33895397de292c1/lxml-6.1.0-cp314-cp314t-win32.whl", hash = "sha256:00750d63ef0031a05331b9223463b1c7c02b9004cef2346a5b2877f0f9494dd2", size = 3897377, upload-time = "2026-04-18T04:32:07.656Z" }, + { url = "https://files.pythonhosted.org/packages/64/ce/3554833989d074267c063209bae8b09815e5656456a2d332b947806b05ff/lxml-6.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:80410c3a7e3c617af04de17caa9f9f20adaa817093293d69eae7d7d0522836f5", size = 4392701, upload-time = "2026-04-18T04:32:12.113Z" }, + { url = "https://files.pythonhosted.org/packages/2b/a0/9b916c68c0e57752c07f8f64b30138d9d4059dbeb27b90274dedbea128ff/lxml-6.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:26dd9f57ee3bd41e7d35b4c98a2ffd89ed11591649f421f0ec19f67d50ec67ac", size = 3817120, upload-time = "2026-04-18T04:32:15.803Z" }, ] [[package]]