diff --git a/README.md b/README.md index 354c732..2e55ae9 100644 --- a/README.md +++ b/README.md @@ -174,10 +174,10 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. -- `--convert-to` - - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. Downloaded files will be automatically decompressed and recompressed to the target format. Example: `--convert-to gz` converts all downloaded compressed files to gzip format. -- `--convert-from` - - Optional filter to specify which source compression format should be converted. Use with `--convert-to` to convert only files with a specific compression format. Example: `--convert-to gz --convert-from bz2` converts only `.bz2` files to `.gz`, leaving other formats unchanged. +- `--compression` + - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. The source compression is auto-detected from the file extension. Example: `--compression gz` converts all downloaded compressed files to gzip format. +- `--format` + - Enables on-the-fly RDF and tabular format conversion during download (Layer 2). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Example: `--format turtle` converts all downloaded RDF triple files to Turtle format. - `--validate-checksum` - Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted. @@ -272,16 +272,28 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER docker run --rm -v $(pwd):/data dbpedia/databus-python-client download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql ``` -**Download with Compression Conversion**: download files and convert them to a different compression format on-the-fly +**Download with Compression Conversion**: download files and convert compression format on-the-fly. Source compression is auto-detected from the file extension. ```bash # Convert all compressed files to gzip format -databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --convert-to gz - -# Convert only bz2 files to xz format, leaving other compressions unchanged -databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals --convert-to xz --convert-from bz2 +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --compression gz # Download a collection and unify all files to bz2 format -databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --convert-to bz2 +databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --compression bz2 +``` + +**Download with Format Conversion**: download files and convert RDF or tabular format on-the-fly. Only the converted output file is kept. +```bash +# Convert RDF/XML to Turtle +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format turtle + +# Convert N-Quads to TriG (within quad equivalence class) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --format trig + +# Convert RDF to CSV (cross-class, produces companion .meta.json) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format csv + +# Combine format conversion and compression +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format ntriples --compression gz ``` diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py new file mode 100644 index 0000000..b095eaf --- /dev/null +++ b/databusclient/api/convert.py @@ -0,0 +1,50 @@ +from databusclient.filehandling.format import convert_file, get_converted_filename +from databusclient.filehandling import mapping as _mapping + +from databusclient.filehandling.format import ( # noqa: F401 + ALL_FORMATS, + EXTENSION_TO_FORMAT, + FORMAT_TO_EXTENSION, + RDF_QUAD_FORMATS, + RDF_TRIPLE_FORMATS, + TABULAR_FORMATS, + QuadHandler, + TSDHandler, + TripleHandler, + _quad_handler, + _tsd_handler, + _triple_handler, + detect_format_from_filename, + get_format_class, +) + +__all__ = ["convert_file", "get_converted_filename"] + +convert_rdf_to_csv = _mapping.convert_rdf_to_csv + + +def convert_rdf_triple_format( + source: str, + target: str, + input_format: str, + output_format: str, +) -> None: + _triple_handler.convert(source, target, input_format, output_format) + + +def convert_rdf_quad_format( + source: str, + target: str, + input_format: str, + output_format: str, +) -> None: + _quad_handler.convert(source, target, input_format, output_format) + + +def convert_tabular_format( + source: str, + target: str, + input_format: str, + output_format: str, +) -> None: + _tsd_handler.convert(source, target, input_format, output_format) \ No newline at end of file diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 312af45..41b1f2a 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -5,6 +5,8 @@ import lzma from typing import List, Optional, Tuple import re +import shutil +import tempfile from urllib.parse import urlparse import requests @@ -16,6 +18,7 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) +from databusclient.filehandling.format import convert_file, get_converted_filename # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -47,20 +50,23 @@ def _detect_compression_format(filename: str) -> Optional[str]: return None -def _should_convert_file( - filename: str, convert_to: Optional[str], convert_from: Optional[str] +def _should_convert_compression( + filename: str, compression: Optional[str] ) -> Tuple[bool, Optional[str]]: - """Determine if a file should be converted and what the source format is. + """Determine if a file should have its compression format converted. + + Source compression is detected automatically from the file extension. + All compressed files will be converted to the target format regardless + of their source compression format. Args: filename: Name of the file. - convert_to: Target compression format ('bz2', 'gz', 'xz'). - convert_from: Optional source compression format filter. + compression: Target compression format ('bz2', 'gz', 'xz') or None. Returns: Tuple of (should_convert: bool, source_format: Optional[str]). """ - if not convert_to: + if not compression: return False, None source_format = _detect_compression_format(filename) @@ -70,11 +76,7 @@ def _should_convert_file( return False, None # If source and target are the same, skip conversion - if source_format == convert_to: - return False, None - - # If convert_from is specified, only convert matching formats - if convert_from and source_format != convert_from: + if source_format == compression: return False, None return True, source_format @@ -311,8 +313,8 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, - convert_to=None, - convert_from=None, + compression=None, + convert_format=None, validate_checksum: bool = False, expected_checksum: str | None = None, ) -> None: @@ -325,8 +327,9 @@ def _download_file( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + Source compression is auto-detected from the file extension. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. """ @@ -349,6 +352,7 @@ def _download_file( dirpath = os.path.dirname(filename) if dirpath: os.makedirs(dirpath, exist_ok=True) # Create the necessary directories + # --- 1. Get redirect URL by requesting HEAD --- headers = {} @@ -505,14 +509,98 @@ def _download_file( f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" ) - # --- 7. Convert compression format if requested (AFTER validation) --- - should_convert, source_format = _should_convert_file(file, convert_to, convert_from) - if should_convert and source_format: - target_filename = _get_converted_filename(file, source_format, convert_to) - target_filepath = os.path.join(localDir, target_filename) - _convert_compression_format( - filename, target_filepath, source_format, convert_to - ) + # --- 7. Unified compression/format conversion pass --- + source_compression = _detect_compression_format(file) + should_convert_compression, source_fmt = _should_convert_compression( + file, compression + ) + needs_format_conversion = convert_format is not None + + if not should_convert_compression and not needs_format_conversion: + return + + temp_paths: list[str] = [] + try: + # Compression-only path: convert directly from the downloaded file. + # _convert_compression_format deletes the source after success, + # so the original downloaded file is removed automatically. + if should_convert_compression and not needs_format_conversion: + target_filename = _get_converted_filename(file, source_fmt, compression) + target_filepath = os.path.join(localDir, target_filename) + _convert_compression_format( + filename, + target_filepath, + source_fmt, + compression, + ) + return + + # Determine input for format conversion. + # If source is compressed, decompress once to a safe temporary file. + conversion_input_path = filename + if source_compression is not None: + source_ext = COMPRESSION_EXTENSIONS[source_compression] + stripped_name = file + if stripped_name.lower().endswith(source_ext): + stripped_name = stripped_name[: -len(source_ext)] + _, format_ext = os.path.splitext(stripped_name) + + with tempfile.NamedTemporaryFile( + delete=False, + suffix=format_ext, + dir=localDir, + ) as temp_decompressed: + temp_decompressed_path = temp_decompressed.name + temp_paths.append(temp_decompressed_path) + + print(f"Decompressing {file}...") + with COMPRESSION_MODULES[source_compression].open(filename, "rb") as sf: + with open(temp_decompressed_path, "wb") as tf: + shutil.copyfileobj(sf, tf) + + conversion_input_path = temp_decompressed_path + + # Convert format on uncompressed input. + converted_basename = get_converted_filename(file, convert_format) + converted_uncompressed_path = os.path.join(localDir, converted_basename) + convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + + # Delete the original downloaded file after successful format conversion, + # unless the converted output is the same file (same format, same path). + if os.path.abspath(filename) != os.path.abspath(converted_uncompressed_path): + if os.path.exists(filename): + os.remove(filename) + print(f"Removed original file: {os.path.basename(filename)}") + + # Recompress converted output when needed. + if source_compression is not None: + if should_convert_compression and compression: + final_compression = compression + else: + final_compression = source_compression + elif should_convert_compression and compression: + final_compression = compression + else: + final_compression = None + + if final_compression is not None: + recompressed_path = ( + converted_uncompressed_path + COMPRESSION_EXTENSIONS[final_compression] + ) + print( + f"Recompressing {os.path.basename(converted_uncompressed_path)} -> {os.path.basename(recompressed_path)}..." + ) + with open(converted_uncompressed_path, "rb") as sf: + with COMPRESSION_MODULES[final_compression].open( + recompressed_path, "wb" + ) as tf: + shutil.copyfileobj(sf, tf) + + os.remove(converted_uncompressed_path) + finally: + for temp_path in temp_paths: + if os.path.exists(temp_path): + os.remove(temp_path) def _download_files( @@ -522,8 +610,8 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, + convert_format: str = None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -536,8 +624,8 @@ def _download_files( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. """ @@ -552,8 +640,8 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -700,8 +788,8 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -714,8 +802,8 @@ def _download_collection( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ query = _get_sparql_query_of_collection(uri, databus_key=databus_key) @@ -735,8 +823,8 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -749,8 +837,8 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -762,8 +850,8 @@ def _download_version( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -782,8 +870,8 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, ) @@ -797,8 +885,8 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -811,8 +899,8 @@ def _download_artifact( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -837,8 +925,8 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, ) @@ -913,8 +1001,8 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -927,8 +1015,8 @@ def _download_group( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -943,8 +1031,8 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -992,8 +1080,8 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", - convert_to=None, - convert_from=None, + compression=None, + convert_format=None, validate_checksum: bool = False, ) -> None: """Download datasets from databus. @@ -1008,8 +1096,9 @@ def download( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". client_id: Client ID for token exchange. Default is "vault-token-exchange". - convert_to: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). + Source compression is auto-detected from the file extension. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ for databusURI in databusURIs: @@ -1037,8 +1126,8 @@ def download( databus_key, auth_url, client_id, - convert_to, - convert_from, + compression, + convert_format, validate_checksum=validate_checksum, ) elif file is not None: @@ -1058,8 +1147,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -1072,8 +1161,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -1088,8 +1177,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -1104,8 +1193,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif account is not None: @@ -1142,8 +1231,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, - ) + ) \ No newline at end of file diff --git a/databusclient/cli.py b/databusclient/cli.py index c3bd8f2..50f0766 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -180,14 +180,33 @@ def deploy( help="Client ID for token exchange", ) @click.option( - "--convert-to", + "--compression", + "compression", type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), - help="Target compression format for on-the-fly conversion during download (supported: bz2, gz, xz)", + help="Target compression format for on-the-fly conversion during download. " + "Source compression is detected automatically from the file extension. " + "All compressed files will be converted to the target format (bz2, gz, xz).", ) @click.option( - "--convert-from", - type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), - help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", + "--format", + "convert_format", + type=click.Choice( + [ + "ntriples", "nt", + "turtle", "ttl", + "rdf-xml", "rdf", "xml", + "nquads", "nq", + "trig", + "trix", + "json-ld", "jsonld", + "csv", + "tsv", + ], + case_sensitive=False, + ), + help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3). " + "Accepts full names (ntriples, turtle, rdf-xml, nquads, trig, trix, json-ld, csv, tsv) " + "or short aliases (nt, ttl, rdf, xml, nq, jsonld).", ) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" @@ -201,8 +220,8 @@ def download( all_versions, authurl, clientid, - convert_to, - convert_from, + compression, + convert_format, validate_checksum, ): """ @@ -219,8 +238,8 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, + convert_format=convert_format, validate_checksum=validate_checksum, ) except DownloadAuthError as e: diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py new file mode 100644 index 0000000..7c40109 --- /dev/null +++ b/databusclient/filehandling/format.py @@ -0,0 +1,556 @@ +"""Format and Mapping Conversion Layer. + +This module implements the format conversion pipeline for the Databus Python Client + +Layer 2: Within-class format conversion (lossless). + - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) + - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) + - TSDHandler: Tabular formats (csv, tsv) + +Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. +The IR (intermediate representation) returned by read() is designed to be passed +to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). +""" + +import csv +import os +import shutil +import warnings +from typing import Optional + +from rdflib import Dataset, Graph + +# Suppress rdflib internal DeprecationWarning for Dataset API. +# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. +# These warnings originate from rdflib internals, not our code. +# Can be removed when rdflib completes their Dataset API migration. +warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") +warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") + + +# --------------------------------------------------------------------------- +# Format registries +# --------------------------------------------------------------------------- + +# Maps CLI format name -> rdflib format string +RDF_TRIPLE_FORMATS = { + "ntriples": "ntriples", + "turtle": "turtle", + "rdf-xml": "xml", +} + +RDF_QUAD_FORMATS = { + "nquads": "nquads", + "trig": "trig", + "trix": "trix", + "json-ld": "json-ld", +} + +TABULAR_FORMATS = { + "csv": ",", + "tsv": "\t", +} + +ALL_FORMATS = ( + list(RDF_TRIPLE_FORMATS) + + list(RDF_QUAD_FORMATS) + + list(TABULAR_FORMATS) +) + +# Maps short CLI aliases -> canonical format name +FORMAT_ALIASES = { + "nt": "ntriples", + "ttl": "turtle", + "rdf": "rdf-xml", + "xml": "rdf-xml", + "nq": "nquads", + "jsonld": "json-ld", +} + +def normalize_format(fmt: str) -> str: + """Normalize a format name or alias to its canonical form. + + Accepts both full names (e.g. 'ntriples') and short aliases (e.g. 'nt'). + Canonical names pass through unchanged. Unknown values raise ValueError. + + Args: + fmt: Format name or alias string (case-insensitive). + + Returns: + Canonical format name string. + + Raises: + ValueError: If fmt is not a recognised format name or alias. + """ + fmt_lower = fmt.lower() + # Resolve alias first + canonical = FORMAT_ALIASES.get(fmt_lower, fmt_lower) + if canonical not in ALL_FORMATS: + raise ValueError( + f"Unknown format: '{fmt}'. " + f"Supported formats: {ALL_FORMATS}. " + f"Supported aliases: {list(FORMAT_ALIASES.keys())}" + ) + return canonical + +# Maps file extension -> CLI format name +EXTENSION_TO_FORMAT = { + ".ttl": "turtle", + ".nt": "ntriples", + ".rdf": "rdf-xml", + ".xml": "rdf-xml", + ".owl": "rdf-xml", + ".nq": "nquads", + ".trig": "trig", + ".trix": "trix", + ".jsonld": "json-ld", + ".json": "json-ld", + ".csv": "csv", + ".tsv": "tsv", +} + +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + + +# --------------------------------------------------------------------------- +# Format detection helpers +# --------------------------------------------------------------------------- + +def detect_format_from_filename(filename: str) -> Optional[str]: + """Detect format from file extension, ignoring compression extensions. + + Args: + filename: File name or path. + + Returns: + Format name string or None if not detectable. + """ + name = filename.lower() + + # strip compression extension first + for ext in (".bz2", ".gz", ".xz"): + if name.endswith(ext): + name = name[: -len(ext)] + break + + # match longest extension first to avoid .json matching before .jsonld + for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): + if name.endswith(ext): + return EXTENSION_TO_FORMAT[ext] + + return None + + +def get_format_class(fmt: str) -> str: + """Return equivalence class for a format name. + + Args: + fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). + + Returns: + 'triples', 'quads', or 'tabular'. + + Raises: + ValueError: If format is not recognised. + """ + if fmt in RDF_TRIPLE_FORMATS: + return "triples" + if fmt in RDF_QUAD_FORMATS: + return "quads" + if fmt in TABULAR_FORMATS: + return "tabular" + raise ValueError( + f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" + ) + + +def get_converted_filename(original_filename: str, convert_format: str) -> str: + """Generate output filename after format conversion. + + Strips compression extension if present, then replaces the format + extension with the target format extension. Accepts format aliases. + + Args: + original_filename: Original file name (basename only, not full path). + convert_format: Target format name or alias. + + Returns: + New filename with updated extension. + """ + # Normalize alias to canonical name + convert_format = normalize_format(convert_format) + + name = original_filename + + # strip compression extension + for ext in (".bz2", ".gz", ".xz"): + if name.lower().endswith(ext): + name = name[: -len(ext)] + break + + # strip existing format extension (longest first) + for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): + if name.lower().endswith(old_ext): + name = name[: -len(old_ext)] + break + + target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") + return name + target_ext + + +# --------------------------------------------------------------------------- +# Layer 2 Handlers +# --------------------------------------------------------------------------- + +class TripleHandler: + """Handler for RDF triple formats (Layer 2). + + Uses rdflib.Graph as the intermediate representation (IR). + Supports: ntriples, turtle, rdf-xml. + + The IR returned by read() can be passed to future mapping classes + such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> Graph: + """Parse an RDF triples file into a Graph (IR). + + Args: + source: Path to input file. + input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). + + Returns: + rdflib.Graph containing all parsed triples. + + Raises: + ValueError: If input_format is not a recognised triple format. + """ + if input_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{input_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + g = Graph() + g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) + return g + + def write(self, data: Graph, target: str, output_format: str) -> None: + """Serialize a Graph (IR) to a file. + + Args: + data: rdflib.Graph to serialize. + target: Path to output file. + output_format: Target format name (e.g. 'ntriples', 'turtle'). + + Raises: + ValueError: If output_format is not a recognised triple format. + """ + if output_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{output_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + # Explicitly specify utf-8 encoding to avoid NTSerializer warning + data.serialize( + destination=target, + format=RDF_TRIPLE_FORMATS[output_format], + encoding="utf-8", + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF triple formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF triples). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + graph = self.read(source, input_format) + self.write(graph, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class QuadHandler: + """Handler for RDF quad formats (Layer 2). + + Uses rdflib.Dataset as the intermediate representation (IR). + Supports: nquads, trig, trix, json-ld. + + Named graph information is preserved through the Dataset IR. + The IR returned by read() can be passed to future mapping classes + such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> Dataset: + """Parse an RDF quads file into a Dataset (IR). + + Args: + source: Path to input file. + input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). + + Returns: + rdflib.Dataset containing all parsed quads with named graphs. + + Raises: + ValueError: If input_format is not a recognised quad format. + """ + if input_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{input_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + d = Dataset() + d.parse(source, format=RDF_QUAD_FORMATS[input_format]) + return d + + def write(self, data: Dataset, target: str, output_format: str) -> None: + """Serialize a Dataset (IR) to a file. + + Args: + data: rdflib.Dataset to serialize. + target: Path to output file. + output_format: Target format name. + + Raises: + ValueError: If output_format is not a recognised quad format. + """ + if output_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{output_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + data.serialize( + destination=target, + format=RDF_QUAD_FORMATS[output_format], + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF quad formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF quads). Named graph information is preserved. + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + dataset = self.read(source, input_format) + self.write(dataset, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class TSDHandler: + """Handler for tabular structured data formats (Layer 2). + + Uses list[list[str]] as the intermediate representation (IR). + Supports: csv, tsv. + + The IR returned by read() can be passed to future mapping classes + such as TSDToTripleMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> list: + """Parse a tabular file into a list of rows (IR). + + Each row is a list of string values. First row is the header. + + Args: + source: Path to input file. + input_format: Source format name ('csv' or 'tsv'). + + Returns: + list[list[str]] where first element is the header row. + + Raises: + ValueError: If input_format is not a recognised tabular format. + """ + if input_format not in TABULAR_FORMATS: + raise ValueError( + f"'{input_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[input_format] + with open(source, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter=delimiter) + return list(reader) + + def write(self, data: list, target: str, output_format: str) -> None: + """Serialize a list of rows (IR) to a tabular file. + + Args: + data: list[list[str]] to write. + target: Path to output file. + output_format: Target format name ('csv' or 'tsv'). + + Raises: + ValueError: If output_format is not a recognised tabular format. + """ + if output_format not in TABULAR_FORMATS: + raise ValueError( + f"'{output_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + delimiter = TABULAR_FORMATS[output_format] + with open(target, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter=delimiter) + writer.writerows(data) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between tabular formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (tabular). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + rows = self.read(source, input_format) + self.write(rows, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +# --------------------------------------------------------------------------- +# Main dispatcher — called from download pipeline +# --------------------------------------------------------------------------- + +# Handler instances — created once, reused +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + +def convert_file( + input_file: str, + output_file: str, + convert_format: str, +) -> None: + """Main conversion dispatcher called from the download pipeline. + + Detects the input format from the file extension, determines whether + this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, + and delegates to the appropriate handler. + + Accepts both canonical format names and short aliases (e.g. 'nt' for + 'ntriples', 'ttl' for 'turtle'). See normalize_format() for full list. + + Args: + input_file: Path to the input file (must be decompressed). + output_file: Path to write the converted output file. + convert_format: Target format name or alias (CLI format string). + + Raises: + ValueError: If input format cannot be detected or conversion + is not supported. + """ + # Normalize alias to canonical name before any processing + convert_format = normalize_format(convert_format) + + input_format = detect_format_from_filename(input_file) + + if input_format is None: + raise ValueError( + f"Could not detect input format from filename: " + f"'{os.path.basename(input_file)}'. " + f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" + ) + + if input_format == convert_format: + # Input and target format are identical. + # Copy input to output path so the caller always receives an output file. + # This is important for the download pipeline which expects an output + # file to exist after convert_file() returns — e.g. for recompression. + if input_file != output_file: + shutil.copy2(input_file, output_file) + print( + f"Input and target format are both '{input_format}'. " + f"Copied to output path: {os.path.basename(output_file)}" + ) + return + + input_class = get_format_class(input_format) + output_class = get_format_class(convert_format) + + # --- Layer 2: within-class --- + if input_class == output_class: + if input_class == "triples": + _triple_handler.convert( + input_file, output_file, input_format, convert_format + ) + elif input_class == "quads": + _quad_handler.convert( + input_file, output_file, input_format, convert_format + ) + elif input_class == "tabular": + _tsd_handler.convert( + input_file, output_file, input_format, convert_format + ) + return + + # --- Layer 3: cross-class (prototype only) --- + if input_class == "triples" and output_class == "tabular": + from databusclient.filehandling.mapping import convert_rdf_to_csv + + convert_rdf_to_csv(input_file, output_file, input_format) + return + + raise ValueError( + f"Conversion from '{input_format}' ({input_class}) to " + f"'{convert_format}' ({output_class}) is not yet implemented. " + f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." + ) diff --git a/databusclient/filehandling/mapping.py b/databusclient/filehandling/mapping.py new file mode 100644 index 0000000..93b5a00 --- /dev/null +++ b/databusclient/filehandling/mapping.py @@ -0,0 +1,68 @@ +"""Layer 3 prototype mapping handlers.""" + +import json +import os + +from databusclient.filehandling.format import TSDHandler, TripleHandler + + +def convert_rdf_to_csv( + input_file: str, + output_file: str, + input_format: str, +) -> None: + """Map RDF triples to a wide CSV table (Layer 3 prototype). + + Each unique subject becomes a row. Each unique predicate becomes a column. + Multi-valued predicates are pipe-separated. + A companion .meta.json file is generated to preserve RDF datatype and + language tag information for lossless round trips. + + NOTE: This is a Layer 3 prototype. It is not yet tested and will be + properly implemented in the Layer 3 issue. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output CSV file. + input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + """ + handler = TripleHandler() + g = handler.read(input_file, input_format) + + predicates = sorted(set(str(p) for s, p, o in g)) + + subjects: dict = {} + column_metadata: dict = {} + + for s, p, o in g: + subj = str(s) + pred = str(p) + + if hasattr(o, "datatype") and o.datatype: + column_metadata[pred] = {"datatype": str(o.datatype)} + elif hasattr(o, "language") and o.language: + column_metadata[pred] = {"language": str(o.language)} + + if subj not in subjects: + subjects[subj] = {} + if pred not in subjects[subj]: + subjects[subj][pred] = [] + subjects[subj][pred].append(str(o)) + + tsd_handler = TSDHandler() + rows = [["resource"] + predicates] + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + tsd_handler.write(rows, output_file, "csv") + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Companion metadata: {os.path.basename(companion_file)}") diff --git a/pyproject.toml b/pyproject.toml index 92f479b..9759c07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,15 @@ databusclient = "databusclient.cli:app" target-version = "py311" src = ["databusclient", "tests"] +[tool.ruff.lint.per-file-ignores] +"tests/test_format_round_trips.py" = ["F841"] + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +filterwarnings = [ + "ignore::DeprecationWarning:rdflib", + "ignore::UserWarning:rdflib", +] diff --git a/run_all_conversion_tests.py b/run_all_conversion_tests.py new file mode 100644 index 0000000..98f9bbb --- /dev/null +++ b/run_all_conversion_tests.py @@ -0,0 +1,342 @@ +""" +Layer 2 Conversion Testing Script +Tests every conversion combination systematically. +Outputs go to test_outputs/ folder. +Test file for testing with real datasets from databus. +""" + +# TODO: This script is a temporary manual integration test artifact. +# It must be removed or rewritten as proper pytest integration tests +# before the final PR. Do not commit this file to the upstream repo. + +import os +from databusclient.api.convert import ( + convert_rdf_triple_format, + convert_rdf_quad_format, + convert_tabular_format, +) + +# --------------------------------------------------------------------------- +# Setup output folders +# --------------------------------------------------------------------------- + +folders = [ + "test_outputs/triples/T1_turtle_to_ntriples", + "test_outputs/triples/T2_turtle_to_rdfxml", + "test_outputs/triples/T3_ntriples_to_turtle", + "test_outputs/triples/T4_ntriples_to_rdfxml", + "test_outputs/triples/T5_rdfxml_to_turtle", + "test_outputs/triples/T6_rdfxml_to_ntriples", + "test_outputs/quads/Q1_nquads_to_trig", + "test_outputs/quads/Q2_nquads_to_trix", + "test_outputs/quads/Q3_nquads_to_jsonld", + "test_outputs/quads/Q4_trig_to_nquads", + "test_outputs/quads/Q5_trig_to_trix", + "test_outputs/quads/Q6_trig_to_jsonld", + "test_outputs/quads/Q7_trix_to_nquads", + "test_outputs/quads/Q8_trix_to_trig", + "test_outputs/quads/Q9_trix_to_jsonld", + "test_outputs/quads/Q10_jsonld_to_nquads", + "test_outputs/quads/Q11_jsonld_to_trig", + "test_outputs/quads/Q12_jsonld_to_trix", + "test_outputs/tabular/TAB1_csv_to_tsv", + "test_outputs/tabular/TAB2_tsv_to_csv", +] + +for folder in folders: + os.makedirs(folder, exist_ok=True) + +results = [] + + +def run_test(test_id, description, func, input_file, output_file, *args): + """Run one conversion test and record the result.""" + try: + func(input_file, output_file, *args) + size = os.path.getsize(output_file) + results.append(f"PASS {test_id}: {description} -> {os.path.basename(output_file)} ({size} bytes)") + return output_file + except Exception as e: + results.append(f"FAIL {test_id}: {description} -> ERROR: {e}") + return None + + +# --------------------------------------------------------------------------- +# GROUP 1: RDF Triple Format Conversions +# 6 combinations: each format -> every other format +# Base file: test_outputs/base/base.ttl (real DBpedia Turtle data) +# Chain: turtle -> ntriples -> rdfxml -> back to turtle +# --------------------------------------------------------------------------- + +print("\n=== GROUP 1: RDF TRIPLE FORMAT CONVERSIONS ===\n") + +BASE_TTL = "test_outputs/base/base.ttl" + +# T1: turtle -> ntriples (from base turtle file) +t1_out = "test_outputs/triples/T1_turtle_to_ntriples/output.nt" +run_test( + "T1", "turtle -> ntriples", + convert_rdf_triple_format, + BASE_TTL, t1_out, "turtle", "ntriples" +) + +# T2: turtle -> rdf-xml (from base turtle file) +t2_out = "test_outputs/triples/T2_turtle_to_rdfxml/output.rdf" +run_test( + "T2", "turtle -> rdf-xml", + convert_rdf_triple_format, + BASE_TTL, t2_out, "turtle", "rdf-xml" +) + +# T3: ntriples -> turtle (uses T1 output) +t3_out = "test_outputs/triples/T3_ntriples_to_turtle/output.ttl" +if t1_out and os.path.exists(t1_out): + run_test( + "T3", "ntriples -> turtle", + convert_rdf_triple_format, + t1_out, t3_out, "ntriples", "turtle" + ) +else: + results.append("SKIP T3: ntriples -> turtle (T1 output not available)") + +# T4: ntriples -> rdf-xml (uses T1 output) +t4_out = "test_outputs/triples/T4_ntriples_to_rdfxml/output.rdf" +if t1_out and os.path.exists(t1_out): + run_test( + "T4", "ntriples -> rdf-xml", + convert_rdf_triple_format, + t1_out, t4_out, "ntriples", "rdf-xml" + ) +else: + results.append("SKIP T4: ntriples -> rdf-xml (T1 output not available)") + +# T5: rdf-xml -> turtle (uses T2 output) +t5_out = "test_outputs/triples/T5_rdfxml_to_turtle/output.ttl" +if t2_out and os.path.exists(t2_out): + run_test( + "T5", "rdf-xml -> turtle", + convert_rdf_triple_format, + t2_out, t5_out, "rdf-xml", "turtle" + ) +else: + results.append("SKIP T5: rdf-xml -> turtle (T2 output not available)") + +# T6: rdf-xml -> ntriples (uses T2 output) +t6_out = "test_outputs/triples/T6_rdfxml_to_ntriples/output.nt" +if t2_out and os.path.exists(t2_out): + run_test( + "T6", "rdf-xml -> ntriples", + convert_rdf_triple_format, + t2_out, t6_out, "rdf-xml", "ntriples" + ) +else: + results.append("SKIP T6: rdf-xml -> ntriples (T2 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 2: RDF Quad Format Conversions +# 12 combinations: each of 4 formats -> every other format (4*3=12) +# Base file: test_outputs/base/base.nq +# Chain: nquads -> trig -> trix -> jsonld -> back to nquads +# --------------------------------------------------------------------------- + +print("\n=== GROUP 2: RDF QUAD FORMAT CONVERSIONS ===\n") + +BASE_NQ = "test_outputs/base/base.nq" + +# Q1: nquads -> trig +q1_out = "test_outputs/quads/Q1_nquads_to_trig/output.trig" +run_test( + "Q1", "nquads -> trig", + convert_rdf_quad_format, + BASE_NQ, q1_out, "nquads", "trig" +) + +# Q2: nquads -> trix +q2_out = "test_outputs/quads/Q2_nquads_to_trix/output.trix" +run_test( + "Q2", "nquads -> trix", + convert_rdf_quad_format, + BASE_NQ, q2_out, "nquads", "trix" +) + +# Q3: nquads -> json-ld +q3_out = "test_outputs/quads/Q3_nquads_to_jsonld/output.jsonld" +run_test( + "Q3", "nquads -> json-ld", + convert_rdf_quad_format, + BASE_NQ, q3_out, "nquads", "json-ld" +) + +# Q4: trig -> nquads (uses Q1 output) +q4_out = "test_outputs/quads/Q4_trig_to_nquads/output.nq" +if q1_out and os.path.exists(q1_out): + run_test( + "Q4", "trig -> nquads", + convert_rdf_quad_format, + q1_out, q4_out, "trig", "nquads" + ) +else: + results.append("SKIP Q4: trig -> nquads (Q1 output not available)") + +# Q5: trig -> trix (uses Q1 output) +q5_out = "test_outputs/quads/Q5_trig_to_trix/output.trix" +if q1_out and os.path.exists(q1_out): + run_test( + "Q5", "trig -> trix", + convert_rdf_quad_format, + q1_out, q5_out, "trig", "trix" + ) +else: + results.append("SKIP Q5: trig -> trix (Q1 output not available)") + +# Q6: trig -> json-ld (uses Q1 output) +q6_out = "test_outputs/quads/Q6_trig_to_jsonld/output.jsonld" +if q1_out and os.path.exists(q1_out): + run_test( + "Q6", "trig -> json-ld", + convert_rdf_quad_format, + q1_out, q6_out, "trig", "json-ld" + ) +else: + results.append("SKIP Q6: trig -> json-ld (Q1 output not available)") + +# Q7: trix -> nquads (uses Q2 output) +q7_out = "test_outputs/quads/Q7_trix_to_nquads/output.nq" +if q2_out and os.path.exists(q2_out): + run_test( + "Q7", "trix -> nquads", + convert_rdf_quad_format, + q2_out, q7_out, "trix", "nquads" + ) +else: + results.append("SKIP Q7: trix -> nquads (Q2 output not available)") + +# Q8: trix -> trig (uses Q2 output) +q8_out = "test_outputs/quads/Q8_trix_to_trig/output.trig" +if q2_out and os.path.exists(q2_out): + run_test( + "Q8", "trix -> trig", + convert_rdf_quad_format, + q2_out, q8_out, "trix", "trig" + ) +else: + results.append("SKIP Q8: trix -> trig (Q2 output not available)") + +# Q9: trix -> json-ld (uses Q2 output) +q9_out = "test_outputs/quads/Q9_trix_to_jsonld/output.jsonld" +if q2_out and os.path.exists(q2_out): + run_test( + "Q9", "trix -> json-ld", + convert_rdf_quad_format, + q2_out, q9_out, "trix", "json-ld" + ) +else: + results.append("SKIP Q9: trix -> json-ld (Q2 output not available)") + +# Q10: json-ld -> nquads (uses Q3 output) +q10_out = "test_outputs/quads/Q10_jsonld_to_nquads/output.nq" +if q3_out and os.path.exists(q3_out): + run_test( + "Q10", "json-ld -> nquads", + convert_rdf_quad_format, + q3_out, q10_out, "json-ld", "nquads" + ) +else: + results.append("SKIP Q10: json-ld -> nquads (Q3 output not available)") + +# Q11: json-ld -> trig (uses Q3 output) +q11_out = "test_outputs/quads/Q11_jsonld_to_trig/output.trig" +if q3_out and os.path.exists(q3_out): + run_test( + "Q11", "json-ld -> trig", + convert_rdf_quad_format, + q3_out, q11_out, "json-ld", "trig" + ) +else: + results.append("SKIP Q11: json-ld -> trig (Q3 output not available)") + +# Q12: json-ld -> trix (uses Q3 output) +q12_out = "test_outputs/quads/Q12_jsonld_to_trix/output.trix" +if q3_out and os.path.exists(q3_out): + run_test( + "Q12", "json-ld -> trix", + convert_rdf_quad_format, + q3_out, q12_out, "json-ld", "trix" + ) +else: + results.append("SKIP Q12: json-ld -> trix (Q3 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 3: Tabular Format Conversions +# 2 combinations: csv->tsv and tsv->csv +# --------------------------------------------------------------------------- + +print("\n=== GROUP 3: TABULAR FORMAT CONVERSIONS ===\n") + +BASE_CSV = "test_outputs/base/base.csv" +BASE_TSV = "test_outputs/base/base.tsv" + +# TAB1: csv -> tsv +tab1_out = "test_outputs/tabular/TAB1_csv_to_tsv/output.tsv" +run_test( + "TAB1", "csv -> tsv", + convert_tabular_format, + BASE_CSV, tab1_out, "csv", "tsv" +) + +# TAB2: tsv -> csv (uses TAB1 output) +tab2_out = "test_outputs/tabular/TAB2_tsv_to_csv/output.csv" +if tab1_out and os.path.exists(tab1_out): + run_test( + "TAB2", "tsv -> csv", + convert_tabular_format, + tab1_out, tab2_out, "tsv", "csv" + ) +else: + results.append("SKIP TAB2: tsv -> csv (TAB1 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 4: CLI End-to-End Tests (compressed real Databus file) +# These test the full pipeline including download.py wiring +# --------------------------------------------------------------------------- + +print("\n=== GROUP 4: CLI END-TO-END (run these manually) ===\n") +cli_tests = [ + "CLI1: turtle->ntriples from compressed Databus file", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --localdir ./test_outputs/cli/CLI1", + "", + "CLI2: turtle->rdf-xml from compressed Databus file", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format rdf-xml --localdir ./test_outputs/cli/CLI2", + "", + "CLI3: turtle->ntriples + compression bz2->gz", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --convert-to gz --localdir ./test_outputs/cli/CLI3", + "", + "CLI4: turtle->ntriples + compression bz2->xz", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --convert-to xz --localdir ./test_outputs/cli/CLI4", + "", + "CLI5: unsupported cross-class error (expect ValueError)", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format nquads --localdir ./test_outputs/cli/CLI5", +] +for line in cli_tests: + print(line) + + +# --------------------------------------------------------------------------- +# Print summary +# --------------------------------------------------------------------------- + +print("\n" + "="*60) +print("LAYER 2 CONVERSION TEST SUMMARY") +print("="*60) +for result in results: + print(result) + +passed = sum(1 for r in results if r.startswith("PASS")) +failed = sum(1 for r in results if r.startswith("FAIL")) +skipped = sum(1 for r in results if r.startswith("SKIP")) + +print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped") +print("="*60) \ No newline at end of file diff --git a/tests/resources/sample.csv b/tests/resources/sample.csv new file mode 100644 index 0000000..50dda4c --- /dev/null +++ b/tests/resources/sample.csv @@ -0,0 +1,11 @@ +subject,predicate,object,graph +https://example.org/data/alice,http://xmlns.com/foaf/0.1/name,Alice,https://example.org/graph/people +https://example.org/data/alice,https://example.org/vocab/age,29,https://example.org/graph/people +https://example.org/data/alice,https://example.org/vocab/livesAt,_:address1,https://example.org/graph/people +_:address1,https://example.org/vocab/city,Leipzig,https://example.org/graph/people +_:address1,https://example.org/vocab/country,Germany,https://example.org/graph/people +https://example.org/data/bob,http://xmlns.com/foaf/0.1/name,Bob,https://example.org/graph/people +https://example.org/data/bob,https://example.org/vocab/age,34,https://example.org/graph/people +https://example.org/data/bob,https://example.org/vocab/knows,https://example.org/data/alice,https://example.org/graph/people +https://example.org/data/project1,https://example.org/vocab/title,Databus Example Project,https://example.org/graph/projects +https://example.org/data/project1,https://example.org/vocab/member,https://example.org/data/alice,https://example.org/graph/projects \ No newline at end of file diff --git a/tests/resources/sample.jsonld b/tests/resources/sample.jsonld new file mode 100644 index 0000000..af80f31 --- /dev/null +++ b/tests/resources/sample.jsonld @@ -0,0 +1,62 @@ +{ + "@context": { + "@base": "https://example.org/data/", + "ex": "https://example.org/vocab/", + "foaf": "http://xmlns.com/foaf/0.1/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "name": "foaf:name", + "age": { + "@id": "ex:age", + "@type": "xsd:integer" + }, + "livesAt": { + "@id": "ex:livesAt", + "@type": "@id" + }, + "city": "ex:city", + "country": "ex:country", + "knows": { + "@id": "ex:knows", + "@type": "@id" + }, + "title": "ex:title", + "member": { + "@id": "ex:member", + "@type": "@id" + } + }, + "@graph": [ + { + "@id": "https://example.org/graph/people", + "@graph": [ + { + "@id": "alice", + "name": "Alice", + "age": 29, + "livesAt": "_:address1" + }, + { + "@id": "_:address1", + "city": "Leipzig", + "country": "Germany" + }, + { + "@id": "bob", + "name": "Bob", + "age": 34, + "knows": "alice" + } + ] + }, + { + "@id": "https://example.org/graph/projects", + "@graph": [ + { + "@id": "project1", + "title": "Databus Example Project", + "member": "alice" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/resources/sample.nq b/tests/resources/sample.nq new file mode 100644 index 0000000..a111652 --- /dev/null +++ b/tests/resources/sample.nq @@ -0,0 +1,10 @@ + "Alice" . + "29"^^ . + _:address1 . +_:address1 "Leipzig" . +_:address1 "Germany" . + "Bob" . + "34"^^ . + . + "Databus Example Project" . + . \ No newline at end of file diff --git a/tests/resources/sample.nt b/tests/resources/sample.nt new file mode 100644 index 0000000..f6b8488 --- /dev/null +++ b/tests/resources/sample.nt @@ -0,0 +1,10 @@ + "Alice" . + "29"^^ . + _:address1 . +_:address1 "Leipzig" . +_:address1 "Germany" . + "Bob" . + "34"^^ . + . + "Databus Example Project" . + . \ No newline at end of file diff --git a/tests/resources/sample.rdf b/tests/resources/sample.rdf new file mode 100644 index 0000000..c8bb09a --- /dev/null +++ b/tests/resources/sample.rdf @@ -0,0 +1,30 @@ + + + + + Alice + 29 + + + + + Leipzig + Germany + + + + Bob + 34 + + + + + Databus Example Project + + + + \ No newline at end of file diff --git a/tests/resources/sample.trig b/tests/resources/sample.trig new file mode 100644 index 0000000..e4abc3f --- /dev/null +++ b/tests/resources/sample.trig @@ -0,0 +1,22 @@ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + { + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + + _:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . +} + + { + ex:title "Databus Example Project" ; + ex:member . +} \ No newline at end of file diff --git a/tests/resources/sample.trix b/tests/resources/sample.trix new file mode 100644 index 0000000..d8edb13 --- /dev/null +++ b/tests/resources/sample.trix @@ -0,0 +1,72 @@ + + + + + https://example.org/graph/people + + + https://example.org/data/alice + http://xmlns.com/foaf/0.1/name + Alice + + + + https://example.org/data/alice + https://example.org/vocab/age + 29 + + + + https://example.org/data/alice + https://example.org/vocab/livesAt + address1 + + + + address1 + https://example.org/vocab/city + Leipzig + + + + address1 + https://example.org/vocab/country + Germany + + + + https://example.org/data/bob + http://xmlns.com/foaf/0.1/name + Bob + + + + https://example.org/data/bob + https://example.org/vocab/age + 34 + + + + https://example.org/data/bob + https://example.org/vocab/knows + https://example.org/data/alice + + + + + https://example.org/graph/projects + + + https://example.org/data/project1 + https://example.org/vocab/title + Databus Example Project + + + + https://example.org/data/project1 + https://example.org/vocab/member + https://example.org/data/alice + + + + \ No newline at end of file diff --git a/tests/resources/sample.tsv b/tests/resources/sample.tsv new file mode 100644 index 0000000..c23af40 --- /dev/null +++ b/tests/resources/sample.tsv @@ -0,0 +1,11 @@ +subject predicate object graph +https://example.org/data/alice http://xmlns.com/foaf/0.1/name Alice https://example.org/graph/people +https://example.org/data/alice https://example.org/vocab/age 29 https://example.org/graph/people +https://example.org/data/alice https://example.org/vocab/livesAt _:address1 https://example.org/graph/people +_:address1 https://example.org/vocab/city Leipzig https://example.org/graph/people +_:address1 https://example.org/vocab/country Germany https://example.org/graph/people +https://example.org/data/bob http://xmlns.com/foaf/0.1/name Bob https://example.org/graph/people +https://example.org/data/bob https://example.org/vocab/age 34 https://example.org/graph/people +https://example.org/data/bob https://example.org/vocab/knows https://example.org/data/alice https://example.org/graph/people +https://example.org/data/project1 https://example.org/vocab/title Databus Example Project https://example.org/graph/projects +https://example.org/data/project1 https://example.org/vocab/member https://example.org/data/alice https://example.org/graph/projects \ No newline at end of file diff --git a/tests/resources/sample.ttl b/tests/resources/sample.ttl new file mode 100644 index 0000000..a8eb198 --- /dev/null +++ b/tests/resources/sample.ttl @@ -0,0 +1,18 @@ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + +_:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . + + ex:title "Databus Example Project" ; + ex:member . \ No newline at end of file diff --git a/tests/test_compression_conversion.py b/tests/test_compression_conversion.py index 71ada16..8effa1b 100644 --- a/tests/test_compression_conversion.py +++ b/tests/test_compression_conversion.py @@ -8,7 +8,7 @@ import pytest from databusclient.api.download import ( _detect_compression_format, - _should_convert_file, + _should_convert_compression, _get_converted_filename, _convert_compression_format, ) @@ -23,37 +23,42 @@ def test_detect_compression_format(): assert _detect_compression_format("FILE.TXT.GZ") == "gz" # case insensitive -def test_should_convert_file(): - """Test file conversion decision logic""" +def test_should_convert_compression(): + """Test file compression conversion decision logic. + + With --compression, source format is auto-detected from the file extension. + All compressed files are converted to the target format regardless of their + source compression format (no convert_from filter). + """ # No conversion target specified - should_convert, source = _should_convert_file("file.txt.bz2", None, None) + should_convert, source = _should_convert_compression("file.txt.bz2", None) assert should_convert is False assert source is None - # Uncompressed file - should_convert, source = _should_convert_file("file.txt", "gz", None) + # Uncompressed file — never converted + should_convert, source = _should_convert_compression("file.txt", "gz") assert should_convert is False assert source is None - # Same source and target - should_convert, source = _should_convert_file("file.txt.gz", "gz", None) + # Same source and target — skip (no-op) + should_convert, source = _should_convert_compression("file.txt.gz", "gz") assert should_convert is False assert source is None - # Valid conversion - should_convert, source = _should_convert_file("file.txt.bz2", "gz", None) + # bz2 -> gz: should convert, source auto-detected + should_convert, source = _should_convert_compression("file.txt.bz2", "gz") assert should_convert is True assert source == "bz2" - # With convert_from filter matching - should_convert, source = _should_convert_file("file.txt.bz2", "gz", "bz2") + # xz -> gz: should convert regardless of source format (no filter) + should_convert, source = _should_convert_compression("file.txt.xz", "gz") assert should_convert is True - assert source == "bz2" + assert source == "xz" - # With convert_from filter not matching - should_convert, source = _should_convert_file("file.txt.bz2", "gz", "xz") - assert should_convert is False - assert source is None + # gz -> bz2: should convert + should_convert, source = _should_convert_compression("file.txt.gz", "bz2") + assert should_convert is True + assert source == "gz" def test_get_converted_filename(): @@ -195,4 +200,4 @@ def test_corrupted_file_handling(): if __name__ == "__main__": - pytest.main([__file__, "-v"]) + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/test_format_round_trips.py b/tests/test_format_round_trips.py new file mode 100644 index 0000000..ebe9f76 --- /dev/null +++ b/tests/test_format_round_trips.py @@ -0,0 +1,256 @@ +"""Round trip tests for Layer 2 format conversion. + +Following the strategy from Frey et al., each test validates that +reading a format and writing it back produces semantically identical output. + +The key validation pattern using handlers and IR: + 1. Read original file into IR (Graph/Dataset/rows) BEFORE any conversion + 2. Convert the file through the handler (read -> write cycle) + 3. Read the converted output back into IR + 4. Compare both IRs — if conversion lost data, IRs will differ + +This correctly catches information loss because g_original is captured +BEFORE serialization, not after. Both IRs use the same rdflib internal +representation, making comparison meaningful at the data level. + +Test data lives in tests/resources/ — one sample file per format. +These files are semantically consistent (same cities dataset across +all formats) and are shared across Layer 2 and future Layer 3 tests. + +9 round trip tests total: + Triple formats: turtle, ntriples, rdf-xml (3 tests) + Quad formats: nquads, trig, trix, json-ld (4 tests) + Tabular formats: csv, tsv (2 tests) +""" + +import os +import tempfile + +from databusclient.api.convert import ( + QuadHandler, + TSDHandler, + TripleHandler, +) + +# --------------------------------------------------------------------------- +# Path to shared test resources +# --------------------------------------------------------------------------- + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def resource(filename: str) -> str: + """Return absolute path to a file in tests/resources/.""" + return os.path.join(RESOURCES, filename) + + +# --------------------------------------------------------------------------- +# Handler instances shared across tests +# --------------------------------------------------------------------------- + +triple_handler = TripleHandler() +quad_handler = QuadHandler() +tsd_handler = TSDHandler() + + +# --------------------------------------------------------------------------- +# Triple format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_turtle(): + """Turtle -> Turtle: read into IR before conversion, compare after.""" + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.NamedTemporaryFile(suffix=".ttl", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "turtle", "turtle") + g_roundtrip = triple_handler.read(output, "turtle") + assert g_original.isomorphic(g_roundtrip), ( + "Turtle round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_ntriples(): + """N-Triples -> N-Triples: read into IR before conversion, compare after.""" + source = resource("sample.nt") + g_original = triple_handler.read(source, "ntriples") + + with tempfile.NamedTemporaryFile(suffix=".nt", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "ntriples", "ntriples") + g_roundtrip = triple_handler.read(output, "ntriples") + assert g_original.isomorphic(g_roundtrip), ( + "N-Triples round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_rdf_xml(): + """RDF/XML -> RDF/XML: read into IR before conversion, compare after.""" + source = resource("sample.rdf") + g_original = triple_handler.read(source, "rdf-xml") + + with tempfile.NamedTemporaryFile(suffix=".rdf", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "rdf-xml", "rdf-xml") + g_roundtrip = triple_handler.read(output, "rdf-xml") + assert g_original.isomorphic(g_roundtrip), ( + "RDF/XML round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +# --------------------------------------------------------------------------- +# Quad format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def _datasets_equal(d1, d2) -> bool: + """Check semantic equivalence of two Datasets. + + Compares total triple count, named graph identifiers, and + performs isomorphism check on each named graph to correctly + handle blank node renaming during serialization. + """ + if len(d1) != len(d2): + return False + + graphs1 = {str(g.identifier) for g in d1.graphs()} + graphs2 = {str(g.identifier) for g in d2.graphs()} + if graphs1 != graphs2: + return False + + # Compare triples inside each named graph using isomorphism + # to correctly handle blank nodes that may be renamed during + # serialization/deserialization + for g1 in d1.graphs(): + graph_id = str(g1.identifier) + g2 = d2.get_context(g1.identifier) + if g2 is None: + return False + if not g1.isomorphic(g2): + return False + + return True + + +def test_round_trip_nquads(): + """N-Quads -> N-Quads: read into IR before conversion, compare after.""" + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.NamedTemporaryFile(suffix=".nq", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "nquads", "nquads") + d_roundtrip = quad_handler.read(output, "nquads") + assert _datasets_equal(d_original, d_roundtrip), ( + "N-Quads round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_trig(): + """TriG -> TriG: read into IR before conversion, compare after.""" + source = resource("sample.trig") + d_original = quad_handler.read(source, "trig") + + with tempfile.NamedTemporaryFile(suffix=".trig", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "trig", "trig") + d_roundtrip = quad_handler.read(output, "trig") + assert _datasets_equal(d_original, d_roundtrip), ( + "TriG round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_trix(): + """TriX -> TriX: read into IR before conversion, compare after.""" + source = resource("sample.trix") + d_original = quad_handler.read(source, "trix") + + with tempfile.NamedTemporaryFile(suffix=".trix", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "trix", "trix") + d_roundtrip = quad_handler.read(output, "trix") + assert _datasets_equal(d_original, d_roundtrip), ( + "TriX round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_json_ld(): + """JSON-LD -> JSON-LD: read into IR before conversion, compare after.""" + source = resource("sample.jsonld") + d_original = quad_handler.read(source, "json-ld") + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "json-ld", "json-ld") + d_roundtrip = quad_handler.read(output, "json-ld") + assert _datasets_equal(d_original, d_roundtrip), ( + "JSON-LD round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +# --------------------------------------------------------------------------- +# Tabular format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_csv(): + """CSV -> CSV: read into IR before conversion, compare after.""" + source = resource("sample.csv") + rows_original = tsd_handler.read(source, "csv") + + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: + output = f.name + try: + tsd_handler.convert(source, output, "csv", "csv") + rows_roundtrip = tsd_handler.read(output, "csv") + assert rows_original == rows_roundtrip, ( + "CSV round trip failed: rows do not match" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_tsv(): + """TSV -> TSV: read into IR before conversion, compare after.""" + source = resource("sample.tsv") + rows_original = tsd_handler.read(source, "tsv") + + with tempfile.NamedTemporaryFile(suffix=".tsv", delete=False) as f: + output = f.name + try: + tsd_handler.convert(source, output, "tsv", "tsv") + rows_roundtrip = tsd_handler.read(output, "tsv") + assert rows_original == rows_roundtrip, ( + "TSV round trip failed: rows do not match" + ) + finally: + if os.path.exists(output): + os.remove(output) \ No newline at end of file