From 7b4c635dd97300c11855db56a44856d1a90a5ff6 Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Fri, 5 Jun 2026 15:28:17 +0530 Subject: [PATCH 1/7] gsoc26: Layer 2 with tests initial commit --- databusclient/api/convert.py | 383 ++++++++++++++++++++++++++++++++++ databusclient/api/download.py | 77 +++++++ databusclient/cli.py | 11 + run_all_conversion_tests.py | 338 ++++++++++++++++++++++++++++++ tests/test_conversion.py | 309 +++++++++++++++++++++++++++ 5 files changed, 1118 insertions(+) create mode 100644 databusclient/api/convert.py create mode 100644 run_all_conversion_tests.py create mode 100644 tests/test_conversion.py diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py new file mode 100644 index 0000000..8d28fd8 --- /dev/null +++ b/databusclient/api/convert.py @@ -0,0 +1,383 @@ +"""Format and Mapping Conversion Layer. + +Layer 2: Within-class format conversion (lossless). +Layer 3: Cross-class mapping conversion (quasi-equal for RDF <-> Tabular). +""" + +import csv +import json +import os +from typing import Optional + +from rdflib import Dataset, Graph + + +# --------------------------------------------------------------------------- +# Format registries +# --------------------------------------------------------------------------- + +# Maps CLI format name -> rdflib format string +RDF_TRIPLE_FORMATS = { + "ntriples": "ntriples", + "turtle": "turtle", + "rdf-xml": "xml", +} + +RDF_QUAD_FORMATS = { + "nquads": "nquads", + "trig": "trig", + "trix": "trix", + "json-ld": "json-ld", +} + +TABULAR_FORMATS = { + "csv": ",", + "tsv": "\t", +} + +ALL_FORMATS = ( + list(RDF_TRIPLE_FORMATS) + + list(RDF_QUAD_FORMATS) + + list(TABULAR_FORMATS) +) + +# Maps file extension -> CLI format name +EXTENSION_TO_FORMAT = { + ".ttl": "turtle", + ".nt": "ntriples", + ".rdf": "rdf-xml", + ".xml": "rdf-xml", + ".owl": "rdf-xml", + ".nq": "nquads", + ".trig": "trig", + ".trix": "trix", + ".jsonld": "json-ld", + ".json": "json-ld", + ".csv": "csv", + ".tsv": "tsv", +} + + +# --------------------------------------------------------------------------- +# Format detection +# --------------------------------------------------------------------------- + +def detect_format_from_filename(filename: str) -> Optional[str]: + """Detect format from file extension, ignoring compression extensions. + + Args: + filename: File name or path. + + Returns: + Format name string or None if not detectable. + """ + name = filename.lower() + + # strip compression extension first + for ext in (".bz2", ".gz", ".xz"): + if name.endswith(ext): + name = name[: -len(ext)] + break + + # match longest extension first to avoid .json matching before .jsonld + for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): + if name.endswith(ext): + return EXTENSION_TO_FORMAT[ext] + + return None + + +def get_format_class(fmt: str) -> str: + """Return equivalence class for a format name. + + Args: + fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). + + Returns: + 'triples', 'quads', or 'tabular'. + + Raises: + ValueError: If format is not recognised. + """ + if fmt in RDF_TRIPLE_FORMATS: + return "triples" + if fmt in RDF_QUAD_FORMATS: + return "quads" + if fmt in TABULAR_FORMATS: + return "tabular" + raise ValueError( + f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" + ) + + +# --------------------------------------------------------------------------- +# Output filename helper +# --------------------------------------------------------------------------- + +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + + +def get_converted_filename(original_filename: str, convert_format: str) -> str: + """Generate output filename after format conversion. + + Strips compression extension if present, then replaces the format + extension with the target format extension. + + Args: + original_filename: Original file name (basename only, not full path). + convert_format: Target format name. + + Returns: + New filename with updated extension. + """ + name = original_filename + + # strip compression extension + for ext in (".bz2", ".gz", ".xz"): + if name.lower().endswith(ext): + name = name[: -len(ext)] + break + + # strip existing format extension + for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): + if name.lower().endswith(old_ext): + name = name[: -len(old_ext)] + break + + target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") + return name + target_ext + + +# --------------------------------------------------------------------------- +# Layer 2 — within-class format conversion +# --------------------------------------------------------------------------- + +def convert_rdf_triple_format( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Convert between RDF triple serialization formats (Layer 2). + + Handles: ntriples, turtle, rdf-xml. + Uses rdflib Graph as internal representation. + + Args: + input_file: Path to input file. + output_file: Path to write converted output. + input_format: Source format name (must be in RDF_TRIPLE_FORMATS). + output_format: Target format name (must be in RDF_TRIPLE_FORMATS). + """ + g = Graph() + g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) + g.serialize(destination=output_file, format=RDF_TRIPLE_FORMATS[output_format]) + print( + f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" + ) + + +def convert_rdf_quad_format( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Convert between RDF quad serialization formats (Layer 2). + + Handles: nquads, trig, trix, json-ld. + Uses rdflib Dataset as internal representation + to preserve named graph information. + + Args: + input_file: Path to input file. + output_file: Path to write converted output. + input_format: Source format name (must be in RDF_QUAD_FORMATS). + output_format: Target format name (must be in RDF_QUAD_FORMATS). + """ + g = Dataset() + g.parse(input_file, format=RDF_QUAD_FORMATS[input_format]) + g.serialize(destination=output_file, format=RDF_QUAD_FORMATS[output_format]) + print( + f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" + ) + + +def convert_tabular_format( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Convert between tabular formats (Layer 2). + + Handles: csv <-> tsv. + Uses Python built-in csv module. + + Args: + input_file: Path to input file. + output_file: Path to write converted output. + input_format: Source format name ('csv' or 'tsv'). + output_format: Target format name ('csv' or 'tsv'). + """ + input_delimiter = TABULAR_FORMATS[input_format] + output_delimiter = TABULAR_FORMATS[output_format] + + with open(input_file, "r", newline="", encoding="utf-8") as infile: + reader = csv.reader(infile, delimiter=input_delimiter) + rows = list(reader) + + with open(output_file, "w", newline="", encoding="utf-8") as outfile: + writer = csv.writer(outfile, delimiter=output_delimiter) + writer.writerows(rows) + + print( + f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" + ) + + +# --------------------------------------------------------------------------- +# Layer 3 — cross-class mapping conversion +# --------------------------------------------------------------------------- + +def convert_rdf_to_csv( + input_file: str, + output_file: str, + input_format: str, +) -> None: + """Map RDF triples to a wide CSV table (Layer 3). + + Each unique subject becomes a row. Each unique predicate becomes a column. + Multi-valued predicates are pipe-separated. + A companion .meta.json file is generated alongside the CSV to preserve + RDF datatype and language tag information for lossless round trips. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output CSV file. + input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + """ + g = Graph() + g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) + + predicates = sorted(set(str(p) for s, p, o in g)) + + subjects: dict = {} + column_metadata: dict = {} + + for s, p, o in g: + subj = str(s) + pred = str(p) + + # capture datatype or language tag for companion file + if hasattr(o, "datatype") and o.datatype: + column_metadata[pred] = {"datatype": str(o.datatype)} + elif hasattr(o, "language") and o.language: + column_metadata[pred] = {"language": str(o.language)} + + if subj not in subjects: + subjects[subj] = {} + if pred not in subjects[subj]: + subjects[subj][pred] = [] + subjects[subj][pred].append(str(o)) + + with open(output_file, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["resource"] + predicates) + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + writer.writerow(row) + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Companion metadata: {os.path.basename(companion_file)}") + + +# --------------------------------------------------------------------------- +# Main dispatcher — called from download pipeline +# --------------------------------------------------------------------------- + +def convert_file( + input_file: str, + output_file: str, + convert_format: str, +) -> None: + """Main conversion dispatcher called from the download pipeline. + + Detects the input format from the file extension, determines whether + this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, + and delegates to the appropriate conversion function. + + For Layer 2: lossless, same equivalence class. + For Layer 3: quasi-equal for RDF <-> Tabular, lossless for Triples <-> Quads. + + Args: + input_file: Path to the input file (must be decompressed). + output_file: Path to write the converted output file. + convert_format: Target format name (CLI format string). + + Raises: + ValueError: If the input format cannot be detected or if the + requested conversion is not supported. + """ + input_format = detect_format_from_filename(input_file) + + if input_format is None: + raise ValueError( + f"Could not detect input format from filename: '{os.path.basename(input_file)}'. " + f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" + ) + + if input_format == convert_format: + print( + f"WARNING: Input and target format are both '{input_format}'. " + "Skipping conversion." + ) + return + + input_class = get_format_class(input_format) + output_class = get_format_class(convert_format) + + # --- Layer 2: within-class --- + if input_class == output_class: + if input_class == "triples": + convert_rdf_triple_format( + input_file, output_file, input_format, convert_format + ) + elif input_class == "quads": + convert_rdf_quad_format( + input_file, output_file, input_format, convert_format + ) + elif input_class == "tabular": + convert_tabular_format( + input_file, output_file, input_format, convert_format + ) + return + + # --- Layer 3: cross-class --- + if input_class == "triples" and output_class == "tabular": + convert_rdf_to_csv(input_file, output_file, input_format) + return + + raise ValueError( + f"Conversion from '{input_format}' ({input_class}) to " + f"'{convert_format}' ({output_class}) is not yet implemented. " + f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." + ) \ No newline at end of file diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 312af45..74bd2fd 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -16,6 +16,7 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) +from databusclient.api.convert import convert_file, get_converted_filename # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -313,6 +314,7 @@ def _download_file( client_id=None, convert_to=None, convert_from=None, + convert_format=None, validate_checksum: bool = False, expected_checksum: str | None = None, ) -> None: @@ -327,6 +329,7 @@ def _download_file( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. """ @@ -507,12 +510,63 @@ def _download_file( # --- 7. Convert compression format if requested (AFTER validation) --- should_convert, source_format = _should_convert_file(file, convert_to, convert_from) + final_downloaded_file = filename if should_convert and source_format: target_filename = _get_converted_filename(file, source_format, convert_to) target_filepath = os.path.join(localDir, target_filename) _convert_compression_format( filename, target_filepath, source_format, convert_to ) + final_downloaded_file = target_filepath + + # --- 8. Convert file format if requested (AFTER compression conversion) --- + if convert_format: + final_basename = os.path.basename(final_downloaded_file) + compression_fmt = _detect_compression_format(final_basename) + + if compression_fmt: + # File is still compressed — decompress to a temp file first, + # then convert format, then clean up the temp file. + # This follows the pipeline: Download -> Decompress -> Convert -> Save + import tempfile + + source_module = COMPRESSION_MODULES[compression_fmt] + # temp decompressed file sits next to the original + compression_ext = COMPRESSION_EXTENSIONS[compression_fmt] + if final_downloaded_file.lower().endswith(compression_ext): + temp_decompressed = final_downloaded_file[:-len(compression_ext)] + else: + temp_decompressed = final_downloaded_file + ".decompressed" + + try: + print( + f"Decompressing {final_basename} before format conversion..." + ) + with source_module.open(final_downloaded_file, "rb") as sf: + with open(temp_decompressed, "wb") as tf: + while True: + chunk = sf.read(8192) + if not chunk: + break + tf.write(chunk) + + # now convert the decompressed temp file + converted_filename = get_converted_filename( + final_basename, convert_format + ) + converted_filepath = os.path.join(localDir, converted_filename) + convert_file(temp_decompressed, converted_filepath, convert_format) + + finally: + # always clean up temp file even if conversion fails + if os.path.exists(temp_decompressed): + os.remove(temp_decompressed) + + else: + # file is already uncompressed — convert directly + converted_filename = get_converted_filename(final_basename, convert_format) + converted_filepath = os.path.join(localDir, converted_filename) + convert_file(final_downloaded_file, converted_filepath, convert_format) def _download_files( @@ -524,6 +578,7 @@ def _download_files( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -538,6 +593,7 @@ def _download_files( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. """ @@ -554,6 +610,7 @@ def _download_files( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -702,6 +759,7 @@ def _download_collection( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -716,6 +774,7 @@ def _download_collection( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ query = _get_sparql_query_of_collection(uri, databus_key=databus_key) @@ -737,6 +796,7 @@ def _download_collection( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -751,6 +811,7 @@ def _download_version( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -764,6 +825,7 @@ def _download_version( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -784,6 +846,7 @@ def _download_version( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, ) @@ -799,6 +862,7 @@ def _download_artifact( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -813,6 +877,7 @@ def _download_artifact( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -839,6 +904,7 @@ def _download_artifact( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, ) @@ -915,6 +981,7 @@ def _download_group( client_id: str = None, convert_to: str = None, convert_from: str = None, + convert_format: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -929,6 +996,7 @@ def _download_group( client_id: Client ID for token exchange. convert_to: Target compression format for on-the-fly conversion. convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -945,6 +1013,7 @@ def _download_group( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -994,6 +1063,7 @@ def download( client_id="vault-token-exchange", convert_to=None, convert_from=None, + convert_format=None, validate_checksum: bool = False, ) -> None: """Download datasets from databus. @@ -1010,6 +1080,7 @@ def download( client_id: Client ID for token exchange. Default is "vault-token-exchange". convert_to: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). convert_from: Optional source compression format filter. + convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ for databusURI in databusURIs: @@ -1039,6 +1110,7 @@ def download( client_id, convert_to, convert_from, + convert_format, validate_checksum=validate_checksum, ) elif file is not None: @@ -1060,6 +1132,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -1074,6 +1147,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -1090,6 +1164,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -1106,6 +1181,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) elif account is not None: @@ -1144,6 +1220,7 @@ def download( client_id=client_id, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) diff --git a/databusclient/cli.py b/databusclient/cli.py index c3bd8f2..c687616 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -189,6 +189,15 @@ def deploy( type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", ) +@click.option( + "--convert-format", + "convert_format", + type=click.Choice( + ["ntriples","turtle","rdf-xml","nquads","trig","trix","json-ld","csv","tsv"], + case_sensitive=False, + ), + help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3).", +) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" ) @@ -203,6 +212,7 @@ def download( clientid, convert_to, convert_from, + convert_format, validate_checksum, ): """ @@ -221,6 +231,7 @@ def download( client_id=clientid, convert_to=convert_to, convert_from=convert_from, + convert_format=convert_format, validate_checksum=validate_checksum, ) except DownloadAuthError as e: diff --git a/run_all_conversion_tests.py b/run_all_conversion_tests.py new file mode 100644 index 0000000..384e052 --- /dev/null +++ b/run_all_conversion_tests.py @@ -0,0 +1,338 @@ +""" +Layer 2 Conversion Testing Script +Tests every conversion combination systematically. +Outputs go to test_outputs/ folder. +Test file for testing with real datasets from databus. +""" + +import os +from databusclient.api.convert import ( + convert_rdf_triple_format, + convert_rdf_quad_format, + convert_tabular_format, +) + +# --------------------------------------------------------------------------- +# Setup output folders +# --------------------------------------------------------------------------- + +folders = [ + "test_outputs/triples/T1_turtle_to_ntriples", + "test_outputs/triples/T2_turtle_to_rdfxml", + "test_outputs/triples/T3_ntriples_to_turtle", + "test_outputs/triples/T4_ntriples_to_rdfxml", + "test_outputs/triples/T5_rdfxml_to_turtle", + "test_outputs/triples/T6_rdfxml_to_ntriples", + "test_outputs/quads/Q1_nquads_to_trig", + "test_outputs/quads/Q2_nquads_to_trix", + "test_outputs/quads/Q3_nquads_to_jsonld", + "test_outputs/quads/Q4_trig_to_nquads", + "test_outputs/quads/Q5_trig_to_trix", + "test_outputs/quads/Q6_trig_to_jsonld", + "test_outputs/quads/Q7_trix_to_nquads", + "test_outputs/quads/Q8_trix_to_trig", + "test_outputs/quads/Q9_trix_to_jsonld", + "test_outputs/quads/Q10_jsonld_to_nquads", + "test_outputs/quads/Q11_jsonld_to_trig", + "test_outputs/quads/Q12_jsonld_to_trix", + "test_outputs/tabular/TAB1_csv_to_tsv", + "test_outputs/tabular/TAB2_tsv_to_csv", +] + +for folder in folders: + os.makedirs(folder, exist_ok=True) + +results = [] + + +def run_test(test_id, description, func, input_file, output_file, *args): + """Run one conversion test and record the result.""" + try: + func(input_file, output_file, *args) + size = os.path.getsize(output_file) + results.append(f"PASS {test_id}: {description} -> {os.path.basename(output_file)} ({size} bytes)") + return output_file + except Exception as e: + results.append(f"FAIL {test_id}: {description} -> ERROR: {e}") + return None + + +# --------------------------------------------------------------------------- +# GROUP 1: RDF Triple Format Conversions +# 6 combinations: each format -> every other format +# Base file: test_outputs/base/base.ttl (real DBpedia Turtle data) +# Chain: turtle -> ntriples -> rdfxml -> back to turtle +# --------------------------------------------------------------------------- + +print("\n=== GROUP 1: RDF TRIPLE FORMAT CONVERSIONS ===\n") + +BASE_TTL = "test_outputs/base/base.ttl" + +# T1: turtle -> ntriples (from base turtle file) +t1_out = "test_outputs/triples/T1_turtle_to_ntriples/output.nt" +run_test( + "T1", "turtle -> ntriples", + convert_rdf_triple_format, + BASE_TTL, t1_out, "turtle", "ntriples" +) + +# T2: turtle -> rdf-xml (from base turtle file) +t2_out = "test_outputs/triples/T2_turtle_to_rdfxml/output.rdf" +run_test( + "T2", "turtle -> rdf-xml", + convert_rdf_triple_format, + BASE_TTL, t2_out, "turtle", "rdf-xml" +) + +# T3: ntriples -> turtle (uses T1 output) +t3_out = "test_outputs/triples/T3_ntriples_to_turtle/output.ttl" +if t1_out and os.path.exists(t1_out): + run_test( + "T3", "ntriples -> turtle", + convert_rdf_triple_format, + t1_out, t3_out, "ntriples", "turtle" + ) +else: + results.append("SKIP T3: ntriples -> turtle (T1 output not available)") + +# T4: ntriples -> rdf-xml (uses T1 output) +t4_out = "test_outputs/triples/T4_ntriples_to_rdfxml/output.rdf" +if t1_out and os.path.exists(t1_out): + run_test( + "T4", "ntriples -> rdf-xml", + convert_rdf_triple_format, + t1_out, t4_out, "ntriples", "rdf-xml" + ) +else: + results.append("SKIP T4: ntriples -> rdf-xml (T1 output not available)") + +# T5: rdf-xml -> turtle (uses T2 output) +t5_out = "test_outputs/triples/T5_rdfxml_to_turtle/output.ttl" +if t2_out and os.path.exists(t2_out): + run_test( + "T5", "rdf-xml -> turtle", + convert_rdf_triple_format, + t2_out, t5_out, "rdf-xml", "turtle" + ) +else: + results.append("SKIP T5: rdf-xml -> turtle (T2 output not available)") + +# T6: rdf-xml -> ntriples (uses T2 output) +t6_out = "test_outputs/triples/T6_rdfxml_to_ntriples/output.nt" +if t2_out and os.path.exists(t2_out): + run_test( + "T6", "rdf-xml -> ntriples", + convert_rdf_triple_format, + t2_out, t6_out, "rdf-xml", "ntriples" + ) +else: + results.append("SKIP T6: rdf-xml -> ntriples (T2 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 2: RDF Quad Format Conversions +# 12 combinations: each of 4 formats -> every other format (4*3=12) +# Base file: test_outputs/base/base.nq +# Chain: nquads -> trig -> trix -> jsonld -> back to nquads +# --------------------------------------------------------------------------- + +print("\n=== GROUP 2: RDF QUAD FORMAT CONVERSIONS ===\n") + +BASE_NQ = "test_outputs/base/base.nq" + +# Q1: nquads -> trig +q1_out = "test_outputs/quads/Q1_nquads_to_trig/output.trig" +run_test( + "Q1", "nquads -> trig", + convert_rdf_quad_format, + BASE_NQ, q1_out, "nquads", "trig" +) + +# Q2: nquads -> trix +q2_out = "test_outputs/quads/Q2_nquads_to_trix/output.trix" +run_test( + "Q2", "nquads -> trix", + convert_rdf_quad_format, + BASE_NQ, q2_out, "nquads", "trix" +) + +# Q3: nquads -> json-ld +q3_out = "test_outputs/quads/Q3_nquads_to_jsonld/output.jsonld" +run_test( + "Q3", "nquads -> json-ld", + convert_rdf_quad_format, + BASE_NQ, q3_out, "nquads", "json-ld" +) + +# Q4: trig -> nquads (uses Q1 output) +q4_out = "test_outputs/quads/Q4_trig_to_nquads/output.nq" +if q1_out and os.path.exists(q1_out): + run_test( + "Q4", "trig -> nquads", + convert_rdf_quad_format, + q1_out, q4_out, "trig", "nquads" + ) +else: + results.append("SKIP Q4: trig -> nquads (Q1 output not available)") + +# Q5: trig -> trix (uses Q1 output) +q5_out = "test_outputs/quads/Q5_trig_to_trix/output.trix" +if q1_out and os.path.exists(q1_out): + run_test( + "Q5", "trig -> trix", + convert_rdf_quad_format, + q1_out, q5_out, "trig", "trix" + ) +else: + results.append("SKIP Q5: trig -> trix (Q1 output not available)") + +# Q6: trig -> json-ld (uses Q1 output) +q6_out = "test_outputs/quads/Q6_trig_to_jsonld/output.jsonld" +if q1_out and os.path.exists(q1_out): + run_test( + "Q6", "trig -> json-ld", + convert_rdf_quad_format, + q1_out, q6_out, "trig", "json-ld" + ) +else: + results.append("SKIP Q6: trig -> json-ld (Q1 output not available)") + +# Q7: trix -> nquads (uses Q2 output) +q7_out = "test_outputs/quads/Q7_trix_to_nquads/output.nq" +if q2_out and os.path.exists(q2_out): + run_test( + "Q7", "trix -> nquads", + convert_rdf_quad_format, + q2_out, q7_out, "trix", "nquads" + ) +else: + results.append("SKIP Q7: trix -> nquads (Q2 output not available)") + +# Q8: trix -> trig (uses Q2 output) +q8_out = "test_outputs/quads/Q8_trix_to_trig/output.trig" +if q2_out and os.path.exists(q2_out): + run_test( + "Q8", "trix -> trig", + convert_rdf_quad_format, + q2_out, q8_out, "trix", "trig" + ) +else: + results.append("SKIP Q8: trix -> trig (Q2 output not available)") + +# Q9: trix -> json-ld (uses Q2 output) +q9_out = "test_outputs/quads/Q9_trix_to_jsonld/output.jsonld" +if q2_out and os.path.exists(q2_out): + run_test( + "Q9", "trix -> json-ld", + convert_rdf_quad_format, + q2_out, q9_out, "trix", "json-ld" + ) +else: + results.append("SKIP Q9: trix -> json-ld (Q2 output not available)") + +# Q10: json-ld -> nquads (uses Q3 output) +q10_out = "test_outputs/quads/Q10_jsonld_to_nquads/output.nq" +if q3_out and os.path.exists(q3_out): + run_test( + "Q10", "json-ld -> nquads", + convert_rdf_quad_format, + q3_out, q10_out, "json-ld", "nquads" + ) +else: + results.append("SKIP Q10: json-ld -> nquads (Q3 output not available)") + +# Q11: json-ld -> trig (uses Q3 output) +q11_out = "test_outputs/quads/Q11_jsonld_to_trig/output.trig" +if q3_out and os.path.exists(q3_out): + run_test( + "Q11", "json-ld -> trig", + convert_rdf_quad_format, + q3_out, q11_out, "json-ld", "trig" + ) +else: + results.append("SKIP Q11: json-ld -> trig (Q3 output not available)") + +# Q12: json-ld -> trix (uses Q3 output) +q12_out = "test_outputs/quads/Q12_jsonld_to_trix/output.trix" +if q3_out and os.path.exists(q3_out): + run_test( + "Q12", "json-ld -> trix", + convert_rdf_quad_format, + q3_out, q12_out, "json-ld", "trix" + ) +else: + results.append("SKIP Q12: json-ld -> trix (Q3 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 3: Tabular Format Conversions +# 2 combinations: csv->tsv and tsv->csv +# --------------------------------------------------------------------------- + +print("\n=== GROUP 3: TABULAR FORMAT CONVERSIONS ===\n") + +BASE_CSV = "test_outputs/base/base.csv" +BASE_TSV = "test_outputs/base/base.tsv" + +# TAB1: csv -> tsv +tab1_out = "test_outputs/tabular/TAB1_csv_to_tsv/output.tsv" +run_test( + "TAB1", "csv -> tsv", + convert_tabular_format, + BASE_CSV, tab1_out, "csv", "tsv" +) + +# TAB2: tsv -> csv (uses TAB1 output) +tab2_out = "test_outputs/tabular/TAB2_tsv_to_csv/output.csv" +if tab1_out and os.path.exists(tab1_out): + run_test( + "TAB2", "tsv -> csv", + convert_tabular_format, + tab1_out, tab2_out, "tsv", "csv" + ) +else: + results.append("SKIP TAB2: tsv -> csv (TAB1 output not available)") + + +# --------------------------------------------------------------------------- +# GROUP 4: CLI End-to-End Tests (compressed real Databus file) +# These test the full pipeline including download.py wiring +# --------------------------------------------------------------------------- + +print("\n=== GROUP 4: CLI END-TO-END (run these manually) ===\n") +cli_tests = [ + "CLI1: turtle->ntriples from compressed Databus file", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --localdir ./test_outputs/cli/CLI1", + "", + "CLI2: turtle->rdf-xml from compressed Databus file", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format rdf-xml --localdir ./test_outputs/cli/CLI2", + "", + "CLI3: turtle->ntriples + compression bz2->gz", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --convert-to gz --localdir ./test_outputs/cli/CLI3", + "", + "CLI4: turtle->ntriples + compression bz2->xz", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format ntriples --convert-to xz --localdir ./test_outputs/cli/CLI4", + "", + "CLI5: unsupported cross-class error (expect ValueError)", + " poetry run databusclient download \"https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=cy.ttl.bz2\" --convert-format nquads --localdir ./test_outputs/cli/CLI5", +] +for line in cli_tests: + print(line) + + +# --------------------------------------------------------------------------- +# Print summary +# --------------------------------------------------------------------------- + +print("\n" + "="*60) +print("LAYER 2 CONVERSION TEST SUMMARY") +print("="*60) +for result in results: + print(result) + +passed = sum(1 for r in results if r.startswith("PASS")) +failed = sum(1 for r in results if r.startswith("FAIL")) +skipped = sum(1 for r in results if r.startswith("SKIP")) + +print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped") +print("="*60) \ No newline at end of file diff --git a/tests/test_conversion.py b/tests/test_conversion.py new file mode 100644 index 0000000..7ce710c --- /dev/null +++ b/tests/test_conversion.py @@ -0,0 +1,309 @@ +"""Round trip tests for Layer 2 format conversion. + +Following the strategy from Frey et al., each test validates that +reading a format and writing it back produces semantically identical output. +Pattern: parse(format X) -> serialize(format X) -> parse again -> compare. + +9 tests total: +- Triple formats: ntriples, turtle, rdf-xml (3 tests) +- Quad formats: nquads, trig, trix, json-ld (4 tests) +- Tabular formats: csv, tsv (2 tests) +""" + +import csv +import os +import tempfile + +from rdflib import Dataset, Graph + +from databusclient.api.convert import ( + convert_rdf_quad_format, + convert_rdf_triple_format, + convert_tabular_format, +) + +# --------------------------------------------------------------------------- +# Sample RDF data used across all RDF tests +# --------------------------------------------------------------------------- + +SAMPLE_TURTLE = """ +@prefix ex: . +@prefix schema: . +@prefix xsd: . + +ex:Paris schema:isCapitalOf ex:France ; + schema:population "2161000"^^xsd:integer . + +ex:Berlin schema:isCapitalOf ex:Germany ; + schema:population "3645000"^^xsd:integer . +""" + +SAMPLE_NQUADS = """ + . + . + . +""" + +SAMPLE_CSV = """resource,name,population +http://example.org/Paris,Paris,2161000 +http://example.org/Berlin,Berlin,3645000 +""" + +SAMPLE_TSV = "resource\tname\tpopulation\nhttp://example.org/Paris\tParis\t2161000\nhttp://example.org/Berlin\tBerlin\t3645000\n" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _write_temp(content: str, suffix: str) -> str: + """Write content to a named temp file and return its path.""" + fd, path = tempfile.mkstemp(suffix=suffix) + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(content) + return path + + +def _graphs_are_isomorphic(g1: Graph, g2: Graph) -> bool: + """Check semantic equivalence of two rdflib Graphs.""" + return g1.isomorphic(g2) + + +def _datasets_equal(g1: Dataset, g2: Dataset) -> bool: + """Check semantic equivalence of two Datasets by triple count and graph names.""" + if len(g1) != len(g2): + return False + graphs1 = {str(c.identifier) for c in g1.graphs()} + graphs2 = {str(c.identifier) for c in g2.graphs()} + return graphs1 == graphs2 + + +# --------------------------------------------------------------------------- +# Triple format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_turtle(): + """Turtle -> Turtle: parse, serialize, reparse, compare.""" + input_path = _write_temp(SAMPLE_TURTLE, ".ttl") + output_path = input_path + ".rt.ttl" + + try: + convert_rdf_triple_format(input_path, output_path, "turtle", "turtle") + + g_original = Graph() + g_original.parse(input_path, format="turtle") + + g_roundtrip = Graph() + g_roundtrip.parse(output_path, format="turtle") + + assert _graphs_are_isomorphic(g_original, g_roundtrip), ( + "Turtle round trip failed: graphs are not isomorphic" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_ntriples(): + """N-Triples -> N-Triples: parse, serialize, reparse, compare.""" + # first produce an ntriples file from turtle + turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") + nt_path = turtle_path + ".nt" + output_path = nt_path + ".rt.nt" + + try: + convert_rdf_triple_format(turtle_path, nt_path, "turtle", "ntriples") + convert_rdf_triple_format(nt_path, output_path, "ntriples", "ntriples") + + g_original = Graph() + g_original.parse(nt_path, format="ntriples") + + g_roundtrip = Graph() + g_roundtrip.parse(output_path, format="ntriples") + + assert _graphs_are_isomorphic(g_original, g_roundtrip), ( + "N-Triples round trip failed: graphs are not isomorphic" + ) + finally: + for p in (turtle_path, nt_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_rdf_xml(): + """RDF/XML -> RDF/XML: parse, serialize, reparse, compare.""" + turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") + rdf_path = turtle_path + ".rdf" + output_path = rdf_path + ".rt.rdf" + + try: + convert_rdf_triple_format(turtle_path, rdf_path, "turtle", "rdf-xml") + convert_rdf_triple_format(rdf_path, output_path, "rdf-xml", "rdf-xml") + + g_original = Graph() + g_original.parse(rdf_path, format="xml") + + g_roundtrip = Graph() + g_roundtrip.parse(output_path, format="xml") + + assert _graphs_are_isomorphic(g_original, g_roundtrip), ( + "RDF/XML round trip failed: graphs are not isomorphic" + ) + finally: + for p in (turtle_path, rdf_path, output_path): + if os.path.exists(p): + os.remove(p) + + +# --------------------------------------------------------------------------- +# Quad format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_nquads(): + """N-Quads -> N-Quads: parse, serialize, reparse, compare.""" + input_path = _write_temp(SAMPLE_NQUADS, ".nq") + output_path = input_path + ".rt.nq" + + try: + convert_rdf_quad_format(input_path, output_path, "nquads", "nquads") + + g_original = Dataset() + g_original.parse(input_path, format="nquads") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="nquads") + + assert _datasets_equal(g_original, g_roundtrip), ( + "N-Quads round trip failed: graphs are not equal" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_trig(): + """TriG -> TriG: parse, serialize, reparse, compare.""" + # produce trig from nquads + nq_path = _write_temp(SAMPLE_NQUADS, ".nq") + trig_path = nq_path + ".trig" + output_path = trig_path + ".rt.trig" + + try: + convert_rdf_quad_format(nq_path, trig_path, "nquads", "trig") + convert_rdf_quad_format(trig_path, output_path, "trig", "trig") + + g_original = Dataset() + g_original.parse(trig_path, format="trig") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="trig") + + assert _datasets_equal(g_original, g_roundtrip), ( + "TriG round trip failed: graphs are not equal" + ) + finally: + for p in (nq_path, trig_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_trix(): + """TriX -> TriX: parse, serialize, reparse, compare.""" + nq_path = _write_temp(SAMPLE_NQUADS, ".nq") + trix_path = nq_path + ".trix" + output_path = trix_path + ".rt.trix" + + try: + convert_rdf_quad_format(nq_path, trix_path, "nquads", "trix") + convert_rdf_quad_format(trix_path, output_path, "trix", "trix") + + g_original = Dataset() + g_original.parse(trix_path, format="trix") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="trix") + + assert _datasets_equal(g_original, g_roundtrip), ( + "TriX round trip failed: graphs are not equal" + ) + finally: + for p in (nq_path, trix_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_json_ld(): + """JSON-LD -> JSON-LD: parse, serialize, reparse, compare.""" + nq_path = _write_temp(SAMPLE_NQUADS, ".nq") + jsonld_path = nq_path + ".jsonld" + output_path = jsonld_path + ".rt.jsonld" + + try: + convert_rdf_quad_format(nq_path, jsonld_path, "nquads", "json-ld") + convert_rdf_quad_format(jsonld_path, output_path, "json-ld", "json-ld") + + g_original = Dataset() + g_original.parse(jsonld_path, format="json-ld") + + g_roundtrip = Dataset() + g_roundtrip.parse(output_path, format="json-ld") + + assert _datasets_equal(g_original, g_roundtrip), ( + "JSON-LD round trip failed: graphs are not equal" + ) + finally: + for p in (nq_path, jsonld_path, output_path): + if os.path.exists(p): + os.remove(p) + + +# --------------------------------------------------------------------------- +# Tabular format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_csv(): + """CSV -> CSV: read, write, reread, compare rows.""" + input_path = _write_temp(SAMPLE_CSV, ".csv") + output_path = input_path + ".rt.csv" + + try: + convert_tabular_format(input_path, output_path, "csv", "csv") + + with open(input_path, newline="", encoding="utf-8") as f: + original_rows = list(csv.reader(f)) + + with open(output_path, newline="", encoding="utf-8") as f: + roundtrip_rows = list(csv.reader(f)) + + assert original_rows == roundtrip_rows, ( + "CSV round trip failed: rows do not match" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) + + +def test_round_trip_tsv(): + """TSV -> TSV: read, write, reread, compare rows.""" + input_path = _write_temp(SAMPLE_TSV, ".tsv") + output_path = input_path + ".rt.tsv" + + try: + convert_tabular_format(input_path, output_path, "tsv", "tsv") + + with open(input_path, newline="", encoding="utf-8") as f: + original_rows = list(csv.reader(f, delimiter="\t")) + + with open(output_path, newline="", encoding="utf-8") as f: + roundtrip_rows = list(csv.reader(f, delimiter="\t")) + + assert original_rows == roundtrip_rows, ( + "TSV round trip failed: rows do not match" + ) + finally: + for p in (input_path, output_path): + if os.path.exists(p): + os.remove(p) \ No newline at end of file From 5d224f691897580e52bab5f82f26639a1ce69f6b Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 6 Jun 2026 19:53:50 +0530 Subject: [PATCH 2/7] gsoc26: Refactor Layer 2 with handler architecture and improved tests --- databusclient/api/convert.py | 410 ++++++++++++++++++++++--------- databusclient/api/download.py | 46 +++- pyproject.toml | 6 + tests/resources/sample.csv | 11 + tests/resources/sample.jsonld | 62 +++++ tests/resources/sample.nq | 10 + tests/resources/sample.nt | 10 + tests/resources/sample.rdf | 30 +++ tests/resources/sample.trig | 22 ++ tests/resources/sample.trix | 72 ++++++ tests/resources/sample.tsv | 11 + tests/resources/sample.ttl | 18 ++ tests/test_conversion.py | 309 ----------------------- tests/test_format_round_trips.py | 256 +++++++++++++++++++ 14 files changed, 837 insertions(+), 436 deletions(-) create mode 100644 tests/resources/sample.csv create mode 100644 tests/resources/sample.jsonld create mode 100644 tests/resources/sample.nq create mode 100644 tests/resources/sample.nt create mode 100644 tests/resources/sample.rdf create mode 100644 tests/resources/sample.trig create mode 100644 tests/resources/sample.trix create mode 100644 tests/resources/sample.tsv create mode 100644 tests/resources/sample.ttl delete mode 100644 tests/test_conversion.py create mode 100644 tests/test_format_round_trips.py diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py index 8d28fd8..8bd6dbb 100644 --- a/databusclient/api/convert.py +++ b/databusclient/api/convert.py @@ -1,14 +1,33 @@ """Format and Mapping Conversion Layer. +This module implements the format conversion pipeline for the Databus Python Client + Layer 2: Within-class format conversion (lossless). -Layer 3: Cross-class mapping conversion (quasi-equal for RDF <-> Tabular). + - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) + - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) + - TSDHandler: Tabular formats (csv, tsv) + +Layer 3 (prototype, not yet fully implemented): + - RDF triples -> CSV/TSV (quasi-equal, companion metadata generated) + +Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. +The IR (intermediate representation) returned by read() is designed to be passed +to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). """ import csv import json import os +import warnings from typing import Optional +# Suppress rdflib internal DeprecationWarning for Dataset API. +# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. +# These warnings originate from rdflib internals, not our code. +# Can be removed when rdflib completes their Dataset API migration. +warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") +warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") + from rdflib import Dataset, Graph @@ -57,9 +76,22 @@ ".tsv": "tsv", } +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + # --------------------------------------------------------------------------- -# Format detection +# Format detection helpers # --------------------------------------------------------------------------- def detect_format_from_filename(filename: str) -> Optional[str]: @@ -110,24 +142,6 @@ def get_format_class(fmt: str) -> str: ) -# --------------------------------------------------------------------------- -# Output filename helper -# --------------------------------------------------------------------------- - -# Maps format name -> file extension -FORMAT_TO_EXTENSION = { - "ntriples": ".nt", - "turtle": ".ttl", - "rdf-xml": ".rdf", - "nquads": ".nq", - "trig": ".trig", - "trix": ".trix", - "json-ld": ".jsonld", - "csv": ".csv", - "tsv": ".tsv", -} - - def get_converted_filename(original_filename: str, convert_format: str) -> str: """Generate output filename after format conversion. @@ -149,7 +163,7 @@ def get_converted_filename(original_filename: str, convert_format: str) -> str: name = name[: -len(ext)] break - # strip existing format extension + # strip existing format extension (longest first) for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): if name.lower().endswith(old_ext): name = name[: -len(old_ext)] @@ -160,95 +174,254 @@ def get_converted_filename(original_filename: str, convert_format: str) -> str: # --------------------------------------------------------------------------- -# Layer 2 — within-class format conversion +# Layer 2 Handlers # --------------------------------------------------------------------------- -def convert_rdf_triple_format( - input_file: str, - output_file: str, - input_format: str, - output_format: str, -) -> None: - """Convert between RDF triple serialization formats (Layer 2). +class TripleHandler: + """Handler for RDF triple formats (Layer 2). - Handles: ntriples, turtle, rdf-xml. - Uses rdflib Graph as internal representation. + Uses rdflib.Graph as the intermediate representation (IR). + Supports: ntriples, turtle, rdf-xml. - Args: - input_file: Path to input file. - output_file: Path to write converted output. - input_format: Source format name (must be in RDF_TRIPLE_FORMATS). - output_format: Target format name (must be in RDF_TRIPLE_FORMATS). + The IR returned by read() can be passed to future mapping classes + such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. """ - g = Graph() - g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) - g.serialize(destination=output_file, format=RDF_TRIPLE_FORMATS[output_format]) - print( - f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" - ) + def read(self, source: str, input_format: str) -> Graph: + """Parse an RDF triples file into a Graph (IR). -def convert_rdf_quad_format( - input_file: str, - output_file: str, - input_format: str, - output_format: str, -) -> None: - """Convert between RDF quad serialization formats (Layer 2). + Args: + source: Path to input file. + input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). - Handles: nquads, trig, trix, json-ld. - Uses rdflib Dataset as internal representation - to preserve named graph information. + Returns: + rdflib.Graph containing all parsed triples. - Args: - input_file: Path to input file. - output_file: Path to write converted output. - input_format: Source format name (must be in RDF_QUAD_FORMATS). - output_format: Target format name (must be in RDF_QUAD_FORMATS). + Raises: + ValueError: If input_format is not a recognised triple format. + """ + if input_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{input_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + g = Graph() + g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) + return g + + def write(self, data: Graph, target: str, output_format: str) -> None: + """Serialize a Graph (IR) to a file. + + Args: + data: rdflib.Graph to serialize. + target: Path to output file. + output_format: Target format name (e.g. 'ntriples', 'turtle'). + + Raises: + ValueError: If output_format is not a recognised triple format. + """ + if output_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{output_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + # Explicitly specify utf-8 encoding to avoid NTSerializer warning + data.serialize( + destination=target, + format=RDF_TRIPLE_FORMATS[output_format], + encoding="utf-8", + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF triple formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF triples). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + graph = self.read(source, input_format) + self.write(graph, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class QuadHandler: + """Handler for RDF quad formats (Layer 2). + + Uses rdflib.Dataset as the intermediate representation (IR). + Supports: nquads, trig, trix, json-ld. + + Named graph information is preserved through the Dataset IR. + The IR returned by read() can be passed to future mapping classes + such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. """ - g = Dataset() - g.parse(input_file, format=RDF_QUAD_FORMATS[input_format]) - g.serialize(destination=output_file, format=RDF_QUAD_FORMATS[output_format]) - print( - f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" - ) + def read(self, source: str, input_format: str) -> Dataset: + """Parse an RDF quads file into a Dataset (IR). -def convert_tabular_format( - input_file: str, - output_file: str, - input_format: str, - output_format: str, -) -> None: - """Convert between tabular formats (Layer 2). + Args: + source: Path to input file. + input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). - Handles: csv <-> tsv. - Uses Python built-in csv module. + Returns: + rdflib.Dataset containing all parsed quads with named graphs. - Args: - input_file: Path to input file. - output_file: Path to write converted output. - input_format: Source format name ('csv' or 'tsv'). - output_format: Target format name ('csv' or 'tsv'). + Raises: + ValueError: If input_format is not a recognised quad format. + """ + if input_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{input_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + d = Dataset() + d.parse(source, format=RDF_QUAD_FORMATS[input_format]) + return d + + def write(self, data: Dataset, target: str, output_format: str) -> None: + """Serialize a Dataset (IR) to a file. + + Args: + data: rdflib.Dataset to serialize. + target: Path to output file. + output_format: Target format name. + + Raises: + ValueError: If output_format is not a recognised quad format. + """ + if output_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{output_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + data.serialize( + destination=target, + format=RDF_QUAD_FORMATS[output_format], + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF quad formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF quads). Named graph information is preserved. + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + dataset = self.read(source, input_format) + self.write(dataset, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class TSDHandler: + """Handler for tabular structured data formats (Layer 2). + + Uses list[list[str]] as the intermediate representation (IR). + Supports: csv, tsv. + + The IR returned by read() can be passed to future mapping classes + such as TSDToTripleMapper for Layer 3 conversions. """ - input_delimiter = TABULAR_FORMATS[input_format] - output_delimiter = TABULAR_FORMATS[output_format] - with open(input_file, "r", newline="", encoding="utf-8") as infile: - reader = csv.reader(infile, delimiter=input_delimiter) - rows = list(reader) + def read(self, source: str, input_format: str) -> list: + """Parse a tabular file into a list of rows (IR). - with open(output_file, "w", newline="", encoding="utf-8") as outfile: - writer = csv.writer(outfile, delimiter=output_delimiter) - writer.writerows(rows) + Each row is a list of string values. First row is the header. - print( - f"Converted {input_format} -> {output_format}: {os.path.basename(output_file)}" - ) + Args: + source: Path to input file. + input_format: Source format name ('csv' or 'tsv'). + + Returns: + list[list[str]] where first element is the header row. + + Raises: + ValueError: If input_format is not a recognised tabular format. + """ + if input_format not in TABULAR_FORMATS: + raise ValueError( + f"'{input_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[input_format] + with open(source, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter=delimiter) + return list(reader) + + def write(self, data: list, target: str, output_format: str) -> None: + """Serialize a list of rows (IR) to a tabular file. + + Args: + data: list[list[str]] to write. + target: Path to output file. + output_format: Target format name ('csv' or 'tsv'). + + Raises: + ValueError: If output_format is not a recognised tabular format. + """ + if output_format not in TABULAR_FORMATS: + raise ValueError( + f"'{output_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[output_format] + with open(target, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter=delimiter) + writer.writerows(data) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between tabular formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (tabular). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + rows = self.read(source, input_format) + self.write(rows, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) # --------------------------------------------------------------------------- -# Layer 3 — cross-class mapping conversion +# Layer 3 prototype — RDF triples to CSV (not yet fully implemented) # --------------------------------------------------------------------------- def convert_rdf_to_csv( @@ -256,20 +429,23 @@ def convert_rdf_to_csv( output_file: str, input_format: str, ) -> None: - """Map RDF triples to a wide CSV table (Layer 3). + """Map RDF triples to a wide CSV table (Layer 3 prototype). Each unique subject becomes a row. Each unique predicate becomes a column. Multi-valued predicates are pipe-separated. - A companion .meta.json file is generated alongside the CSV to preserve - RDF datatype and language tag information for lossless round trips. + A companion .meta.json file is generated to preserve RDF datatype and + language tag information for lossless round trips. + + NOTE: This is a Layer 3 prototype. It is not yet tested and will be + properly implemented in the Layer 3 issue. Args: input_file: Path to input RDF triples file. output_file: Path to write output CSV file. input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). """ - g = Graph() - g.parse(input_file, format=RDF_TRIPLE_FORMATS[input_format]) + handler = TripleHandler() + g = handler.read(input_file, input_format) predicates = sorted(set(str(p) for s, p, o in g)) @@ -280,7 +456,6 @@ def convert_rdf_to_csv( subj = str(s) pred = str(p) - # capture datatype or language tag for companion file if hasattr(o, "datatype") and o.datatype: column_metadata[pred] = {"datatype": str(o.datatype)} elif hasattr(o, "language") and o.language: @@ -292,15 +467,16 @@ def convert_rdf_to_csv( subjects[subj][pred] = [] subjects[subj][pred].append(str(o)) - with open(output_file, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f) - writer.writerow(["resource"] + predicates) - for subj, pred_map in subjects.items(): - row = [subj] - for pred in predicates: - values = pred_map.get(pred, []) - row.append("|".join(values)) - writer.writerow(row) + tsd_handler = TSDHandler() + rows = [["resource"] + predicates] + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + tsd_handler.write(rows, output_file, "csv") companion_file = output_file + ".meta.json" with open(companion_file, "w", encoding="utf-8") as f: @@ -314,6 +490,12 @@ def convert_rdf_to_csv( # Main dispatcher — called from download pipeline # --------------------------------------------------------------------------- +# Handler instances — created once, reused +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + def convert_file( input_file: str, output_file: str, @@ -323,10 +505,7 @@ def convert_file( Detects the input format from the file extension, determines whether this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, - and delegates to the appropriate conversion function. - - For Layer 2: lossless, same equivalence class. - For Layer 3: quasi-equal for RDF <-> Tabular, lossless for Triples <-> Quads. + and delegates to the appropriate handler. Args: input_file: Path to the input file (must be decompressed). @@ -334,14 +513,15 @@ def convert_file( convert_format: Target format name (CLI format string). Raises: - ValueError: If the input format cannot be detected or if the - requested conversion is not supported. + ValueError: If input format cannot be detected or conversion + is not supported. """ input_format = detect_format_from_filename(input_file) if input_format is None: raise ValueError( - f"Could not detect input format from filename: '{os.path.basename(input_file)}'. " + f"Could not detect input format from filename: " + f"'{os.path.basename(input_file)}'. " f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" ) @@ -358,20 +538,20 @@ def convert_file( # --- Layer 2: within-class --- if input_class == output_class: if input_class == "triples": - convert_rdf_triple_format( + _triple_handler.convert( input_file, output_file, input_format, convert_format ) elif input_class == "quads": - convert_rdf_quad_format( + _quad_handler.convert( input_file, output_file, input_format, convert_format ) elif input_class == "tabular": - convert_tabular_format( + _tsd_handler.convert( input_file, output_file, input_format, convert_format ) return - # --- Layer 3: cross-class --- + # --- Layer 3: cross-class (prototype only) --- if input_class == "triples" and output_class == "tabular": convert_rdf_to_csv(input_file, output_file, input_format) return diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 74bd2fd..7c33fac 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -520,18 +520,16 @@ def _download_file( final_downloaded_file = target_filepath # --- 8. Convert file format if requested (AFTER compression conversion) --- + # Pipeline follows :decompress -> convert format -> recompress + # If the source was compressed, the converted output is recompressed: + # - to the format specified by --convert-to if provided + # - to the original compression format otherwise if convert_format: final_basename = os.path.basename(final_downloaded_file) compression_fmt = _detect_compression_format(final_basename) if compression_fmt: - # File is still compressed — decompress to a temp file first, - # then convert format, then clean up the temp file. - # This follows the pipeline: Download -> Decompress -> Convert -> Save - import tempfile - - source_module = COMPRESSION_MODULES[compression_fmt] - # temp decompressed file sits next to the original + # File is still compressed — decompress to temp, convert, recompress compression_ext = COMPRESSION_EXTENSIONS[compression_fmt] if final_downloaded_file.lower().endswith(compression_ext): temp_decompressed = final_downloaded_file[:-len(compression_ext)] @@ -542,6 +540,7 @@ def _download_file( print( f"Decompressing {final_basename} before format conversion..." ) + source_module = COMPRESSION_MODULES[compression_fmt] with source_module.open(final_downloaded_file, "rb") as sf: with open(temp_decompressed, "wb") as tf: while True: @@ -550,20 +549,43 @@ def _download_file( break tf.write(chunk) - # now convert the decompressed temp file - converted_filename = get_converted_filename( + # Convert format on the decompressed temp file + converted_basename = get_converted_filename( final_basename, convert_format ) - converted_filepath = os.path.join(localDir, converted_filename) + converted_filepath = os.path.join(localDir, converted_basename) convert_file(temp_decompressed, converted_filepath, convert_format) + # Recompress the converted output. + # Use --convert-to format if specified, otherwise use original compression. + recompress_fmt = convert_to if convert_to else compression_fmt + recompress_ext = COMPRESSION_EXTENSIONS[recompress_fmt] + recompressed_filepath = converted_filepath + recompress_ext + recompress_module = COMPRESSION_MODULES[recompress_fmt] + + print( + f"Recompressing converted file to {recompress_fmt}: " + f"{os.path.basename(recompressed_filepath)}" + ) + with open(converted_filepath, "rb") as sf: + with recompress_module.open(recompressed_filepath, "wb") as tf: + while True: + chunk = sf.read(8192) + if not chunk: + break + tf.write(chunk) + + # Remove the uncompressed converted file — keep only recompressed + if os.path.exists(converted_filepath): + os.remove(converted_filepath) + finally: - # always clean up temp file even if conversion fails + # Always clean up temp decompressed file if os.path.exists(temp_decompressed): os.remove(temp_decompressed) else: - # file is already uncompressed — convert directly + # File is already uncompressed — convert directly, no recompression needed converted_filename = get_converted_filename(final_basename, convert_format) converted_filepath = os.path.join(localDir, converted_filename) convert_file(final_downloaded_file, converted_filepath, convert_format) diff --git a/pyproject.toml b/pyproject.toml index 92f479b..72179cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,3 +29,9 @@ src = ["databusclient", "tests"] [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +filterwarnings = [ + "ignore::DeprecationWarning:rdflib", + "ignore::UserWarning:rdflib", +] diff --git a/tests/resources/sample.csv b/tests/resources/sample.csv new file mode 100644 index 0000000..50dda4c --- /dev/null +++ b/tests/resources/sample.csv @@ -0,0 +1,11 @@ +subject,predicate,object,graph +https://example.org/data/alice,http://xmlns.com/foaf/0.1/name,Alice,https://example.org/graph/people +https://example.org/data/alice,https://example.org/vocab/age,29,https://example.org/graph/people +https://example.org/data/alice,https://example.org/vocab/livesAt,_:address1,https://example.org/graph/people +_:address1,https://example.org/vocab/city,Leipzig,https://example.org/graph/people +_:address1,https://example.org/vocab/country,Germany,https://example.org/graph/people +https://example.org/data/bob,http://xmlns.com/foaf/0.1/name,Bob,https://example.org/graph/people +https://example.org/data/bob,https://example.org/vocab/age,34,https://example.org/graph/people +https://example.org/data/bob,https://example.org/vocab/knows,https://example.org/data/alice,https://example.org/graph/people +https://example.org/data/project1,https://example.org/vocab/title,Databus Example Project,https://example.org/graph/projects +https://example.org/data/project1,https://example.org/vocab/member,https://example.org/data/alice,https://example.org/graph/projects \ No newline at end of file diff --git a/tests/resources/sample.jsonld b/tests/resources/sample.jsonld new file mode 100644 index 0000000..af80f31 --- /dev/null +++ b/tests/resources/sample.jsonld @@ -0,0 +1,62 @@ +{ + "@context": { + "@base": "https://example.org/data/", + "ex": "https://example.org/vocab/", + "foaf": "http://xmlns.com/foaf/0.1/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "name": "foaf:name", + "age": { + "@id": "ex:age", + "@type": "xsd:integer" + }, + "livesAt": { + "@id": "ex:livesAt", + "@type": "@id" + }, + "city": "ex:city", + "country": "ex:country", + "knows": { + "@id": "ex:knows", + "@type": "@id" + }, + "title": "ex:title", + "member": { + "@id": "ex:member", + "@type": "@id" + } + }, + "@graph": [ + { + "@id": "https://example.org/graph/people", + "@graph": [ + { + "@id": "alice", + "name": "Alice", + "age": 29, + "livesAt": "_:address1" + }, + { + "@id": "_:address1", + "city": "Leipzig", + "country": "Germany" + }, + { + "@id": "bob", + "name": "Bob", + "age": 34, + "knows": "alice" + } + ] + }, + { + "@id": "https://example.org/graph/projects", + "@graph": [ + { + "@id": "project1", + "title": "Databus Example Project", + "member": "alice" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/resources/sample.nq b/tests/resources/sample.nq new file mode 100644 index 0000000..a111652 --- /dev/null +++ b/tests/resources/sample.nq @@ -0,0 +1,10 @@ + "Alice" . + "29"^^ . + _:address1 . +_:address1 "Leipzig" . +_:address1 "Germany" . + "Bob" . + "34"^^ . + . + "Databus Example Project" . + . \ No newline at end of file diff --git a/tests/resources/sample.nt b/tests/resources/sample.nt new file mode 100644 index 0000000..f6b8488 --- /dev/null +++ b/tests/resources/sample.nt @@ -0,0 +1,10 @@ + "Alice" . + "29"^^ . + _:address1 . +_:address1 "Leipzig" . +_:address1 "Germany" . + "Bob" . + "34"^^ . + . + "Databus Example Project" . + . \ No newline at end of file diff --git a/tests/resources/sample.rdf b/tests/resources/sample.rdf new file mode 100644 index 0000000..c8bb09a --- /dev/null +++ b/tests/resources/sample.rdf @@ -0,0 +1,30 @@ + + + + + Alice + 29 + + + + + Leipzig + Germany + + + + Bob + 34 + + + + + Databus Example Project + + + + \ No newline at end of file diff --git a/tests/resources/sample.trig b/tests/resources/sample.trig new file mode 100644 index 0000000..e4abc3f --- /dev/null +++ b/tests/resources/sample.trig @@ -0,0 +1,22 @@ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + { + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + + _:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . +} + + { + ex:title "Databus Example Project" ; + ex:member . +} \ No newline at end of file diff --git a/tests/resources/sample.trix b/tests/resources/sample.trix new file mode 100644 index 0000000..d8edb13 --- /dev/null +++ b/tests/resources/sample.trix @@ -0,0 +1,72 @@ + + + + + https://example.org/graph/people + + + https://example.org/data/alice + http://xmlns.com/foaf/0.1/name + Alice + + + + https://example.org/data/alice + https://example.org/vocab/age + 29 + + + + https://example.org/data/alice + https://example.org/vocab/livesAt + address1 + + + + address1 + https://example.org/vocab/city + Leipzig + + + + address1 + https://example.org/vocab/country + Germany + + + + https://example.org/data/bob + http://xmlns.com/foaf/0.1/name + Bob + + + + https://example.org/data/bob + https://example.org/vocab/age + 34 + + + + https://example.org/data/bob + https://example.org/vocab/knows + https://example.org/data/alice + + + + + https://example.org/graph/projects + + + https://example.org/data/project1 + https://example.org/vocab/title + Databus Example Project + + + + https://example.org/data/project1 + https://example.org/vocab/member + https://example.org/data/alice + + + + \ No newline at end of file diff --git a/tests/resources/sample.tsv b/tests/resources/sample.tsv new file mode 100644 index 0000000..c23af40 --- /dev/null +++ b/tests/resources/sample.tsv @@ -0,0 +1,11 @@ +subject predicate object graph +https://example.org/data/alice http://xmlns.com/foaf/0.1/name Alice https://example.org/graph/people +https://example.org/data/alice https://example.org/vocab/age 29 https://example.org/graph/people +https://example.org/data/alice https://example.org/vocab/livesAt _:address1 https://example.org/graph/people +_:address1 https://example.org/vocab/city Leipzig https://example.org/graph/people +_:address1 https://example.org/vocab/country Germany https://example.org/graph/people +https://example.org/data/bob http://xmlns.com/foaf/0.1/name Bob https://example.org/graph/people +https://example.org/data/bob https://example.org/vocab/age 34 https://example.org/graph/people +https://example.org/data/bob https://example.org/vocab/knows https://example.org/data/alice https://example.org/graph/people +https://example.org/data/project1 https://example.org/vocab/title Databus Example Project https://example.org/graph/projects +https://example.org/data/project1 https://example.org/vocab/member https://example.org/data/alice https://example.org/graph/projects \ No newline at end of file diff --git a/tests/resources/sample.ttl b/tests/resources/sample.ttl new file mode 100644 index 0000000..a8eb198 --- /dev/null +++ b/tests/resources/sample.ttl @@ -0,0 +1,18 @@ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + +_:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . + + ex:title "Databus Example Project" ; + ex:member . \ No newline at end of file diff --git a/tests/test_conversion.py b/tests/test_conversion.py deleted file mode 100644 index 7ce710c..0000000 --- a/tests/test_conversion.py +++ /dev/null @@ -1,309 +0,0 @@ -"""Round trip tests for Layer 2 format conversion. - -Following the strategy from Frey et al., each test validates that -reading a format and writing it back produces semantically identical output. -Pattern: parse(format X) -> serialize(format X) -> parse again -> compare. - -9 tests total: -- Triple formats: ntriples, turtle, rdf-xml (3 tests) -- Quad formats: nquads, trig, trix, json-ld (4 tests) -- Tabular formats: csv, tsv (2 tests) -""" - -import csv -import os -import tempfile - -from rdflib import Dataset, Graph - -from databusclient.api.convert import ( - convert_rdf_quad_format, - convert_rdf_triple_format, - convert_tabular_format, -) - -# --------------------------------------------------------------------------- -# Sample RDF data used across all RDF tests -# --------------------------------------------------------------------------- - -SAMPLE_TURTLE = """ -@prefix ex: . -@prefix schema: . -@prefix xsd: . - -ex:Paris schema:isCapitalOf ex:France ; - schema:population "2161000"^^xsd:integer . - -ex:Berlin schema:isCapitalOf ex:Germany ; - schema:population "3645000"^^xsd:integer . -""" - -SAMPLE_NQUADS = """ - . - . - . -""" - -SAMPLE_CSV = """resource,name,population -http://example.org/Paris,Paris,2161000 -http://example.org/Berlin,Berlin,3645000 -""" - -SAMPLE_TSV = "resource\tname\tpopulation\nhttp://example.org/Paris\tParis\t2161000\nhttp://example.org/Berlin\tBerlin\t3645000\n" - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _write_temp(content: str, suffix: str) -> str: - """Write content to a named temp file and return its path.""" - fd, path = tempfile.mkstemp(suffix=suffix) - with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write(content) - return path - - -def _graphs_are_isomorphic(g1: Graph, g2: Graph) -> bool: - """Check semantic equivalence of two rdflib Graphs.""" - return g1.isomorphic(g2) - - -def _datasets_equal(g1: Dataset, g2: Dataset) -> bool: - """Check semantic equivalence of two Datasets by triple count and graph names.""" - if len(g1) != len(g2): - return False - graphs1 = {str(c.identifier) for c in g1.graphs()} - graphs2 = {str(c.identifier) for c in g2.graphs()} - return graphs1 == graphs2 - - -# --------------------------------------------------------------------------- -# Triple format round trip tests (Layer 2) -# --------------------------------------------------------------------------- - -def test_round_trip_turtle(): - """Turtle -> Turtle: parse, serialize, reparse, compare.""" - input_path = _write_temp(SAMPLE_TURTLE, ".ttl") - output_path = input_path + ".rt.ttl" - - try: - convert_rdf_triple_format(input_path, output_path, "turtle", "turtle") - - g_original = Graph() - g_original.parse(input_path, format="turtle") - - g_roundtrip = Graph() - g_roundtrip.parse(output_path, format="turtle") - - assert _graphs_are_isomorphic(g_original, g_roundtrip), ( - "Turtle round trip failed: graphs are not isomorphic" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_ntriples(): - """N-Triples -> N-Triples: parse, serialize, reparse, compare.""" - # first produce an ntriples file from turtle - turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") - nt_path = turtle_path + ".nt" - output_path = nt_path + ".rt.nt" - - try: - convert_rdf_triple_format(turtle_path, nt_path, "turtle", "ntriples") - convert_rdf_triple_format(nt_path, output_path, "ntriples", "ntriples") - - g_original = Graph() - g_original.parse(nt_path, format="ntriples") - - g_roundtrip = Graph() - g_roundtrip.parse(output_path, format="ntriples") - - assert _graphs_are_isomorphic(g_original, g_roundtrip), ( - "N-Triples round trip failed: graphs are not isomorphic" - ) - finally: - for p in (turtle_path, nt_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_rdf_xml(): - """RDF/XML -> RDF/XML: parse, serialize, reparse, compare.""" - turtle_path = _write_temp(SAMPLE_TURTLE, ".ttl") - rdf_path = turtle_path + ".rdf" - output_path = rdf_path + ".rt.rdf" - - try: - convert_rdf_triple_format(turtle_path, rdf_path, "turtle", "rdf-xml") - convert_rdf_triple_format(rdf_path, output_path, "rdf-xml", "rdf-xml") - - g_original = Graph() - g_original.parse(rdf_path, format="xml") - - g_roundtrip = Graph() - g_roundtrip.parse(output_path, format="xml") - - assert _graphs_are_isomorphic(g_original, g_roundtrip), ( - "RDF/XML round trip failed: graphs are not isomorphic" - ) - finally: - for p in (turtle_path, rdf_path, output_path): - if os.path.exists(p): - os.remove(p) - - -# --------------------------------------------------------------------------- -# Quad format round trip tests (Layer 2) -# --------------------------------------------------------------------------- - -def test_round_trip_nquads(): - """N-Quads -> N-Quads: parse, serialize, reparse, compare.""" - input_path = _write_temp(SAMPLE_NQUADS, ".nq") - output_path = input_path + ".rt.nq" - - try: - convert_rdf_quad_format(input_path, output_path, "nquads", "nquads") - - g_original = Dataset() - g_original.parse(input_path, format="nquads") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="nquads") - - assert _datasets_equal(g_original, g_roundtrip), ( - "N-Quads round trip failed: graphs are not equal" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_trig(): - """TriG -> TriG: parse, serialize, reparse, compare.""" - # produce trig from nquads - nq_path = _write_temp(SAMPLE_NQUADS, ".nq") - trig_path = nq_path + ".trig" - output_path = trig_path + ".rt.trig" - - try: - convert_rdf_quad_format(nq_path, trig_path, "nquads", "trig") - convert_rdf_quad_format(trig_path, output_path, "trig", "trig") - - g_original = Dataset() - g_original.parse(trig_path, format="trig") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="trig") - - assert _datasets_equal(g_original, g_roundtrip), ( - "TriG round trip failed: graphs are not equal" - ) - finally: - for p in (nq_path, trig_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_trix(): - """TriX -> TriX: parse, serialize, reparse, compare.""" - nq_path = _write_temp(SAMPLE_NQUADS, ".nq") - trix_path = nq_path + ".trix" - output_path = trix_path + ".rt.trix" - - try: - convert_rdf_quad_format(nq_path, trix_path, "nquads", "trix") - convert_rdf_quad_format(trix_path, output_path, "trix", "trix") - - g_original = Dataset() - g_original.parse(trix_path, format="trix") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="trix") - - assert _datasets_equal(g_original, g_roundtrip), ( - "TriX round trip failed: graphs are not equal" - ) - finally: - for p in (nq_path, trix_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_json_ld(): - """JSON-LD -> JSON-LD: parse, serialize, reparse, compare.""" - nq_path = _write_temp(SAMPLE_NQUADS, ".nq") - jsonld_path = nq_path + ".jsonld" - output_path = jsonld_path + ".rt.jsonld" - - try: - convert_rdf_quad_format(nq_path, jsonld_path, "nquads", "json-ld") - convert_rdf_quad_format(jsonld_path, output_path, "json-ld", "json-ld") - - g_original = Dataset() - g_original.parse(jsonld_path, format="json-ld") - - g_roundtrip = Dataset() - g_roundtrip.parse(output_path, format="json-ld") - - assert _datasets_equal(g_original, g_roundtrip), ( - "JSON-LD round trip failed: graphs are not equal" - ) - finally: - for p in (nq_path, jsonld_path, output_path): - if os.path.exists(p): - os.remove(p) - - -# --------------------------------------------------------------------------- -# Tabular format round trip tests (Layer 2) -# --------------------------------------------------------------------------- - -def test_round_trip_csv(): - """CSV -> CSV: read, write, reread, compare rows.""" - input_path = _write_temp(SAMPLE_CSV, ".csv") - output_path = input_path + ".rt.csv" - - try: - convert_tabular_format(input_path, output_path, "csv", "csv") - - with open(input_path, newline="", encoding="utf-8") as f: - original_rows = list(csv.reader(f)) - - with open(output_path, newline="", encoding="utf-8") as f: - roundtrip_rows = list(csv.reader(f)) - - assert original_rows == roundtrip_rows, ( - "CSV round trip failed: rows do not match" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) - - -def test_round_trip_tsv(): - """TSV -> TSV: read, write, reread, compare rows.""" - input_path = _write_temp(SAMPLE_TSV, ".tsv") - output_path = input_path + ".rt.tsv" - - try: - convert_tabular_format(input_path, output_path, "tsv", "tsv") - - with open(input_path, newline="", encoding="utf-8") as f: - original_rows = list(csv.reader(f, delimiter="\t")) - - with open(output_path, newline="", encoding="utf-8") as f: - roundtrip_rows = list(csv.reader(f, delimiter="\t")) - - assert original_rows == roundtrip_rows, ( - "TSV round trip failed: rows do not match" - ) - finally: - for p in (input_path, output_path): - if os.path.exists(p): - os.remove(p) \ No newline at end of file diff --git a/tests/test_format_round_trips.py b/tests/test_format_round_trips.py new file mode 100644 index 0000000..ebe9f76 --- /dev/null +++ b/tests/test_format_round_trips.py @@ -0,0 +1,256 @@ +"""Round trip tests for Layer 2 format conversion. + +Following the strategy from Frey et al., each test validates that +reading a format and writing it back produces semantically identical output. + +The key validation pattern using handlers and IR: + 1. Read original file into IR (Graph/Dataset/rows) BEFORE any conversion + 2. Convert the file through the handler (read -> write cycle) + 3. Read the converted output back into IR + 4. Compare both IRs — if conversion lost data, IRs will differ + +This correctly catches information loss because g_original is captured +BEFORE serialization, not after. Both IRs use the same rdflib internal +representation, making comparison meaningful at the data level. + +Test data lives in tests/resources/ — one sample file per format. +These files are semantically consistent (same cities dataset across +all formats) and are shared across Layer 2 and future Layer 3 tests. + +9 round trip tests total: + Triple formats: turtle, ntriples, rdf-xml (3 tests) + Quad formats: nquads, trig, trix, json-ld (4 tests) + Tabular formats: csv, tsv (2 tests) +""" + +import os +import tempfile + +from databusclient.api.convert import ( + QuadHandler, + TSDHandler, + TripleHandler, +) + +# --------------------------------------------------------------------------- +# Path to shared test resources +# --------------------------------------------------------------------------- + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def resource(filename: str) -> str: + """Return absolute path to a file in tests/resources/.""" + return os.path.join(RESOURCES, filename) + + +# --------------------------------------------------------------------------- +# Handler instances shared across tests +# --------------------------------------------------------------------------- + +triple_handler = TripleHandler() +quad_handler = QuadHandler() +tsd_handler = TSDHandler() + + +# --------------------------------------------------------------------------- +# Triple format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_turtle(): + """Turtle -> Turtle: read into IR before conversion, compare after.""" + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.NamedTemporaryFile(suffix=".ttl", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "turtle", "turtle") + g_roundtrip = triple_handler.read(output, "turtle") + assert g_original.isomorphic(g_roundtrip), ( + "Turtle round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_ntriples(): + """N-Triples -> N-Triples: read into IR before conversion, compare after.""" + source = resource("sample.nt") + g_original = triple_handler.read(source, "ntriples") + + with tempfile.NamedTemporaryFile(suffix=".nt", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "ntriples", "ntriples") + g_roundtrip = triple_handler.read(output, "ntriples") + assert g_original.isomorphic(g_roundtrip), ( + "N-Triples round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_rdf_xml(): + """RDF/XML -> RDF/XML: read into IR before conversion, compare after.""" + source = resource("sample.rdf") + g_original = triple_handler.read(source, "rdf-xml") + + with tempfile.NamedTemporaryFile(suffix=".rdf", delete=False) as f: + output = f.name + try: + triple_handler.convert(source, output, "rdf-xml", "rdf-xml") + g_roundtrip = triple_handler.read(output, "rdf-xml") + assert g_original.isomorphic(g_roundtrip), ( + "RDF/XML round trip failed: graphs are not isomorphic" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +# --------------------------------------------------------------------------- +# Quad format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def _datasets_equal(d1, d2) -> bool: + """Check semantic equivalence of two Datasets. + + Compares total triple count, named graph identifiers, and + performs isomorphism check on each named graph to correctly + handle blank node renaming during serialization. + """ + if len(d1) != len(d2): + return False + + graphs1 = {str(g.identifier) for g in d1.graphs()} + graphs2 = {str(g.identifier) for g in d2.graphs()} + if graphs1 != graphs2: + return False + + # Compare triples inside each named graph using isomorphism + # to correctly handle blank nodes that may be renamed during + # serialization/deserialization + for g1 in d1.graphs(): + graph_id = str(g1.identifier) + g2 = d2.get_context(g1.identifier) + if g2 is None: + return False + if not g1.isomorphic(g2): + return False + + return True + + +def test_round_trip_nquads(): + """N-Quads -> N-Quads: read into IR before conversion, compare after.""" + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.NamedTemporaryFile(suffix=".nq", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "nquads", "nquads") + d_roundtrip = quad_handler.read(output, "nquads") + assert _datasets_equal(d_original, d_roundtrip), ( + "N-Quads round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_trig(): + """TriG -> TriG: read into IR before conversion, compare after.""" + source = resource("sample.trig") + d_original = quad_handler.read(source, "trig") + + with tempfile.NamedTemporaryFile(suffix=".trig", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "trig", "trig") + d_roundtrip = quad_handler.read(output, "trig") + assert _datasets_equal(d_original, d_roundtrip), ( + "TriG round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_trix(): + """TriX -> TriX: read into IR before conversion, compare after.""" + source = resource("sample.trix") + d_original = quad_handler.read(source, "trix") + + with tempfile.NamedTemporaryFile(suffix=".trix", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "trix", "trix") + d_roundtrip = quad_handler.read(output, "trix") + assert _datasets_equal(d_original, d_roundtrip), ( + "TriX round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_json_ld(): + """JSON-LD -> JSON-LD: read into IR before conversion, compare after.""" + source = resource("sample.jsonld") + d_original = quad_handler.read(source, "json-ld") + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + output = f.name + try: + quad_handler.convert(source, output, "json-ld", "json-ld") + d_roundtrip = quad_handler.read(output, "json-ld") + assert _datasets_equal(d_original, d_roundtrip), ( + "JSON-LD round trip failed: datasets are not equal" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +# --------------------------------------------------------------------------- +# Tabular format round trip tests (Layer 2) +# --------------------------------------------------------------------------- + +def test_round_trip_csv(): + """CSV -> CSV: read into IR before conversion, compare after.""" + source = resource("sample.csv") + rows_original = tsd_handler.read(source, "csv") + + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: + output = f.name + try: + tsd_handler.convert(source, output, "csv", "csv") + rows_roundtrip = tsd_handler.read(output, "csv") + assert rows_original == rows_roundtrip, ( + "CSV round trip failed: rows do not match" + ) + finally: + if os.path.exists(output): + os.remove(output) + + +def test_round_trip_tsv(): + """TSV -> TSV: read into IR before conversion, compare after.""" + source = resource("sample.tsv") + rows_original = tsd_handler.read(source, "tsv") + + with tempfile.NamedTemporaryFile(suffix=".tsv", delete=False) as f: + output = f.name + try: + tsd_handler.convert(source, output, "tsv", "tsv") + rows_roundtrip = tsd_handler.read(output, "tsv") + assert rows_original == rows_roundtrip, ( + "TSV round trip failed: rows do not match" + ) + finally: + if os.path.exists(output): + os.remove(output) \ No newline at end of file From f2fe92eddf6540ca8880865230020a70b660f0ec Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Thu, 11 Jun 2026 01:42:48 +0530 Subject: [PATCH 3/7] Improvements to Format layer implementation --- databusclient/api/convert.py | 589 ++------------------------ databusclient/api/download.py | 173 ++++---- databusclient/filehandling/format.py | 511 ++++++++++++++++++++++ databusclient/filehandling/mapping.py | 68 +++ pyproject.toml | 3 + run_all_conversion_tests.py | 4 + 6 files changed, 719 insertions(+), 629 deletions(-) create mode 100644 databusclient/filehandling/format.py create mode 100644 databusclient/filehandling/mapping.py diff --git a/databusclient/api/convert.py b/databusclient/api/convert.py index 8bd6dbb..b095eaf 100644 --- a/databusclient/api/convert.py +++ b/databusclient/api/convert.py @@ -1,563 +1,50 @@ -"""Format and Mapping Conversion Layer. - -This module implements the format conversion pipeline for the Databus Python Client - -Layer 2: Within-class format conversion (lossless). - - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) - - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) - - TSDHandler: Tabular formats (csv, tsv) - -Layer 3 (prototype, not yet fully implemented): - - RDF triples -> CSV/TSV (quasi-equal, companion metadata generated) - -Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. -The IR (intermediate representation) returned by read() is designed to be passed -to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). -""" - -import csv -import json -import os -import warnings -from typing import Optional - -# Suppress rdflib internal DeprecationWarning for Dataset API. -# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. -# These warnings originate from rdflib internals, not our code. -# Can be removed when rdflib completes their Dataset API migration. -warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") -warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") - -from rdflib import Dataset, Graph - - -# --------------------------------------------------------------------------- -# Format registries -# --------------------------------------------------------------------------- - -# Maps CLI format name -> rdflib format string -RDF_TRIPLE_FORMATS = { - "ntriples": "ntriples", - "turtle": "turtle", - "rdf-xml": "xml", -} - -RDF_QUAD_FORMATS = { - "nquads": "nquads", - "trig": "trig", - "trix": "trix", - "json-ld": "json-ld", -} - -TABULAR_FORMATS = { - "csv": ",", - "tsv": "\t", -} - -ALL_FORMATS = ( - list(RDF_TRIPLE_FORMATS) - + list(RDF_QUAD_FORMATS) - + list(TABULAR_FORMATS) +from databusclient.filehandling.format import convert_file, get_converted_filename +from databusclient.filehandling import mapping as _mapping + +from databusclient.filehandling.format import ( # noqa: F401 + ALL_FORMATS, + EXTENSION_TO_FORMAT, + FORMAT_TO_EXTENSION, + RDF_QUAD_FORMATS, + RDF_TRIPLE_FORMATS, + TABULAR_FORMATS, + QuadHandler, + TSDHandler, + TripleHandler, + _quad_handler, + _tsd_handler, + _triple_handler, + detect_format_from_filename, + get_format_class, ) -# Maps file extension -> CLI format name -EXTENSION_TO_FORMAT = { - ".ttl": "turtle", - ".nt": "ntriples", - ".rdf": "rdf-xml", - ".xml": "rdf-xml", - ".owl": "rdf-xml", - ".nq": "nquads", - ".trig": "trig", - ".trix": "trix", - ".jsonld": "json-ld", - ".json": "json-ld", - ".csv": "csv", - ".tsv": "tsv", -} - -# Maps format name -> file extension -FORMAT_TO_EXTENSION = { - "ntriples": ".nt", - "turtle": ".ttl", - "rdf-xml": ".rdf", - "nquads": ".nq", - "trig": ".trig", - "trix": ".trix", - "json-ld": ".jsonld", - "csv": ".csv", - "tsv": ".tsv", -} - - -# --------------------------------------------------------------------------- -# Format detection helpers -# --------------------------------------------------------------------------- - -def detect_format_from_filename(filename: str) -> Optional[str]: - """Detect format from file extension, ignoring compression extensions. - - Args: - filename: File name or path. - - Returns: - Format name string or None if not detectable. - """ - name = filename.lower() - - # strip compression extension first - for ext in (".bz2", ".gz", ".xz"): - if name.endswith(ext): - name = name[: -len(ext)] - break - - # match longest extension first to avoid .json matching before .jsonld - for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): - if name.endswith(ext): - return EXTENSION_TO_FORMAT[ext] - - return None - - -def get_format_class(fmt: str) -> str: - """Return equivalence class for a format name. - - Args: - fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). - - Returns: - 'triples', 'quads', or 'tabular'. - - Raises: - ValueError: If format is not recognised. - """ - if fmt in RDF_TRIPLE_FORMATS: - return "triples" - if fmt in RDF_QUAD_FORMATS: - return "quads" - if fmt in TABULAR_FORMATS: - return "tabular" - raise ValueError( - f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" - ) - - -def get_converted_filename(original_filename: str, convert_format: str) -> str: - """Generate output filename after format conversion. - - Strips compression extension if present, then replaces the format - extension with the target format extension. - - Args: - original_filename: Original file name (basename only, not full path). - convert_format: Target format name. - - Returns: - New filename with updated extension. - """ - name = original_filename - - # strip compression extension - for ext in (".bz2", ".gz", ".xz"): - if name.lower().endswith(ext): - name = name[: -len(ext)] - break - - # strip existing format extension (longest first) - for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): - if name.lower().endswith(old_ext): - name = name[: -len(old_ext)] - break - - target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") - return name + target_ext - - -# --------------------------------------------------------------------------- -# Layer 2 Handlers -# --------------------------------------------------------------------------- - -class TripleHandler: - """Handler for RDF triple formats (Layer 2). - - Uses rdflib.Graph as the intermediate representation (IR). - Supports: ntriples, turtle, rdf-xml. - - The IR returned by read() can be passed to future mapping classes - such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. - """ - - def read(self, source: str, input_format: str) -> Graph: - """Parse an RDF triples file into a Graph (IR). - - Args: - source: Path to input file. - input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). - - Returns: - rdflib.Graph containing all parsed triples. - - Raises: - ValueError: If input_format is not a recognised triple format. - """ - if input_format not in RDF_TRIPLE_FORMATS: - raise ValueError( - f"'{input_format}' is not a triple format. " - f"Supported: {list(RDF_TRIPLE_FORMATS)}" - ) - g = Graph() - g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) - return g - - def write(self, data: Graph, target: str, output_format: str) -> None: - """Serialize a Graph (IR) to a file. - - Args: - data: rdflib.Graph to serialize. - target: Path to output file. - output_format: Target format name (e.g. 'ntriples', 'turtle'). - - Raises: - ValueError: If output_format is not a recognised triple format. - """ - if output_format not in RDF_TRIPLE_FORMATS: - raise ValueError( - f"'{output_format}' is not a triple format. " - f"Supported: {list(RDF_TRIPLE_FORMATS)}" - ) - # Explicitly specify utf-8 encoding to avoid NTSerializer warning - data.serialize( - destination=target, - format=RDF_TRIPLE_FORMATS[output_format], - encoding="utf-8", - ) - - def convert( - self, - source: str, - target: str, - input_format: str, - output_format: str, - ) -> None: - """Convert between RDF triple formats (Layer 2, lossless). - - Chains read() -> write(). Both formats must be in the same - equivalence class (RDF triples). - - Args: - source: Path to input file. - target: Path to output file. - input_format: Source format name. - output_format: Target format name. - """ - graph = self.read(source, input_format) - self.write(graph, target, output_format) - print( - f"Converted {input_format} -> {output_format}: " - f"{os.path.basename(target)}" - ) - - -class QuadHandler: - """Handler for RDF quad formats (Layer 2). +__all__ = ["convert_file", "get_converted_filename"] - Uses rdflib.Dataset as the intermediate representation (IR). - Supports: nquads, trig, trix, json-ld. +convert_rdf_to_csv = _mapping.convert_rdf_to_csv - Named graph information is preserved through the Dataset IR. - The IR returned by read() can be passed to future mapping classes - such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. - """ - def read(self, source: str, input_format: str) -> Dataset: - """Parse an RDF quads file into a Dataset (IR). - - Args: - source: Path to input file. - input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). - - Returns: - rdflib.Dataset containing all parsed quads with named graphs. - - Raises: - ValueError: If input_format is not a recognised quad format. - """ - if input_format not in RDF_QUAD_FORMATS: - raise ValueError( - f"'{input_format}' is not a quad format. " - f"Supported: {list(RDF_QUAD_FORMATS)}" - ) - d = Dataset() - d.parse(source, format=RDF_QUAD_FORMATS[input_format]) - return d - - def write(self, data: Dataset, target: str, output_format: str) -> None: - """Serialize a Dataset (IR) to a file. - - Args: - data: rdflib.Dataset to serialize. - target: Path to output file. - output_format: Target format name. - - Raises: - ValueError: If output_format is not a recognised quad format. - """ - if output_format not in RDF_QUAD_FORMATS: - raise ValueError( - f"'{output_format}' is not a quad format. " - f"Supported: {list(RDF_QUAD_FORMATS)}" - ) - data.serialize( - destination=target, - format=RDF_QUAD_FORMATS[output_format], - ) - - def convert( - self, - source: str, - target: str, - input_format: str, - output_format: str, - ) -> None: - """Convert between RDF quad formats (Layer 2, lossless). - - Chains read() -> write(). Both formats must be in the same - equivalence class (RDF quads). Named graph information is preserved. - - Args: - source: Path to input file. - target: Path to output file. - input_format: Source format name. - output_format: Target format name. - """ - dataset = self.read(source, input_format) - self.write(dataset, target, output_format) - print( - f"Converted {input_format} -> {output_format}: " - f"{os.path.basename(target)}" - ) - - -class TSDHandler: - """Handler for tabular structured data formats (Layer 2). - - Uses list[list[str]] as the intermediate representation (IR). - Supports: csv, tsv. - - The IR returned by read() can be passed to future mapping classes - such as TSDToTripleMapper for Layer 3 conversions. - """ - - def read(self, source: str, input_format: str) -> list: - """Parse a tabular file into a list of rows (IR). - - Each row is a list of string values. First row is the header. - - Args: - source: Path to input file. - input_format: Source format name ('csv' or 'tsv'). - - Returns: - list[list[str]] where first element is the header row. - - Raises: - ValueError: If input_format is not a recognised tabular format. - """ - if input_format not in TABULAR_FORMATS: - raise ValueError( - f"'{input_format}' is not a tabular format. " - f"Supported: {list(TABULAR_FORMATS)}" - ) - delimiter = TABULAR_FORMATS[input_format] - with open(source, "r", newline="", encoding="utf-8") as f: - reader = csv.reader(f, delimiter=delimiter) - return list(reader) - - def write(self, data: list, target: str, output_format: str) -> None: - """Serialize a list of rows (IR) to a tabular file. - - Args: - data: list[list[str]] to write. - target: Path to output file. - output_format: Target format name ('csv' or 'tsv'). - - Raises: - ValueError: If output_format is not a recognised tabular format. - """ - if output_format not in TABULAR_FORMATS: - raise ValueError( - f"'{output_format}' is not a tabular format. " - f"Supported: {list(TABULAR_FORMATS)}" - ) - delimiter = TABULAR_FORMATS[output_format] - with open(target, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f, delimiter=delimiter) - writer.writerows(data) - - def convert( - self, - source: str, - target: str, - input_format: str, - output_format: str, - ) -> None: - """Convert between tabular formats (Layer 2, lossless). - - Chains read() -> write(). Both formats must be in the same - equivalence class (tabular). - - Args: - source: Path to input file. - target: Path to output file. - input_format: Source format name. - output_format: Target format name. - """ - rows = self.read(source, input_format) - self.write(rows, target, output_format) - print( - f"Converted {input_format} -> {output_format}: " - f"{os.path.basename(target)}" - ) - - -# --------------------------------------------------------------------------- -# Layer 3 prototype — RDF triples to CSV (not yet fully implemented) -# --------------------------------------------------------------------------- - -def convert_rdf_to_csv( - input_file: str, - output_file: str, +def convert_rdf_triple_format( + source: str, + target: str, input_format: str, + output_format: str, ) -> None: - """Map RDF triples to a wide CSV table (Layer 3 prototype). - - Each unique subject becomes a row. Each unique predicate becomes a column. - Multi-valued predicates are pipe-separated. - A companion .meta.json file is generated to preserve RDF datatype and - language tag information for lossless round trips. - - NOTE: This is a Layer 3 prototype. It is not yet tested and will be - properly implemented in the Layer 3 issue. - - Args: - input_file: Path to input RDF triples file. - output_file: Path to write output CSV file. - input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). - """ - handler = TripleHandler() - g = handler.read(input_file, input_format) - - predicates = sorted(set(str(p) for s, p, o in g)) - - subjects: dict = {} - column_metadata: dict = {} - - for s, p, o in g: - subj = str(s) - pred = str(p) - - if hasattr(o, "datatype") and o.datatype: - column_metadata[pred] = {"datatype": str(o.datatype)} - elif hasattr(o, "language") and o.language: - column_metadata[pred] = {"language": str(o.language)} - - if subj not in subjects: - subjects[subj] = {} - if pred not in subjects[subj]: - subjects[subj][pred] = [] - subjects[subj][pred].append(str(o)) - - tsd_handler = TSDHandler() - rows = [["resource"] + predicates] - for subj, pred_map in subjects.items(): - row = [subj] - for pred in predicates: - values = pred_map.get(pred, []) - row.append("|".join(values)) - rows.append(row) - - tsd_handler.write(rows, output_file, "csv") - - companion_file = output_file + ".meta.json" - with open(companion_file, "w", encoding="utf-8") as f: - json.dump({"columns": column_metadata}, f, indent=2) - - print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") - print(f"Companion metadata: {os.path.basename(companion_file)}") + _triple_handler.convert(source, target, input_format, output_format) -# --------------------------------------------------------------------------- -# Main dispatcher — called from download pipeline -# --------------------------------------------------------------------------- - -# Handler instances — created once, reused -_triple_handler = TripleHandler() -_quad_handler = QuadHandler() -_tsd_handler = TSDHandler() - - -def convert_file( - input_file: str, - output_file: str, - convert_format: str, +def convert_rdf_quad_format( + source: str, + target: str, + input_format: str, + output_format: str, ) -> None: - """Main conversion dispatcher called from the download pipeline. - - Detects the input format from the file extension, determines whether - this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, - and delegates to the appropriate handler. - - Args: - input_file: Path to the input file (must be decompressed). - output_file: Path to write the converted output file. - convert_format: Target format name (CLI format string). - - Raises: - ValueError: If input format cannot be detected or conversion - is not supported. - """ - input_format = detect_format_from_filename(input_file) + _quad_handler.convert(source, target, input_format, output_format) - if input_format is None: - raise ValueError( - f"Could not detect input format from filename: " - f"'{os.path.basename(input_file)}'. " - f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" - ) - if input_format == convert_format: - print( - f"WARNING: Input and target format are both '{input_format}'. " - "Skipping conversion." - ) - return - - input_class = get_format_class(input_format) - output_class = get_format_class(convert_format) - - # --- Layer 2: within-class --- - if input_class == output_class: - if input_class == "triples": - _triple_handler.convert( - input_file, output_file, input_format, convert_format - ) - elif input_class == "quads": - _quad_handler.convert( - input_file, output_file, input_format, convert_format - ) - elif input_class == "tabular": - _tsd_handler.convert( - input_file, output_file, input_format, convert_format - ) - return - - # --- Layer 3: cross-class (prototype only) --- - if input_class == "triples" and output_class == "tabular": - convert_rdf_to_csv(input_file, output_file, input_format) - return - - raise ValueError( - f"Conversion from '{input_format}' ({input_class}) to " - f"'{convert_format}' ({output_class}) is not yet implemented. " - f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." - ) \ No newline at end of file +def convert_tabular_format( + source: str, + target: str, + input_format: str, + output_format: str, +) -> None: + _tsd_handler.convert(source, target, input_format, output_format) \ No newline at end of file diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 7c33fac..414511a 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -5,6 +5,8 @@ import lzma from typing import List, Optional, Tuple import re +import shutil +import tempfile from urllib.parse import urlparse import requests @@ -16,7 +18,7 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) -from databusclient.api.convert import convert_file, get_converted_filename +from databusclient.filehandling.format import convert_file, get_converted_filename # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -508,87 +510,102 @@ def _download_file( f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" ) - # --- 7. Convert compression format if requested (AFTER validation) --- - should_convert, source_format = _should_convert_file(file, convert_to, convert_from) - final_downloaded_file = filename - if should_convert and source_format: - target_filename = _get_converted_filename(file, source_format, convert_to) - target_filepath = os.path.join(localDir, target_filename) - _convert_compression_format( - filename, target_filepath, source_format, convert_to - ) - final_downloaded_file = target_filepath - - # --- 8. Convert file format if requested (AFTER compression conversion) --- - # Pipeline follows :decompress -> convert format -> recompress - # If the source was compressed, the converted output is recompressed: - # - to the format specified by --convert-to if provided - # - to the original compression format otherwise - if convert_format: - final_basename = os.path.basename(final_downloaded_file) - compression_fmt = _detect_compression_format(final_basename) - - if compression_fmt: - # File is still compressed — decompress to temp, convert, recompress - compression_ext = COMPRESSION_EXTENSIONS[compression_fmt] - if final_downloaded_file.lower().endswith(compression_ext): - temp_decompressed = final_downloaded_file[:-len(compression_ext)] - else: - temp_decompressed = final_downloaded_file + ".decompressed" - - try: - print( - f"Decompressing {final_basename} before format conversion..." - ) - source_module = COMPRESSION_MODULES[compression_fmt] - with source_module.open(final_downloaded_file, "rb") as sf: - with open(temp_decompressed, "wb") as tf: - while True: - chunk = sf.read(8192) - if not chunk: - break - tf.write(chunk) - - # Convert format on the decompressed temp file - converted_basename = get_converted_filename( - final_basename, convert_format - ) - converted_filepath = os.path.join(localDir, converted_basename) - convert_file(temp_decompressed, converted_filepath, convert_format) + # --- 7. Unified compression/format conversion pass --- + source_compression = _detect_compression_format(file) + should_convert_compression, source_format_for_convert_to = _should_convert_file( + file, convert_to, convert_from + ) + needs_format_conversion = convert_format is not None - # Recompress the converted output. - # Use --convert-to format if specified, otherwise use original compression. - recompress_fmt = convert_to if convert_to else compression_fmt - recompress_ext = COMPRESSION_EXTENSIONS[recompress_fmt] - recompressed_filepath = converted_filepath + recompress_ext - recompress_module = COMPRESSION_MODULES[recompress_fmt] + if not should_convert_compression and not needs_format_conversion: + return - print( - f"Recompressing converted file to {recompress_fmt}: " - f"{os.path.basename(recompressed_filepath)}" - ) - with open(converted_filepath, "rb") as sf: - with recompress_module.open(recompressed_filepath, "wb") as tf: - while True: - chunk = sf.read(8192) - if not chunk: - break - tf.write(chunk) - - # Remove the uncompressed converted file — keep only recompressed - if os.path.exists(converted_filepath): - os.remove(converted_filepath) - - finally: - # Always clean up temp decompressed file - if os.path.exists(temp_decompressed): - os.remove(temp_decompressed) + temp_paths: list[str] = [] + try: + # Compression-only path keeps existing conversion message behavior. + # Use a temp copy so the original downloaded file remains unchanged. + if should_convert_compression and not needs_format_conversion: + target_filename = _get_converted_filename( + file, source_format_for_convert_to, convert_to + ) + target_filepath = os.path.join(localDir, target_filename) + + with tempfile.NamedTemporaryFile( + delete=False, + suffix=COMPRESSION_EXTENSIONS[source_format_for_convert_to], + dir=localDir, + ) as temp_source_copy: + source_copy_path = temp_source_copy.name + temp_paths.append(source_copy_path) + + shutil.copyfile(filename, source_copy_path) + _convert_compression_format( + source_copy_path, + target_filepath, + source_format_for_convert_to, + convert_to, + ) + return + # Determine input for format conversion. + # If source is compressed, decompress once to a safe temporary file. + conversion_input_path = filename + if source_compression is not None: + source_ext = COMPRESSION_EXTENSIONS[source_compression] + stripped_name = file + if stripped_name.lower().endswith(source_ext): + stripped_name = stripped_name[: -len(source_ext)] + _, format_ext = os.path.splitext(stripped_name) + + with tempfile.NamedTemporaryFile( + delete=False, + suffix=format_ext, + dir=localDir, + ) as temp_decompressed: + temp_decompressed_path = temp_decompressed.name + temp_paths.append(temp_decompressed_path) + + print(f"Decompressing {file}...") + with COMPRESSION_MODULES[source_compression].open(filename, "rb") as sf: + with open(temp_decompressed_path, "wb") as tf: + shutil.copyfileobj(sf, tf) + + conversion_input_path = temp_decompressed_path + + # Convert format on uncompressed input. + converted_basename = get_converted_filename(file, convert_format) + converted_uncompressed_path = os.path.join(localDir, converted_basename) + convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + + # Recompress converted output when needed. + if source_compression is not None: + if should_convert_compression and convert_to: + final_compression = convert_to + else: + final_compression = source_compression + elif should_convert_compression and convert_to: + final_compression = convert_to else: - # File is already uncompressed — convert directly, no recompression needed - converted_filename = get_converted_filename(final_basename, convert_format) - converted_filepath = os.path.join(localDir, converted_filename) - convert_file(final_downloaded_file, converted_filepath, convert_format) + final_compression = None + + if final_compression is not None: + recompressed_path = ( + converted_uncompressed_path + COMPRESSION_EXTENSIONS[final_compression] + ) + print( + f"Recompressing {os.path.basename(converted_uncompressed_path)} -> {os.path.basename(recompressed_path)}..." + ) + with open(converted_uncompressed_path, "rb") as sf: + with COMPRESSION_MODULES[final_compression].open( + recompressed_path, "wb" + ) as tf: + shutil.copyfileobj(sf, tf) + + os.remove(converted_uncompressed_path) + finally: + for temp_path in temp_paths: + if os.path.exists(temp_path): + os.remove(temp_path) def _download_files( diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py new file mode 100644 index 0000000..1b625b8 --- /dev/null +++ b/databusclient/filehandling/format.py @@ -0,0 +1,511 @@ +"""Format and Mapping Conversion Layer. + +This module implements the format conversion pipeline for the Databus Python Client + +Layer 2: Within-class format conversion (lossless). + - TripleHandler: RDF triple formats (turtle, ntriples, rdf-xml) + - QuadHandler: RDF quad formats (nquads, trig, trix, json-ld) + - TSDHandler: Tabular formats (csv, tsv) + +Each handler provides read() -> IR, write(IR) -> file, convert() -> chains both. +The IR (intermediate representation) returned by read() is designed to be passed +to future mapping classes (TripleToQuadMapper, TripleToTSDMapper, etc.). +""" + +import csv +import os +import shutil +import warnings +from typing import Optional + +from rdflib import Dataset, Graph + +# Suppress rdflib internal DeprecationWarning for Dataset API. +# rdflib is mid-migration from ConjunctiveGraph to Dataset in 7.x. +# These warnings originate from rdflib internals, not our code. +# Can be removed when rdflib completes their Dataset API migration. +warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdflib") +warnings.filterwarnings("ignore", category=UserWarning, module="rdflib") + + +# --------------------------------------------------------------------------- +# Format registries +# --------------------------------------------------------------------------- + +# Maps CLI format name -> rdflib format string +RDF_TRIPLE_FORMATS = { + "ntriples": "ntriples", + "turtle": "turtle", + "rdf-xml": "xml", +} + +RDF_QUAD_FORMATS = { + "nquads": "nquads", + "trig": "trig", + "trix": "trix", + "json-ld": "json-ld", +} + +TABULAR_FORMATS = { + "csv": ",", + "tsv": "\t", +} + +ALL_FORMATS = ( + list(RDF_TRIPLE_FORMATS) + + list(RDF_QUAD_FORMATS) + + list(TABULAR_FORMATS) +) + +# Maps file extension -> CLI format name +EXTENSION_TO_FORMAT = { + ".ttl": "turtle", + ".nt": "ntriples", + ".rdf": "rdf-xml", + ".xml": "rdf-xml", + ".owl": "rdf-xml", + ".nq": "nquads", + ".trig": "trig", + ".trix": "trix", + ".jsonld": "json-ld", + ".json": "json-ld", + ".csv": "csv", + ".tsv": "tsv", +} + +# Maps format name -> file extension +FORMAT_TO_EXTENSION = { + "ntriples": ".nt", + "turtle": ".ttl", + "rdf-xml": ".rdf", + "nquads": ".nq", + "trig": ".trig", + "trix": ".trix", + "json-ld": ".jsonld", + "csv": ".csv", + "tsv": ".tsv", +} + + +# --------------------------------------------------------------------------- +# Format detection helpers +# --------------------------------------------------------------------------- + +def detect_format_from_filename(filename: str) -> Optional[str]: + """Detect format from file extension, ignoring compression extensions. + + Args: + filename: File name or path. + + Returns: + Format name string or None if not detectable. + """ + name = filename.lower() + + # strip compression extension first + for ext in (".bz2", ".gz", ".xz"): + if name.endswith(ext): + name = name[: -len(ext)] + break + + # match longest extension first to avoid .json matching before .jsonld + for ext in sorted(EXTENSION_TO_FORMAT.keys(), key=len, reverse=True): + if name.endswith(ext): + return EXTENSION_TO_FORMAT[ext] + + return None + + +def get_format_class(fmt: str) -> str: + """Return equivalence class for a format name. + + Args: + fmt: Format name (e.g. 'turtle', 'nquads', 'csv'). + + Returns: + 'triples', 'quads', or 'tabular'. + + Raises: + ValueError: If format is not recognised. + """ + if fmt in RDF_TRIPLE_FORMATS: + return "triples" + if fmt in RDF_QUAD_FORMATS: + return "quads" + if fmt in TABULAR_FORMATS: + return "tabular" + raise ValueError( + f"Unknown format: '{fmt}'. Supported formats: {ALL_FORMATS}" + ) + + +def get_converted_filename(original_filename: str, convert_format: str) -> str: + """Generate output filename after format conversion. + + Strips compression extension if present, then replaces the format + extension with the target format extension. + + Args: + original_filename: Original file name (basename only, not full path). + convert_format: Target format name. + + Returns: + New filename with updated extension. + """ + name = original_filename + + # strip compression extension + for ext in (".bz2", ".gz", ".xz"): + if name.lower().endswith(ext): + name = name[: -len(ext)] + break + + # strip existing format extension (longest first) + for old_ext in sorted(FORMAT_TO_EXTENSION.values(), key=len, reverse=True): + if name.lower().endswith(old_ext): + name = name[: -len(old_ext)] + break + + target_ext = FORMAT_TO_EXTENSION.get(convert_format, f".{convert_format}") + return name + target_ext + + +# --------------------------------------------------------------------------- +# Layer 2 Handlers +# --------------------------------------------------------------------------- + +class TripleHandler: + """Handler for RDF triple formats (Layer 2). + + Uses rdflib.Graph as the intermediate representation (IR). + Supports: ntriples, turtle, rdf-xml. + + The IR returned by read() can be passed to future mapping classes + such as TripleToQuadMapper or TripleToTSDMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> Graph: + """Parse an RDF triples file into a Graph (IR). + + Args: + source: Path to input file. + input_format: Source format name (e.g. 'turtle', 'ntriples', 'rdf-xml'). + + Returns: + rdflib.Graph containing all parsed triples. + + Raises: + ValueError: If input_format is not a recognised triple format. + """ + if input_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{input_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + g = Graph() + g.parse(source, format=RDF_TRIPLE_FORMATS[input_format]) + return g + + def write(self, data: Graph, target: str, output_format: str) -> None: + """Serialize a Graph (IR) to a file. + + Args: + data: rdflib.Graph to serialize. + target: Path to output file. + output_format: Target format name (e.g. 'ntriples', 'turtle'). + + Raises: + ValueError: If output_format is not a recognised triple format. + """ + if output_format not in RDF_TRIPLE_FORMATS: + raise ValueError( + f"'{output_format}' is not a triple format. " + f"Supported: {list(RDF_TRIPLE_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + # Explicitly specify utf-8 encoding to avoid NTSerializer warning + data.serialize( + destination=target, + format=RDF_TRIPLE_FORMATS[output_format], + encoding="utf-8", + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF triple formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF triples). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + graph = self.read(source, input_format) + self.write(graph, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class QuadHandler: + """Handler for RDF quad formats (Layer 2). + + Uses rdflib.Dataset as the intermediate representation (IR). + Supports: nquads, trig, trix, json-ld. + + Named graph information is preserved through the Dataset IR. + The IR returned by read() can be passed to future mapping classes + such as QuadToTripleMapper or QuadToTSDMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> Dataset: + """Parse an RDF quads file into a Dataset (IR). + + Args: + source: Path to input file. + input_format: Source format name (e.g. 'nquads', 'trig', 'trix', 'json-ld'). + + Returns: + rdflib.Dataset containing all parsed quads with named graphs. + + Raises: + ValueError: If input_format is not a recognised quad format. + """ + if input_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{input_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + d = Dataset() + d.parse(source, format=RDF_QUAD_FORMATS[input_format]) + return d + + def write(self, data: Dataset, target: str, output_format: str) -> None: + """Serialize a Dataset (IR) to a file. + + Args: + data: rdflib.Dataset to serialize. + target: Path to output file. + output_format: Target format name. + + Raises: + ValueError: If output_format is not a recognised quad format. + """ + if output_format not in RDF_QUAD_FORMATS: + raise ValueError( + f"'{output_format}' is not a quad format. " + f"Supported: {list(RDF_QUAD_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + data.serialize( + destination=target, + format=RDF_QUAD_FORMATS[output_format], + ) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between RDF quad formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (RDF quads). Named graph information is preserved. + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + dataset = self.read(source, input_format) + self.write(dataset, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +class TSDHandler: + """Handler for tabular structured data formats (Layer 2). + + Uses list[list[str]] as the intermediate representation (IR). + Supports: csv, tsv. + + The IR returned by read() can be passed to future mapping classes + such as TSDToTripleMapper for Layer 3 conversions. + """ + + def read(self, source: str, input_format: str) -> list: + """Parse a tabular file into a list of rows (IR). + + Each row is a list of string values. First row is the header. + + Args: + source: Path to input file. + input_format: Source format name ('csv' or 'tsv'). + + Returns: + list[list[str]] where first element is the header row. + + Raises: + ValueError: If input_format is not a recognised tabular format. + """ + if input_format not in TABULAR_FORMATS: + raise ValueError( + f"'{input_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + delimiter = TABULAR_FORMATS[input_format] + with open(source, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter=delimiter) + return list(reader) + + def write(self, data: list, target: str, output_format: str) -> None: + """Serialize a list of rows (IR) to a tabular file. + + Args: + data: list[list[str]] to write. + target: Path to output file. + output_format: Target format name ('csv' or 'tsv'). + + Raises: + ValueError: If output_format is not a recognised tabular format. + """ + if output_format not in TABULAR_FORMATS: + raise ValueError( + f"'{output_format}' is not a tabular format. " + f"Supported: {list(TABULAR_FORMATS)}" + ) + parent = os.path.dirname(target) + if parent: + os.makedirs(parent, exist_ok=True) + delimiter = TABULAR_FORMATS[output_format] + with open(target, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter=delimiter) + writer.writerows(data) + + def convert( + self, + source: str, + target: str, + input_format: str, + output_format: str, + ) -> None: + """Convert between tabular formats (Layer 2, lossless). + + Chains read() -> write(). Both formats must be in the same + equivalence class (tabular). + + Args: + source: Path to input file. + target: Path to output file. + input_format: Source format name. + output_format: Target format name. + """ + rows = self.read(source, input_format) + self.write(rows, target, output_format) + print( + f"Converted {input_format} -> {output_format}: " + f"{os.path.basename(target)}" + ) + + +# --------------------------------------------------------------------------- +# Main dispatcher — called from download pipeline +# --------------------------------------------------------------------------- + +# Handler instances — created once, reused +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + +def convert_file( + input_file: str, + output_file: str, + convert_format: str, +) -> None: + """Main conversion dispatcher called from the download pipeline. + + Detects the input format from the file extension, determines whether + this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, + and delegates to the appropriate handler. + + Args: + input_file: Path to the input file (must be decompressed). + output_file: Path to write the converted output file. + convert_format: Target format name (CLI format string). + + Raises: + ValueError: If input format cannot be detected or conversion + is not supported. + """ + input_format = detect_format_from_filename(input_file) + + if input_format is None: + raise ValueError( + f"Could not detect input format from filename: " + f"'{os.path.basename(input_file)}'. " + f"Supported extensions: {list(EXTENSION_TO_FORMAT.keys())}" + ) + + if input_format == convert_format: + # Input and target format are identical. + # Copy input to output path so the caller always receives an output file. + # This is important for the download pipeline which expects an output + # file to exist after convert_file() returns — e.g. for recompression. + if input_file != output_file: + shutil.copy2(input_file, output_file) + print( + f"Input and target format are both '{input_format}'. " + f"Copied to output path: {os.path.basename(output_file)}" + ) + return + + input_class = get_format_class(input_format) + output_class = get_format_class(convert_format) + + # --- Layer 2: within-class --- + if input_class == output_class: + if input_class == "triples": + _triple_handler.convert( + input_file, output_file, input_format, convert_format + ) + elif input_class == "quads": + _quad_handler.convert( + input_file, output_file, input_format, convert_format + ) + elif input_class == "tabular": + _tsd_handler.convert( + input_file, output_file, input_format, convert_format + ) + return + + # --- Layer 3: cross-class (prototype only) --- + if input_class == "triples" and output_class == "tabular": + from databusclient.filehandling.mapping import convert_rdf_to_csv + + convert_rdf_to_csv(input_file, output_file, input_format) + return + + raise ValueError( + f"Conversion from '{input_format}' ({input_class}) to " + f"'{convert_format}' ({output_class}) is not yet implemented. " + f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." + ) diff --git a/databusclient/filehandling/mapping.py b/databusclient/filehandling/mapping.py new file mode 100644 index 0000000..93b5a00 --- /dev/null +++ b/databusclient/filehandling/mapping.py @@ -0,0 +1,68 @@ +"""Layer 3 prototype mapping handlers.""" + +import json +import os + +from databusclient.filehandling.format import TSDHandler, TripleHandler + + +def convert_rdf_to_csv( + input_file: str, + output_file: str, + input_format: str, +) -> None: + """Map RDF triples to a wide CSV table (Layer 3 prototype). + + Each unique subject becomes a row. Each unique predicate becomes a column. + Multi-valued predicates are pipe-separated. + A companion .meta.json file is generated to preserve RDF datatype and + language tag information for lossless round trips. + + NOTE: This is a Layer 3 prototype. It is not yet tested and will be + properly implemented in the Layer 3 issue. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output CSV file. + input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + """ + handler = TripleHandler() + g = handler.read(input_file, input_format) + + predicates = sorted(set(str(p) for s, p, o in g)) + + subjects: dict = {} + column_metadata: dict = {} + + for s, p, o in g: + subj = str(s) + pred = str(p) + + if hasattr(o, "datatype") and o.datatype: + column_metadata[pred] = {"datatype": str(o.datatype)} + elif hasattr(o, "language") and o.language: + column_metadata[pred] = {"language": str(o.language)} + + if subj not in subjects: + subjects[subj] = {} + if pred not in subjects[subj]: + subjects[subj][pred] = [] + subjects[subj][pred].append(str(o)) + + tsd_handler = TSDHandler() + rows = [["resource"] + predicates] + for subj, pred_map in subjects.items(): + row = [subj] + for pred in predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + tsd_handler.write(rows, output_file, "csv") + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Companion metadata: {os.path.basename(companion_file)}") diff --git a/pyproject.toml b/pyproject.toml index 72179cc..9759c07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,9 @@ databusclient = "databusclient.cli:app" target-version = "py311" src = ["databusclient", "tests"] +[tool.ruff.lint.per-file-ignores] +"tests/test_format_round_trips.py" = ["F841"] + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/run_all_conversion_tests.py b/run_all_conversion_tests.py index 384e052..98f9bbb 100644 --- a/run_all_conversion_tests.py +++ b/run_all_conversion_tests.py @@ -5,6 +5,10 @@ Test file for testing with real datasets from databus. """ +# TODO: This script is a temporary manual integration test artifact. +# It must be removed or rewritten as proper pytest integration tests +# before the final PR. Do not commit this file to the upstream repo. + import os from databusclient.api.convert import ( convert_rdf_triple_format, From 180c255ed3961f454d8af1b71bdff03101d8284f Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Fri, 12 Jun 2026 17:12:11 +0530 Subject: [PATCH 4/7] Review comments resolved under issue #59 --- databusclient/api/download.py | 7 ++++ databusclient/cli.py | 18 ++++++++-- databusclient/filehandling/format.py | 51 ++++++++++++++++++++++++++-- 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 414511a..d7ec030 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -577,6 +577,13 @@ def _download_file( converted_uncompressed_path = os.path.join(localDir, converted_basename) convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + # Delete the original downloaded file after successful format conversion, + # unless the converted output is the same file (same format, same path). + if os.path.abspath(filename) != os.path.abspath(converted_uncompressed_path): + if os.path.exists(filename): + os.remove(filename) + print(f"Removed original file: {os.path.basename(filename)}") + # Recompress converted output when needed. if source_compression is not None: if should_convert_compression and convert_to: diff --git a/databusclient/cli.py b/databusclient/cli.py index c687616..e998d4e 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -190,13 +190,25 @@ def deploy( help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", ) @click.option( - "--convert-format", + "--format", "convert_format", type=click.Choice( - ["ntriples","turtle","rdf-xml","nquads","trig","trix","json-ld","csv","tsv"], + [ + "ntriples", "nt", + "turtle", "ttl", + "rdf-xml", "rdf", "xml", + "nquads", "nq", + "trig", + "trix", + "json-ld", "jsonld", + "csv", + "tsv", + ], case_sensitive=False, ), - help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3).", + help="Target format for on-the-fly format conversion during download (Layer 2 and Layer 3). " + "Accepts full names (ntriples, turtle, rdf-xml, nquads, trig, trix, json-ld, csv, tsv) " + "or short aliases (nt, ttl, rdf, xml, nq, jsonld).", ) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py index 1b625b8..7c40109 100644 --- a/databusclient/filehandling/format.py +++ b/databusclient/filehandling/format.py @@ -57,6 +57,42 @@ + list(TABULAR_FORMATS) ) +# Maps short CLI aliases -> canonical format name +FORMAT_ALIASES = { + "nt": "ntriples", + "ttl": "turtle", + "rdf": "rdf-xml", + "xml": "rdf-xml", + "nq": "nquads", + "jsonld": "json-ld", +} + +def normalize_format(fmt: str) -> str: + """Normalize a format name or alias to its canonical form. + + Accepts both full names (e.g. 'ntriples') and short aliases (e.g. 'nt'). + Canonical names pass through unchanged. Unknown values raise ValueError. + + Args: + fmt: Format name or alias string (case-insensitive). + + Returns: + Canonical format name string. + + Raises: + ValueError: If fmt is not a recognised format name or alias. + """ + fmt_lower = fmt.lower() + # Resolve alias first + canonical = FORMAT_ALIASES.get(fmt_lower, fmt_lower) + if canonical not in ALL_FORMATS: + raise ValueError( + f"Unknown format: '{fmt}'. " + f"Supported formats: {ALL_FORMATS}. " + f"Supported aliases: {list(FORMAT_ALIASES.keys())}" + ) + return canonical + # Maps file extension -> CLI format name EXTENSION_TO_FORMAT = { ".ttl": "turtle", @@ -143,15 +179,18 @@ def get_converted_filename(original_filename: str, convert_format: str) -> str: """Generate output filename after format conversion. Strips compression extension if present, then replaces the format - extension with the target format extension. + extension with the target format extension. Accepts format aliases. Args: original_filename: Original file name (basename only, not full path). - convert_format: Target format name. + convert_format: Target format name or alias. Returns: New filename with updated extension. """ + # Normalize alias to canonical name + convert_format = normalize_format(convert_format) + name = original_filename # strip compression extension @@ -447,15 +486,21 @@ def convert_file( this is a Layer 2 (within-class) or Layer 3 (cross-class) conversion, and delegates to the appropriate handler. + Accepts both canonical format names and short aliases (e.g. 'nt' for + 'ntriples', 'ttl' for 'turtle'). See normalize_format() for full list. + Args: input_file: Path to the input file (must be decompressed). output_file: Path to write the converted output file. - convert_format: Target format name (CLI format string). + convert_format: Target format name or alias (CLI format string). Raises: ValueError: If input format cannot be detected or conversion is not supported. """ + # Normalize alias to canonical name before any processing + convert_format = normalize_format(convert_format) + input_format = detect_format_from_filename(input_file) if input_format is None: From 34039f23a616e2e9df2e3fa4e66b8888fc47a06e Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Sat, 13 Jun 2026 18:36:26 +0530 Subject: [PATCH 5/7] #61: replacing --convert_from & --convert_to with --compression --- databusclient/api/download.py | 123 +++++++++++---------------- databusclient/cli.py | 18 ++-- tests/test_compression_conversion.py | 41 +++++---- 3 files changed, 79 insertions(+), 103 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index d7ec030..56cf07c 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -50,20 +50,23 @@ def _detect_compression_format(filename: str) -> Optional[str]: return None -def _should_convert_file( - filename: str, convert_to: Optional[str], convert_from: Optional[str] +def _should_convert_compression( + filename: str, compression: Optional[str] ) -> Tuple[bool, Optional[str]]: - """Determine if a file should be converted and what the source format is. + """Determine if a file should have its compression format converted. + + Source compression is detected automatically from the file extension. + All compressed files will be converted to the target format regardless + of their source compression format. Args: filename: Name of the file. - convert_to: Target compression format ('bz2', 'gz', 'xz'). - convert_from: Optional source compression format filter. + compression: Target compression format ('bz2', 'gz', 'xz') or None. Returns: Tuple of (should_convert: bool, source_format: Optional[str]). """ - if not convert_to: + if not compression: return False, None source_format = _detect_compression_format(filename) @@ -73,11 +76,7 @@ def _should_convert_file( return False, None # If source and target are the same, skip conversion - if source_format == convert_to: - return False, None - - # If convert_from is specified, only convert matching formats - if convert_from and source_format != convert_from: + if source_format == compression: return False, None return True, source_format @@ -314,8 +313,7 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, - convert_to=None, - convert_from=None, + compression=None, convert_format=None, validate_checksum: bool = False, expected_checksum: str | None = None, @@ -329,8 +327,8 @@ def _download_file( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. + Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. @@ -354,6 +352,7 @@ def _download_file( dirpath = os.path.dirname(filename) if dirpath: os.makedirs(dirpath, exist_ok=True) # Create the necessary directories + # --- 1. Get redirect URL by requesting HEAD --- headers = {} @@ -512,8 +511,8 @@ def _download_file( # --- 7. Unified compression/format conversion pass --- source_compression = _detect_compression_format(file) - should_convert_compression, source_format_for_convert_to = _should_convert_file( - file, convert_to, convert_from + should_convert_compression, source_fmt = _should_convert_compression( + file, compression ) needs_format_conversion = convert_format is not None @@ -525,14 +524,12 @@ def _download_file( # Compression-only path keeps existing conversion message behavior. # Use a temp copy so the original downloaded file remains unchanged. if should_convert_compression and not needs_format_conversion: - target_filename = _get_converted_filename( - file, source_format_for_convert_to, convert_to - ) + target_filename = _get_converted_filename(file, source_fmt, compression) target_filepath = os.path.join(localDir, target_filename) with tempfile.NamedTemporaryFile( delete=False, - suffix=COMPRESSION_EXTENSIONS[source_format_for_convert_to], + suffix=COMPRESSION_EXTENSIONS[source_fmt], dir=localDir, ) as temp_source_copy: source_copy_path = temp_source_copy.name @@ -542,8 +539,8 @@ def _download_file( _convert_compression_format( source_copy_path, target_filepath, - source_format_for_convert_to, - convert_to, + source_fmt, + compression, ) return @@ -586,12 +583,12 @@ def _download_file( # Recompress converted output when needed. if source_compression is not None: - if should_convert_compression and convert_to: - final_compression = convert_to + if should_convert_compression and compression: + final_compression = compression else: final_compression = source_compression - elif should_convert_compression and convert_to: - final_compression = convert_to + elif should_convert_compression and compression: + final_compression = compression else: final_compression = None @@ -622,8 +619,7 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, checksums: dict | None = None, @@ -637,8 +633,7 @@ def _download_files( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. @@ -654,8 +649,7 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, @@ -803,8 +797,7 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -818,8 +811,7 @@ def _download_collection( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -840,8 +832,7 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, @@ -855,8 +846,7 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -869,8 +859,7 @@ def _download_version( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -890,8 +879,7 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, @@ -906,8 +894,7 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -921,8 +908,7 @@ def _download_artifact( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -948,8 +934,7 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums, @@ -1025,8 +1010,7 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, - convert_to: str = None, - convert_from: str = None, + compression: str = None, convert_format: str = None, validate_checksum: bool = False, ) -> None: @@ -1040,8 +1024,7 @@ def _download_group( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. - convert_to: Target compression format for on-the-fly conversion. - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -1057,8 +1040,7 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1107,8 +1089,7 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", - convert_to=None, - convert_from=None, + compression=None, convert_format=None, validate_checksum: bool = False, ) -> None: @@ -1124,8 +1105,8 @@ def download( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". client_id: Client ID for token exchange. Default is "vault-token-exchange". - convert_to: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). - convert_from: Optional source compression format filter. + compression: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). + Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. validate_checksum: Whether to validate checksums after downloading. """ @@ -1154,8 +1135,7 @@ def download( databus_key, auth_url, client_id, - convert_to, - convert_from, + compression, convert_format, validate_checksum=validate_checksum, ) @@ -1176,8 +1156,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, expected_checksum=expected, @@ -1191,8 +1170,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1208,8 +1186,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1225,8 +1202,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) @@ -1264,9 +1240,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, checksums=checksums if checksums else None, - ) + ) \ No newline at end of file diff --git a/databusclient/cli.py b/databusclient/cli.py index e998d4e..50f0766 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -180,14 +180,12 @@ def deploy( help="Client ID for token exchange", ) @click.option( - "--convert-to", + "--compression", + "compression", type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), - help="Target compression format for on-the-fly conversion during download (supported: bz2, gz, xz)", -) -@click.option( - "--convert-from", - type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), - help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", + help="Target compression format for on-the-fly conversion during download. " + "Source compression is detected automatically from the file extension. " + "All compressed files will be converted to the target format (bz2, gz, xz).", ) @click.option( "--format", @@ -222,8 +220,7 @@ def download( all_versions, authurl, clientid, - convert_to, - convert_from, + compression, convert_format, validate_checksum, ): @@ -241,8 +238,7 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, - convert_to=convert_to, - convert_from=convert_from, + compression=compression, convert_format=convert_format, validate_checksum=validate_checksum, ) diff --git a/tests/test_compression_conversion.py b/tests/test_compression_conversion.py index 71ada16..8effa1b 100644 --- a/tests/test_compression_conversion.py +++ b/tests/test_compression_conversion.py @@ -8,7 +8,7 @@ import pytest from databusclient.api.download import ( _detect_compression_format, - _should_convert_file, + _should_convert_compression, _get_converted_filename, _convert_compression_format, ) @@ -23,37 +23,42 @@ def test_detect_compression_format(): assert _detect_compression_format("FILE.TXT.GZ") == "gz" # case insensitive -def test_should_convert_file(): - """Test file conversion decision logic""" +def test_should_convert_compression(): + """Test file compression conversion decision logic. + + With --compression, source format is auto-detected from the file extension. + All compressed files are converted to the target format regardless of their + source compression format (no convert_from filter). + """ # No conversion target specified - should_convert, source = _should_convert_file("file.txt.bz2", None, None) + should_convert, source = _should_convert_compression("file.txt.bz2", None) assert should_convert is False assert source is None - # Uncompressed file - should_convert, source = _should_convert_file("file.txt", "gz", None) + # Uncompressed file — never converted + should_convert, source = _should_convert_compression("file.txt", "gz") assert should_convert is False assert source is None - # Same source and target - should_convert, source = _should_convert_file("file.txt.gz", "gz", None) + # Same source and target — skip (no-op) + should_convert, source = _should_convert_compression("file.txt.gz", "gz") assert should_convert is False assert source is None - # Valid conversion - should_convert, source = _should_convert_file("file.txt.bz2", "gz", None) + # bz2 -> gz: should convert, source auto-detected + should_convert, source = _should_convert_compression("file.txt.bz2", "gz") assert should_convert is True assert source == "bz2" - # With convert_from filter matching - should_convert, source = _should_convert_file("file.txt.bz2", "gz", "bz2") + # xz -> gz: should convert regardless of source format (no filter) + should_convert, source = _should_convert_compression("file.txt.xz", "gz") assert should_convert is True - assert source == "bz2" + assert source == "xz" - # With convert_from filter not matching - should_convert, source = _should_convert_file("file.txt.bz2", "gz", "xz") - assert should_convert is False - assert source is None + # gz -> bz2: should convert + should_convert, source = _should_convert_compression("file.txt.gz", "bz2") + assert should_convert is True + assert source == "gz" def test_get_converted_filename(): @@ -195,4 +200,4 @@ def test_corrupted_file_handling(): if __name__ == "__main__": - pytest.main([__file__, "-v"]) + pytest.main([__file__, "-v"]) \ No newline at end of file From cd5e990efe53b62aa744bcd110019e68284b285c Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Mon, 15 Jun 2026 01:06:58 +0530 Subject: [PATCH 6/7] gsoc26: layer2 complete + implementation for #61 --- databusclient/api/download.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 56cf07c..41b1f2a 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -521,23 +521,14 @@ def _download_file( temp_paths: list[str] = [] try: - # Compression-only path keeps existing conversion message behavior. - # Use a temp copy so the original downloaded file remains unchanged. + # Compression-only path: convert directly from the downloaded file. + # _convert_compression_format deletes the source after success, + # so the original downloaded file is removed automatically. if should_convert_compression and not needs_format_conversion: target_filename = _get_converted_filename(file, source_fmt, compression) target_filepath = os.path.join(localDir, target_filename) - - with tempfile.NamedTemporaryFile( - delete=False, - suffix=COMPRESSION_EXTENSIONS[source_fmt], - dir=localDir, - ) as temp_source_copy: - source_copy_path = temp_source_copy.name - temp_paths.append(source_copy_path) - - shutil.copyfile(filename, source_copy_path) _convert_compression_format( - source_copy_path, + filename, target_filepath, source_fmt, compression, From 549ea3bcc0f226eef39ba8123eb4410337a5b90e Mon Sep 17 00:00:00 2001 From: DhanashreePetare Date: Wed, 17 Jun 2026 01:54:43 +0530 Subject: [PATCH 7/7] docs: update README for --format and --compression flags --- README.md | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 354c732..2e55ae9 100644 --- a/README.md +++ b/README.md @@ -174,10 +174,10 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. -- `--convert-to` - - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. Downloaded files will be automatically decompressed and recompressed to the target format. Example: `--convert-to gz` converts all downloaded compressed files to gzip format. -- `--convert-from` - - Optional filter to specify which source compression format should be converted. Use with `--convert-to` to convert only files with a specific compression format. Example: `--convert-to gz --convert-from bz2` converts only `.bz2` files to `.gz`, leaving other formats unchanged. +- `--compression` + - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. The source compression is auto-detected from the file extension. Example: `--compression gz` converts all downloaded compressed files to gzip format. +- `--format` + - Enables on-the-fly RDF and tabular format conversion during download (Layer 2). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Example: `--format turtle` converts all downloaded RDF triple files to Turtle format. - `--validate-checksum` - Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted. @@ -272,16 +272,28 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER docker run --rm -v $(pwd):/data dbpedia/databus-python-client download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql ``` -**Download with Compression Conversion**: download files and convert them to a different compression format on-the-fly +**Download with Compression Conversion**: download files and convert compression format on-the-fly. Source compression is auto-detected from the file extension. ```bash # Convert all compressed files to gzip format -databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --convert-to gz - -# Convert only bz2 files to xz format, leaving other compressions unchanged -databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals --convert-to xz --convert-from bz2 +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --compression gz # Download a collection and unify all files to bz2 format -databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --convert-to bz2 +databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --compression bz2 +``` + +**Download with Format Conversion**: download files and convert RDF or tabular format on-the-fly. Only the converted output file is kept. +```bash +# Convert RDF/XML to Turtle +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format turtle + +# Convert N-Quads to TriG (within quad equivalence class) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --format trig + +# Convert RDF to CSV (cross-class, produces companion .meta.json) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format csv + +# Combine format conversion and compression +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format ntriples --compression gz ```