From 7e123f58e2c9b9f339926a64e4984d1fa878870b Mon Sep 17 00:00:00 2001 From: YueWang Date: Thu, 12 Feb 2026 16:09:34 -0800 Subject: [PATCH 1/7] add RefSeq missing assembly pipeline --- notebooks/pangenome_refseq.py | 137 ++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 notebooks/pangenome_refseq.py diff --git a/notebooks/pangenome_refseq.py b/notebooks/pangenome_refseq.py new file mode 100644 index 00000000..f328ea53 --- /dev/null +++ b/notebooks/pangenome_refseq.py @@ -0,0 +1,137 @@ +import logging +import urllib.request +from pathlib import Path +import click +from typing import List, Optional +from pyspark.sql import SparkSession, DataFrame +from berdl_notebook_utils.setup_spark_session import get_spark_session + + +logger = logging.getLogger(__name__) + +REFSEQ_URL = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt" + + +def download_refseq_summary(output_path: Path) -> Path: + logger.info("Downloading RefSeq assembly summary from %s", REFSEQ_URL) + urllib.request.urlretrieve(REFSEQ_URL, output_path) + return output_path + + +def parse_refseq_gcf_ids(file_path: Path) -> List[str]: + assembly_ids: List[str] = [] + + with open(file_path, encoding="utf-8") as f: + for line in f: + if line.startswith("#"): + continue + + accession = line.split("\t", 1)[0] + + if accession.startswith("GCF_"): + assembly_ids.append(accession) + + logger.info("Parsed %d GCF assemblies", len(assembly_ids)) + return assembly_ids + + +def build_refseq_df(spark: SparkSession, assembly_ids: List[str]) -> DataFrame: + return spark.createDataFrame( + [(x,) for x in assembly_ids], + ["assembly_id"], + ) + + +def compute_missing_refseq( + refseq_df: DataFrame, + existing_df: DataFrame, +) -> DataFrame: + return refseq_df.join( + existing_df.select("assembly_id"), + on="assembly_id", + how="left_anti", + ) + + +def read_existing_df( + spark: SparkSession, + existing_table: Optional[str], + existing_path: Optional[str], +) -> DataFrame: + if existing_table: + logger.info("Reading existing table from metastore: %s", existing_table) + return spark.table(existing_table) + + if existing_path: + logger.info("Reading existing Delta table from path: %s", existing_path) + return spark.read.format("delta").load(existing_path) + + raise ValueError("Either --existing-table or --existing-path must be provided.") + + +def write_output(df: DataFrame, output_path: str) -> None: + logger.info("Writing missing assemblies to %s", output_path) + + (df.coalesce(1).write.format("delta").mode("overwrite").save(output_path)) + + +# ------------------------------------------------------------------------- +# Pipeline Orchestration +# ------------------------------------------------------------------------- +def run_pipeline( + spark: SparkSession, + existing_table: Optional[str], + existing_path: Optional[str], + output_path: str, +) -> None: + summary_path = Path("assembly_summary_refseq.txt") + + download_refseq_summary(summary_path) + + assembly_ids = parse_refseq_gcf_ids(summary_path) + + refseq_df = build_refseq_df(spark, assembly_ids) + + existing_df = read_existing_df( + spark, + existing_table=existing_table, + existing_path=existing_path, + ) + + missing_df = compute_missing_refseq(refseq_df, existing_df) + + missing_count = missing_df.count() + logger.info("Missing RefSeq assemblies: %d", missing_count) + + write_output(missing_df, output_path) + + +# ------------------------------------------------------------------------- +# CLI +# ------------------------------------------------------------------------- +@click.command() +@click.option("--existing-table", help="Existing table in metastore containing RefSeq assemblies") +@click.option("--existing-path", help="Existing Delta path containing RefSeq assemblies") +@click.option("--output-path", required=True, help="Output path for missing RefSeq assemblies (Delta format)") +def main( + existing_table: Optional[str], + existing_path: Optional[str], + output_path: str, +) -> None: + spark = get_spark_session() + + run_pipeline( + spark=spark, + existing_table=existing_table, + existing_path=existing_path, + output_path=output_path, + ) + + +def cli(): + logging.basicConfig(level=logging.INFO) + main(standalone_mode=False) + + +if __name__ == "__main__": + cli() From ff38bb655bba32bd6c99a4d714ed50f3529c4016 Mon Sep 17 00:00:00 2001 From: YueWang Date: Fri, 13 Feb 2026 09:24:45 -0800 Subject: [PATCH 2/7] change the output format --- notebooks/pangenome_refseq.py | 148 ++++++++++++++-------------------- 1 file changed, 61 insertions(+), 87 deletions(-) diff --git a/notebooks/pangenome_refseq.py b/notebooks/pangenome_refseq.py index f328ea53..11914245 100644 --- a/notebooks/pangenome_refseq.py +++ b/notebooks/pangenome_refseq.py @@ -1,9 +1,11 @@ import logging import urllib.request from pathlib import Path + import click -from typing import List, Optional -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import SparkSession +from pyspark.sql.functions import regexp_replace + from berdl_notebook_utils.setup_spark_session import get_spark_session @@ -14,12 +16,12 @@ def download_refseq_summary(output_path: Path) -> Path: logger.info("Downloading RefSeq assembly summary from %s", REFSEQ_URL) - urllib.request.urlretrieve(REFSEQ_URL, output_path) + urllib.request.urlretrieve(REFSEQ_URL, output_path) # noqa: S310 return output_path -def parse_refseq_gcf_ids(file_path: Path) -> List[str]: - assembly_ids: List[str] = [] +def parse_refseq_gcf_ids(file_path: Path) -> list[str]: + assembly_ids: list[str] = [] with open(file_path, encoding="utf-8") as f: for line in f: @@ -27,111 +29,83 @@ def parse_refseq_gcf_ids(file_path: Path) -> List[str]: continue accession = line.split("\t", 1)[0] - if accession.startswith("GCF_"): assembly_ids.append(accession) - logger.info("Parsed %d GCF assemblies", len(assembly_ids)) return assembly_ids -def build_refseq_df(spark: SparkSession, assembly_ids: List[str]) -> DataFrame: - return spark.createDataFrame( - [(x,) for x in assembly_ids], - ["assembly_id"], - ) - - -def compute_missing_refseq( - refseq_df: DataFrame, - existing_df: DataFrame, -) -> DataFrame: - return refseq_df.join( - existing_df.select("assembly_id"), - on="assembly_id", - how="left_anti", - ) - - -def read_existing_df( - spark: SparkSession, - existing_table: Optional[str], - existing_path: Optional[str], -) -> DataFrame: - if existing_table: - logger.info("Reading existing table from metastore: %s", existing_table) - return spark.table(existing_table) - - if existing_path: - logger.info("Reading existing Delta table from path: %s", existing_path) - return spark.read.format("delta").load(existing_path) - - raise ValueError("Either --existing-table or --existing-path must be provided.") - +@click.command() +@click.option( + "--gtdb-table", + required=True, + help="Metastore table containing genome_id column", +) +@click.option( + "--output-dir", + required=True, + help="Directory where output text files will be written", +) +def main(gtdb_table: str, output_dir: str) -> None: + logging.basicConfig(level=logging.INFO) -def write_output(df: DataFrame, output_path: str) -> None: - logger.info("Writing missing assemblies to %s", output_path) + spark: SparkSession = get_spark_session() - (df.coalesce(1).write.format("delta").mode("overwrite").save(output_path)) + # Read the GTDB genome table: + r214_df = spark.table(gtdb_table).select("genome_id").distinct() + rm_prefix_df = ( + r214_df.withColumn( + "assembly_id", + regexp_replace("genome_id", r"^(GB_|RS_)", ""), + ) + .select("assembly_id") + .distinct() + ) -# ------------------------------------------------------------------------- -# Pipeline Orchestration -# ------------------------------------------------------------------------- -def run_pipeline( - spark: SparkSession, - existing_table: Optional[str], - existing_path: Optional[str], - output_path: str, -) -> None: - summary_path = Path("assembly_summary_refseq.txt") + logger.info("Total GTDB assemblies: %d", rm_prefix_df.count()) + # Download RefSeq summary in BERDL temp directory + summary_path = Path("/tmp/assembly_summary_refseq.txt") download_refseq_summary(summary_path) - assembly_ids = parse_refseq_gcf_ids(summary_path) + # Parse RefSeq GCF IDs + refseq_ids = parse_refseq_gcf_ids(summary_path) - refseq_df = build_refseq_df(spark, assembly_ids) + refseq_df = spark.createDataFrame( + [(x,) for x in refseq_ids], + ["assembly_id"], + ) - existing_df = read_existing_df( - spark, - existing_table=existing_table, - existing_path=existing_path, + # Compute missing values in GTDB + missing_df = refseq_df.join( + rm_prefix_df, + on="assembly_id", + how="left_anti", ) - missing_df = compute_missing_refseq(refseq_df, existing_df) + logger.info("Missing RefSeq assemblies: %d", missing_df.count()) - missing_count = missing_df.count() - logger.info("Missing RefSeq assemblies: %d", missing_count) + # Prepare output directory + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) - write_output(missing_df, output_path) + # Output 1: Write GTDB existing assemblies + gtdb_ids = rm_prefix_df.select("assembly_id").rdd.map(lambda x: x[0]).collect() + with open(output_path / "r214_assemblies.txt", "w") as f: + for assembly_id in gtdb_ids: + f.write(f"{assembly_id}\n") -# ------------------------------------------------------------------------- -# CLI -# ------------------------------------------------------------------------- -@click.command() -@click.option("--existing-table", help="Existing table in metastore containing RefSeq assemblies") -@click.option("--existing-path", help="Existing Delta path containing RefSeq assemblies") -@click.option("--output-path", required=True, help="Output path for missing RefSeq assemblies (Delta format)") -def main( - existing_table: Optional[str], - existing_path: Optional[str], - output_path: str, -) -> None: - spark = get_spark_session() - - run_pipeline( - spark=spark, - existing_table=existing_table, - existing_path=existing_path, - output_path=output_path, - ) + # Output 2: Write missing RefSeq assemblies + missing_ids = missing_df.select("assembly_id").rdd.map(lambda x: x[0]).collect() + with open(output_path / "missing_refseq_ids.txt", "w") as f: + for assembly_id in missing_ids: + f.write(f"{assembly_id}\n") -def cli(): - logging.basicConfig(level=logging.INFO) - main(standalone_mode=False) + logger.info("Output files written to %s", output_dir) if __name__ == "__main__": - cli() + main() From 679ecd5905a7a62aac1cd40a74125a5a140ebd4a Mon Sep 17 00:00:00 2001 From: YueWang Date: Tue, 17 Feb 2026 10:55:57 -0800 Subject: [PATCH 3/7] Refactor missing RefSeq utility to use distributed Spark text output and fix lint issues --- notebooks/__init__.py | 0 notebooks/pangenome_refseq.py | 87 ++++++++++++++++++++++++----------- 2 files changed, 61 insertions(+), 26 deletions(-) create mode 100644 notebooks/__init__.py diff --git a/notebooks/__init__.py b/notebooks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/notebooks/pangenome_refseq.py b/notebooks/pangenome_refseq.py index 11914245..e537a3cf 100644 --- a/notebooks/pangenome_refseq.py +++ b/notebooks/pangenome_refseq.py @@ -1,13 +1,31 @@ +""" +Utility script to identify missing RefSeq assemblies relative to GTDB. + +This script: +1. Reads a GTDB metastore table. +2. Removes GB_/RS_ prefixes from genome_id. +3. Downloads the latest RefSeq assembly summary. +4. Computes missing GCF assemblies. +5. Outputs two text files using Spark distributed write: + - r214_assemblies + - missing_refseq_ids +""" + +from __future__ import annotations + import logging +import tempfile import urllib.request from pathlib import Path +from typing import TYPE_CHECKING import click -from pyspark.sql import SparkSession from pyspark.sql.functions import regexp_replace from berdl_notebook_utils.setup_spark_session import get_spark_session +if TYPE_CHECKING: + from pyspark.sql import SparkSession logger = logging.getLogger(__name__) @@ -15,16 +33,22 @@ def download_refseq_summary(output_path: Path) -> Path: + """ + Download RefSeq assembly summary file. + """ logger.info("Downloading RefSeq assembly summary from %s", REFSEQ_URL) urllib.request.urlretrieve(REFSEQ_URL, output_path) # noqa: S310 return output_path def parse_refseq_gcf_ids(file_path: Path) -> list[str]: + """ + Parse all GCF_ assembly accessions from the RefSeq summary file. + """ assembly_ids: list[str] = [] - with open(file_path, encoding="utf-8") as f: - for line in f: + with file_path.open(encoding="utf-8") as file: + for line in file: if line.startswith("#"): continue @@ -44,14 +68,19 @@ def parse_refseq_gcf_ids(file_path: Path) -> list[str]: @click.option( "--output-dir", required=True, - help="Directory where output text files will be written", + help="Output directory (e.g. s3a://...) where text files will be written", ) def main(gtdb_table: str, output_dir: str) -> None: + """ + Run the missing RefSeq assembly detection pipeline. + """ logging.basicConfig(level=logging.INFO) spark: SparkSession = get_spark_session() - # Read the GTDB genome table: + # ------------------------------------------------------------------ + # 1. Read GTDB genome table + # ------------------------------------------------------------------ r214_df = spark.table(gtdb_table).select("genome_id").distinct() rm_prefix_df = ( @@ -63,13 +92,19 @@ def main(gtdb_table: str, output_dir: str) -> None: .distinct() ) - logger.info("Total GTDB assemblies: %d", rm_prefix_df.count()) + logger.info("GTDB assemblies: %d", rm_prefix_df.count()) + + # ------------------------------------------------------------------ + # 2. Download RefSeq summary securely + # ------------------------------------------------------------------ + with tempfile.NamedTemporaryFile(delete=False) as tmp: + summary_path = Path(tmp.name) - # Download RefSeq summary in BERDL temp directory - summary_path = Path("/tmp/assembly_summary_refseq.txt") download_refseq_summary(summary_path) - # Parse RefSeq GCF IDs + # ------------------------------------------------------------------ + # 3. Parse RefSeq GCF IDs + # ------------------------------------------------------------------ refseq_ids = parse_refseq_gcf_ids(summary_path) refseq_df = spark.createDataFrame( @@ -77,7 +112,11 @@ def main(gtdb_table: str, output_dir: str) -> None: ["assembly_id"], ) - # Compute missing values in GTDB + logger.info("RefSeq assemblies: %d", refseq_df.count()) + + # ------------------------------------------------------------------ + # 4. Compute missing assemblies + # ------------------------------------------------------------------ missing_df = refseq_df.join( rm_prefix_df, on="assembly_id", @@ -86,25 +125,21 @@ def main(gtdb_table: str, output_dir: str) -> None: logger.info("Missing RefSeq assemblies: %d", missing_df.count()) - # Prepare output directory - output_path = Path(output_dir) - output_path.mkdir(parents=True, exist_ok=True) + # ------------------------------------------------------------------ + # 5. Distributed Spark text output + # ------------------------------------------------------------------ - # Output 1: Write GTDB existing assemblies - gtdb_ids = rm_prefix_df.select("assembly_id").rdd.map(lambda x: x[0]).collect() - - with open(output_path / "r214_assemblies.txt", "w") as f: - for assembly_id in gtdb_ids: - f.write(f"{assembly_id}\n") - - # Output 2: Write missing RefSeq assemblies - missing_ids = missing_df.select("assembly_id").rdd.map(lambda x: x[0]).collect() + # Output 1: All GTDB assemblies + rm_prefix_df.select("assembly_id").orderBy("assembly_id").coalesce(1).write.mode("overwrite").text( + f"{output_dir}/r214_assemblies" + ) - with open(output_path / "missing_refseq_ids.txt", "w") as f: - for assembly_id in missing_ids: - f.write(f"{assembly_id}\n") + # Output 2: Missing RefSeq assemblies + missing_df.select("assembly_id").orderBy("assembly_id").coalesce(1).write.mode("overwrite").text( + f"{output_dir}/missing_refseq_ids" + ) - logger.info("Output files written to %s", output_dir) + logger.info("Output files successfully written to %s", output_dir) if __name__ == "__main__": From 554b224c1fd85694af51cdd2cf2f5b90c80d26db Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Mon, 9 Mar 2026 07:30:21 -0700 Subject: [PATCH 4/7] Renaming and restructuring repo --- .github/workflows/tests.yml | 22 +------ AI_COVENANT.md | 65 +++++++++++++++++++ README.md | 45 ++++++++++--- pyproject.toml | 8 +-- .../__init__.py | 0 .../audit/__init__.py | 0 .../audit/checkpoint.py | 6 +- .../audit/metrics.py | 6 +- .../audit/rejects.py | 8 +-- .../audit/run.py | 6 +- .../audit/schema.py | 2 +- .../core/__init__.py | 0 .../core/constants.py | 0 .../core/pipeline_run.py | 0 .../model/feature.py | 0 .../parsers/__init__.py | 0 .../parsers/bbmap_stats.py | 0 .../parsers/checkm2.py | 0 .../parsers/fasta.py | 2 +- .../parsers/gene_association_file.py | 0 .../parsers/genome_loader.py | 8 +-- .../parsers/genome_paths.py | 0 .../parsers/refseq_importer/__init__.py | 0 .../parsers/refseq_importer/cli/__init__.py | 0 .../refseq_importer/cli/refseq_api_cli.py | 12 ++-- .../parsers/refseq_importer/core/__init__.py | 0 .../refseq_importer/core/cdm_builders.py | 0 .../refseq_importer/core/datasets_api.py | 0 .../refseq_importer/core/extractors.py | 0 .../refseq_importer/core/spark_delta.py | 0 .../refseq_importer/core/tables_finalize.py | 0 .../refseq_importer/core/taxon_processing.py | 0 .../parsers/refseq_pipeline/__init__.py | 0 .../parsers/refseq_pipeline/cli/__init__.py | 0 .../refseq_pipeline/cli/compare_snapshots.py | 4 +- .../cli/debug_parse_one_taxon.py | 10 +-- .../refseq_pipeline/cli/debug_register.py | 6 +- .../refseq_pipeline/cli/detect_updates.py | 6 +- .../cli/diff_changed_taxids.py | 8 +-- .../cli/fetch_taxon_reports.py | 8 +-- .../cli/refseq_update_manager.py | 6 +- .../refseq_pipeline/cli/register_table.py | 2 +- .../refseq_pipeline/cli/save_index_tsv.py | 4 +- .../refseq_pipeline/cli/snapshot_hashes.py | 6 +- .../parsers/refseq_pipeline/core/__init__.py | 0 .../parsers/refseq_pipeline/core/cdm_parse.py | 2 +- .../parsers/refseq_pipeline/core/config.py | 0 .../refseq_pipeline/core/datasets_api.py | 2 +- .../refseq_pipeline/core/debug_snapshot.py | 6 +- .../parsers/refseq_pipeline/core/driver.py | 4 +- .../refseq_pipeline/core/hashes_diff.py | 0 .../refseq_pipeline/core/hashes_snapshot.py | 6 +- .../parsers/refseq_pipeline/core/refseq_io.py | 2 +- .../refseq_pipeline/core/snapshot_utils.py | 0 .../refseq_pipeline/core/spark_delta.py | 0 .../parsers/refseq_pipeline/utils/__init__.py | 0 .../parsers/refseq_pipeline/utils/common.py | 0 .../parsers/refseq_pipeline/utils/logging.py | 0 .../parsers/uniprot/__init__.py | 0 .../parsers/uniprot/idmapping.py | 16 ++--- .../parsers/uniprot/metalink.py | 2 +- .../parsers/uniprot/relnotes.py | 2 +- .../parsers/uniprot/uniprot_kb.py | 6 +- .../parsers/uniprot/uniref.py | 2 +- .../pipelines/__init__.py | 0 .../pipelines/uniprot_kb_pipeline.py | 6 +- .../pipelines/uniref_pipeline.py | 6 +- .../readers/__init__.py | 0 .../readers/dsv.py | 4 +- .../transformers/__init__.py | 0 .../transformers/genome_depot/__init__.py | 0 .../transformers/genome_depot/schema.py | 0 .../utils/__init__.py | 0 .../utils/calculate_hash.py | 2 +- .../utils/cdm_logger.py | 0 .../utils/download/__init__.py | 0 .../utils/download/async_client.py | 4 +- .../utils/download/core.py | 2 +- .../utils/download/sync_client.py | 4 +- .../utils/gz.py | 2 +- .../utils/helpers.py | 0 .../utils/minio.py | 0 .../utils/spark_delta.py | 2 +- .../utils/xml_utils.py | 2 +- .../validation/__init__.py | 0 .../validation/dataframe_validator.py | 12 ++-- .../validation/df_nullable_fields.py | 2 +- .../validation/validation_result.py | 0 tests/audit/conftest.py | 2 +- tests/audit/test_checkpoint.py | 6 +- tests/audit/test_metrics.py | 6 +- tests/audit/test_rejects.py | 8 +-- tests/audit/test_run.py | 6 +- tests/audit/test_schema.py | 4 +- tests/conftest.py | 8 +-- tests/data/example_files/stats.json | 4 +- tests/data/results_multi/stats.json | 4 +- tests/data/results_single/stats.json | 2 +- tests/parsers/conftest.py | 4 +- .../refseq_importer/test_cdm_builders.py | 6 +- .../refseq_importer/test_extractors.py | 2 +- .../refseq_importer/test_refseq_api_cli.py | 14 ++-- .../refseq_importer/test_spark_delta.py | 2 +- .../refseq_importer/test_tables_finalize.py | 2 +- tests/parsers/test_bbmap_stats.py | 2 +- tests/parsers/test_checkm2.py | 2 +- tests/parsers/test_gene_association_file.py | 2 +- tests/parsers/test_genome_loader.py | 2 +- tests/parsers/test_genome_paths.py | 2 +- tests/parsers/uniprot/test_idmapping.py | 8 +-- tests/parsers/uniprot/test_metalink.py | 2 +- tests/parsers/uniprot/test_relnotes.py | 2 +- tests/parsers/uniprot/test_uniprot_kb.py | 2 +- tests/parsers/uniprot/test_uniref.py | 2 +- tests/readers/test_dsv.py | 2 +- .../readers/test_dsv_read_with_validation.py | 12 ++-- tests/utils/download/conftest.py | 4 +- tests/utils/download/test_async_client.py | 2 +- tests/utils/download/test_clients.py | 6 +- tests/utils/test_gz.py | 2 +- tests/utils/test_spark_delta.py | 14 ++-- tests/utils/test_xml_utils.py | 2 +- tests/validation/test_dataframe_validator.py | 8 +-- tests/validation/test_df_nullable_fields.py | 8 +-- uv.lock | 2 +- 125 files changed, 296 insertions(+), 220 deletions(-) create mode 100644 AI_COVENANT.md rename src/{cdm_data_loader_utils => cdm_data_loaders}/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/audit/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/audit/checkpoint.py (95%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/audit/metrics.py (94%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/audit/rejects.py (93%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/audit/run.py (95%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/audit/schema.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/core/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/core/constants.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/core/pipeline_run.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/model/feature.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/bbmap_stats.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/checkm2.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/fasta.py (95%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/gene_association_file.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/genome_loader.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/genome_paths.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/cli/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/cli/refseq_api_cli.py (88%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/cdm_builders.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/datasets_api.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/extractors.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/spark_delta.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/tables_finalize.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_importer/core/taxon_processing.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/compare_snapshots.py (97%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py (83%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/debug_register.py (72%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/detect_updates.py (94%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/diff_changed_taxids.py (93%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/fetch_taxon_reports.py (93%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/refseq_update_manager.py (96%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/register_table.py (92%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/save_index_tsv.py (91%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/cli/snapshot_hashes.py (93%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/cdm_parse.py (97%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/config.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/datasets_api.py (97%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/debug_snapshot.py (80%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/driver.py (86%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/hashes_diff.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/hashes_snapshot.py (94%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/refseq_io.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/snapshot_utils.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/core/spark_delta.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/utils/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/utils/common.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/refseq_pipeline/utils/logging.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/uniprot/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/uniprot/idmapping.py (89%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/uniprot/metalink.py (97%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/uniprot/relnotes.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/uniprot/uniprot_kb.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/parsers/uniprot/uniref.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/pipelines/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/pipelines/uniprot_kb_pipeline.py (94%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/pipelines/uniref_pipeline.py (94%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/readers/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/readers/dsv.py (96%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/transformers/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/transformers/genome_depot/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/transformers/genome_depot/schema.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/calculate_hash.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/cdm_logger.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/download/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/download/async_client.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/download/core.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/download/sync_client.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/gz.py (96%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/helpers.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/minio.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/spark_delta.py (99%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/utils/xml_utils.py (98%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/validation/__init__.py (100%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/validation/dataframe_validator.py (87%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/validation/df_nullable_fields.py (97%) rename src/{cdm_data_loader_utils => cdm_data_loaders}/validation/validation_result.py (100%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fdb3019d..f8eb20ac 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,7 +14,7 @@ on: - ready_for_review jobs: - code_format: + code_format_and_lint: name: Check code formatting runs-on: ubuntu-latest steps: @@ -33,29 +33,13 @@ jobs: shell: bash run: uv run ruff format --check - code_linting: - name: Run code lint checks - runs-on: ubuntu-latest - needs: code_format - steps: - - name: Checkout - uses: actions/checkout@v6 - - - name: Install uv - uses: astral-sh/setup-uv@v7.1.2 - with: - enable-cache: true - - - name: Install dependencies - run: uv sync - - name: Run code linting checks continue-on-error: true run: uv run ruff check --output-format=github . spark_tests: name: Run container tests - needs: code_format + needs: code_format_and_lint runs-on: ubuntu-latest steps: @@ -83,7 +67,7 @@ jobs: tests: name: Run local tests (${{ matrix.python-version }}, ${{ matrix.os }}) runs-on: ${{ matrix.os }} - needs: code_format + needs: code_format_and_lint strategy: fail-fast: false matrix: diff --git a/AI_COVENANT.md b/AI_COVENANT.md new file mode 100644 index 00000000..5631770e --- /dev/null +++ b/AI_COVENANT.md @@ -0,0 +1,65 @@ +# AI Covenant for Developers + +This covenant establishes community norms for responsible AI use in the cdm-data-loaders project. It aims to maintain trust, quality, and accountability while embracing AI as a useful tool. + +## Core Principle: You Own Your Contributions + +**Everything you contribute is yours, regardless of what tools helped create it.** + +When you submit code, documentation, issues, or comments with AI assistance, you are the author. You are responsible for: + +- Understanding what you are submitting +- Verifying correctness and appropriateness +- Defending and explaining your choices during review +- Ensuring it meets project standards + +Do not submit AI-generated code without checking it first, and do not submit anything you cannot fully stand behind. + +## AI-Assisted Code Reviews + +AI review tools (Claude, Copilot, CodeRabbit, etc.) provide **automated quality checks, not human reviews**. + +- AI comments are suggestions, not requirements +- PR owners may close AI comments without response +- Human reviewers may use AI feedback to inform their own review +- A PR still requires human approval regardless of AI feedback + +## When to Disclose AI Assistance + +**Required disclosure:** + +- When proposing bug fixes or changes to code you don't fully understand, attribute the idea to AI so reviewers can assess appropriately. + +**Appreciated transparency:** + +- When brainstorming solutions, distinguish between "AI suggests X" and "I recommend X based on my expertise". This helps reviewers to prioritize ideas. + +**Not required:** + +- Routine use of AI for writing code, issues, or PR descriptions. +- AI co-authorship in commit messages. This is actively discouraged. + +## What This Means in Practice + +| Situation | Guidance | +| ------------------------------------------------ | -------------------------------------------- | +| Writing code with Copilot/Claude/etc. | No disclosure needed; you own the result | +| Submitting AI-suggested fix you fully understand | No disclosure needed | +| Submitting AI-suggested fix in unfamiliar code | Disclose AI origin for reviewer context | +| Drafting issue or PR description with AI | No disclosure needed; ensure it's accurate | +| Brainstorming in discussions | Be clear about AI-generated vs. expert ideas | +| Receiving AI review comments | Address or close at your discretion | + +## Trust and Accountability + +This covenant is built on trust. By contributing to this repository, you agree that: + +1. You will not submit AI-generated content without reviewing it +2. You will take responsibility for any issues arising from your contributions +3. You will be honest about the origins of ideas when it matters for review quality + +--- + +*This covenant may evolve as AI tools and community needs change. Feedback and suggestions are welcome.* + +Based heavily on the excellent AI Covenant established by the [LinkML project](https://github.com/linkml/linkml/blob/main/AI_COVENANT.md). diff --git a/README.md b/README.md index 2dea6782..9fab0a13 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ -# cdm-data-loader-utils +# cdm-data-loaders Repo for CDM input data loading and wrangling -- [cdm-data-loader-utils](#cdm-data-loader-utils) +- [cdm-data-loaders](#cdm-data-loaders) - [Environment and python management](#environment-and-python-management) - [Installation](#installation) + - [Running import pipelines](#running-import-pipelines) - [Development](#development) - [Spark and other non-python dependencies](#spark-and-other-non-python-dependencies) - [Tests](#tests) @@ -20,9 +21,17 @@ The data loader utils package uses [uv](https://docs.astral.sh/uv/) for python e ## Installation -The data loader utils run on python 3.13 and above. +The CDM data loaders run on python 3.13 and above. -To install dependencies (including python), run +Most python code can be run using the command + +```sh +> uv run +``` + +This will automatically launch a virtual environment and install all required dependencies. + +To manually set up the virtual environment and install dependencies (including python), run ```sh > uv sync @@ -39,6 +48,17 @@ To activate a virtual environment with these dependencies installed, run If you are using IDEs like VSCode, they should pick up the creation of the new environment and offer it for executing python code. +## Running import pipelines + +The repo provides a Docker container that can be used to run several import pipelines or to run unit tests for the repo. The [entrypoint script](scripts/entrypoint.sh) parses the container `run` arguments and launches the appropriate functions. + +Current endpoints include: + +- `test`: run the unit tests that do _not_ require external dependencies like Spark +- `uniprot`: run the UniProtKB (UniProt protein database) import pipeline; see [the UniProtKB pipeline](src/cdm_data_loaders/pipelines/uniprot_kb_pipeline.py) for arguments +- `uniref`: run the UniRef import pipeline; the [the UniRef pipeline](src/cdm_data_loaders/pipelines/uniref_pipeline.py) for arguments + + ## Development @@ -64,7 +84,7 @@ Run the container interactively as the user `runner`; current directory is mount > docker run --rm -e NB_USER=runner -it -v .:/tmp/cdm ghcr.io/berdatalakehouse/spark_notebook:main ``` -This will launch a bash shell; the contents of the `cdm-data-loader-utils` directory are mounted at `/tmp/cdm`. +This will launch a bash shell; the contents of the `cdm-data-loaders` directory are mounted at `/tmp/cdm`. Run the container and sleep: @@ -81,16 +101,23 @@ See the [BERDataLakehouse/spark_notebook](https://github.com/BERDataLakehouse/sp ### Tests -To run the tests, execute the command: +Tests are categorised using pytest markers to allow developers to execute some or all the tests. See [pyproject.toml](pyproject.toml) for the markers used. + +To run all tests (requires a running Spark instance), execute the command: ```sh > uv run pytest ``` -To generate coverage for the tests, run +To run only tests that do not require Spark, run ```sh -> uv run pytest --cov=src --cov-report=xml tests/ +> uv run pytest -m "not requires_spark" +``` + +To generate coverage for the tests, run +```sh +> uv run pytest --cov=src --cov-report=xml ``` The standard python `coverage` package is used and coverage can be generated as html or other formats by changing the parameters. @@ -98,7 +125,7 @@ The standard python `coverage` package is used and coverage can be generated as ## Loading genomes, contigs, and features -The [genome loader](src/cdm_data_loader_utils/parsers/genome_loader.py) can be used to load and integrate data from related GFF and FASTA files. Currently, the loader requires a GFF file and two FASTA files (one for amino acid seqs, one for nucleic acid seqs) for each genome. The list of files to be processed should be specified in the genome paths file, which has the following format: +The [genome loader](src/cdm_data_loaders/parsers/genome_loader.py) can be used to load and integrate data from related GFF and FASTA files. Currently, the loader requires a GFF file and two FASTA files (one for amino acid seqs, one for nucleic acid seqs) for each genome. The list of files to be processed should be specified in the genome paths file, which has the following format: ```json { diff --git a/pyproject.toml b/pyproject.toml index 78e905b5..b140e715 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "cdm-data-loader-utils" +name = "cdm-data-loaders" version = "0.1.0" description = "Data loaders and wranglers for the CDM." requires-python = ">= 3.13" @@ -22,9 +22,9 @@ dependencies = [ ] [project.scripts] -idmapping = "cdm_data_loader_utils.parsers.uniprot.idmapping:cli" -uniprot_pipeline = "cdm_data_loader_utils.pipelines.uniprot_kb_pipeline:cli" -uniref_pipeline = "cdm_data_loader_utils.pipelines.uniref_pipeline:cli" +idmapping = "cdm_data_loaders.parsers.uniprot.idmapping:cli" +uniprot_pipeline = "cdm_data_loaders.pipelines.uniprot_kb_pipeline:cli" +uniref_pipeline = "cdm_data_loaders.pipelines.uniref_pipeline:cli" [dependency-groups] dev = [ diff --git a/src/cdm_data_loader_utils/__init__.py b/src/cdm_data_loaders/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/__init__.py rename to src/cdm_data_loaders/__init__.py diff --git a/src/cdm_data_loader_utils/audit/__init__.py b/src/cdm_data_loaders/audit/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/audit/__init__.py rename to src/cdm_data_loaders/audit/__init__.py diff --git a/src/cdm_data_loader_utils/audit/checkpoint.py b/src/cdm_data_loaders/audit/checkpoint.py similarity index 95% rename from src/cdm_data_loader_utils/audit/checkpoint.py rename to src/cdm_data_loaders/audit/checkpoint.py index d49a75e5..0aa152da 100644 --- a/src/cdm_data_loader_utils/audit/checkpoint.py +++ b/src/cdm_data_loaders/audit/checkpoint.py @@ -4,7 +4,7 @@ from pyspark.sql import SparkSession from pyspark.sql import functions as sf -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, CHECKPOINT, LAST_ENTRY_ID, @@ -17,8 +17,8 @@ UPDATED, current_run_expr, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger # Checkpoint table-related functions diff --git a/src/cdm_data_loader_utils/audit/metrics.py b/src/cdm_data_loaders/audit/metrics.py similarity index 94% rename from src/cdm_data_loader_utils/audit/metrics.py rename to src/cdm_data_loaders/audit/metrics.py index 120dc851..db725451 100644 --- a/src/cdm_data_loader_utils/audit/metrics.py +++ b/src/cdm_data_loaders/audit/metrics.py @@ -4,7 +4,7 @@ from pyspark.sql import DataFrame, Row, SparkSession from pyspark.sql import functions as sf -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, METRICS, N_INVALID, @@ -18,8 +18,8 @@ VALIDATION_ERRORS, current_run_expr, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/audit/rejects.py b/src/cdm_data_loaders/audit/rejects.py similarity index 93% rename from src/cdm_data_loader_utils/audit/rejects.py rename to src/cdm_data_loaders/audit/rejects.py index 2605ea3a..d92c10a6 100644 --- a/src/cdm_data_loader_utils/audit/rejects.py +++ b/src/cdm_data_loaders/audit/rejects.py @@ -4,7 +4,7 @@ from pyspark.sql import DataFrame from pyspark.sql.types import StructField -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, PARSED_ROW, PIPELINE, @@ -15,8 +15,8 @@ SOURCE, TIMESTAMP, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger logger = get_cdm_logger() @@ -30,7 +30,7 @@ def write_rejects( """Write rejected data to the rejects audit table. This should plug in directly to readers like the spark CSV reader, which put any non-compliant data into - a single column when run in PERMISSIVE mode (default for the cdm_data_loader_utils readers). + a single column when run in PERMISSIVE mode (default for the cdm_data_loaders readers). It is expected that the dataframe will contain a column called ROW_ERRORS, which contains a list of strings describing the errors found in the rows. diff --git a/src/cdm_data_loader_utils/audit/run.py b/src/cdm_data_loaders/audit/run.py similarity index 95% rename from src/cdm_data_loader_utils/audit/run.py rename to src/cdm_data_loaders/audit/run.py index ad1558cb..e128b258 100644 --- a/src/cdm_data_loader_utils/audit/run.py +++ b/src/cdm_data_loaders/audit/run.py @@ -4,7 +4,7 @@ from pyspark.sql import SparkSession from pyspark.sql import functions as sf -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, END_TIME, ERROR, @@ -20,8 +20,8 @@ STATUS_SUCCESS, match_run, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger def _table_not_updated(delta: DeltaTable) -> bool: diff --git a/src/cdm_data_loader_utils/audit/schema.py b/src/cdm_data_loaders/audit/schema.py similarity index 98% rename from src/cdm_data_loader_utils/audit/schema.py rename to src/cdm_data_loaders/audit/schema.py index 8a5d4f07..0789d724 100644 --- a/src/cdm_data_loader_utils/audit/schema.py +++ b/src/cdm_data_loaders/audit/schema.py @@ -2,7 +2,7 @@ from pyspark.sql.types import ArrayType, IntegerType, LongType, StringType, StructField, StructType, TimestampType -from cdm_data_loader_utils.core.pipeline_run import PipelineRun +from cdm_data_loaders.core.pipeline_run import PipelineRun CHECKPOINT = "checkpoint" METRICS = "metrics" diff --git a/src/cdm_data_loader_utils/core/__init__.py b/src/cdm_data_loaders/core/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/core/__init__.py rename to src/cdm_data_loaders/core/__init__.py diff --git a/src/cdm_data_loader_utils/core/constants.py b/src/cdm_data_loaders/core/constants.py similarity index 100% rename from src/cdm_data_loader_utils/core/constants.py rename to src/cdm_data_loaders/core/constants.py diff --git a/src/cdm_data_loader_utils/core/pipeline_run.py b/src/cdm_data_loaders/core/pipeline_run.py similarity index 100% rename from src/cdm_data_loader_utils/core/pipeline_run.py rename to src/cdm_data_loaders/core/pipeline_run.py diff --git a/src/cdm_data_loader_utils/model/feature.py b/src/cdm_data_loaders/model/feature.py similarity index 100% rename from src/cdm_data_loader_utils/model/feature.py rename to src/cdm_data_loaders/model/feature.py diff --git a/src/cdm_data_loader_utils/parsers/__init__.py b/src/cdm_data_loaders/parsers/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/__init__.py rename to src/cdm_data_loaders/parsers/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/bbmap_stats.py b/src/cdm_data_loaders/parsers/bbmap_stats.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/bbmap_stats.py rename to src/cdm_data_loaders/parsers/bbmap_stats.py diff --git a/src/cdm_data_loader_utils/parsers/checkm2.py b/src/cdm_data_loaders/parsers/checkm2.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/checkm2.py rename to src/cdm_data_loaders/parsers/checkm2.py diff --git a/src/cdm_data_loader_utils/parsers/fasta.py b/src/cdm_data_loaders/parsers/fasta.py similarity index 95% rename from src/cdm_data_loader_utils/parsers/fasta.py rename to src/cdm_data_loaders/parsers/fasta.py index 0c95d984..e4e19a29 100644 --- a/src/cdm_data_loader_utils/parsers/fasta.py +++ b/src/cdm_data_loaders/parsers/fasta.py @@ -2,7 +2,7 @@ import gzip -from cdm_data_loader_utils.model.feature import Feature +from cdm_data_loaders.model.feature import Feature DEFAULT_SPLIT = " " diff --git a/src/cdm_data_loader_utils/parsers/gene_association_file.py b/src/cdm_data_loaders/parsers/gene_association_file.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/gene_association_file.py rename to src/cdm_data_loaders/parsers/gene_association_file.py diff --git a/src/cdm_data_loader_utils/parsers/genome_loader.py b/src/cdm_data_loaders/parsers/genome_loader.py similarity index 98% rename from src/cdm_data_loader_utils/parsers/genome_loader.py rename to src/cdm_data_loaders/parsers/genome_loader.py index 65ca0bf5..711cc8c5 100644 --- a/src/cdm_data_loader_utils/parsers/genome_loader.py +++ b/src/cdm_data_loaders/parsers/genome_loader.py @@ -10,10 +10,10 @@ from Bio import SeqIO -from cdm_data_loader_utils.parsers.bbmap_stats import get_bbmap_stats -from cdm_data_loader_utils.parsers.checkm2 import get_checkm2_data -from cdm_data_loader_utils.parsers.genome_paths import get_genome_paths -from cdm_data_loader_utils.utils import calculate_hash as ch +from cdm_data_loaders.parsers.bbmap_stats import get_bbmap_stats +from cdm_data_loaders.parsers.checkm2 import get_checkm2_data +from cdm_data_loaders.parsers.genome_paths import get_genome_paths +from cdm_data_loaders.utils import calculate_hash as ch # Define SO terms mapping so_terms = { diff --git a/src/cdm_data_loader_utils/parsers/genome_paths.py b/src/cdm_data_loaders/parsers/genome_paths.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/genome_paths.py rename to src/cdm_data_loaders/parsers/genome_paths.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/__init__.py b/src/cdm_data_loaders/parsers/refseq_importer/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/__init__.py rename to src/cdm_data_loaders/parsers/refseq_importer/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/cli/__init__.py b/src/cdm_data_loaders/parsers/refseq_importer/cli/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/cli/__init__.py rename to src/cdm_data_loaders/parsers/refseq_importer/cli/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/cli/refseq_api_cli.py b/src/cdm_data_loaders/parsers/refseq_importer/cli/refseq_api_cli.py similarity index 88% rename from src/cdm_data_loader_utils/parsers/refseq_importer/cli/refseq_api_cli.py rename to src/cdm_data_loaders/parsers/refseq_importer/cli/refseq_api_cli.py index d9227a29..44a4d412 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_importer/cli/refseq_api_cli.py +++ b/src/cdm_data_loaders/parsers/refseq_importer/cli/refseq_api_cli.py @@ -2,13 +2,13 @@ Example usage: PYTHONPATH=src/parsers \ -python -m cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli \ +python -m cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli \ --taxid "224325, 2741724, 193567" \ --database refseq_api \ --mode overwrite \ --debug \ --unique-per-taxon \ - --data-dir /global_share/alinawang/cdm-data-loader-utils/output/taxon_data + --data-dir /global_share/alinawang/cdm-data-loaders/output/taxon_data """ @@ -17,10 +17,10 @@ import click -from cdm_data_loader_utils.parsers.refseq_importer.core.cdm_builders import build_cdm_datasource -from cdm_data_loader_utils.parsers.refseq_importer.core.spark_delta import build_spark, write_delta -from cdm_data_loader_utils.parsers.refseq_importer.core.tables_finalize import finalize_tables, write_and_preview -from cdm_data_loader_utils.parsers.refseq_importer.core.taxon_processing import process_taxon +from cdm_data_loaders.parsers.refseq_importer.core.cdm_builders import build_cdm_datasource +from cdm_data_loaders.parsers.refseq_importer.core.spark_delta import build_spark, write_delta +from cdm_data_loaders.parsers.refseq_importer.core.tables_finalize import finalize_tables, write_and_preview +from cdm_data_loaders.parsers.refseq_importer.core.taxon_processing import process_taxon # ---------------- Helpers ---------------- diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/__init__.py b/src/cdm_data_loaders/parsers/refseq_importer/core/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/__init__.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/cdm_builders.py b/src/cdm_data_loaders/parsers/refseq_importer/core/cdm_builders.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/cdm_builders.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/cdm_builders.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/datasets_api.py b/src/cdm_data_loaders/parsers/refseq_importer/core/datasets_api.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/datasets_api.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/datasets_api.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/extractors.py b/src/cdm_data_loaders/parsers/refseq_importer/core/extractors.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/extractors.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/extractors.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/spark_delta.py b/src/cdm_data_loaders/parsers/refseq_importer/core/spark_delta.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/spark_delta.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/spark_delta.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/tables_finalize.py b/src/cdm_data_loaders/parsers/refseq_importer/core/tables_finalize.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/tables_finalize.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/tables_finalize.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_importer/core/taxon_processing.py b/src/cdm_data_loaders/parsers/refseq_importer/core/taxon_processing.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_importer/core/taxon_processing.py rename to src/cdm_data_loaders/parsers/refseq_importer/core/taxon_processing.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/__init__.py b/src/cdm_data_loaders/parsers/refseq_pipeline/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/__init__.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/__init__.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/__init__.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/compare_snapshots.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/compare_snapshots.py similarity index 97% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/compare_snapshots.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/compare_snapshots.py index 288584b9..44c78b89 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/compare_snapshots.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/compare_snapshots.py @@ -16,7 +16,7 @@ from delta import configure_spark_with_delta_pip from pyspark.sql import SparkSession -from cdm_data_loader_utils.parsers.refseq_pipeline.core.snapshot_utils import detect_updated_or_new_hashes_from_path +from cdm_data_loaders.parsers.refseq_pipeline.core.snapshot_utils import detect_updated_or_new_hashes_from_path def build_spark_session(app_name="Compare Snapshot Hashes") -> SparkSession: @@ -74,7 +74,7 @@ def run_compare_snapshots(spark, delta_path: Path, old_tag: str, new_tag: str): def main(database, table, old_tag, new_tag, output_dir): spark = build_spark_session() - project_root = Path("/global_share/alinawang/cdm-data-loader-utils") + project_root = Path("/global_share/alinawang/cdm-data-loaders") delta_path = project_root / "delta_data" / "refseq" / "refseq_api" / "assembly_hashes" print(f"[compare] Using Delta path: {delta_path}") diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py similarity index 83% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py index 14c4beb4..48cf792c 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/debug_parse_one_taxon.py @@ -1,9 +1,9 @@ import click -from cdm_data_loader_utils.parsers.refseq_pipeline.core.cdm_parse import parse_reports -from cdm_data_loader_utils.parsers.refseq_pipeline.core.datasets_api import fetch_reports_by_taxon -from cdm_data_loader_utils.parsers.refseq_pipeline.core.driver import process_and_write_reports -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import build_spark +from cdm_data_loaders.parsers.refseq_pipeline.core.cdm_parse import parse_reports +from cdm_data_loaders.parsers.refseq_pipeline.core.datasets_api import fetch_reports_by_taxon +from cdm_data_loaders.parsers.refseq_pipeline.core.driver import process_and_write_reports +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import build_spark """ @@ -12,7 +12,7 @@ --database refseq_api \ --table assembly_stats \ --mode overwrite \ - --data-dir /global_share/alinawang/cdm-data-loader-utils/delta_data \ + --data-dir /global_share/alinawang/cdm-data-loaders/delta_data \ --preview-only """ diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/debug_register.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/debug_register.py similarity index 72% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/debug_register.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/debug_register.py index 6aa9df95..5b7b747e 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/debug_register.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/debug_register.py @@ -1,15 +1,15 @@ from delta import configure_spark_with_delta_pip from pyspark.sql import SparkSession -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import read_delta_table, register_table +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import read_delta_table, register_table """ -PYTHONPATH=/global_share/alinawang/cdm-data-loader-utils/src/parsers python debug_register.py +PYTHONPATH=/global_share/alinawang/cdm-data-loaders/src/parsers python debug_register.py """ database = "refseq_api" table = "assembly_hashes" -delta_path = "/global_share/alinawang/cdm-data-loader-utils/delta_data/refseq/refseq_api/assembly_hashes" +delta_path = "/global_share/alinawang/cdm-data-loaders/delta_data/refseq/refseq_api/assembly_hashes" builder = ( SparkSession.builder.appName("Delta Table Inspector") diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/detect_updates.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/detect_updates.py similarity index 94% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/detect_updates.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/detect_updates.py index 2de49dd0..c8d4b68e 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/detect_updates.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/detect_updates.py @@ -3,8 +3,8 @@ import click -from cdm_data_loader_utils.parsers.refseq_pipeline.core.snapshot_utils import detect_updated_or_new_hashes_from_path -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import build_spark +from cdm_data_loaders.parsers.refseq_pipeline.core.snapshot_utils import detect_updated_or_new_hashes_from_path +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import build_spark """ We implemented a snapshot comparison tool for RefSeq assemblies. @@ -53,7 +53,7 @@ def main(database, table, old_tag, new_tag, output): # project_root = Path(__file__).resolve().parents[2] # delta_path = project_root / "delta_data" / "refseq" / database / table - project_root = Path("/global_share/alinawang/cdm-data-loader-utils") + project_root = Path("/global_share/alinawang/cdm-data-loaders") delta_path = project_root / "delta_data" / "refseq" / "refseq_api" / "assembly_hashes" print(f"[detect] Using Delta path: {delta_path}") diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/diff_changed_taxids.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/diff_changed_taxids.py similarity index 93% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/diff_changed_taxids.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/diff_changed_taxids.py index fc035eac..8360260e 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/diff_changed_taxids.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/diff_changed_taxids.py @@ -13,9 +13,9 @@ import click -from cdm_data_loader_utils.parsers.refseq_pipeline.core.hashes_diff import diff_hash_and_get_changed_taxids -from cdm_data_loader_utils.parsers.refseq_pipeline.core.refseq_io import load_refseq_assembly_index -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import build_spark +from cdm_data_loaders.parsers.refseq_pipeline.core.hashes_diff import diff_hash_and_get_changed_taxids +from cdm_data_loaders.parsers.refseq_pipeline.core.refseq_io import load_refseq_assembly_index +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import build_spark def run_diff_changed_taxids(database: str, hash_table: str, new_tag: str, old_tag: str) -> list[str]: @@ -28,7 +28,7 @@ def run_diff_changed_taxids(database: str, hash_table: str, new_tag: str, old_ta # PROJECT_ROOT = Path(__file__).resolve().parents[2] # delta_path = PROJECT_ROOT / "delta_data" / "refseq" / database / hash_table - project_root = Path("/global_share/alinawang/cdm-data-loader-utils") + project_root = Path("/global_share/alinawang/cdm-data-loaders") delta_path = project_root / "delta_data" / "refseq" / "refseq_api" / "assembly_hashes" if not delta_path.exists(): diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/fetch_taxon_reports.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/fetch_taxon_reports.py similarity index 93% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/fetch_taxon_reports.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/fetch_taxon_reports.py index 294cbc6c..03301eaa 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/fetch_taxon_reports.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/fetch_taxon_reports.py @@ -5,9 +5,9 @@ from pyspark.sql import functions as F from pyspark.sql import types as T -from cdm_data_loader_utils.parsers.refseq_pipeline.core.cdm_parse import parse_reports -from cdm_data_loader_utils.parsers.refseq_pipeline.core.datasets_api import fetch_reports_by_taxon -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import ( +from cdm_data_loaders.parsers.refseq_pipeline.core.cdm_parse import parse_reports +from cdm_data_loaders.parsers.refseq_pipeline.core.datasets_api import fetch_reports_by_taxon +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import ( build_spark, cleanup_after_write, write_delta_table, @@ -69,7 +69,7 @@ def main(database, table, taxids_json, mode, prefer_spark): spark: SparkSession = build_spark(database) num_success = 0 num_failed = 0 - DATA_DIR = "/global_share/alinawang/cdm-data-loader-utils/delta_data/refseq" + DATA_DIR = "/global_share/alinawang/cdm-data-loaders/delta_data/refseq" for i, tx in enumerate(taxids, 1): print(f"[fetch] Processing taxid {i}/{len(taxids)}: {tx}") diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/refseq_update_manager.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/refseq_update_manager.py similarity index 96% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/refseq_update_manager.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/refseq_update_manager.py index 7a01d18f..816cd86e 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/refseq_update_manager.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/refseq_update_manager.py @@ -4,8 +4,8 @@ import click -from cdm_data_loader_utils.parsers.refseq_pipeline.core.config import REFSEQ_ASSEMBLY_SUMMARY_URL -from cdm_data_loader_utils.parsers.refseq_pipeline.core.refseq_io import ( +from cdm_data_loaders.parsers.refseq_pipeline.core.config import REFSEQ_ASSEMBLY_SUMMARY_URL +from cdm_data_loaders.parsers.refseq_pipeline.core.refseq_io import ( download_text, normalize_multiline_text, text_sha256, @@ -26,7 +26,7 @@ """ -from cdm_data_loader_utils.parsers.refseq_pipeline.cli import detect_updates, snapshot_hashes +from cdm_data_loaders.parsers.refseq_pipeline.cli import detect_updates, snapshot_hashes logger = logging.getLogger(__name__) logging.basicConfig( diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/register_table.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/register_table.py similarity index 92% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/register_table.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/register_table.py index b57b1f18..719bb771 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/register_table.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/register_table.py @@ -2,7 +2,7 @@ import click -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import build_spark +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import build_spark @click.command() diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/save_index_tsv.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/save_index_tsv.py similarity index 91% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/save_index_tsv.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/save_index_tsv.py index 4f464f92..d5b05453 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/save_index_tsv.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/save_index_tsv.py @@ -3,8 +3,8 @@ import click -from cdm_data_loader_utils.parsers.refseq_pipeline.core.config import REFSEQ_ASM_REPORTS -from cdm_data_loader_utils.parsers.refseq_pipeline.core.refseq_io import download_text +from cdm_data_loaders.parsers.refseq_pipeline.core.config import REFSEQ_ASM_REPORTS +from cdm_data_loaders.parsers.refseq_pipeline.core.refseq_io import download_text def save_assembly_index(destination_path: str) -> bool: diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/snapshot_hashes.py b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/snapshot_hashes.py similarity index 93% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/snapshot_hashes.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/cli/snapshot_hashes.py index d908d695..5b514842 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/cli/snapshot_hashes.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/cli/snapshot_hashes.py @@ -5,9 +5,9 @@ import click from pyspark.sql import functions as F -from cdm_data_loader_utils.parsers.refseq_pipeline.core.hashes_snapshot import snapshot_hashes_for_accessions -from cdm_data_loader_utils.parsers.refseq_pipeline.core.refseq_io import parse_assembly_summary -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import build_spark, write_delta_table +from cdm_data_loaders.parsers.refseq_pipeline.core.hashes_snapshot import snapshot_hashes_for_accessions +from cdm_data_loaders.parsers.refseq_pipeline.core.refseq_io import parse_assembly_summary +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import build_spark, write_delta_table # --------------------------- diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/__init__.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/__init__.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/cdm_parse.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/cdm_parse.py similarity index 97% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/cdm_parse.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/cdm_parse.py index 4a947aef..6ff8c172 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/cdm_parse.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/core/cdm_parse.py @@ -3,7 +3,7 @@ from pyspark.sql import Row -from cdm_data_loader_utils.parsers.refseq_pipeline.core.config import CDM_NAMESPACE, CDM_SCHEMA, EXPECTED_COLS +from cdm_data_loaders.parsers.refseq_pipeline.core.config import CDM_NAMESPACE, CDM_SCHEMA, EXPECTED_COLS if TYPE_CHECKING: from pyspark.sql import DataFrame as SparkDF diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/config.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/config.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/config.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/config.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/datasets_api.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/datasets_api.py similarity index 97% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/datasets_api.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/datasets_api.py index 5d70ad62..57c84d67 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/datasets_api.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/core/datasets_api.py @@ -4,7 +4,7 @@ import requests -from cdm_data_loader_utils.parsers.refseq_pipeline.core.config import NCBI_BASE_V2 +from cdm_data_loaders.parsers.refseq_pipeline.core.config import NCBI_BASE_V2 # ------------------------------- # Logging setup diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/debug_snapshot.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/debug_snapshot.py similarity index 80% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/debug_snapshot.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/debug_snapshot.py index 8d905e17..147b4b9a 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/debug_snapshot.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/core/debug_snapshot.py @@ -1,11 +1,11 @@ from pyspark.sql import functions as F -from cdm_data_loader_utils.parsers.refseq_pipeline.core.hashes_snapshot import ( +from cdm_data_loaders.parsers.refseq_pipeline.core.hashes_snapshot import ( snapshot_hashes_for_accessions, write_hash_snapshot, ) -from cdm_data_loader_utils.parsers.refseq_pipeline.core.refseq_io import load_refseq_assembly_index -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import build_spark +from cdm_data_loaders.parsers.refseq_pipeline.core.refseq_io import load_refseq_assembly_index +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import build_spark ## python -m refseq_pipeline.core.debug_snapshot ## diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/driver.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/driver.py similarity index 86% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/driver.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/driver.py index 200e06fa..b44c4e9a 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/driver.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/core/driver.py @@ -2,8 +2,8 @@ from pyspark.sql import SparkSession -from cdm_data_loader_utils.parsers.refseq_pipeline.core.cdm_parse import parse_reports -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import cleanup_after_write, write_delta_table +from cdm_data_loaders.parsers.refseq_pipeline.core.cdm_parse import parse_reports +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import cleanup_after_write, write_delta_table def process_and_write_reports( diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/hashes_diff.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/hashes_diff.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/hashes_diff.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/hashes_diff.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/hashes_snapshot.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/hashes_snapshot.py similarity index 94% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/hashes_snapshot.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/hashes_snapshot.py index a40aaca9..2aef1b32 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/hashes_snapshot.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/core/hashes_snapshot.py @@ -4,13 +4,13 @@ from pyspark.sql import SparkSession from pyspark.sql.types import StringType, StructField, StructType -from cdm_data_loader_utils.parsers.refseq_pipeline.core.config import DEFAULT_HASH_TABLE -from cdm_data_loader_utils.parsers.refseq_pipeline.core.refseq_io import ( +from cdm_data_loaders.parsers.refseq_pipeline.core.config import DEFAULT_HASH_TABLE +from cdm_data_loaders.parsers.refseq_pipeline.core.refseq_io import ( fetch_annotation_hash, fetch_md5_checksums, text_sha256, ) -from cdm_data_loader_utils.parsers.refseq_pipeline.core.spark_delta import write_delta_table +from cdm_data_loaders.parsers.refseq_pipeline.core.spark_delta import write_delta_table # Delta schema for hash snapshots HASH_SCHEMA = StructType( diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/refseq_io.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/refseq_io.py similarity index 98% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/refseq_io.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/refseq_io.py index bc4c1a88..6b139a8a 100644 --- a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/refseq_io.py +++ b/src/cdm_data_loaders/parsers/refseq_pipeline/core/refseq_io.py @@ -6,7 +6,7 @@ import requests from pyspark.sql import DataFrame, SparkSession -from cdm_data_loader_utils.parsers.refseq_pipeline.core.config import REFSEQ_ASSEMBLY_SUMMARY_URL +from cdm_data_loaders.parsers.refseq_pipeline.core.config import REFSEQ_ASSEMBLY_SUMMARY_URL """ python -m refseq_pipeline.core.refseq_io diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/snapshot_utils.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/snapshot_utils.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/snapshot_utils.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/snapshot_utils.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/core/spark_delta.py b/src/cdm_data_loaders/parsers/refseq_pipeline/core/spark_delta.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/core/spark_delta.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/core/spark_delta.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/utils/__init__.py b/src/cdm_data_loaders/parsers/refseq_pipeline/utils/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/utils/__init__.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/utils/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/utils/common.py b/src/cdm_data_loaders/parsers/refseq_pipeline/utils/common.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/utils/common.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/utils/common.py diff --git a/src/cdm_data_loader_utils/parsers/refseq_pipeline/utils/logging.py b/src/cdm_data_loaders/parsers/refseq_pipeline/utils/logging.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq_pipeline/utils/logging.py rename to src/cdm_data_loaders/parsers/refseq_pipeline/utils/logging.py diff --git a/src/cdm_data_loader_utils/parsers/uniprot/__init__.py b/src/cdm_data_loaders/parsers/uniprot/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/uniprot/__init__.py rename to src/cdm_data_loaders/parsers/uniprot/__init__.py diff --git a/src/cdm_data_loader_utils/parsers/uniprot/idmapping.py b/src/cdm_data_loaders/parsers/uniprot/idmapping.py similarity index 89% rename from src/cdm_data_loader_utils/parsers/uniprot/idmapping.py rename to src/cdm_data_loaders/parsers/uniprot/idmapping.py index ac3e4466..0e54d5e7 100644 --- a/src/cdm_data_loader_utils/parsers/uniprot/idmapping.py +++ b/src/cdm_data_loaders/parsers/uniprot/idmapping.py @@ -31,14 +31,14 @@ from pyspark.sql import functions as sf from pyspark.sql.types import StringType, StructField -from cdm_data_loader_utils.core.constants import CDM_LAKE_S3, INVALID_DATA_FIELD_NAME -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.readers.dsv import read -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.utils.minio import list_remote_dir_contents -from cdm_data_loader_utils.utils.spark_delta import APPEND, set_up_workspace, write_delta -from cdm_data_loader_utils.validation.dataframe_validator import DataFrameValidator, Validator -from cdm_data_loader_utils.validation.df_nullable_fields import validate as check_nullable_fields +from cdm_data_loaders.core.constants import CDM_LAKE_S3, INVALID_DATA_FIELD_NAME +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.readers.dsv import read +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.minio import list_remote_dir_contents +from cdm_data_loaders.utils.spark_delta import APPEND, set_up_workspace, write_delta +from cdm_data_loaders.validation.dataframe_validator import DataFrameValidator, Validator +from cdm_data_loaders.validation.df_nullable_fields import validate as check_nullable_fields APP_NAME = "uniprot_idmapping" NOW = datetime.datetime.now(tz=datetime.UTC) diff --git a/src/cdm_data_loader_utils/parsers/uniprot/metalink.py b/src/cdm_data_loaders/parsers/uniprot/metalink.py similarity index 97% rename from src/cdm_data_loader_utils/parsers/uniprot/metalink.py rename to src/cdm_data_loaders/parsers/uniprot/metalink.py index 45c1a282..601d497a 100644 --- a/src/cdm_data_loader_utils/parsers/uniprot/metalink.py +++ b/src/cdm_data_loaders/parsers/uniprot/metalink.py @@ -12,7 +12,7 @@ from defusedxml.ElementTree import parse -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger NS = {"": "http://www.metalinker.org/"} NOW = datetime.datetime.now(tz=datetime.UTC) diff --git a/src/cdm_data_loader_utils/parsers/uniprot/relnotes.py b/src/cdm_data_loaders/parsers/uniprot/relnotes.py similarity index 98% rename from src/cdm_data_loader_utils/parsers/uniprot/relnotes.py rename to src/cdm_data_loaders/parsers/uniprot/relnotes.py index 9a38125f..6a14f098 100644 --- a/src/cdm_data_loader_utils/parsers/uniprot/relnotes.py +++ b/src/cdm_data_loaders/parsers/uniprot/relnotes.py @@ -23,7 +23,7 @@ from pathlib import Path from typing import Any -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger RELEASE_VERSION_DATE: re.Pattern[str] = re.compile( r"is pleased to announce UniProt Knowledgebase \(UniProtKB\) Release\s+(\w+) \((\d{1,2}-[a-zA-Z]+-\d{4})\)\." diff --git a/src/cdm_data_loader_utils/parsers/uniprot/uniprot_kb.py b/src/cdm_data_loaders/parsers/uniprot/uniprot_kb.py similarity index 98% rename from src/cdm_data_loader_utils/parsers/uniprot/uniprot_kb.py rename to src/cdm_data_loaders/parsers/uniprot/uniprot_kb.py index d34fde6f..399d84e2 100644 --- a/src/cdm_data_loader_utils/parsers/uniprot/uniprot_kb.py +++ b/src/cdm_data_loaders/parsers/uniprot/uniprot_kb.py @@ -10,9 +10,9 @@ from lxml.etree import Element, tounicode -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.utils.helpers import _ensembl_type -from cdm_data_loader_utils.utils.xml_utils import get_text +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.helpers import _ensembl_type +from cdm_data_loaders.utils.xml_utils import get_text CONTENT = "content" DB = "db" diff --git a/src/cdm_data_loader_utils/parsers/uniprot/uniref.py b/src/cdm_data_loaders/parsers/uniprot/uniref.py similarity index 98% rename from src/cdm_data_loader_utils/parsers/uniprot/uniref.py rename to src/cdm_data_loaders/parsers/uniprot/uniref.py index f5a26bf7..73ac7200 100644 --- a/src/cdm_data_loader_utils/parsers/uniprot/uniref.py +++ b/src/cdm_data_loaders/parsers/uniprot/uniref.py @@ -18,7 +18,7 @@ from lxml.etree import Element, tounicode -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger UNIREF_URL = "http://uniprot.org/uniref" UNIREF_URL_BRKT = f"{{{UNIREF_URL}}}" diff --git a/src/cdm_data_loader_utils/pipelines/__init__.py b/src/cdm_data_loaders/pipelines/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/pipelines/__init__.py rename to src/cdm_data_loaders/pipelines/__init__.py diff --git a/src/cdm_data_loader_utils/pipelines/uniprot_kb_pipeline.py b/src/cdm_data_loaders/pipelines/uniprot_kb_pipeline.py similarity index 94% rename from src/cdm_data_loader_utils/pipelines/uniprot_kb_pipeline.py rename to src/cdm_data_loaders/pipelines/uniprot_kb_pipeline.py index b58a3e77..df9e6bbe 100644 --- a/src/cdm_data_loader_utils/pipelines/uniprot_kb_pipeline.py +++ b/src/cdm_data_loaders/pipelines/uniprot_kb_pipeline.py @@ -11,9 +11,9 @@ from pydantic import Field from pydantic_settings import BaseSettings -from cdm_data_loader_utils.parsers.uniprot.uniprot_kb import ENTRY_XML_TAG, parse_uniprot_entry -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.utils.xml_utils import stream_xml_file +from cdm_data_loaders.parsers.uniprot.uniprot_kb import ENTRY_XML_TAG, parse_uniprot_entry +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.xml_utils import stream_xml_file logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/pipelines/uniref_pipeline.py b/src/cdm_data_loaders/pipelines/uniref_pipeline.py similarity index 94% rename from src/cdm_data_loader_utils/pipelines/uniref_pipeline.py rename to src/cdm_data_loaders/pipelines/uniref_pipeline.py index e123e703..4e77503b 100644 --- a/src/cdm_data_loader_utils/pipelines/uniref_pipeline.py +++ b/src/cdm_data_loaders/pipelines/uniref_pipeline.py @@ -11,9 +11,9 @@ from pydantic import Field from pydantic_settings import BaseSettings -from cdm_data_loader_utils.parsers.uniprot.uniref import UNIREF_URL, UNIREF_VARIANTS, parse_uniref_entry -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.utils.xml_utils import stream_xml_file +from cdm_data_loaders.parsers.uniprot.uniref import UNIREF_URL, UNIREF_VARIANTS, parse_uniref_entry +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.xml_utils import stream_xml_file logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/readers/__init__.py b/src/cdm_data_loaders/readers/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/readers/__init__.py rename to src/cdm_data_loaders/readers/__init__.py diff --git a/src/cdm_data_loader_utils/readers/dsv.py b/src/cdm_data_loaders/readers/dsv.py similarity index 96% rename from src/cdm_data_loader_utils/readers/dsv.py rename to src/cdm_data_loaders/readers/dsv.py index 0715d793..04712bbd 100644 --- a/src/cdm_data_loader_utils/readers/dsv.py +++ b/src/cdm_data_loaders/readers/dsv.py @@ -5,8 +5,8 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StringType, StructField, StructType -from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger, log_and_die +from cdm_data_loaders.core.constants import INVALID_DATA_FIELD_NAME +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger, log_and_die # mapping of delimiters to format names (for logging) # spark defaults to separating on commas if nothing is specified diff --git a/src/cdm_data_loader_utils/transformers/__init__.py b/src/cdm_data_loaders/transformers/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/transformers/__init__.py rename to src/cdm_data_loaders/transformers/__init__.py diff --git a/src/cdm_data_loader_utils/transformers/genome_depot/__init__.py b/src/cdm_data_loaders/transformers/genome_depot/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/transformers/genome_depot/__init__.py rename to src/cdm_data_loaders/transformers/genome_depot/__init__.py diff --git a/src/cdm_data_loader_utils/transformers/genome_depot/schema.py b/src/cdm_data_loaders/transformers/genome_depot/schema.py similarity index 100% rename from src/cdm_data_loader_utils/transformers/genome_depot/schema.py rename to src/cdm_data_loaders/transformers/genome_depot/schema.py diff --git a/src/cdm_data_loader_utils/utils/__init__.py b/src/cdm_data_loaders/utils/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/utils/__init__.py rename to src/cdm_data_loaders/utils/__init__.py diff --git a/src/cdm_data_loader_utils/utils/calculate_hash.py b/src/cdm_data_loaders/utils/calculate_hash.py similarity index 98% rename from src/cdm_data_loader_utils/utils/calculate_hash.py rename to src/cdm_data_loaders/utils/calculate_hash.py index d25a102a..ed25c7ee 100644 --- a/src/cdm_data_loader_utils/utils/calculate_hash.py +++ b/src/cdm_data_loaders/utils/calculate_hash.py @@ -4,7 +4,7 @@ import hashlib from typing import Any -from cdm_data_loader_utils.parsers.fasta import read_fasta +from cdm_data_loaders.parsers.fasta import read_fasta def _hash_string(s: str) -> str: diff --git a/src/cdm_data_loader_utils/utils/cdm_logger.py b/src/cdm_data_loaders/utils/cdm_logger.py similarity index 100% rename from src/cdm_data_loader_utils/utils/cdm_logger.py rename to src/cdm_data_loaders/utils/cdm_logger.py diff --git a/src/cdm_data_loader_utils/utils/download/__init__.py b/src/cdm_data_loaders/utils/download/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/utils/download/__init__.py rename to src/cdm_data_loaders/utils/download/__init__.py diff --git a/src/cdm_data_loader_utils/utils/download/async_client.py b/src/cdm_data_loaders/utils/download/async_client.py similarity index 98% rename from src/cdm_data_loader_utils/utils/download/async_client.py rename to src/cdm_data_loaders/utils/download/async_client.py index aea4c161..09a9a4d8 100644 --- a/src/cdm_data_loader_utils/utils/download/async_client.py +++ b/src/cdm_data_loaders/utils/download/async_client.py @@ -32,8 +32,8 @@ wait_exponential, ) -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.utils.download.core import ( +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.download.core import ( DownloadCore, DownloadError, NonRetryableDownloadError, diff --git a/src/cdm_data_loader_utils/utils/download/core.py b/src/cdm_data_loaders/utils/download/core.py similarity index 98% rename from src/cdm_data_loader_utils/utils/download/core.py rename to src/cdm_data_loaders/utils/download/core.py index 819b83b6..1a189f4b 100644 --- a/src/cdm_data_loader_utils/utils/download/core.py +++ b/src/cdm_data_loaders/utils/download/core.py @@ -8,7 +8,7 @@ import httpx -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger, log_and_die +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger, log_and_die logger: logging.Logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/utils/download/sync_client.py b/src/cdm_data_loaders/utils/download/sync_client.py similarity index 98% rename from src/cdm_data_loader_utils/utils/download/sync_client.py rename to src/cdm_data_loaders/utils/download/sync_client.py index 1070ba87..0f3c1369 100644 --- a/src/cdm_data_loader_utils/utils/download/sync_client.py +++ b/src/cdm_data_loaders/utils/download/sync_client.py @@ -32,8 +32,8 @@ wait_exponential, ) -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.utils.download.core import ( +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.download.core import ( DownloadCore, DownloadError, NonRetryableDownloadError, diff --git a/src/cdm_data_loader_utils/utils/gz.py b/src/cdm_data_loaders/utils/gz.py similarity index 96% rename from src/cdm_data_loader_utils/utils/gz.py rename to src/cdm_data_loaders/utils/gz.py index 332deb85..00bac119 100644 --- a/src/cdm_data_loader_utils/utils/gz.py +++ b/src/cdm_data_loaders/utils/gz.py @@ -6,7 +6,7 @@ import click -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/utils/helpers.py b/src/cdm_data_loaders/utils/helpers.py similarity index 100% rename from src/cdm_data_loader_utils/utils/helpers.py rename to src/cdm_data_loaders/utils/helpers.py diff --git a/src/cdm_data_loader_utils/utils/minio.py b/src/cdm_data_loaders/utils/minio.py similarity index 100% rename from src/cdm_data_loader_utils/utils/minio.py rename to src/cdm_data_loaders/utils/minio.py diff --git a/src/cdm_data_loader_utils/utils/spark_delta.py b/src/cdm_data_loaders/utils/spark_delta.py similarity index 99% rename from src/cdm_data_loader_utils/utils/spark_delta.py rename to src/cdm_data_loaders/utils/spark_delta.py index 2b1482c2..e2422d77 100644 --- a/src/cdm_data_loader_utils/utils/spark_delta.py +++ b/src/cdm_data_loaders/utils/spark_delta.py @@ -4,7 +4,7 @@ from berdl_notebook_utils.spark.database import create_namespace_if_not_exists from pyspark.sql import DataFrame, DataFrameWriter, SparkSession -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/utils/xml_utils.py b/src/cdm_data_loaders/utils/xml_utils.py similarity index 98% rename from src/cdm_data_loader_utils/utils/xml_utils.py rename to src/cdm_data_loaders/utils/xml_utils.py index c844974c..57d395d3 100644 --- a/src/cdm_data_loader_utils/utils/xml_utils.py +++ b/src/cdm_data_loaders/utils/xml_utils.py @@ -17,7 +17,7 @@ from lxml.etree import Element, iterparse -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger def get_text(elem: Element | None, default: str | None = None) -> str | None: diff --git a/src/cdm_data_loader_utils/validation/__init__.py b/src/cdm_data_loaders/validation/__init__.py similarity index 100% rename from src/cdm_data_loader_utils/validation/__init__.py rename to src/cdm_data_loaders/validation/__init__.py diff --git a/src/cdm_data_loader_utils/validation/dataframe_validator.py b/src/cdm_data_loaders/validation/dataframe_validator.py similarity index 87% rename from src/cdm_data_loader_utils/validation/dataframe_validator.py rename to src/cdm_data_loaders/validation/dataframe_validator.py index 5e0243bd..51b1e08a 100644 --- a/src/cdm_data_loader_utils/validation/dataframe_validator.py +++ b/src/cdm_data_loaders/validation/dataframe_validator.py @@ -8,12 +8,12 @@ from pyspark.sql import functions as sf from pyspark.sql.types import StructField -from cdm_data_loader_utils.audit.metrics import write_metrics -from cdm_data_loader_utils.audit.rejects import write_rejects -from cdm_data_loader_utils.audit.schema import ROW_ERRORS -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger -from cdm_data_loader_utils.validation.validation_result import ValidationResult +from cdm_data_loaders.audit.metrics import write_metrics +from cdm_data_loaders.audit.rejects import write_rejects +from cdm_data_loaders.audit.schema import ROW_ERRORS +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.validation.validation_result import ValidationResult logger = get_cdm_logger() diff --git a/src/cdm_data_loader_utils/validation/df_nullable_fields.py b/src/cdm_data_loaders/validation/df_nullable_fields.py similarity index 97% rename from src/cdm_data_loader_utils/validation/df_nullable_fields.py rename to src/cdm_data_loaders/validation/df_nullable_fields.py index b885a7ff..72d184aa 100644 --- a/src/cdm_data_loader_utils/validation/df_nullable_fields.py +++ b/src/cdm_data_loaders/validation/df_nullable_fields.py @@ -4,7 +4,7 @@ from pyspark.sql import functions as sf from pyspark.sql.types import StructField -from cdm_data_loader_utils.audit.schema import ROW_ERRORS +from cdm_data_loaders.audit.schema import ROW_ERRORS COLLECTED_ERRORS = "collected_errors" diff --git a/src/cdm_data_loader_utils/validation/validation_result.py b/src/cdm_data_loaders/validation/validation_result.py similarity index 100% rename from src/cdm_data_loader_utils/validation/validation_result.py rename to src/cdm_data_loaders/validation/validation_result.py diff --git a/tests/audit/conftest.py b/tests/audit/conftest.py index 4e2990c5..4f389ca3 100644 --- a/tests/audit/conftest.py +++ b/tests/audit/conftest.py @@ -5,7 +5,7 @@ from pyspark.sql import SparkSession -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, CHECKPOINT, END_TIME, diff --git a/tests/audit/test_checkpoint.py b/tests/audit/test_checkpoint.py index 27d7d45c..466a17b0 100644 --- a/tests/audit/test_checkpoint.py +++ b/tests/audit/test_checkpoint.py @@ -6,8 +6,8 @@ import pytest from pyspark.sql import SparkSession -from cdm_data_loader_utils.audit.checkpoint import load_checkpoint, update_checkpoint_status, upsert_checkpoint -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.checkpoint import load_checkpoint, update_checkpoint_status, upsert_checkpoint +from cdm_data_loaders.audit.schema import ( CHECKPOINT, LAST_ENTRY_ID, PIPELINE, @@ -19,7 +19,7 @@ STATUS_RUNNING, UPDATED, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun +from cdm_data_loaders.core.pipeline_run import PipelineRun from tests.audit.conftest import DEFAULT_DATA, check_saved_data, create_table from tests.conftest import PIPELINE_RUN, TEST_NS diff --git a/tests/audit/test_metrics.py b/tests/audit/test_metrics.py index e5c24d36..2a92ac32 100644 --- a/tests/audit/test_metrics.py +++ b/tests/audit/test_metrics.py @@ -8,8 +8,8 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType -from cdm_data_loader_utils.audit.metrics import write_metrics -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.metrics import write_metrics +from cdm_data_loaders.audit.schema import ( METRICS, N_INVALID, N_READ, @@ -19,7 +19,7 @@ SOURCE, VALIDATION_ERRORS, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun +from cdm_data_loaders.core.pipeline_run import PipelineRun from tests.audit.conftest import ( DEFAULT_DATA, INIT_TIMESTAMP_FIELDS, diff --git a/tests/audit/test_rejects.py b/tests/audit/test_rejects.py index 6a80e94a..5c66468b 100644 --- a/tests/audit/test_rejects.py +++ b/tests/audit/test_rejects.py @@ -10,8 +10,8 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StringType, StructField, StructType -from cdm_data_loader_utils.audit.rejects import write_rejects -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.rejects import write_rejects +from cdm_data_loaders.audit.schema import ( PARSED_ROW, PIPELINE, RAW_ROW, @@ -21,8 +21,8 @@ SOURCE, TIMESTAMP, ) -from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME -from cdm_data_loader_utils.core.pipeline_run import PipelineRun +from cdm_data_loaders.core.constants import INVALID_DATA_FIELD_NAME +from cdm_data_loaders.core.pipeline_run import PipelineRun from tests.audit.conftest import create_table n_rows_per_file = 4 diff --git a/tests/audit/test_run.py b/tests/audit/test_run.py index 54b96d1a..d825937b 100644 --- a/tests/audit/test_run.py +++ b/tests/audit/test_run.py @@ -6,8 +6,8 @@ import pytest from pyspark.sql import SparkSession -from cdm_data_loader_utils.audit.run import complete_run, fail_run, start_run -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.run import complete_run, fail_run, start_run +from cdm_data_loaders.audit.schema import ( END_TIME, ERROR, PIPELINE, @@ -20,7 +20,7 @@ STATUS_ERROR, STATUS_SUCCESS, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun +from cdm_data_loaders.core.pipeline_run import PipelineRun from tests.audit.conftest import DEFAULT_DATA, check_saved_data, create_table from tests.conftest import PIPELINE_RUN, TEST_NS diff --git a/tests/audit/test_schema.py b/tests/audit/test_schema.py index 1bf179a9..3d36171e 100644 --- a/tests/audit/test_schema.py +++ b/tests/audit/test_schema.py @@ -2,8 +2,8 @@ import pytest -from cdm_data_loader_utils.audit.schema import PIPELINE, RUN_ID, SOURCE, current_run_expr, match_run -from cdm_data_loader_utils.core.pipeline_run import PipelineRun +from cdm_data_loaders.audit.schema import PIPELINE, RUN_ID, SOURCE, current_run_expr, match_run +from cdm_data_loaders.core.pipeline_run import PipelineRun from tests.audit.conftest import PIPELINE_RUN diff --git a/tests/conftest.py b/tests/conftest.py index bf77c6bb..37d704b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,16 +22,16 @@ StructType, ) -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( NAMESPACE, PIPELINE, ROW_ERRORS, RUN_ID, SOURCE, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.readers.dsv import INVALID_DATA_FIELD -from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.readers.dsv import INVALID_DATA_FIELD +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger SAVE_DIR = "spark.sql.warehouse.dir" diff --git a/tests/data/example_files/stats.json b/tests/data/example_files/stats.json index 9b67f048..e0cf637e 100644 --- a/tests/data/example_files/stats.json +++ b/tests/data/example_files/stats.json @@ -24,7 +24,7 @@ "scaf_pct_gt50K": 94.60626, "gc_avg": 0.65985, "gc_std": 0.02052, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna" + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna" } { "scaffolds": 28, @@ -52,5 +52,5 @@ "scaf_pct_gt50K": 95.694664, "gc_avg": 0.69368, "gc_std": 0.03501, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna" + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna" } diff --git a/tests/data/results_multi/stats.json b/tests/data/results_multi/stats.json index 1045e732..16ca3770 100644 --- a/tests/data/results_multi/stats.json +++ b/tests/data/results_multi/stats.json @@ -3,12 +3,12 @@ "contigs": 61, "gc_avg": 0.65985, "gc_std": 0.02052, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna" + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna" } { "scaffolds": 28, "contigs": 31, "gc_avg": 0.69368, "gc_std": 0.03501, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna" + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna" } diff --git a/tests/data/results_single/stats.json b/tests/data/results_single/stats.json index 77c69432..8a201e96 100644 --- a/tests/data/results_single/stats.json +++ b/tests/data/results_single/stats.json @@ -3,5 +3,5 @@ "contigs": 61, "gc_avg": 0.65985, "gc_std": 0.02052, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna" + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna" } diff --git a/tests/parsers/conftest.py b/tests/parsers/conftest.py index fa41f7a2..28939abb 100644 --- a/tests/parsers/conftest.py +++ b/tests/parsers/conftest.py @@ -10,14 +10,14 @@ "contigs": 61, "gc_avg": 0.65985, "gc_std": 0.02052, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna", + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna", }, "FW305-C-112.1_scaffolds.fna": { "scaffolds": 28, "contigs": 31, "gc_avg": 0.69368, "gc_std": 0.03501, - "filename": "/home/runner/work/cdm-data-loader-utils/cdm-data-loader-utils/tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna", + "filename": "/home/runner/work/cdm-data-loaders/cdm-data-loaders/tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna", }, } diff --git a/tests/parsers/refseq_importer/test_cdm_builders.py b/tests/parsers/refseq_importer/test_cdm_builders.py index 4cb87729..16a12c2d 100644 --- a/tests/parsers/refseq_importer/test_cdm_builders.py +++ b/tests/parsers/refseq_importer/test_cdm_builders.py @@ -1,7 +1,7 @@ import pytest from pyspark.sql import SparkSession -from cdm_data_loader_utils.parsers.refseq_importer.core.cdm_builders import ( +from cdm_data_loaders.parsers.refseq_importer.core.cdm_builders import ( build_cdm_contig_collection, build_cdm_entity, build_cdm_identifier_rows, @@ -9,7 +9,7 @@ build_entity_id, ) -### pytest cdm_data_loader_utils.parsers.refseq_importer/tests/test_cdm_builders.py ### +### pytest cdm_data_loaders.parsers.refseq_importer/tests/test_cdm_builders.py ### # ------------------------------------------------------------- @@ -19,7 +19,7 @@ def spark(): spark = ( SparkSession.builder.master("local[1]") - .appName("cdm_data_loader_utils.parsers.refseq_importer_tests") + .appName("cdm_data_loaders.parsers.refseq_importer_tests") .getOrCreate() ) yield spark diff --git a/tests/parsers/refseq_importer/test_extractors.py b/tests/parsers/refseq_importer/test_extractors.py index 317fa7ed..ec4d93c7 100644 --- a/tests/parsers/refseq_importer/test_extractors.py +++ b/tests/parsers/refseq_importer/test_extractors.py @@ -1,6 +1,6 @@ import pytest -from cdm_data_loader_utils.parsers.refseq_importer.core.extractors import ( +from cdm_data_loaders.parsers.refseq_importer.core.extractors import ( PAT_BIOSAMPLE, _coalesce, _deep_collect_regex, diff --git a/tests/parsers/refseq_importer/test_refseq_api_cli.py b/tests/parsers/refseq_importer/test_refseq_api_cli.py index e1cf1757..b88204bf 100644 --- a/tests/parsers/refseq_importer/test_refseq_api_cli.py +++ b/tests/parsers/refseq_importer/test_refseq_api_cli.py @@ -2,7 +2,7 @@ from click.testing import CliRunner -from cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli import ( +from cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli import ( cli, main, parse_taxid_args, @@ -31,11 +31,11 @@ def test_parse_taxid_args_file(tmp_path) -> None: # ------------------------------------------------- -@patch("cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli.write_and_preview") -@patch("cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli.finalize_tables") -@patch("cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli.process_taxon") -@patch("cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli.write_delta") -@patch("cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli.build_spark") +@patch("cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli.write_and_preview") +@patch("cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli.finalize_tables") +@patch("cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli.process_taxon") +@patch("cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli.write_delta") +@patch("cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli.build_spark") def test_main_end_to_end( mock_build, mock_write_delta, @@ -71,7 +71,7 @@ def test_main_end_to_end( def test_cli_invocation() -> None: - with patch("cdm_data_loader_utils.parsers.refseq_importer.cli.refseq_api_cli.main") as mock_main: + with patch("cdm_data_loaders.parsers.refseq_importer.cli.refseq_api_cli.main") as mock_main: runner = CliRunner() result = runner.invoke( cli, diff --git a/tests/parsers/refseq_importer/test_spark_delta.py b/tests/parsers/refseq_importer/test_spark_delta.py index b5cd9d0f..7b5f84fa 100644 --- a/tests/parsers/refseq_importer/test_spark_delta.py +++ b/tests/parsers/refseq_importer/test_spark_delta.py @@ -5,7 +5,7 @@ from pyspark.sql import Row, SparkSession from pyspark.sql.types import StringType, StructField, StructType -from cdm_data_loader_utils.parsers.refseq_importer.core.spark_delta import ( +from cdm_data_loaders.parsers.refseq_importer.core.spark_delta import ( build_spark, preview_or_skip, write_delta, diff --git a/tests/parsers/refseq_importer/test_tables_finalize.py b/tests/parsers/refseq_importer/test_tables_finalize.py index c71911ce..869905bc 100644 --- a/tests/parsers/refseq_importer/test_tables_finalize.py +++ b/tests/parsers/refseq_importer/test_tables_finalize.py @@ -2,7 +2,7 @@ from pyspark.sql import Row, SparkSession from pyspark.sql.types import StringType, StructField, StructType -from cdm_data_loader_utils.parsers.refseq_importer.core.tables_finalize import finalize_tables, list_of_dicts_to_spark +from cdm_data_loaders.parsers.refseq_importer.core.tables_finalize import finalize_tables, list_of_dicts_to_spark # ------------------------------------------------------------------- diff --git a/tests/parsers/test_bbmap_stats.py b/tests/parsers/test_bbmap_stats.py index 85575026..bae0670d 100644 --- a/tests/parsers/test_bbmap_stats.py +++ b/tests/parsers/test_bbmap_stats.py @@ -6,7 +6,7 @@ import pytest -from cdm_data_loader_utils.parsers.bbmap_stats import get_bbmap_stats +from cdm_data_loaders.parsers.bbmap_stats import get_bbmap_stats from tests.parsers.conftest import RESULTS diff --git a/tests/parsers/test_checkm2.py b/tests/parsers/test_checkm2.py index 63f70915..f5c7d8dc 100644 --- a/tests/parsers/test_checkm2.py +++ b/tests/parsers/test_checkm2.py @@ -5,7 +5,7 @@ import pytest -from cdm_data_loader_utils.parsers.checkm2 import get_checkm2_data +from cdm_data_loaders.parsers.checkm2 import get_checkm2_data from tests.parsers.conftest import RESULTS diff --git a/tests/parsers/test_gene_association_file.py b/tests/parsers/test_gene_association_file.py index 52256569..26d0f917 100644 --- a/tests/parsers/test_gene_association_file.py +++ b/tests/parsers/test_gene_association_file.py @@ -14,7 +14,7 @@ from pyspark.sql import Row, SparkSession from pyspark.sql.functions import col -from cdm_data_loader_utils.parsers.gene_association_file import ( +from cdm_data_loaders.parsers.gene_association_file import ( AGGREGATOR, ANNOTATION_DATE, DB, diff --git a/tests/parsers/test_genome_loader.py b/tests/parsers/test_genome_loader.py index 006d6a9c..1d026b11 100644 --- a/tests/parsers/test_genome_loader.py +++ b/tests/parsers/test_genome_loader.py @@ -4,7 +4,7 @@ import pytest -from cdm_data_loader_utils.parsers.genome_loader import MultiGenomeDataFileCreator +from cdm_data_loaders.parsers.genome_loader import MultiGenomeDataFileCreator @pytest.mark.parametrize( diff --git a/tests/parsers/test_genome_paths.py b/tests/parsers/test_genome_paths.py index 99adedc4..846b40ef 100644 --- a/tests/parsers/test_genome_paths.py +++ b/tests/parsers/test_genome_paths.py @@ -7,7 +7,7 @@ import pytest -from cdm_data_loader_utils.parsers.genome_paths import get_genome_paths +from cdm_data_loaders.parsers.genome_paths import get_genome_paths GPF_DIR = "genome_paths_file" diff --git a/tests/parsers/uniprot/test_idmapping.py b/tests/parsers/uniprot/test_idmapping.py index d23f5330..d78f68c3 100644 --- a/tests/parsers/uniprot/test_idmapping.py +++ b/tests/parsers/uniprot/test_idmapping.py @@ -7,7 +7,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as sf -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, METRICS, N_INVALID, @@ -19,9 +19,9 @@ SOURCE, VALIDATION_ERRORS, ) -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.parsers.uniprot.idmapping import ingest -from cdm_data_loader_utils.utils.spark_delta import write_delta +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.parsers.uniprot.idmapping import ingest +from cdm_data_loaders.utils.spark_delta import write_delta from tests.audit.conftest import ( INIT_TIMESTAMP_FIELDS, ) diff --git a/tests/parsers/uniprot/test_metalink.py b/tests/parsers/uniprot/test_metalink.py index 0a1d6824..7852e347 100644 --- a/tests/parsers/uniprot/test_metalink.py +++ b/tests/parsers/uniprot/test_metalink.py @@ -5,7 +5,7 @@ import pytest -from cdm_data_loader_utils.parsers.uniprot.metalink import generate_data_source_table, get_files +from cdm_data_loaders.parsers.uniprot.metalink import generate_data_source_table, get_files def test_generate_data_source_table(test_data_dir: Path) -> None: diff --git a/tests/parsers/uniprot/test_relnotes.py b/tests/parsers/uniprot/test_relnotes.py index ef4c610d..2ab394d8 100644 --- a/tests/parsers/uniprot/test_relnotes.py +++ b/tests/parsers/uniprot/test_relnotes.py @@ -5,7 +5,7 @@ import pytest -from cdm_data_loader_utils.parsers.uniprot.relnotes import parse, parse_relnotes +from cdm_data_loaders.parsers.uniprot.relnotes import parse, parse_relnotes def test_parse_relnotes(test_data_dir: Path) -> None: diff --git a/tests/parsers/uniprot/test_uniprot_kb.py b/tests/parsers/uniprot/test_uniprot_kb.py index f99d9e4e..56ef0764 100644 --- a/tests/parsers/uniprot/test_uniprot_kb.py +++ b/tests/parsers/uniprot/test_uniprot_kb.py @@ -7,7 +7,7 @@ import pytest from lxml.etree import XMLParser, fromstring -from cdm_data_loader_utils.parsers.uniprot.uniprot_kb import ( +from cdm_data_loaders.parsers.uniprot.uniprot_kb import ( CONTENT, DB, DESCRIPTION, diff --git a/tests/parsers/uniprot/test_uniref.py b/tests/parsers/uniprot/test_uniref.py index 4141b2ae..0abe8e74 100644 --- a/tests/parsers/uniprot/test_uniref.py +++ b/tests/parsers/uniprot/test_uniref.py @@ -7,7 +7,7 @@ import pytest from lxml.etree import fromstring -from cdm_data_loader_utils.parsers.uniprot.uniref import ( +from cdm_data_loaders.parsers.uniprot.uniref import ( UNIREF_URL, extract_cluster, extract_cross_refs, diff --git a/tests/readers/test_dsv.py b/tests/readers/test_dsv.py index 369c2d95..07d9814a 100644 --- a/tests/readers/test_dsv.py +++ b/tests/readers/test_dsv.py @@ -11,7 +11,7 @@ from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.testing import assertSchemaEqual -from cdm_data_loader_utils.readers.dsv import read, read_csv, read_tsv +from cdm_data_loaders.readers.dsv import read, read_csv, read_tsv from tests.conftest import ALL_LINES, MISSING_REQUIRED, TOO_FEW_COLS, TOO_MANY_COLS, TYPE_MISMATCH, VALID from tests.helpers import assertDataFrameEqual diff --git a/tests/readers/test_dsv_read_with_validation.py b/tests/readers/test_dsv_read_with_validation.py index c6383b2c..32cf9372 100644 --- a/tests/readers/test_dsv_read_with_validation.py +++ b/tests/readers/test_dsv_read_with_validation.py @@ -6,7 +6,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StringType, StructField -from cdm_data_loader_utils.audit.schema import ( +from cdm_data_loaders.audit.schema import ( AUDIT_SCHEMA, METRICS, N_INVALID, @@ -14,11 +14,11 @@ N_VALID, VALIDATION_ERRORS, ) -from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.readers.dsv import read -from cdm_data_loader_utils.validation.dataframe_validator import DataFrameValidator, Validator -from cdm_data_loader_utils.validation.df_nullable_fields import validate as nullable_fields +from cdm_data_loaders.core.constants import INVALID_DATA_FIELD_NAME +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.readers.dsv import read +from cdm_data_loaders.validation.dataframe_validator import DataFrameValidator, Validator +from cdm_data_loaders.validation.df_nullable_fields import validate as nullable_fields from tests.conftest import ALL_LINES, MISSING_REQUIRED, TEST_NS, TOO_FEW_COLS, TOO_MANY_COLS, TYPE_MISMATCH, VALID from tests.helpers import create_empty_delta_table diff --git a/tests/utils/download/conftest.py b/tests/utils/download/conftest.py index 8e5774f1..37dcdb09 100644 --- a/tests/utils/download/conftest.py +++ b/tests/utils/download/conftest.py @@ -7,8 +7,8 @@ import httpx import pytest -from cdm_data_loader_utils.utils.download.async_client import AsyncFileDownloader -from cdm_data_loader_utils.utils.download.sync_client import FileDownloader +from cdm_data_loaders.utils.download.async_client import AsyncFileDownloader +from cdm_data_loaders.utils.download.sync_client import FileDownloader @dataclass diff --git a/tests/utils/download/test_async_client.py b/tests/utils/download/test_async_client.py index f42518e6..435e447c 100644 --- a/tests/utils/download/test_async_client.py +++ b/tests/utils/download/test_async_client.py @@ -7,7 +7,7 @@ import httpx import pytest -from cdm_data_loader_utils.utils.download.async_client import AsyncFileDownloader +from cdm_data_loaders.utils.download.async_client import AsyncFileDownloader DOWNLOAD_URL = "https://example.com/file.txt" diff --git a/tests/utils/download/test_clients.py b/tests/utils/download/test_clients.py index 1a1c305a..3be2460c 100644 --- a/tests/utils/download/test_clients.py +++ b/tests/utils/download/test_clients.py @@ -7,13 +7,13 @@ import httpx import pytest -from cdm_data_loader_utils.utils.download.async_client import AsyncFileDownloader -from cdm_data_loader_utils.utils.download.core import ( +from cdm_data_loaders.utils.download.async_client import AsyncFileDownloader +from cdm_data_loaders.utils.download.core import ( ChecksumMismatchError, DownloadError, NonRetryableDownloadError, ) -from cdm_data_loader_utils.utils.download.sync_client import FileDownloader +from cdm_data_loaders.utils.download.sync_client import FileDownloader from tests.utils.download.conftest import DownloaderAdapter DOWNLOAD_URL = "https://example.com/file.txt" diff --git a/tests/utils/test_gz.py b/tests/utils/test_gz.py index dcd45677..b3276dbc 100644 --- a/tests/utils/test_gz.py +++ b/tests/utils/test_gz.py @@ -7,7 +7,7 @@ import pytest from click.testing import CliRunner -from cdm_data_loader_utils.utils.gz import compress_file, compress_files, main +from cdm_data_loaders.utils.gz import compress_file, compress_files, main @pytest.fixture diff --git a/tests/utils/test_spark_delta.py b/tests/utils/test_spark_delta.py index ed0d3934..db40b11a 100644 --- a/tests/utils/test_spark_delta.py +++ b/tests/utils/test_spark_delta.py @@ -8,8 +8,8 @@ import pytest from pyspark.sql import DataFrame, DataFrameWriter, Row, SparkSession -from cdm_data_loader_utils.utils import spark_delta -from cdm_data_loader_utils.utils.spark_delta import ( +from cdm_data_loaders.utils import spark_delta +from cdm_data_loaders.utils.spark_delta import ( APPEND, DEFAULT_APP_NAME, DEFAULT_NAMESPACE, @@ -70,7 +70,7 @@ def spark_db(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Generator[tuple """Provide a Spark session with a per-test warehouse dir and patched workspace setup.""" # patch the create_namespace_if_not_exists function monkeypatch.setattr( - "cdm_data_loader_utils.utils.spark_delta.create_namespace_if_not_exists", + "cdm_data_loaders.utils.spark_delta.create_namespace_if_not_exists", fake_create_namespace_if_not_exists, ) @@ -99,7 +99,7 @@ def fake_get_spark_session(*args: str, **kwargs: str | bool) -> str: # noqa: AR return "fake spark session" monkeypatch.setattr( - "cdm_data_loader_utils.utils.spark_delta.get_spark_session", + "cdm_data_loaders.utils.spark_delta.get_spark_session", fake_get_spark_session, ) @@ -147,12 +147,12 @@ def fake_create_ns(*args: str, **kwargs: str | bool) -> str: return "delta namespace" monkeypatch.setattr( - "cdm_data_loader_utils.utils.spark_delta.get_spark_session", + "cdm_data_loaders.utils.spark_delta.get_spark_session", fake_get_spark_session, ) monkeypatch.setattr( - "cdm_data_loader_utils.utils.spark_delta.create_namespace_if_not_exists", + "cdm_data_loaders.utils.spark_delta.create_namespace_if_not_exists", fake_create_ns, ) @@ -212,7 +212,7 @@ def fake_create_namespace_if_not_exists( # patch the create_namespace_if_not_exists function monkeypatch.setattr( - "cdm_data_loader_utils.utils.spark_delta.create_namespace_if_not_exists", + "cdm_data_loaders.utils.spark_delta.create_namespace_if_not_exists", fake_create_namespace_if_not_exists, ) diff --git a/tests/utils/test_xml_utils.py b/tests/utils/test_xml_utils.py index 02deb050..f014b879 100644 --- a/tests/utils/test_xml_utils.py +++ b/tests/utils/test_xml_utils.py @@ -1,6 +1,6 @@ import xml.etree.ElementTree as ET -from cdm_data_loader_utils.utils.xml_utils import ( +from cdm_data_loaders.utils.xml_utils import ( clean_dict, get_attr, get_text, diff --git a/tests/validation/test_dataframe_validator.py b/tests/validation/test_dataframe_validator.py index 63a316ce..925ca22d 100644 --- a/tests/validation/test_dataframe_validator.py +++ b/tests/validation/test_dataframe_validator.py @@ -7,10 +7,10 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructField, StructType -from cdm_data_loader_utils.audit.schema import METRICS, REJECTS, ROW_ERRORS -from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME -from cdm_data_loader_utils.core.pipeline_run import PipelineRun -from cdm_data_loader_utils.validation.dataframe_validator import DataFrameValidator, Validator +from cdm_data_loaders.audit.schema import METRICS, REJECTS, ROW_ERRORS +from cdm_data_loaders.core.constants import INVALID_DATA_FIELD_NAME +from cdm_data_loaders.core.pipeline_run import PipelineRun +from cdm_data_loaders.validation.dataframe_validator import DataFrameValidator, Validator from tests.audit.conftest import create_table diff --git a/tests/validation/test_df_nullable_fields.py b/tests/validation/test_df_nullable_fields.py index e008e808..61a827a6 100644 --- a/tests/validation/test_df_nullable_fields.py +++ b/tests/validation/test_df_nullable_fields.py @@ -4,10 +4,10 @@ from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StructType -from cdm_data_loader_utils.audit.schema import ROW_ERRORS -from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME -from cdm_data_loader_utils.readers.dsv import INVALID_DATA_FIELD -from cdm_data_loader_utils.validation.df_nullable_fields import validate +from cdm_data_loaders.audit.schema import ROW_ERRORS +from cdm_data_loaders.core.constants import INVALID_DATA_FIELD_NAME +from cdm_data_loaders.readers.dsv import INVALID_DATA_FIELD +from cdm_data_loaders.validation.df_nullable_fields import validate from tests.conftest import ALL_LINES, MISSING_REQUIRED, TOO_FEW_COLS, TOO_MANY_COLS, TYPE_MISMATCH, VALID diff --git a/uv.lock b/uv.lock index 168e198f..4e0b624b 100644 --- a/uv.lock +++ b/uv.lock @@ -413,7 +413,7 @@ crt = [ ] [[package]] -name = "cdm-data-loader-utils" +name = "cdm-data-loaders" version = "0.1.0" source = { editable = "." } dependencies = [ From d5ab82085cf3722710f71c035b6121b8168e6e7d Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Mon, 9 Mar 2026 07:39:11 -0700 Subject: [PATCH 5/7] Reformatting file --- tests/parsers/refseq_importer/test_cdm_builders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/parsers/refseq_importer/test_cdm_builders.py b/tests/parsers/refseq_importer/test_cdm_builders.py index 16a12c2d..398590c1 100644 --- a/tests/parsers/refseq_importer/test_cdm_builders.py +++ b/tests/parsers/refseq_importer/test_cdm_builders.py @@ -18,9 +18,7 @@ @pytest.fixture(scope="session") def spark(): spark = ( - SparkSession.builder.master("local[1]") - .appName("cdm_data_loaders.parsers.refseq_importer_tests") - .getOrCreate() + SparkSession.builder.master("local[1]").appName("cdm_data_loaders.parsers.refseq_importer_tests").getOrCreate() ) yield spark spark.stop() From 58a2e8c53e7c0a90beb26241f0949c388820ee67 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Mon, 9 Mar 2026 07:56:33 -0700 Subject: [PATCH 6/7] Removing unused old code --- pyproject.toml | 5 +- src/cdm_data_loaders/model/feature.py | 22 - src/cdm_data_loaders/parsers/fasta.py | 44 -- src/cdm_data_loaders/parsers/genome_loader.py | 519 ------------------ src/cdm_data_loaders/parsers/genome_paths.py | 65 --- src/cdm_data_loaders/transformers/__init__.py | 0 .../transformers/genome_depot/__init__.py | 0 .../transformers/genome_depot/schema.py | 437 --------------- src/cdm_data_loaders/utils/calculate_hash.py | 96 ---- tests/parsers/test_genome_loader.py | 124 ----- tests/parsers/test_genome_paths.py | 145 ----- uv.lock | 40 +- 12 files changed, 2 insertions(+), 1495 deletions(-) delete mode 100644 src/cdm_data_loaders/model/feature.py delete mode 100644 src/cdm_data_loaders/parsers/fasta.py delete mode 100644 src/cdm_data_loaders/parsers/genome_loader.py delete mode 100644 src/cdm_data_loaders/parsers/genome_paths.py delete mode 100644 src/cdm_data_loaders/transformers/__init__.py delete mode 100644 src/cdm_data_loaders/transformers/genome_depot/__init__.py delete mode 100644 src/cdm_data_loaders/transformers/genome_depot/schema.py delete mode 100644 src/cdm_data_loaders/utils/calculate_hash.py delete mode 100644 tests/parsers/test_genome_loader.py delete mode 100644 tests/parsers/test_genome_paths.py diff --git a/pyproject.toml b/pyproject.toml index b140e715..dc8f6d8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ uniref_pipeline = "cdm_data_loaders.pipelines.uniref_pipeline:cli" [dependency-groups] dev = [ "berdl-notebook-utils", - "biopython>=1.86", "pytest>=9.0.2", "pytest-asyncio>=1.3.0", "pytest-cov>=7.0.0", @@ -52,9 +51,7 @@ minio = [ "tqdm>=4.67.3", ] -biopython = [ - "biopython>=1.86", -] +biopython = [] [tool.ruff] line-length = 120 diff --git a/src/cdm_data_loaders/model/feature.py b/src/cdm_data_loaders/model/feature.py deleted file mode 100644 index a716cdfa..00000000 --- a/src/cdm_data_loaders/model/feature.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Class for representing feature data.""" - - -class Feature: - def __init__( - self: "Feature", - feature_id: str, - sequence: str, - description: str | None = None, - aliases=None, - ) -> None: - self.id = feature_id - self.seq = sequence - self.description = description - self.ontology_terms = {} - self.aliases = aliases - - def add_ontology_term(self: "Feature", ontology_term: str, value: str) -> None: - if ontology_term not in self.ontology_terms: - self.ontology_terms[ontology_term] = [] - if value not in self.ontology_terms[ontology_term]: - self.ontology_terms[ontology_term].append(value) diff --git a/src/cdm_data_loaders/parsers/fasta.py b/src/cdm_data_loaders/parsers/fasta.py deleted file mode 100644 index e4e19a29..00000000 --- a/src/cdm_data_loaders/parsers/fasta.py +++ /dev/null @@ -1,44 +0,0 @@ -"""FASTA file parser.""" - -import gzip - -from cdm_data_loaders.model.feature import Feature - -DEFAULT_SPLIT = " " - - -def extract_features(faa_str: str, split: str = DEFAULT_SPLIT, h_func=None) -> list[Feature]: - features = [] - active_seq = None - seq_lines = [] - for line in faa_str.split("\n"): - if line.startswith(">"): - if active_seq is not None: - active_seq.seq = "".join(seq_lines) - features.append(active_seq) - seq_lines = [] - seq_id = line[1:] - desc = None - if h_func: - seq_id, desc = h_func(seq_id) - elif split: - header_data = line[1:].split(split, 1) - seq_id = header_data[0] - if len(header_data) > 1: - desc = header_data[1] - active_seq = Feature(seq_id, "", desc) - else: - seq_lines.append(line.strip()) - if len(seq_lines) > 0: - active_seq.seq = "".join(seq_lines) - features.append(active_seq) - return features - - -def read_fasta(f: str, split: str = DEFAULT_SPLIT, h_func=None) -> list[Feature]: - if f.endswith(".gz"): - with gzip.open(f, "rb") as fh: - return extract_features(fh.read().decode("utf-8"), split, h_func) - - with open(f) as fh: - return extract_features(fh.read(), split, h_func) diff --git a/src/cdm_data_loaders/parsers/genome_loader.py b/src/cdm_data_loaders/parsers/genome_loader.py deleted file mode 100644 index 711cc8c5..00000000 --- a/src/cdm_data_loaders/parsers/genome_loader.py +++ /dev/null @@ -1,519 +0,0 @@ -"""Loader for genome files.""" - -import argparse -import csv -import gzip -import json -import sys -from pathlib import Path -from typing import Any - -from Bio import SeqIO - -from cdm_data_loaders.parsers.bbmap_stats import get_bbmap_stats -from cdm_data_loaders.parsers.checkm2 import get_checkm2_data -from cdm_data_loaders.parsers.genome_paths import get_genome_paths -from cdm_data_loaders.utils import calculate_hash as ch - -# Define SO terms mapping -so_terms = { - "gene": "SO:0000704", - "pseudogene": "SO:0000336", - "ncRNA_gene": "SO:0001263", - "mRNA": "SO:0000234", - "CDS": "SO:0000316", - "exon": "SO:0000147", - "five_prime_UTR": "SO:0000204", - "three_prime_UTR": "SO:0000205", - "ncRNA": "SO:0000655", - "rRNA": "SO:0000252", - "tRNA": "SO:0000253", - "SRP_RNA": "SO:0000590", - "RNase_P_RNA": "SO:0000386", - "riboswitch": "SO:0000035", - "direct_repeat": "SO:0000319", - "origin_of_replication": "SO:0000296", - "CRISPR": "SO:0001459", - "mobile_genetic_element": "SO:0001037", - "region": "SO:0000001", - "sequence_feature": "SO:0000110", -} - - -class GenomeDataFileCreator: - def __init__(self, contigset_file, gff_file, protein_file, output_dir) -> None: - self.contigset_file = contigset_file - self.gff_file = gff_file - self.protein_file = protein_file - self.contigset_dir = Path(contigset_file).parent - self.output_dir = output_dir - - self.features = [] - self.feature_associations = [] - self.feature_protein_associations = [] - self.gff_hash = None - self.protein_count = 0 - self.feature_with_protein_mapping = 0 - - # TODO - # Put self.genome_id and use in contigset - - self.contigset = {} - self.contigs = [] - self.structural_annotation = {} - - self.contigset_hash, self.contig_id_map, self.protein_id_map = ch.compute_hash( - self.contigset_file, self.protein_file - ) - - @staticmethod - def parse_attributes(attributes_str): - """Parse the GFF3 attributes field into a dictionary.""" - attributes = {} - for attribute in attributes_str.strip(";").split(";"): - if "=" in attribute: - key, value = attribute.split("=", 1) - key = key.strip('"') - value = value.strip('"') - attributes[key] = value - return attributes - - # TODO: Update to use prepare_feature_data - def prepare_gff3_data(self) -> None: - """Prepare data for insertion into the database.""" - print(f"Preparing GFF3 data from: {self.gff_file}") - if not self.gff_hash: - print("Error: GFF file hash not calculated.") - return - - open_func = gzip.open if self.gff_file.endswith(".gz") else open - - try: - with open_func(self.gff_file, "rt", encoding="utf-8", errors="ignore") as file: - reader = csv.reader(file, delimiter="\t") - for row in reader: - if row[0].startswith("#") or len(row) < 9: - continue - - seq_id = row[0] - # source = row[1] - feature_type = row[2] - start = int(row[3]) - end = int(row[4]) - # score = row[5] if row[5] != "." else None - strand = row[6] if row[6] in ["+", "-"] else None - phase = row[7] if row[7] in ["0", "1", "2"] else None - attributes_str = row[8] - - feature_ontology = so_terms.get(feature_type, "") - - # Parse attributes - attributes = self.parse_attributes(attributes_str) - feature_name = attributes.get("ID", None) - # protein_accession = attributes.get("protein_id", None) - protein_hash = None - - # Mapping between features in gff and proteins in protein file - if feature_type == "CDS": - protein_hash = self.protein_id_map.get(feature_name, None) - - # Note: This is to handle cases where protein_id in protein file are stashed in - # protein_id field of attributes - # TODO: check if we need more ways to handle this - if not protein_hash and "protein_id" in attributes: - protein_hash = self.protein_id_map.get("protein_id", None) - - if protein_hash: - self.feature_with_protein_mapping += 1 - - contig_hash = self.contig_id_map.get(seq_id, "") - contigset_hash = self.contigset_hash - - # Generate a unique hash ID for each feature - # TODO: May be switch this to use feature_ontology - # If you are working with contigsets of very close strains - # the same contig_hash may appear in each. - # Including contigset_hash ensures that features - # are uniquely identified across different contigsets. - - feature_hash = ch.generate_hash_id(contigset_hash, contig_hash, start, end, feature_type) - - # Prepare feature data including hash of the contigset and contig - feature_data = { - "contigset_hash": contigset_hash, - "feature_hash": feature_hash, - "feature_type": feature_type, - "feature_ontology": feature_ontology, - "start": start, - "end": end, - "strand": strand, - "phase": phase, - "contig_hash": self.contig_id_map.get(seq_id, ""), - "protein_hash": protein_hash, - } - self.features.append(feature_data) - - # Add all other attributes to the associations - - self.feature_associations.append( - { - "gff_hash": self.gff_hash, - "feature_hash": feature_hash, - "feature_attributes": json.dumps(attributes).replace('"', ""), - } - ) - - print(f"Finished preparing GFF3 data. Total features: {len(self.features)}") - except Exception as e: - print(f"Error reading GFF file {self.gff_file}: {e}") - - def prepare_contig_data(self) -> None: - """ - Calculate statistics for each contig in the assembly file. - Handles both compressed (.gz) and uncompressed files. - """ - if not self.contigset_hash: - self.contigset_hash, self.contig_id_map, self.protein_id_map = ch.compute_hash( - self.contigset_file, self.protein_file - ) - - open_func = gzip.open if self.contigset_file.endswith(".gz") else open - - with open_func(self.contigset_file, "rt") as handle: - sequences = SeqIO.parse(handle, "fasta") - for seq_record in sequences: - contig_name = seq_record.id - sequence = str(seq_record.seq).upper() - length = len(sequence) - gc_content = (sequence.count("G") + sequence.count("C")) / length if length > 0 else 0 - # TODO: check if contig_id_map generated or not - self.contigs.append( - { - "contig_hash": self.contig_id_map[contig_name], - "contig_name": contig_name, - "length": length, - "gc_content": gc_content, - "contigset_hash": self.contigset_hash, - "contigset_file": self.contigset_file, - } - ) - - def prepare_contigset_data(self) -> None: - """ - Calculate statistics for each contig in the assembly file. - Handles both compressed (.gz) and uncompressed files. - """ - if not self.contigset_hash: - self.contigset_hash, self.contig_id_map, self.protein_id_map = ch.compute_hash( - self.contigset_file, self.protein_file - ) - - print(self.contigset_file) - - if self.contigset_hash: - self.contigset = { - "contigset_hash": self.contigset_hash, - } - - def prepare_structural_annotation(self) -> None: - self.structural_annotation = { - "contigset_hash": self.contigset_hash, - "gff_hash": self.gff_hash, - "contigset_file": self.contigset_file, - "gff_file": self.gff_file, - } - - def run(self) -> tuple[Any]: - """Import the genomic, feature, and protein data from a set of files.""" - self.gff_hash = ch.calculate_sha256_checksums(self.gff_file) - self.prepare_gff3_data() - self.prepare_contig_data() - self.prepare_contigset_data() - self.prepare_structural_annotation() - - return ( - self.contigset, - self.contigs, - self.features, - self.feature_associations, - self.structural_annotation, - ) - - -class MultiGenomeDataFileCreator: - """Parser that takes in GFF, FAA, and FNA files and generates CDM table data.""" - - def __init__( - self: "MultiGenomeDataFileCreator", - genome_paths_file: str, - output_dir: str, - checkm2_file: str | None = None, - stats_file: str | None = None, - ) -> None: - """Initialise the MultiGenomeDataFileCreator.""" - errs = [] - if not genome_paths_file or not genome_paths_file.strip(): - errs.append("Missing genome_paths_file") - else: - self.genome_paths_file = Path(genome_paths_file.strip()) - if not self.genome_paths_file.exists(): - errs.append(f"genome_paths_file '{self.genome_paths_file}' does not exist") - - if not output_dir or not output_dir.strip(): - errs.append("Missing output_dir") - else: - self.output_dir = Path(output_dir.strip()) - - if errs: - err_msg = f"MultiGenomeDataFileCreator init error:\n{'\n'.join(errs)}" - raise RuntimeError(err_msg) - - self.checkm2_file = None - self.stats_file = None - if checkm2_file and checkm2_file.strip(): - self.checkm2_file = Path(checkm2_file.strip()) - if stats_file and stats_file.strip(): - self.stats_file = Path(stats_file.strip()) - - def write_to_tsv( - self: "MultiGenomeDataFileCreator", - contigset, - contigs, - features, - associations, - structural_annotation, - headers_written, - ) -> None: - """Write data to TSV files incrementally.""" - try: - # Contigset - contigset_out_file = self.output_dir / "contigset.tsv" - write_header = not headers_written["contigset"] - - contigset_fields = [ - "contigset_hash", - "checkm2_contamination", - "checkm2_completeness", - "scaffolds", - "contigs", - "scaf_bp", - "contig_bp", - "gap_pct", - "scaf_N50", - "scaf_L50", - "ctg_N50", - "ctg_L50", - "scaf_N90", - "scaf_L90", - "ctg_N90", - "ctg_L90", - "scaf_logsum", - "scaf_powsum", - "ctg_logsum", - "ctg_powsum", - "asm_score", - "scaf_max", - "ctg_max", - "scaf_n_gt50K", - "scaf_l_gt50k", - "scaf_pct_gt50K", - "gc_avg", - "gc_std", - ] - - with contigset_out_file.open("a", newline="") as f_out: - writer = csv.DictWriter(f_out, fieldnames=contigset_fields, delimiter="\t") - if write_header: - writer.writeheader() - headers_written["contigset"] = True - writer.writerow(contigset) - print(f"Contigset appended to {contigset_out_file}") - - # Contig - contig_file = self.output_dir / "contig.tsv" - write_header = not headers_written["contigs"] - with contig_file.open("a", newline="") as f_out: - writer = csv.DictWriter( - f_out, - fieldnames=[ - "contig_hash", - "contig_name", - "length", - "gc_content", - "contigset_hash", - "contigset_file", - ], - delimiter="\t", - ) - if write_header: - writer.writeheader() - headers_written["contig"] = True - writer.writerows(contigs) - print(f"Contig appended to {contig_file}") - - # Structural annotation - sa_file = self.output_dir / "structural_annotation.tsv" - write_header = not headers_written["structural_annotation"] - with sa_file.open("a", newline="") as f_out: - writer = csv.DictWriter( - f_out, - fieldnames=["contigset_hash", "gff_hash", "contigset_file", "gff_file"], - delimiter="\t", - ) - if write_header: - writer.writeheader() - headers_written["structural_annotation"] = True - writer.writerow(structural_annotation) - print(f"Structural annotation appended to {sa_file}") - - # Features - features_file = self.output_dir / "feature.tsv" - write_header = not headers_written["features"] - with features_file.open("a", newline="") as f_out: - contig_fieldnames = [ - "contigset_hash", - "contig_hash", - "feature_hash", - "feature_type", - "feature_ontology", - "start", - "end", - "strand", - "phase", - "protein_hash", - ] - writer = csv.DictWriter(f_out, fieldnames=contig_fieldnames, delimiter="\t") - if write_header: - writer.writeheader() - headers_written["features"] = True - writer.writerows(features) - print(f"Features appended to {features_file}") - - # Feature associations - associations_file = self.output_dir / "feature_association.tsv" # Renamed file - write_header = not headers_written["associations"] - with associations_file.open("a", newline="") as f_out: - fieldnames = ["feature_hash", "gff_hash", "feature_attributes"] - writer = csv.DictWriter(f_out, fieldnames=fieldnames, delimiter="\t") - if write_header: - writer.writeheader() - headers_written["associations"] = True - writer.writerows(associations) - print(f"Feature associations appended to {associations_file}") - - except Exception as e: - print(f"Error writing to TSV files: {e}") - - def create_all_tables(self: "MultiGenomeDataFileCreator") -> None: - checkm2_data = None - stats_data = None - - genome_paths = get_genome_paths(self.genome_paths_file) - # check for checkm2 and stats data - if self.checkm2_file: - checkm2_data = get_checkm2_data(self.checkm2_file) - if self.stats_file: - stats_data = get_bbmap_stats(self.stats_file) - - if self.output_dir.exists(): - print(f"Error: The directory {self.output_dir} already exists.") - print(f"Error: Remove directory {self.output_dir} to continue") - sys.exit(1) # Exit with a non-zero code to indicate an error - - # create the output directory - self.output_dir.mkdir(parents=True) - - genome_ids = sorted(genome_paths.keys()) - headers_written = { - "contigs": False, - "contigset": False, - "structural_annotation": False, - "features": False, - "associations": False, - } - - for gid in genome_ids: - paths = genome_paths.get(gid) - # this should not happen - if not paths: - print(f"No paths found for genome ID: {gid}") - continue - - # _scaffolds.fna - contigset_file = paths.get("fna") - # _genes.gff - gff_file = paths.get("gff") - # _genes.faa - protein_file = paths.get("protein") - if not contigset_file or not gff_file or not protein_file: - print(f"Missing file paths for genome ID: {gid}") - continue - - print(f"\n==Processing contigset {contigset_file}==\n") - parser = GenomeDataFileCreator(contigset_file, gff_file, protein_file, self.output_dir) - contigset, contigs, features, associations, structural_annotation = parser.run() - - # get the stem of the contigset file name and check the checkm2 data for it - if checkm2_data and Path(contigset_file).stem in checkm2_data: - contigset.update(checkm2_data[Path(contigset_file).stem]) - # check for the contigset file name - if stats_data and Path(contigset_file).name in stats_data: - contigset.update(stats_data[Path(contigset_file).name]) - del contigset["filename"] - - # Write data to TSV files incrementally - self.write_to_tsv( - contigset, - contigs, - features, - associations, - structural_annotation, - headers_written, - ) - - # Clear data to free memory - contigset.clear() - contigs.clear() - features.clear() - associations.clear() - - @classmethod - def from_args(cls: type["MultiGenomeDataFileCreator"]) -> "MultiGenomeDataFileCreator": - """Parse command-line arguments and create an instance of MultiGenomeDataFileCreator.""" - parser = argparse.ArgumentParser(description="Create tables for a specified genome ID.") - parser.add_argument("--genome_paths_file", type=str, required=True, help="Path to genome paths JSON file") - - parser.add_argument("--output_dir", type=str, required=True, help="Output path to save the table files") - - parser.add_argument( - "--stats", - type=str, - default="", - help="Path to the directory containing the `stats.json` fna stats file", - ) - - parser.add_argument( - "--checkm2", - type=str, - default="", - help="Path to the directory containing the `quality_report.tsv` file produced by checkm2", - ) - - args = parser.parse_args() - - return cls( - genome_paths_file=args.genome_paths_file.strip(), - stats_file=args.stats_file.strip(), - checkm2_file=args.checkm2_file.strip(), - output_dir=args.output_dir.strip(), - ) - - def run(self: "MultiGenomeDataFileCreator") -> None: - """Main method to parse input files and generate TSV output.""" - self.create_all_tables() - - -if __name__ == "__main__": - # Create an instance of MultiGenomeDataFileCreator using command-line arguments - creator = MultiGenomeDataFileCreator.from_args() - creator.run() diff --git a/src/cdm_data_loaders/parsers/genome_paths.py b/src/cdm_data_loaders/parsers/genome_paths.py deleted file mode 100644 index e60e98c8..00000000 --- a/src/cdm_data_loaders/parsers/genome_paths.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Parse the genome_paths_file.""" - -import json -from pathlib import Path -from typing import Any - -VALID_FILE_TYPES = ["gff", "fna", "protein"] - - -def get_genome_paths(genome_paths_file: Path) -> dict[str, dict[str, Any]]: - """Read the genome paths file and retrieve the list of files from it. - - :param genome_paths_file: path to the genome paths JSON file - :type genome_paths_file: Path - :return: dictionary, indexed by contigset ID, of file types and paths - :rtype: dict[str, dict[str, Any]] - """ - try: - with genome_paths_file.open() as json_file: - genome_paths = json.load(json_file) - except Exception as err: - err_msg = f"error parsing genome_paths_file: {err!s}" - raise RuntimeError(err_msg) from err - - if not isinstance(genome_paths, dict): - err_msg = "genome_paths_file is not in the correct format" - raise TypeError(err_msg) - - if not genome_paths: - err_msg = "no valid data found in genome_paths_file" - raise RuntimeError(err_msg) - - entries = {} - err_list = [] - entry_list = sorted(genome_paths.keys()) - for entry_id in entry_list: - entry_data = genome_paths[entry_id] - if not entry_id: - err_list.append(f"No ID specified for entry {json.dumps(entry_data, indent=None, sort_keys=True)}") - continue - - if not isinstance(entry_data, dict): - err_list.append(f"{entry_id}: invalid entry format") - continue - - if not entry_data: - err_list.append(f"{entry_id}: no valid file types or paths found") - continue - - invalid_keys = [k for k in entry_data if k not in VALID_FILE_TYPES] - if invalid_keys: - err_list.append(f"{entry_id}: invalid keys: {', '.join(sorted(invalid_keys))}") - continue - - # entries will be unique so don't have to check whether entry_id already exists - entries[entry_id] = entry_data - - if not entries: - err_list.append("No valid entries found in genome_paths_file") - - if err_list: - err_msg = "Please ensure that the genome_paths_file is in the correct format.\n\n" - raise RuntimeError(err_msg + "\n".join(err_list)) - - return genome_paths diff --git a/src/cdm_data_loaders/transformers/__init__.py b/src/cdm_data_loaders/transformers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/cdm_data_loaders/transformers/genome_depot/__init__.py b/src/cdm_data_loaders/transformers/genome_depot/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/cdm_data_loaders/transformers/genome_depot/schema.py b/src/cdm_data_loaders/transformers/genome_depot/schema.py deleted file mode 100644 index ac43d45a..00000000 --- a/src/cdm_data_loaders/transformers/genome_depot/schema.py +++ /dev/null @@ -1,437 +0,0 @@ -"""Database tables and fields for the genome-depot (Alexeydata) ENIGMA database.""" - -GENOME_DEPOT_SCHEMA = { - "browser_cazy_family": [ - "id", # int not_null auto_increment, - "cazy_id", # varchar(12) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "cazy_id" ("cazy_id") - ], - "browser_cog_class": [ - "id", # int not_null auto_increment, - "cog_id", # varchar(1) not_null, - "description", # varchar(80) not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "cog_id" ("cog_id") - ], - "browser_config": [ - "id", # int not_null auto_increment, - "param", # varchar(255) not_null, - "value", # varchar(255) not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "param" ("param") - ], - "browser_ec_number": [ - "id", # int not_null auto_increment, - "ec_number", # varchar(12) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "ec_number" ("ec_number") - ], - "browser_eggnog_description": [ - "id", # int not_null auto_increment, - "fingerprint", # varchar(32) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "fingerprint" ("fingerprint") - ], - "browser_go_term": [ - "id", # int not_null auto_increment, - "go_id", # varchar(12) not_null, - "go_namespace", # varchar(50) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "go_id" ("go_id") - ], - "browser_kegg_ortholog": [ - "id", # int not_null auto_increment, - "kegg_id", # varchar(10) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "kegg_id" ("kegg_id") - ], - "browser_kegg_pathway": [ - "id", # int not_null auto_increment, - "kegg_id", # varchar(10) not_null, - "description", # varchar(200) not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "kegg_id" ("kegg_id") - ], - "browser_kegg_reaction": [ - "id", # int not_null auto_increment, - "kegg_id", # varchar(10) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "kegg_id" ("kegg_id") - ], - "browser_sample": [ - "id", # int not_null auto_increment, - "sample_id", # varchar(100) not_null, - "full_name", # varchar(200) not_null, - "description", # varchar(250) not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "sample_id" ("sample_id") - ], - "browser_tag": [ - "id", # int not_null auto_increment, - "name", # varchar(50) not_null, - "description", # varchar(300) not_null, - "color", # varchar(7) not_null, - "textcolor", # varchar(7) not_null, - # PRIMARY KEY ("id"), - # KEY "browser_tag_name_10f69202" ("name") - ], - "browser_taxon": [ - "id", # int not_null auto_increment, - "taxonomy_id", # varchar(10) not_null, - "eggnog_taxid", # varchar(10) DEFAULT NULL, - "rank", # varchar(20) not_null, - "parent_id", # varchar(10) not_null, - "name", # varchar(100) not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "taxonomy_id" ("taxonomy_id") - ], - "browser_tc_family": [ - "id", # int not_null auto_increment, - "tc_id", # varchar(15) not_null, - "description", # longtext not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "tc_id" ("tc_id") - ], - "browser_ortholog_group": [ - "id", # int not_null auto_increment, - "eggnog_id", # varchar(15) not_null, - "taxon_id", # int DEFAULT NULL, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_ortholog_group_eggnog_id_taxon_id_95fd5f69_uniq" ("eggnog_id","taxon_id"), - # KEY "browser_ortholog_group_taxon_id_e0788f6d_fk_browser_taxon_id" ("taxon_id"), - # KEY "browser_ortholog_group_eggnog_id_16309ff7" ("eggnog_id"), - # CONSTRAINT "browser_ortholog_group_taxon_id_e0788f6d_fk_browser_taxon_id" FOREIGN KEY ("taxon_id") REFERENCES "browser_taxon" ("id") - ], - "browser_protein": [ - "id", # int not_null auto_increment, - "name", # varchar(100) not_null, - "length", # int not_null, - "protein_hash", # varchar(32) not_null, - "sequence", # longtext not_null, - "eggnog_description_id", # int DEFAULT NULL, - "taxonomy_id_id", # int DEFAULT NULL, - # PRIMARY KEY ("id"), - # UNIQUE KEY "protein_hash" ("protein_hash"), - # KEY "browser_protein_eggnog_description_i_7d992f52_fk_browser_e" ("eggnog_description_id"), - # KEY "browser_protein_taxonomy_id_id_ec28e08a_fk_browser_taxon_id" ("taxonomy_id_id"), - # CONSTRAINT "browser_protein_eggnog_description_i_7d992f52_fk_browser_e" FOREIGN KEY ("eggnog_description_id") REFERENCES "browser_eggnog_description" ("id"), - # CONSTRAINT "browser_protein_taxonomy_id_id_ec28e08a_fk_browser_taxon_id" FOREIGN KEY ("taxonomy_id_id") REFERENCES "browser_taxon" ("id") - ], - "browser_protein_cazy_families": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "cazy_family_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_cazy_fam_protein_id_cazy_family_i_98e6a206_uniq" ("protein_id","cazy_family_id"), - # KEY "browser_protein_cazy_cazy_family_id_0b498a05_fk_browser_c" ("cazy_family_id"), - # CONSTRAINT "browser_protein_cazy_cazy_family_id_0b498a05_fk_browser_c" FOREIGN KEY ("cazy_family_id") REFERENCES "browser_cazy_family" ("id"), - # CONSTRAINT "browser_protein_cazy_protein_id_9d8c2d94_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_cog_classes": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "cog_class_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_cog_clas_protein_id_cog_class_id_68b06ebc_uniq" ("protein_id","cog_class_id"), - # KEY "browser_protein_cog__cog_class_id_628912e9_fk_browser_c" ("cog_class_id"), - # CONSTRAINT "browser_protein_cog__cog_class_id_628912e9_fk_browser_c" FOREIGN KEY ("cog_class_id") REFERENCES "browser_cog_class" ("id"), - # CONSTRAINT "browser_protein_cog__protein_id_43c3620b_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_ec_numbers": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "ec_number_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_ec_numbers_protein_id_ec_number_id_0da1a728_uniq" ("protein_id","ec_number_id"), - # KEY "browser_protein_ec_n_ec_number_id_42515a8c_fk_browser_e" ("ec_number_id"), - # CONSTRAINT "browser_protein_ec_n_ec_number_id_42515a8c_fk_browser_e" FOREIGN KEY ("ec_number_id") REFERENCES "browser_ec_number" ("id"), - # CONSTRAINT "browser_protein_ec_n_protein_id_c0731ed9_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_go_terms": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "go_term_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_go_terms_protein_id_go_term_id_e8a154a9_uniq" ("protein_id","go_term_id"), - # KEY "browser_protein_go_t_go_term_id_fff0186d_fk_browser_g" ("go_term_id"), - # CONSTRAINT "browser_protein_go_t_go_term_id_fff0186d_fk_browser_g" FOREIGN KEY ("go_term_id") REFERENCES "browser_go_term" ("id"), - # CONSTRAINT "browser_protein_go_t_protein_id_8eb220cd_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_kegg_orthologs": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "kegg_ortholog_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_kegg_ort_protein_id_kegg_ortholog_20402b1d_uniq" ("protein_id","kegg_ortholog_id"), - # KEY "browser_protein_kegg_kegg_ortholog_id_2495a14d_fk_browser_k" ("kegg_ortholog_id"), - # CONSTRAINT "browser_protein_kegg_kegg_ortholog_id_2495a14d_fk_browser_k" FOREIGN KEY ("kegg_ortholog_id") REFERENCES "browser_kegg_ortholog" ("id"), - # CONSTRAINT "browser_protein_kegg_protein_id_ca765f00_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_kegg_pathways": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "kegg_pathway_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_kegg_pat_protein_id_kegg_pathway__3de66af8_uniq" ("protein_id","kegg_pathway_id"), - # KEY "browser_protein_kegg_kegg_pathway_id_47214837_fk_browser_k" ("kegg_pathway_id"), - # CONSTRAINT "browser_protein_kegg_kegg_pathway_id_47214837_fk_browser_k" FOREIGN KEY ("kegg_pathway_id") REFERENCES "browser_kegg_pathway" ("id"), - # CONSTRAINT "browser_protein_kegg_protein_id_f703bb73_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_kegg_reactions": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "kegg_reaction_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_kegg_rea_protein_id_kegg_reaction_956ee52f_uniq" ("protein_id","kegg_reaction_id"), - # KEY "browser_protein_kegg_kegg_reaction_id_b069e895_fk_browser_k" ("kegg_reaction_id"), - # CONSTRAINT "browser_protein_kegg_kegg_reaction_id_b069e895_fk_browser_k" FOREIGN KEY ("kegg_reaction_id") REFERENCES "browser_kegg_reaction" ("id"), - # CONSTRAINT "browser_protein_kegg_protein_id_c42ec7ca_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_ortholog_groups": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "ortholog_group_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_ortholog_protein_id_ortholog_grou_00502bdd_uniq" ("protein_id","ortholog_group_id"), - # KEY "browser_protein_orth_ortholog_group_id_89ef3520_fk_browser_o" ("ortholog_group_id"), - # CONSTRAINT "browser_protein_orth_ortholog_group_id_89ef3520_fk_browser_o" FOREIGN KEY ("ortholog_group_id") REFERENCES "browser_ortholog_group" ("id"), - # CONSTRAINT "browser_protein_orth_protein_id_16a9accd_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_protein_tc_families": [ - "id", # int not_null auto_increment, - "protein_id", # int not_null, - "tc_family_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_protein_tc_famil_protein_id_tc_family_id_dd3a5cca_uniq" ("protein_id","tc_family_id"), - # KEY "browser_protein_tc_f_tc_family_id_3202c6dc_fk_browser_t" ("tc_family_id"), - # CONSTRAINT "browser_protein_tc_f_protein_id_7bba3b67_fk_browser_p" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id"), - # CONSTRAINT "browser_protein_tc_f_tc_family_id_3202c6dc_fk_browser_t" FOREIGN KEY ("tc_family_id") REFERENCES "browser_tc_family" ("id") - ], - "browser_sample_metadata": [ - "id", # int not_null auto_increment, - "source", # varchar(30) not_null, - "url", # varchar(250) not_null, - "key", # varchar(250) not_null, - "value", # longtext not_null, - "sample_id", # int not_null, - # PRIMARY KEY ("id"), - # KEY "browser_sample_metadata_sample_id_7e68900d_fk_browser_sample_id" ("sample_id"), - # CONSTRAINT "browser_sample_metadata_sample_id_7e68900d_fk_browser_sample_id" FOREIGN KEY ("sample_id") REFERENCES "browser_sample" ("id") - ], - "browser_strain": [ - "id", # int not_null auto_increment, - "strain_id", # varchar(100) not_null, - "full_name", # varchar(200) not_null, - "order", # varchar(100) not_null, - "taxon_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "strain_id" ("strain_id"), - # KEY "browser_strain_taxon_id_b809e6e5_fk_browser_taxon_id" ("taxon_id"), - # CONSTRAINT "browser_strain_taxon_id_b809e6e5_fk_browser_taxon_id" FOREIGN KEY ("taxon_id") REFERENCES "browser_taxon" ("id") - ], - "browser_strain_metadata": [ - "id", # int not_null auto_increment, - "source", # varchar(30) not_null, - "url", # varchar(250) not_null, - "key", # varchar(250) not_null, - "value", # longtext not_null, - "strain_id", # int not_null, - # PRIMARY KEY ("id"), - # KEY "browser_strain_metadata_strain_id_ec4e0f0a_fk_browser_strain_id" ("strain_id"), - # CONSTRAINT "browser_strain_metadata_strain_id_ec4e0f0a_fk_browser_strain_id" FOREIGN KEY ("strain_id") REFERENCES "browser_strain" ("id") - ], - "django_admin_log": [ - "id", # int not_null auto_increment, - "action_time", # datetime(6) not_null, - "object_id", # longtext, - "object_repr", # varchar(200) not_null, - "action_flag", # smallint unsigned not_null, - "change_message", # longtext not_null, - "content_type_id", # int DEFAULT NULL, - "user_id", # int not_null, - # PRIMARY KEY ("id"), - # KEY "django_admin_log_content_type_id_c4bce8eb_fk_django_co" ("content_type_id"), - # KEY "django_admin_log_user_id_c564eba6_fk_auth_user_id" ("user_id"), - # CONSTRAINT "django_admin_log_content_type_id_c4bce8eb_fk_django_co" FOREIGN KEY ("content_type_id") REFERENCES "django_content_type" ("id"), - # CONSTRAINT "django_admin_log_user_id_c564eba6_fk_auth_user_id" FOREIGN KEY ("user_id") REFERENCES "auth_user" ("id"), - # CONSTRAINT "django_admin_log_chk_1" CHECK (("action_flag" >= 0)) - ], - "auth_group_permissions": [ - "id", # int not_null auto_increment, - "group_id", # int not_null, - "permission_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "auth_group_permissions_group_id_permission_id_0cd325b0_uniq" ("group_id","permission_id"), - # KEY "auth_group_permissio_permission_id_84c5c92e_fk_auth_perm" ("permission_id"), - # CONSTRAINT "auth_group_permissio_permission_id_84c5c92e_fk_auth_perm" FOREIGN KEY ("permission_id") REFERENCES "auth_permission" ("id"), - # CONSTRAINT "auth_group_permissions_group_id_b120cbf9_fk_auth_group_id" FOREIGN KEY ("group_id") REFERENCES "auth_group" ("id") - ], - "browser_genome": [ - "id", # int not_null auto_increment, - "name", # varchar(200) not_null, - "description", # longtext not_null, - "contigs", # int unsigned not_null, - "size", # int unsigned not_null, - "genes", # int unsigned not_null, - "json_url", # varchar(200) not_null, - "pub_date", # datetime(6) not_null, - "external_url", # varchar(200) not_null, - "external_id", # varchar(40) not_null, - "gbk_filepath", # varchar(200) not_null, - "sample_id", # int DEFAULT NULL, - "strain_id", # int DEFAULT NULL, - "taxon_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "name" ("name"), - # KEY "browser_genome_sample_id_75fbf0ab_fk_browser_sample_id" ("sample_id"), - # KEY "browser_genome_strain_id_f18624c7_fk_browser_strain_id" ("strain_id"), - # KEY "browser_genome_taxon_id_f69a69a0_fk_browser_taxon_id" ("taxon_id"), - # CONSTRAINT "browser_genome_sample_id_75fbf0ab_fk_browser_sample_id" FOREIGN KEY ("sample_id") REFERENCES "browser_sample" ("id"), - # CONSTRAINT "browser_genome_strain_id_f18624c7_fk_browser_strain_id" FOREIGN KEY ("strain_id") REFERENCES "browser_strain" ("id"), - # CONSTRAINT "browser_genome_taxon_id_f69a69a0_fk_browser_taxon_id" FOREIGN KEY ("taxon_id") REFERENCES "browser_taxon" ("id"), - # CONSTRAINT "browser_genome_chk_1" CHECK (("contigs" >= 0)), - # CONSTRAINT "browser_genome_chk_2" CHECK (("size" >= 0)), - # CONSTRAINT "browser_genome_chk_3" CHECK (("genes" >= 0)) - ], - "browser_genome_tags": [ - "id", # int not_null auto_increment, - "genome_id", # int not_null, - "tag_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_genome_tags_genome_id_tag_id_a75ec35f_uniq" ("genome_id","tag_id"), - # KEY "browser_genome_tags_tag_id_cbc127ca_fk_browser_tag_id" ("tag_id"), - # CONSTRAINT "browser_genome_tags_genome_id_dbeb3fc4_fk_browser_genome_id" FOREIGN KEY ("genome_id") REFERENCES "browser_genome" ("id"), - # CONSTRAINT "browser_genome_tags_tag_id_cbc127ca_fk_browser_tag_id" FOREIGN KEY ("tag_id") REFERENCES "browser_tag" ("id") - ], - "browser_regulon": [ - "id", # int not_null auto_increment, - "name", # varchar(50) not_null, - "description", # longtext not_null, - "genome_id", # int not_null, - # PRIMARY KEY ("id"), - # KEY "browser_regulon_genome_id_0da5b8a6_fk_browser_genome_id" ("genome_id"), - # KEY "browser_regulon_name_961e8e34" ("name"), - # CONSTRAINT "browser_regulon_genome_id_0da5b8a6_fk_browser_genome_id" FOREIGN KEY ("genome_id") REFERENCES "browser_genome" ("id") - ], - "browser_contig": [ - "id", # int not_null auto_increment, - "contig_id", # varchar(100) not_null, - "name", # varchar(250) not_null, - "size", # int not_null, - "genome_id", # int DEFAULT NULL, - # PRIMARY KEY ("id"), - # KEY "browser_contig_genome_id_a98adf71_fk_browser_genome_id" ("genome_id"), - # CONSTRAINT "browser_contig_genome_id_a98adf71_fk_browser_genome_id" FOREIGN KEY ("genome_id") REFERENCES "browser_genome" ("id") - ], - "browser_operon": [ - "id", # int not_null auto_increment, - "name", # varchar(250) not_null, - "start", # int not_null, - "end", # int not_null, - "strand", # int not_null, - "contig_id", # int DEFAULT NULL, - "genome_id", # int DEFAULT NULL, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_operon_genome_id_name_eae14085_uniq" ("genome_id","name"), - # KEY "browser_operon_contig_id_a4110648_fk_browser_contig_id" ("contig_id"), - # CONSTRAINT "browser_operon_contig_id_a4110648_fk_browser_contig_id" FOREIGN KEY ("contig_id") REFERENCES "browser_contig" ("id"), - # CONSTRAINT "browser_operon_genome_id_3e25f150_fk_browser_genome_id" FOREIGN KEY ("genome_id") REFERENCES "browser_genome" ("id") - ], - "browser_site": [ - "id", # int not_null auto_increment, - "name", # varchar(50) not_null, - "type", # varchar(20) not_null, - "start", # int not_null, - "end", # int not_null, - "strand", # int not_null, - "sequence", # longtext not_null, - "contig_id", # int DEFAULT NULL, - "genome_id", # int not_null, - "regulon_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_site_regulon_id_name_e35362d1_uniq" ("regulon_id","name"), - # KEY "browser_site_contig_id_54dc54ad_fk_browser_contig_id" ("contig_id"), - # KEY "browser_site_genome_id_47862a9e_fk_browser_genome_id" ("genome_id"), - # CONSTRAINT "browser_site_contig_id_54dc54ad_fk_browser_contig_id" FOREIGN KEY ("contig_id") REFERENCES "browser_contig" ("id"), - # CONSTRAINT "browser_site_genome_id_47862a9e_fk_browser_genome_id" FOREIGN KEY ("genome_id") REFERENCES "browser_genome" ("id"), - # CONSTRAINT "browser_site_regulon_id_64cf9422_fk_browser_regulon_id" FOREIGN KEY ("regulon_id") REFERENCES "browser_regulon" ("id") - ], - "browser_site_operons": [ - "id", # int not_null auto_increment, - "site_id", # int not_null, - "operon_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_site_operons_site_id_operon_id_4e1afc7b_uniq" ("site_id","operon_id"), - # KEY "browser_site_operons_operon_id_f202c4b2_fk_browser_operon_id" ("operon_id"), - # CONSTRAINT "browser_site_operons_operon_id_f202c4b2_fk_browser_operon_id" FOREIGN KEY ("operon_id") REFERENCES "browser_operon" ("id"), - # CONSTRAINT "browser_site_operons_site_id_842b2fa4_fk_browser_site_id" FOREIGN KEY ("site_id") REFERENCES "browser_site" ("id") - ], - "browser_gene": [ - "id", # int not_null auto_increment, - "name", # varchar(50) not_null, - "locus_tag", # varchar(50) not_null, - "type", # varchar(20) not_null, - "start", # int not_null, - "end", # int not_null, - "strand", # int not_null, - "function", # varchar(250) not_null, - "contig_id", # int DEFAULT NULL, - "genome_id", # int not_null, - "operon_id", # int DEFAULT NULL, - "protein_id", # int DEFAULT NULL, - # PRIMARY KEY ("id"), - # KEY "browser_gene_genome_id_fb585326_fk_browser_genome_id" ("genome_id"), - # KEY "browser_gene_operon_id_72d8bf49_fk_browser_operon_id" ("operon_id"), - # KEY "browser_gene_protein_id_e73b3cb7_fk_browser_protein_id" ("protein_id"), - # KEY "browser_gene_contig_id_b190eedc_fk_browser_contig_id" ("contig_id"), - # KEY "browser_gene_name_edef1e56" ("name"), - # KEY "browser_gene_locus_tag_7f163d55" ("locus_tag"), - # KEY "browser_gene_function_d53e81f6" ("function"), - # CONSTRAINT "browser_gene_contig_id_b190eedc_fk_browser_contig_id" FOREIGN KEY ("contig_id") REFERENCES "browser_contig" ("id"), - # CONSTRAINT "browser_gene_genome_id_fb585326_fk_browser_genome_id" FOREIGN KEY ("genome_id") REFERENCES "browser_genome" ("id"), - # CONSTRAINT "browser_gene_operon_id_72d8bf49_fk_browser_operon_id" FOREIGN KEY ("operon_id") REFERENCES "browser_operon" ("id"), - # CONSTRAINT "browser_gene_protein_id_e73b3cb7_fk_browser_protein_id" FOREIGN KEY ("protein_id") REFERENCES "browser_protein" ("id") - ], - "browser_regulon_regulators": [ - "id", # int not_null auto_increment, - "regulon_id", # int not_null, - "gene_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_regulon_regulators_regulon_id_gene_id_31f79fdf_uniq" ("regulon_id","gene_id"), - # KEY "browser_regulon_regulators_gene_id_9397e934_fk_browser_gene_id" ("gene_id"), - # CONSTRAINT "browser_regulon_regu_regulon_id_05cf48a4_fk_browser_r" FOREIGN KEY ("regulon_id") REFERENCES "browser_regulon" ("id"), - # CONSTRAINT "browser_regulon_regulators_gene_id_9397e934_fk_browser_gene_id" FOREIGN KEY ("gene_id") REFERENCES "browser_gene" ("id") - ], - "browser_site_genes": [ - "id", # int not_null auto_increment, - "site_id", # int not_null, - "gene_id", # int not_null, - # PRIMARY KEY ("id"), - # UNIQUE KEY "browser_site_genes_site_id_gene_id_117d2308_uniq" ("site_id","gene_id"), - # KEY "browser_site_genes_gene_id_e364c0fc_fk_browser_gene_id" ("gene_id"), - # CONSTRAINT "browser_site_genes_gene_id_e364c0fc_fk_browser_gene_id" FOREIGN KEY ("gene_id") REFERENCES "browser_gene" ("id"), - # CONSTRAINT "browser_site_genes_site_id_1c58c050_fk_browser_site_id" FOREIGN KEY ("site_id") REFERENCES "browser_site" ("id") - ], - "browser_annotation": [ - "id", # int not_null auto_increment, - "source", # varchar(30) not_null, - "url", # varchar(300) not_null, - "key", # varchar(30) not_null, - "value", # varchar(50) not_null, - "note", # longtext not_null, - "gene_id_id", # int not_null, - # PRIMARY KEY ("id"), - # KEY "browser_annotation_gene_id_id_cb275c89_fk_browser_gene_id" ("gene_id_id"), - # KEY "browser_annotation_source_79ad3072" ("source"), - # KEY "browser_annotation_key_fbcc98fb" ("key"), - # KEY "browser_annotation_value_8d85b11e" ("value"), - # CONSTRAINT "browser_annotation_gene_id_id_cb275c89_fk_browser_gene_id" FOREIGN KEY ("gene_id_id") REFERENCES "browser_gene" ("id") - ], -} diff --git a/src/cdm_data_loaders/utils/calculate_hash.py b/src/cdm_data_loaders/utils/calculate_hash.py deleted file mode 100644 index ed25c7ee..00000000 --- a/src/cdm_data_loaders/utils/calculate_hash.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Utils for calculating hashes.""" - -import gzip -import hashlib -from typing import Any - -from cdm_data_loaders.parsers.fasta import read_fasta - - -def _hash_string(s: str) -> str: - return hashlib.sha256(s.encode("utf-8")).hexdigest() - - -class HashSeq(str): - def __new__(cls, v) -> "HashSeq": - return super().__new__(cls, v.upper()) - - @property - def hash_value(self): - return _hash_string(self) - - -class HashSeqList(list): - def append(self: "HashSeqList", o: str | HashSeq) -> None: - if isinstance(o, str): - super().append(HashSeq(o)) - elif isinstance(o, HashSeq): - super().append(o) - else: - err_msg = f"Invalid type: {type(o)}" - raise TypeError(err_msg) - - @property - def hash_value(self: "HashSeqList") -> str: - h_list = [x.hash_value for x in self] - hash_seq = "_".join(sorted(h_list)) - return _hash_string(hash_seq) - - -def contig_set_hash(features) -> str: - hl = HashSeqList() - for contig in features: - seq = HashSeq(contig.seq) - hl.append(seq) - return hl.hash_value - - -# TODO: check protein_id_map issue -def compute_hash(contigset_file: str, protein_file: str) -> tuple[str, dict[str, Any], dict[str, Any]]: - """ - Compute the hash of the entire contigset and contigs. - """ - contig_hash_dict = {} - contigs = read_fasta(contigset_file) - contigset_hash = contig_set_hash(contigs) - for f in contigs: - contig_hash_dict[f.id] = HashSeq(f.seq).hash_value - contig_id_map = contig_hash_dict - - protein_hash_dict = {} - proteins = read_fasta(protein_file) - for f in proteins: - protein_hash_dict[f.id] = HashSeq(f.seq).hash_value - protein_id_map = protein_hash_dict - - return contigset_hash, contig_id_map, protein_id_map - - -def generate_file_sha256(filepath: str, blocksize=65536) -> str | None: - """Generate the SHA-256 checksum of a file's decompressed content.""" - sha256 = hashlib.sha256() - open_func = gzip.open if filepath.endswith(".gz") else open - try: - with open_func(filepath, "rt", encoding="utf-8", errors="ignore") as f: - for block in iter(lambda: f.read(blocksize), ""): - sha256.update(block.encode("utf-8")) - return sha256.hexdigest() - except Exception as e: - print(f"Error generating SHA-256 for {filepath}: {e}") - return None - - -def generate_hash_id(*args): - """Generate a hash-based ID from the given arguments.""" - unique_string = "".join(map(str, args)) - return hashlib.sha256(unique_string.encode("utf-8")).hexdigest() - - -def calculate_sha256_checksums(gff_file: str) -> str | None: - """Calculate checksums for the contigset and its contigs.""" - print(f"Calculating sha256 for GFF: {gff_file}") - gff_hash = generate_file_sha256(gff_file) - if not gff_hash: - print(f"Error calculating sha256 for GFF file {gff_file}") - return None - return gff_hash diff --git a/tests/parsers/test_genome_loader.py b/tests/parsers/test_genome_loader.py deleted file mode 100644 index 1d026b11..00000000 --- a/tests/parsers/test_genome_loader.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Tests for the MultiGenomeDataFileCreator.""" - -from pathlib import Path - -import pytest - -from cdm_data_loaders.parsers.genome_loader import MultiGenomeDataFileCreator - - -@pytest.mark.parametrize( - "genome_paths_file", - [ - [None, "Missing genome_paths_file"], - ["", "Missing genome_paths_file"], - [" ", "Missing genome_paths_file"], - ["does/not/exist", "genome_paths_file 'does/not/exist' does not exist"], - ["tests/data/genome_paths_file/valid.json"], - ], -) -@pytest.mark.parametrize( - "output_dir", - [ - [None, "Missing output_dir"], - ["", "Missing output_dir"], - [" ", "Missing output_dir"], - ["valid_dir"], - ], -) -@pytest.mark.parametrize( - "checkm2_path", - [ - [None, None], - ["", None], - [" ", None], - ["valid_dir", "valid_dir"], - [" some valid_dir \t", "some valid_dir"], - ], -) -@pytest.mark.parametrize( - "stats_path", - [ - [None, None], - ["", None], - [" ", None], - ["valid_dir", "valid_dir"], - [" some valid_dir \t", "some valid_dir"], - ], -) -def test_init_multigenomedatafilecreator( - genome_paths_file: list[str | None], - output_dir: list[str | None], - checkm2_path: list[str | None], - stats_path: list[str | None], -) -> None: - """Test that MGDFC init fails without required params.""" - errs = [] - if len(genome_paths_file) == 2: - errs.append(genome_paths_file[1]) - if len(output_dir) == 2: - errs.append(output_dir[1]) - - params = [x[0] for x in [genome_paths_file, output_dir, checkm2_path, stats_path]] - - if errs: - err_msg = "MultiGenomeDataFileCreator init error:\n" + "\n".join(errs) - with pytest.raises(RuntimeError, match=err_msg): - MultiGenomeDataFileCreator(*params) - else: - mgdfc = MultiGenomeDataFileCreator(*params) - assert mgdfc.genome_paths_file == Path(genome_paths_file[0]) - assert mgdfc.output_dir == Path(output_dir[0]) - if checkm2_path[1] is None: - assert mgdfc.checkm2_file is None - else: - assert mgdfc.checkm2_file == Path(checkm2_path[1]) - if stats_path[1] is None: - assert mgdfc.stats_file is None - else: - assert mgdfc.stats_file == Path(stats_path[1]) - - -@pytest.mark.parametrize("use_checkm2", [True, False]) -def test_file_creation(use_checkm2: bool, test_data_dir: Path, tmp_path: Path) -> None: - """Check files are created.""" - test_dir = tmp_path / "test_directory" - expected_dir_name = "file_creation_checkm2" if use_checkm2 else "file_creation" - expected_dir = test_data_dir / expected_dir_name - genome_paths_file = test_data_dir / "genome_paths_file" / "valid.json" - - # Initialize the creators for each test case - feature_protein_creator = MultiGenomeDataFileCreator( - str(genome_paths_file), - str(test_dir), - "tests/data/example_files/checkm2/quality_report.tsv" if use_checkm2 else None, - "tests/data/example_files/stats.json", - ) - feature_protein_creator.create_all_tables() - - # Define file paths and expected line counts - files_and_expected_lines = { - test_dir / "contig.tsv": 89, - test_dir / "contigset.tsv": 3, - test_dir / "feature.tsv": 12028, - test_dir / "feature_association.tsv": 12028, - test_dir / "structural_annotation.tsv": 3, - } - - err_list = [] - for file, n_lines in files_and_expected_lines.items(): - assert file.exists() - parsed_lines = file.read_text().strip().split("\n") - # parse file, check number of lines - assert len(parsed_lines) == n_lines - # read in the expected file and check they are identical - expected_file = expected_dir / file.name - expected_lines = expected_file.read_text().strip().split("\n") - if parsed_lines == expected_lines: - assert parsed_lines == expected_lines - else: - err_list.append(file) - print(f"Error: {expected_file!s} differs from {file!s}") - - if err_list: - assert err_list == [] diff --git a/tests/parsers/test_genome_paths.py b/tests/parsers/test_genome_paths.py deleted file mode 100644 index 846b40ef..00000000 --- a/tests/parsers/test_genome_paths.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Tests for the genome paths file parser.""" - -import json -import re -from pathlib import Path -from typing import Any - -import pytest - -from cdm_data_loaders.parsers.genome_paths import get_genome_paths - -GPF_DIR = "genome_paths_file" - - -def test_get_genome_paths_empty(tmp_path: Path) -> None: - """Test that an error is thrown by a non-existent file.""" - full_path = tmp_path / "some-file" - with pytest.raises( - RuntimeError, - match=re.escape("error parsing genome_paths_file: [Errno 2] No such file or directory"), - ): - get_genome_paths(full_path) - - -def test_get_genome_paths_empty_file(tmp_path: Path) -> None: - """Test that an empty file throws an error.""" - full_path = tmp_path / "some-file" - full_path.touch() - - with pytest.raises( - RuntimeError, - match=re.escape("error parsing genome_paths_file: Expecting value: line 1 column 1 (char 0)"), - ): - get_genome_paths(full_path) - - -format_errors = { - # JSON parse errors - "empty": "error parsing genome_paths_file: Expecting value: line 1 column 1 (char 0)", - "ws": "error parsing genome_paths_file: Expecting value: line 4 column 5 (char 8)", - "str": "error parsing genome_paths_file: Expecting value: line 1 column 1 (char 0)", - "unclosed_str": "error parsing genome_paths_file: Unterminated string starting at: line 1 column 9 (char 8)", - "null_key": "error parsing genome_paths_file: Expecting property name enclosed in double quotes: line 1 column 10 (char 9)", - # wrong format (array) - "array_of_objects": "genome_paths_file is not in the correct format", - "empty_array": "genome_paths_file is not in the correct format", - # no data - "empty_object": "no valid data found in genome_paths_file", -} - -err_types = {"empty_array": TypeError, "array_of_objects": TypeError} - - -@pytest.mark.parametrize( - "params", - [ - pytest.param( - { - "err_msg": format_errors[err_id], - "input": err_id, - }, - id=err_id, - ) - for err_id in format_errors - ], -) -def test_get_genome_paths_invalid_format( - params: dict[str, str], - test_data_dir: Path, - monkeypatch: pytest.MonkeyPatch, - json_test_strings: dict[str, Any], -) -> None: - """Test that invalid JSON structures throw an error.""" - full_path = test_data_dir / GPF_DIR / "valid.json" - - def mockreturn(_) -> dict[str, Any] | list[str | Any]: - return json.loads(json_test_strings[params["input"]]) - - # patch json.load to return the data structure in params - monkeypatch.setattr(json, "load", mockreturn) - - with pytest.raises( - err_types.get(params["input"], RuntimeError), - match=re.escape(params["err_msg"]), - ): - get_genome_paths(full_path) - - -error_list = { - "no_entry": [{"": {"this": "that"}}, 'No ID specified for entry {"this": "that"}', ValueError], - "invalid_entry_format_arr": [{"id": []}, "id: invalid entry format"], - "invalid_entry_format_str": [{"id": "some string"}, "id: invalid entry format"], - "invalid_entry_format_None": [{"id": None}, "id: invalid entry format"], - "no_valid_paths": [{"id": {}}, "id: no valid file types or paths found"], - "invalid_keys": [{"id": {"pap": 1, "pip": 2, "pop": 3}}, "id: invalid keys: pap, pip, pop"], -} - - -@pytest.mark.parametrize( - "params", - [ - pytest.param( - { - "err_msg": error_list[err_id][1], - "input": error_list[err_id][0], - }, - id=err_id, - ) - for err_id in error_list - ], -) -def test_get_genome_paths_valid_input_invalid_format( - params: dict[str, str], test_data_dir: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - """Test that invalid JSON structures throw an error.""" - full_path = test_data_dir / GPF_DIR / "valid.json" - - def mockreturn(_) -> dict[str, Any] | list[str | Any]: - return params["input"] - - # patch json.load to return the data structure in params - monkeypatch.setattr(json, "load", mockreturn) - - with pytest.raises( - RuntimeError, - match=f"Please ensure that the genome_paths_file is in the correct format.\n\n{params['err_msg']}", - ): - get_genome_paths(full_path) - - -def test_get_genome_paths(test_data_dir: Path) -> None: - """Test that the genome paths file can be correctly parsed.""" - full_path = test_data_dir / GPF_DIR / "valid.json" - assert get_genome_paths(full_path) == { - "FW305-3-2-15-C-TSA1.1": { - "fna": "tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_scaffolds.fna", - "gff": "tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_genes.gff", - "protein": "tests/data/FW305-3-2-15-C-TSA1/FW305-3-2-15-C-TSA1_genes.faa", - }, - "FW305-C-112.1": { - "fna": "tests/data/FW305-C-112.1/FW305-C-112.1_scaffolds.fna", - "gff": "tests/data/FW305-C-112.1/FW305-C-112.1_genes.gff", - "protein": "tests/data/FW305-C-112.1/FW305-C-112.1_genes.faa", - }, - } diff --git a/uv.lock b/uv.lock index 4e0b624b..dbb4d8c2 100644 --- a/uv.lock +++ b/uv.lock @@ -324,38 +324,6 @@ dependencies = [ { name = "sidecar" }, ] -[[package]] -name = "biopython" -version = "1.86" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9d/61/c59a849bd457c8a1b408ae828dbcc15e674962b5a29705e869e15b32bf25/biopython-1.86.tar.gz", hash = "sha256:93a50b586a4d2cec68ab2f99d03ef583c5761d8fba5535cb8e81da781d0d92ff", size = 19835323, upload-time = "2025-10-28T21:18:31.041Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/6e/84d6c66ab93095aa7adb998a8eef045328470eafd36b9237c4db213e587c/biopython-1.86-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fb3a11a98e49428720dca227e2a5bdd57c973ee7c4df3cf6734c0aa13fd134c7", size = 2693185, upload-time = "2025-10-28T21:27:39.709Z" }, - { url = "https://files.pythonhosted.org/packages/12/75/60386f2640f13765b1651f2f26d8b4f893c46ee663df3ca76eda966d4f6a/biopython-1.86-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e161f3d3b6e65fbfd1ce22a01c3e9fa9da789adde4972fd0cc2370795ea5357b", size = 2669980, upload-time = "2025-10-28T21:26:58.839Z" }, - { url = "https://files.pythonhosted.org/packages/dd/de/a39adb98a0552a257219503c236ef17f007598af55326c0d143db52e5a92/biopython-1.86-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5aa8c9e92ee6fe59dfe0d2c2daf9a9eec6b812c78328caad038f79163c500218", size = 3209657, upload-time = "2025-10-29T00:36:28.842Z" }, - { url = "https://files.pythonhosted.org/packages/0b/c7/b2e7aca3de8981f4ecb6ab1e0334c3c4a512e5e9898b57b3d8734b086da7/biopython-1.86-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:593ec6a2a4fedec08ddcee1a8a0e0b0ed56835b2714904b352ec4a93d5b9d973", size = 3235774, upload-time = "2025-10-29T00:36:34.07Z" }, - { url = "https://files.pythonhosted.org/packages/52/ed/e6647b0b9cf2bb67347612e8e443b84378c44768a8d8439276e4ba881178/biopython-1.86-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd2f9ebf9b14d67ca92f48779c4f0ba404c35dba3e8b9d6c34d1a3591c3b746d", size = 3178415, upload-time = "2025-10-28T23:54:05.475Z" }, - { url = "https://files.pythonhosted.org/packages/ff/37/f6a14b835842c66a52f212136a99416265f5ce76813d668ceac1cb306357/biopython-1.86-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:137fe9aafd93baa5127d17534b473f6646f92a883f52b34f7c306b800ac50038", size = 3197201, upload-time = "2025-10-28T23:54:10.462Z" }, - { url = "https://files.pythonhosted.org/packages/f2/73/0eac930016c509763c174a0e25e92e6d7a711f6f5de1f7001e54fd5c49f7/biopython-1.86-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e784dc8382430c9893aa084ca18fe8a8815b5811f1c324492ef3f4b54e664fff", size = 3145106, upload-time = "2025-10-28T23:54:15.235Z" }, - { url = "https://files.pythonhosted.org/packages/00/aa/26e836274d03402e8011b04a1714d4ac2f704add303a493e54d2d5646973/biopython-1.86-cp313-cp313-win32.whl", hash = "sha256:5329a777ba90ea624447173046e77c4df2862acc46eea4e94fe2211fe041750f", size = 2698051, upload-time = "2025-10-28T21:32:55.225Z" }, - { url = "https://files.pythonhosted.org/packages/ae/27/fa1f8fa57f2ac8fdc41d14ab36001b8ba0fce5eac01585227b99a4da0e9d/biopython-1.86-cp313-cp313-win_amd64.whl", hash = "sha256:f6f2f1dc75423b15d8a22b8eceae32785736612b6740688526401b8c2d821270", size = 2733649, upload-time = "2025-10-28T21:32:51.052Z" }, - { url = "https://files.pythonhosted.org/packages/a4/2d/5b87ab859d38f2c7d7d1f9df375b4734737c2ef62cf8506983e882419a30/biopython-1.86-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:236ca61aa996f12cbc65a8d6a15abfac70b9ee800656629b784c6a240e7d8dc0", size = 2694733, upload-time = "2025-10-29T00:27:49.142Z" }, - { url = "https://files.pythonhosted.org/packages/24/7e/a80fad6dbfa1335c506b1565d2b3fdd78cda705408a839c5583a9cfca8b6/biopython-1.86-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f96b7441f456c7eecad5c6e61e75b0db1435c489be7cc5e4f97dd4e60921747c", size = 2670131, upload-time = "2025-10-29T00:27:53.758Z" }, - { url = "https://files.pythonhosted.org/packages/2d/0a/6c12e9262b99f395bd66535c4a4203bd70833c11f47ac0730fca6ba2b5f8/biopython-1.86-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d53a78bf960397826219f08f87b061ad7f227527d19986e830eeab60d370b597", size = 3209810, upload-time = "2025-10-29T00:36:45.88Z" }, - { url = "https://files.pythonhosted.org/packages/3a/f9/265211154d2bb4cffe78a57b8e57cfbb165cf41cf3d1b68e2a6b073b3b8a/biopython-1.86-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb86e4383c02fdb2571a38947153346e6f5cd38e22de1df40f54d2a3c51d02a8", size = 3235347, upload-time = "2025-10-29T00:36:51.164Z" }, - { url = "https://files.pythonhosted.org/packages/64/e5/58d8e48d3b4100a7fd8bae97f0dd7179c30f19861841d1a0bb7827e0033e/biopython-1.86-cp314-cp314-win32.whl", hash = "sha256:ffeba620c4786ea836efee235a9c6333b94e922b89de1449a4782dcc15246ff1", size = 2698198, upload-time = "2025-10-29T00:28:02.812Z" }, - { url = "https://files.pythonhosted.org/packages/e2/ca/aa166eb588a2d4eea381c92e5a2a3d09b4b4887b0f0e8f3acf999fb88157/biopython-1.86-cp314-cp314-win_amd64.whl", hash = "sha256:efbb9bc4415a1e2c1c986ba261b02857bc0c9eed098b15493f1cc5c4a1e02409", size = 2734693, upload-time = "2025-10-29T00:27:58.312Z" }, - { url = "https://files.pythonhosted.org/packages/50/da/8c227d701ec9c94d9870b1879982e3dd114da130b0816d3f9b937318d31a/biopython-1.86-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:caa70c1639b3306549605f9273753bdbf8cd6d6d352cecf23afbda3c911694f3", size = 2697389, upload-time = "2025-10-29T00:28:07.037Z" }, - { url = "https://files.pythonhosted.org/packages/8c/1e/66b0b5622ef6a3a14c449d1c8d69749480b37518e4c1e3a8a86fc668dad7/biopython-1.86-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d077f01d1f69f77a26cac46163d4ea45eb4e6509a68feb7f15e665b7e1de0a99", size = 2673857, upload-time = "2025-10-29T00:28:11.488Z" }, - { url = "https://files.pythonhosted.org/packages/76/05/7c8f9800e6960da2007eb75128c8ec0b22e1a0064e8802e8acfad53cdca8/biopython-1.86-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4506ce7dbdf885cb24d1f5439362c3c07f1b6f90761a0d20fe16a2a9ea5702a5", size = 3253007, upload-time = "2025-10-29T00:36:56.066Z" }, - { url = "https://files.pythonhosted.org/packages/14/dd/a2177328d841fda0a12e67c65d06279691e25363a2805f561b3665cae114/biopython-1.86-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcd94717e83ba891ebd9acaecbf05ad38313095ca5706caf6c38fa3f2aa17528", size = 3272883, upload-time = "2025-10-29T00:37:01.189Z" }, - { url = "https://files.pythonhosted.org/packages/ce/04/1aa91f64db5e0728d596fcf7302e2ae2035800c0676e94ea09645a948b91/biopython-1.86-cp314-cp314t-win32.whl", hash = "sha256:2f6b205dcb4101cefa5c615114bd35a19f656abb9d340eb3cf190f829e43800a", size = 2701649, upload-time = "2025-10-29T00:28:20.527Z" }, - { url = "https://files.pythonhosted.org/packages/63/7c/4acaca39102d667175bb3d6502dea91c346f8674c06d5df0dbb678971596/biopython-1.86-cp314-cp314t-win_amd64.whl", hash = "sha256:efeee7c37f2331d2c55704df39e122189cc237ffd7511f34158418ad728131b8", size = 2741364, upload-time = "2025-10-29T00:28:15.752Z" }, -] - [[package]] name = "bioregistry" version = "0.13.21" @@ -430,9 +398,6 @@ dependencies = [ ] [package.optional-dependencies] -biopython = [ - { name = "biopython" }, -] minio = [ { name = "boto3", extra = ["crt"] }, { name = "tqdm" }, @@ -441,7 +406,6 @@ minio = [ [package.dev-dependencies] dev = [ { name = "berdl-notebook-utils" }, - { name = "biopython" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, @@ -459,7 +423,6 @@ xml = [ [package.metadata] requires-dist = [ - { name = "biopython", marker = "extra == 'biopython'", specifier = ">=1.86" }, { name = "bioregistry", specifier = ">=0.13.20" }, { name = "boto3", extras = ["crt"], specifier = ">=1.42.55" }, { name = "boto3", extras = ["crt"], marker = "extra == 'minio'", specifier = ">=1.42.0" }, @@ -478,7 +441,6 @@ provides-extras = ["minio", "biopython"] [package.metadata.requires-dev] dev = [ { name = "berdl-notebook-utils", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" }, - { name = "biopython", specifier = ">=1.86" }, { name = "pytest", specifier = ">=9.0.2" }, { name = "pytest-asyncio", specifier = ">=1.3.0" }, { name = "pytest-cov", specifier = ">=7.0.0" }, @@ -2062,7 +2024,7 @@ name = "pexpect" version = "4.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "ptyprocess" }, + { name = "ptyprocess", marker = "(platform_python_implementation != 'PyPy' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } wheels = [ From a9c4928455dea013a16e355c6e817ff0187fbe5f Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Wed, 18 Mar 2026 16:37:20 -0700 Subject: [PATCH 7/7] Adding xml file splitter app --- Dockerfile | 11 +++++++++-- scripts/entrypoint.sh | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 352463b4..35a2085c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,14 +3,21 @@ # Dockerfile is based heavily on the example uv dockerfile: # https://github.com/astral-sh/uv-docker-example +# Pull the pre-built Rust app from ghcr.io +FROM ghcr.io/ialarmedalien/xml_file_splitter:latest AS rust-app + # Use a Python image with uv pre-installed FROM ghcr.io/astral-sh/uv:python3.13-trixie-slim # Set environment variable to noninteractive to prevent prompts during apt operations ENV DEBIAN_FRONTEND=noninteractive -# add tini -RUN apt-get update -y && apt-get install -y --no-install-recommends tini git +# add tini and git +RUN apt-get update -y && apt-get install -y --no-install-recommends tini git ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +# copy only the compiled xml-file-splitter binary from the Rust image +COPY --from=rust-app /usr/local/bin/xml_file_splitter /usr/local/bin/xml_file_splitter # Setup a non-root user RUN groupadd --system --gid 999 nonroot \ diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 5aad624e..dc83be94 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -3,7 +3,7 @@ set -euo pipefail # Ensure at least one argument is provided if [ "$#" -eq 0 ]; then - echo "Usage: $0 {uniref|uniprot|test} [args...]" + echo "Usage: $0 {uniref|uniprot|test|xml_split} [args...]" exit 1 fi @@ -11,6 +11,10 @@ cmd="$1" shift case "$cmd" in + xml_split) + # Run the xml_file_splitter app + exec /usr/bin/tini -- xml_file_splitter "$@" + ;; uniref) # Run the uniref pipeline with any additional arguments via tini exec /usr/bin/tini -- uv run --no-sync uniref_pipeline "$@"