diff --git a/malariagen_data/__init__.py b/malariagen_data/__init__.py index 0934a0359..48e4eb2d9 100644 --- a/malariagen_data/__init__.py +++ b/malariagen_data/__init__.py @@ -7,6 +7,7 @@ from .anopheles import AnophelesDataResource, Region from .pf7 import Pf7 from .pf8 import Pf8 +from .pf9 import Pf9 from .pv4 import Pv4 from .util import SiteClass diff --git a/malariagen_data/pf9.py b/malariagen_data/pf9.py new file mode 100644 index 000000000..ad5d50144 --- /dev/null +++ b/malariagen_data/pf9.py @@ -0,0 +1,43 @@ +import os + +from .plasmodium import PlasmodiumDataResource + + +class Pf9(PlasmodiumDataResource): + """Provides access to data from the Pf9 release. + + Parameters + ---------- + url : str, optional + Base path to data. Default uses Google Cloud Storage "gs://pf9-release/", + or specify a local path on your file system if data have been downloaded. + data_config : str, optional + Path to config for structure of Pf9 data resource. Defaults to config included + with the malariagen_data package. + **kwargs + Passed through to fsspec when setting up file system access. + + Examples + -------- + Access data from Google Cloud Storage (default): + + >>> import malariagen_data + >>> pf9 = malariagen_data.Pf9() + + Access data downloaded to a local file system: + + >>> pf9 = malariagen_data.Pf9("/local/path/to/pf9-release/") + + """ + + def __init__( + self, + url=None, + data_config=None, + **kwargs, + ): + # setup filesystem + if not data_config: + working_dir = os.path.dirname(os.path.abspath(__file__)) + data_config = os.path.join(working_dir, "pf9_config.json") + super().__init__(data_config=data_config, url=url) diff --git a/malariagen_data/pf9_config.json b/malariagen_data/pf9_config.json new file mode 100644 index 000000000..fd615542d --- /dev/null +++ b/malariagen_data/pf9_config.json @@ -0,0 +1,118 @@ +{ + "default_url": "gs://pf9-release/" , + "metadata_path": "metadata/Pf9_samples.txt", + "reference_path": "reference/PlasmoDB-54-Pfalciparum3D7-Genome.zarr/", + "reference_contigs": [ + "Pf3D7_01_v3", + "Pf3D7_02_v3", + "Pf3D7_03_v3", + "Pf3D7_04_v3", + "Pf3D7_05_v3", + "Pf3D7_06_v3", + "Pf3D7_07_v3", + "Pf3D7_08_v3", + "Pf3D7_09_v3", + "Pf3D7_10_v3", + "Pf3D7_11_v3", + "Pf3D7_12_v3", + "Pf3D7_13_v3", + "Pf3D7_14_v3", + "Pf3D7_API_v3", + "Pf3D7_MIT_v3" + ], + "annotations_path": "annotations/PlasmoDB-55_Pfalciparum3D7.gff.gz", + "variant_calls_zarr_path": "zarr/", + "default_variant_variables": { + "FILTER_PASS": ["variants"], + "is_snp": ["variants"], + "numalt": ["variants"], + "CDS": ["variants"] + }, + "extended_calldata_variables": { + "DP": ["variants", "samples"], + "GQ": ["variants", "samples"], + "MIN_DP": ["variants", "samples"], + "PGT": ["variants", "samples"], + "PID": ["variants", "samples"], + "PS": ["variants", "samples"], + "RGQ": ["variants", "samples"], + "PL": ["variants", "samples", "genotypes"], + "SB": ["variants", "samples", "sb_statistics"] + }, + "extended_variant_fields": { + "AC": ["variants", "alt_alleles"], + "AF": ["variants", "alt_alleles"], + "AN": ["variants"], + "ANN_AA_length": ["variants", "alt_alleles"], + "ANN_AA_pos": ["variants", "alt_alleles"], + "ANN_Allele": ["variants", "alt_alleles"], + "ANN_Annotation": ["variants", "alt_alleles"], + "ANN_Annotation_Impact": ["variants", "alt_alleles"], + "ANN_CDS_length": ["variants", "alt_alleles"], + "ANN_CDS_pos": ["variants", "alt_alleles"], + "ANN_Distance": ["variants", "alt_alleles"], + "ANN_Feature_ID": ["variants", "alt_alleles"], + "ANN_Feature_Type": ["variants", "alt_alleles"], + "ANN_Gene_ID": ["variants", "alt_alleles"], + "ANN_Gene_Name": ["variants", "alt_alleles"], + "ANN_HGVS_c": ["variants", "alt_alleles"], + "ANN_HGVS_p": ["variants", "alt_alleles"], + "ANN_Rank": ["variants", "alt_alleles"], + "ANN_Transcript_BioType": ["variants", "alt_alleles"], + "ANN_cDNA_length": ["variants", "alt_alleles"], + "ANN_cDNA_pos": ["variants", "alt_alleles"], + "AS_BaseQRankSum": ["variants", "alt_alleles"], + "AS_FS": ["variants", "alt_alleles"], + "AS_InbreedingCoeff": ["variants", "alt_alleles"], + "AS_MQ": ["variants", "alt_alleles"], + "AS_MQRankSum": ["variants", "alt_alleles"], + "AS_QD": ["variants", "alt_alleles"], + "AS_ReadPosRankSum": ["variants", "alt_alleles"], + "AS_SOR": ["variants", "alt_alleles"], + "BaseQRankSum": ["variants"], + "DP": ["variants"], + "DS": ["variants"], + "END": ["variants"], + "ExcessHet": ["variants"], + "FILTER_Apicoplast": ["variants"], + "FILTER_Centromere": ["variants"], + "FILTER_InternalHypervariable": ["variants"], + "FILTER_LowQual": ["variants"], + "FILTER_Low_VQSLOD": ["variants"], + "FILTER_Mitochondrion": ["variants"], + "FILTER_SubtelomericHypervariable": ["variants"], + "FILTER_SubtelomericRepeat": ["variants"], + "FILTER_VQSRTrancheINDEL99.50to99.60": ["variants"], + "FILTER_VQSRTrancheINDEL99.60to99.80": ["variants"], + "FILTER_VQSRTrancheINDEL99.80to99.90": ["variants"], + "FILTER_VQSRTrancheINDEL99.90to99.95": ["variants"], + "FILTER_VQSRTrancheINDEL99.95to100.00+": ["variants"], + "FILTER_VQSRTrancheINDEL99.95to100.00": ["variants"], + "FILTER_VQSRTrancheSNP99.50to99.60": ["variants"], + "FILTER_VQSRTrancheSNP99.60to99.80": ["variants"], + "FILTER_VQSRTrancheSNP99.80to99.90": ["variants"], + "FILTER_VQSRTrancheSNP99.90to99.95": ["variants"], + "FILTER_VQSRTrancheSNP99.95to100.00+": ["variants"], + "FILTER_VQSRTrancheSNP99.95to100.00": ["variants"], + "FS": ["variants"], + "ID": ["variants"], + "InbreedingCoeff": ["variants"], + "LOF": ["variants"], + "MLEAC": ["variants", "alt_alleles"], + "MLEAF": ["variants", "alt_alleles"], + "MQ": ["variants"], + "MQRankSum": ["variants"], + "NEGATIVE_TRAIN_SITE": ["variants"], + "NMD": ["variants"], + "POSITIVE_TRAIN_SITE": ["variants"], + "QD": ["variants"], + "QUAL": ["variants"], + "RAW_MQandDP": ["variants", "ploidy"], + "ReadPosRankSum": ["variants"], + "RegionType": ["variants"], + "SOR": ["variants"], + "VQSLOD": ["variants"], + "culprit": ["variants"], + "set": ["variants"] + } + } diff --git a/malariagen_data/plasmodium.py b/malariagen_data/plasmodium.py index 30dfd675c..72b00a77f 100644 --- a/malariagen_data/plasmodium.py +++ b/malariagen_data/plasmodium.py @@ -62,7 +62,9 @@ def sample_metadata(self): if self._cache_sample_metadata is None: path = os.path.join(self._path, self.CONF["metadata_path"]) with self._fs.open(path) as f: - self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="") + self._cache_sample_metadata = pd.read_csv( + f, sep="\t", na_values="", low_memory=False + ) return self._cache_sample_metadata def _open_variant_calls_zarr(self): diff --git a/tests/integration/test_pf9.py b/tests/integration/test_pf9.py new file mode 100644 index 000000000..888883f6b --- /dev/null +++ b/tests/integration/test_pf9.py @@ -0,0 +1,362 @@ +import dask.array as da +import numpy as np +import pandas as pd +import pytest +import xarray + +from malariagen_data.pf9 import Pf9 + + +def setup_pf9(url="simplecache::gs://pf9-release/", **storage_kwargs): + if url.startswith("simplecache::"): + storage_kwargs["simplecache"] = dict(cache_storage="gcs_cache") + return Pf9(url, **storage_kwargs) + + +@pytest.mark.parametrize( + "url", + [ + "gs://pf9-release/", + "gcs://pf9-release/", + "gs://pf9-release", + "gcs://pf9-release", + "simplecache::gs://pf9-release/", + "simplecache::gcs://pf9-release/", + ], +) +def test_sample_metadata(url): + pf9 = setup_pf9(url) + df_samples = pf9.sample_metadata() + + expected_cols = ( + "Sample", + "Study", + "Country", + "Admin level 1", + "Admin level 2", + "Curated location", + "Country latitude", + "Country longitude", + "Admin level 1 latitude", + "Admin level 1 longitude", + "Admin level 2 latitude", + "Admin level 2 longitude", + "Location latitude", + "Location longitude", + "Year", + "Month", + "Day", + "Source ENA accession", + "Alternative ID", + "All samples same case", + "Population", + "% callable", + "QC pass", + "Exclusion reason", + "Sample type", + "Sample was in Pf8", + "MalariaGEN ENA project accession", + "MalariaGEN ENA experiment accession", + "MalariaGEN ENA sample accession", + "MalariaGEN ENA run accession", + "MalariaGEN ENA CRAM URL", + "MalariaGEN ENA FASTQ URL", + ) + + assert tuple(df_samples.columns) == expected_cols + expected_len = 53973 + assert len(df_samples) == expected_len + + +@pytest.mark.parametrize("extended", [True, False]) +def test_variant_calls(extended): + pf9 = setup_pf9() + + ds = pf9.variant_calls(extended=extended) + assert isinstance(ds, xarray.Dataset) + + # check fields + if extended: + expected_data_vars = { + "variant_allele", + "variant_filter_pass", + "variant_is_snp", + "variant_numalt", + "variant_CDS", + "call_genotype", + "call_AD", + "call_DP", + "call_GQ", + "call_MIN_DP", + "call_PGT", + "call_PID", + "call_PS", + "call_RGQ", + "call_PL", + "call_SB", + "variant_AC", + "variant_AF", + "variant_AN", + "variant_ANN_AA_length", + "variant_ANN_AA_pos", + "variant_ANN_Allele", + "variant_ANN_Annotation", + "variant_ANN_Annotation_Impact", + "variant_ANN_CDS_length", + "variant_ANN_CDS_pos", + "variant_ANN_Distance", + "variant_ANN_Feature_ID", + "variant_ANN_Feature_Type", + "variant_ANN_Gene_ID", + "variant_ANN_Gene_Name", + "variant_ANN_HGVS_c", + "variant_ANN_HGVS_p", + "variant_ANN_Rank", + "variant_ANN_Transcript_BioType", + "variant_ANN_cDNA_length", + "variant_ANN_cDNA_pos", + "variant_AS_BaseQRankSum", + "variant_AS_FS", + "variant_AS_InbreedingCoeff", + "variant_AS_MQ", + "variant_AS_MQRankSum", + "variant_AS_QD", + "variant_AS_ReadPosRankSum", + "variant_AS_SOR", + "variant_BaseQRankSum", + "variant_DP", + "variant_DS", + "variant_END", + "variant_ExcessHet", + "variant_FILTER_Apicoplast", + "variant_FILTER_Centromere", + "variant_FILTER_InternalHypervariable", + "variant_FILTER_LowQual", + "variant_FILTER_Low_VQSLOD", + "variant_FILTER_Mitochondrion", + "variant_FILTER_SubtelomericHypervariable", + "variant_FILTER_SubtelomericRepeat", + "variant_FILTER_VQSRTrancheINDEL99.50to99.60", + "variant_FILTER_VQSRTrancheINDEL99.60to99.80", + "variant_FILTER_VQSRTrancheINDEL99.80to99.90", + "variant_FILTER_VQSRTrancheINDEL99.90to99.95", + "variant_FILTER_VQSRTrancheINDEL99.95to100.00+", + "variant_FILTER_VQSRTrancheINDEL99.95to100.00", + "variant_FILTER_VQSRTrancheSNP99.50to99.60", + "variant_FILTER_VQSRTrancheSNP99.60to99.80", + "variant_FILTER_VQSRTrancheSNP99.80to99.90", + "variant_FILTER_VQSRTrancheSNP99.90to99.95", + "variant_FILTER_VQSRTrancheSNP99.95to100.00+", + "variant_FILTER_VQSRTrancheSNP99.95to100.00", + "variant_FS", + "variant_ID", + "variant_InbreedingCoeff", + "variant_LOF", + "variant_MLEAC", + "variant_MLEAF", + "variant_MQ", + "variant_MQRankSum", + "variant_NEGATIVE_TRAIN_SITE", + "variant_NMD", + "variant_POSITIVE_TRAIN_SITE", + "variant_QD", + "variant_QUAL", + "variant_RAW_MQandDP", + "variant_ReadPosRankSum", + "variant_RegionType", + "variant_SOR", + "variant_VQSLOD", + "variant_culprit", + "variant_set", + } + dimensions = { + "alleles", + "ploidy", + "samples", + "variants", + "alt_alleles", + "genotypes", + "sb_statistics", + } + dim_variant_alt_allele_variable = [ + "variant_AC", + "variant_AF", + "variant_ANN_AA_length", + "variant_ANN_AA_pos", + "variant_ANN_Allele", + "variant_ANN_Annotation", + "variant_ANN_Annotation_Impact", + "variant_ANN_CDS_length", + "variant_ANN_CDS_pos", + "variant_ANN_Distance", + "variant_ANN_Feature_ID", + "variant_ANN_Feature_Type", + "variant_ANN_Gene_ID", + "variant_ANN_Gene_Name", + "variant_ANN_HGVS_c", + "variant_ANN_HGVS_p", + "variant_ANN_Rank", + "variant_ANN_Transcript_BioType", + "variant_ANN_cDNA_length", + "variant_ANN_cDNA_pos", + "variant_AS_BaseQRankSum", + "variant_AS_FS", + "variant_AS_InbreedingCoeff", + "variant_AS_MQ", + "variant_AS_MQRankSum", + "variant_AS_QD", + "variant_AS_ReadPosRankSum", + "variant_AS_SOR", + "variant_MLEAC", + "variant_MLEAF", + ] + else: + expected_data_vars = { + "variant_allele", + "variant_filter_pass", + "variant_is_snp", + "variant_numalt", + "variant_CDS", + "call_genotype", + "call_AD", + } + dimensions = {"alleles", "ploidy", "samples", "variants"} + assert set(ds.data_vars) == expected_data_vars + + expected_coords = { + "variant_position", + "variant_chrom", + "sample_id", + } + + assert set(ds.coords) == expected_coords + + # check dimensions + assert set(ds.dims) == dimensions + + # check dim lengths + df_samples = pf9.sample_metadata() + n_samples = len(df_samples) + n_variants = ds.sizes["variants"] + assert ds.sizes["samples"] == n_samples + assert ds.sizes["ploidy"] == 2 + assert ds.sizes["alleles"] == 7 + + # check shapes + for f in expected_coords | expected_data_vars: + x = ds[f] + assert isinstance(x, xarray.DataArray) + assert isinstance(x.data, da.Array) + + if f == "variant_allele": + assert x.ndim, f == 2 + assert x.shape == (n_variants, 7) + assert x.dims == ("variants", "alleles") + elif extended and f in dim_variant_alt_allele_variable: + assert x.ndim, f == 1 + assert x.shape == (n_variants, 6) + assert x.dims == ("variants", "alt_alleles") + elif f == "variant_RAW_MQandDP": + assert x.ndim, f == 1 + assert x.shape == (n_variants, 2) + assert x.dims == ("variants", "ploidy") + elif f.startswith("variant_"): + assert x.ndim, f == 1 + assert x.shape == (n_variants,) + assert x.dims == ("variants",) + elif f == "call_genotype": + assert x.ndim == 3 + assert x.dims == ("variants", "samples", "ploidy") + assert x.shape == (n_variants, n_samples, 2) + elif f == "call_AD": + assert x.ndim == 3 + assert x.dims == ("variants", "samples", "alleles") + assert x.shape == (n_variants, n_samples, 7) + elif f == "call_PL": + assert x.ndim == 3 + assert x.dims == ("variants", "samples", "genotypes") + assert x.shape == (n_variants, n_samples, 3) + elif f == "call_SB": + assert x.ndim == 3 + assert x.dims == ("variants", "samples", "sb_statistics") + assert x.shape == (n_variants, n_samples, 4) + elif f.startswith("call_"): + assert x.ndim, f == 2 + assert x.dims == ("variants", "samples") + assert x.shape == (n_variants, n_samples) + elif f.startswith("sample_"): + assert x.ndim == 1 + assert x.dims == ("samples",) + assert x.shape == (n_samples,) + # check variant_filter_pass + filter_pass = ds["variant_filter_pass"].values + n_pass = np.count_nonzero(filter_pass) + assert n_pass < n_variants + + # check can setup computations + d1 = ds["variant_position"] > 10_000 + assert isinstance(d1, xarray.DataArray) + d2 = ds["call_AD"].sum(axis=(1, 2)) + assert isinstance(d2, xarray.DataArray) + + +@pytest.mark.parametrize( + "region", + [ + "Pf3D7_01_v3", + "*", + ["Pf3D7_07_v3", "Pf3D7_02_v3", "Pf3D7_03_v3"], + ["Pf3D7_07_v3", "Pf3D7_02_v3:15-20", "Pf3D7_03_v3:40-50"], + "PF3D7_0920900.1", + ], +) +def test_genome_sequence(region): + pf9 = setup_pf9() + + seq = pf9.genome_sequence(region=region) + assert isinstance(seq, da.Array) + assert seq.dtype == "|S1" + + +@pytest.mark.parametrize( + "attributes", + [ + ("ID", "Parent", "Name"), + "*", + ["ID"], + ], +) +def test_genome_features(attributes): + pf9 = setup_pf9() + + default_columns = [ + "contig", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + ] + # check fields + df = pf9.genome_features(attributes=attributes) + assert isinstance(df, pd.DataFrame) + if attributes == "*": + additional_columns = [ + "ID", + "Name", + "Note", + "Parent", + "description", + "gene_id", + "protein_source_id", + ] + expected_columns = default_columns + additional_columns + else: + expected_columns = default_columns + list(attributes) + assert list(df.columns) == expected_columns + + # check dimensions + expected_len = 50070 + assert df.shape == (expected_len, len(expected_columns))