Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions malariagen_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .anopheles import AnophelesDataResource, Region
from .pf7 import Pf7
from .pf8 import Pf8
from .pf9 import Pf9
from .pv4 import Pv4
from .util import SiteClass

Expand Down
43 changes: 43 additions & 0 deletions malariagen_data/pf9.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os

from .plasmodium import PlasmodiumDataResource


class Pf9(PlasmodiumDataResource):
"""Provides access to data from the Pf9 release.

Parameters
----------
url : str, optional
Base path to data. Default uses Google Cloud Storage "gs://pf9-release/",
or specify a local path on your file system if data have been downloaded.
data_config : str, optional
Path to config for structure of Pf9 data resource. Defaults to config included
with the malariagen_data package.
**kwargs
Passed through to fsspec when setting up file system access.

Examples
--------
Access data from Google Cloud Storage (default):

>>> import malariagen_data
>>> pf9 = malariagen_data.Pf9()

Access data downloaded to a local file system:

>>> pf9 = malariagen_data.Pf9("/local/path/to/pf9-release/")

"""

def __init__(
self,
url=None,
data_config=None,
**kwargs,
):
# setup filesystem
if not data_config:
working_dir = os.path.dirname(os.path.abspath(__file__))
data_config = os.path.join(working_dir, "pf9_config.json")
super().__init__(data_config=data_config, url=url)
118 changes: 118 additions & 0 deletions malariagen_data/pf9_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"default_url": "gs://pf9-release/" ,
"metadata_path": "metadata/Pf9_samples.txt",
"reference_path": "reference/PlasmoDB-54-Pfalciparum3D7-Genome.zarr/",
"reference_contigs": [
"Pf3D7_01_v3",
"Pf3D7_02_v3",
"Pf3D7_03_v3",
"Pf3D7_04_v3",
"Pf3D7_05_v3",
"Pf3D7_06_v3",
"Pf3D7_07_v3",
"Pf3D7_08_v3",
"Pf3D7_09_v3",
"Pf3D7_10_v3",
"Pf3D7_11_v3",
"Pf3D7_12_v3",
"Pf3D7_13_v3",
"Pf3D7_14_v3",
"Pf3D7_API_v3",
"Pf3D7_MIT_v3"
],
"annotations_path": "annotations/PlasmoDB-55_Pfalciparum3D7.gff.gz",
"variant_calls_zarr_path": "zarr/",
"default_variant_variables": {
"FILTER_PASS": ["variants"],
"is_snp": ["variants"],
"numalt": ["variants"],
"CDS": ["variants"]
},
"extended_calldata_variables": {
"DP": ["variants", "samples"],
"GQ": ["variants", "samples"],
"MIN_DP": ["variants", "samples"],
"PGT": ["variants", "samples"],
"PID": ["variants", "samples"],
"PS": ["variants", "samples"],
"RGQ": ["variants", "samples"],
"PL": ["variants", "samples", "genotypes"],
"SB": ["variants", "samples", "sb_statistics"]
},
"extended_variant_fields": {
"AC": ["variants", "alt_alleles"],
"AF": ["variants", "alt_alleles"],
"AN": ["variants"],
"ANN_AA_length": ["variants", "alt_alleles"],
"ANN_AA_pos": ["variants", "alt_alleles"],
"ANN_Allele": ["variants", "alt_alleles"],
"ANN_Annotation": ["variants", "alt_alleles"],
"ANN_Annotation_Impact": ["variants", "alt_alleles"],
"ANN_CDS_length": ["variants", "alt_alleles"],
"ANN_CDS_pos": ["variants", "alt_alleles"],
"ANN_Distance": ["variants", "alt_alleles"],
"ANN_Feature_ID": ["variants", "alt_alleles"],
"ANN_Feature_Type": ["variants", "alt_alleles"],
"ANN_Gene_ID": ["variants", "alt_alleles"],
"ANN_Gene_Name": ["variants", "alt_alleles"],
"ANN_HGVS_c": ["variants", "alt_alleles"],
"ANN_HGVS_p": ["variants", "alt_alleles"],
"ANN_Rank": ["variants", "alt_alleles"],
"ANN_Transcript_BioType": ["variants", "alt_alleles"],
"ANN_cDNA_length": ["variants", "alt_alleles"],
"ANN_cDNA_pos": ["variants", "alt_alleles"],
"AS_BaseQRankSum": ["variants", "alt_alleles"],
"AS_FS": ["variants", "alt_alleles"],
"AS_InbreedingCoeff": ["variants", "alt_alleles"],
"AS_MQ": ["variants", "alt_alleles"],
"AS_MQRankSum": ["variants", "alt_alleles"],
"AS_QD": ["variants", "alt_alleles"],
"AS_ReadPosRankSum": ["variants", "alt_alleles"],
"AS_SOR": ["variants", "alt_alleles"],
"BaseQRankSum": ["variants"],
"DP": ["variants"],
"DS": ["variants"],
"END": ["variants"],
"ExcessHet": ["variants"],
"FILTER_Apicoplast": ["variants"],
"FILTER_Centromere": ["variants"],
"FILTER_InternalHypervariable": ["variants"],
"FILTER_LowQual": ["variants"],
"FILTER_Low_VQSLOD": ["variants"],
"FILTER_Mitochondrion": ["variants"],
"FILTER_SubtelomericHypervariable": ["variants"],
"FILTER_SubtelomericRepeat": ["variants"],
"FILTER_VQSRTrancheINDEL99.50to99.60": ["variants"],
"FILTER_VQSRTrancheINDEL99.60to99.80": ["variants"],
"FILTER_VQSRTrancheINDEL99.80to99.90": ["variants"],
"FILTER_VQSRTrancheINDEL99.90to99.95": ["variants"],
"FILTER_VQSRTrancheINDEL99.95to100.00+": ["variants"],
"FILTER_VQSRTrancheINDEL99.95to100.00": ["variants"],
"FILTER_VQSRTrancheSNP99.50to99.60": ["variants"],
"FILTER_VQSRTrancheSNP99.60to99.80": ["variants"],
"FILTER_VQSRTrancheSNP99.80to99.90": ["variants"],
"FILTER_VQSRTrancheSNP99.90to99.95": ["variants"],
"FILTER_VQSRTrancheSNP99.95to100.00+": ["variants"],
"FILTER_VQSRTrancheSNP99.95to100.00": ["variants"],
"FS": ["variants"],
"ID": ["variants"],
"InbreedingCoeff": ["variants"],
"LOF": ["variants"],
"MLEAC": ["variants", "alt_alleles"],
"MLEAF": ["variants", "alt_alleles"],
"MQ": ["variants"],
"MQRankSum": ["variants"],
"NEGATIVE_TRAIN_SITE": ["variants"],
"NMD": ["variants"],
"POSITIVE_TRAIN_SITE": ["variants"],
"QD": ["variants"],
"QUAL": ["variants"],
"RAW_MQandDP": ["variants", "ploidy"],
"ReadPosRankSum": ["variants"],
"RegionType": ["variants"],
"SOR": ["variants"],
"VQSLOD": ["variants"],
"culprit": ["variants"],
"set": ["variants"]
}
}
4 changes: 3 additions & 1 deletion malariagen_data/plasmodium.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def sample_metadata(self):
if self._cache_sample_metadata is None:
path = os.path.join(self._path, self.CONF["metadata_path"])
with self._fs.open(path) as f:
self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="")
self._cache_sample_metadata = pd.read_csv(
f, sep="\t", na_values="", low_memory=False
)
return self._cache_sample_metadata

def _open_variant_calls_zarr(self):
Expand Down
Loading
Loading