Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions docs/source/As1.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
As1
=====

This page provides a curated list of functions and properties available in the ``malariagen_data`` API
for data on *Anopheles stephensi* species mosquitoes.

To set up the API, use the following code::

import malariagen_data
as1 = malariagen_data.As1()

All the functions below can then be accessed as methods on the ``as1`` object. E.g., to call the
``sample_metadata()`` function, do::

df_samples = as1.sample_metadata()

For more information about the data and terms of use, please see the
`MalariaGEN website <https://www.malariagen.net/data>`_ or contact support@malariagen.net.

.. currentmodule:: malariagen_data.as1.As1

Basic data access
-----------------
.. autosummary::
:toctree: generated/

releases
sample_sets
lookup_release
lookup_study

Reference genome data access
----------------------------
.. autosummary::
:toctree: generated/

contigs
genome_sequence
genome_features
plot_transcript
plot_genes

Sample metadata access
----------------------
.. autosummary::
:toctree: generated/

sample_metadata
add_extra_metadata
clear_extra_metadata
lookup_sample
count_samples
plot_samples_bar
plot_samples_interactive_map
plot_sample_location_mapbox
plot_sample_location_geo
wgs_data_catalog
cohorts

SNP data access
---------------
.. autosummary::
:toctree: generated/

site_mask_ids
snp_calls
snp_allele_counts
plot_snps
site_annotations
is_accessible
biallelic_snp_calls
biallelic_diplotypes
biallelic_snps_to_plink

Integrative genomics viewer (IGV)
---------------------------------
.. autosummary::
:toctree: generated/

igv
view_alignments

SNP frequency analysis
----------------------
.. autosummary::
:toctree: generated/

snp_allele_frequencies
snp_allele_frequencies_advanced
aa_allele_frequencies
aa_allele_frequencies_advanced
plot_frequencies_heatmap
plot_frequencies_time_series
plot_frequencies_interactive_map

Principal components analysis (PCA)
-----------------------------------
.. autosummary::
:toctree: generated/

pca
plot_pca_variance
plot_pca_coords
plot_pca_coords_3d

Genetic distance and neighbour-joining trees (NJT)
--------------------------------------------------
.. autosummary::
:toctree: generated/

plot_njt
njt
biallelic_diplotype_pairwise_distances

Heterozygosity analysis
-----------------------
.. autosummary::
:toctree: generated/

plot_heterozygosity
roh_hmm
plot_roh

Diversity analysis
------------------
.. autosummary::
:toctree: generated/

cohort_diversity_stats
diversity_stats
plot_diversity_stats

Diplotype clustering
--------------------
.. autosummary::
:toctree: generated/

plot_diplotype_clustering
plot_diplotype_clustering_advanced

Fst analysis
------------
.. autosummary::
:toctree: generated/

average_fst
pairwise_average_fst
plot_pairwise_average_fst
fst_gwss
plot_fst_gwss
1 change: 1 addition & 0 deletions malariagen_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .af1 import Af1
from .ag3 import Ag3
from .amin1 import Amin1
from .as1 import As1
from .anopheles import AnophelesDataResource, Region
from .pf7 import Pf7
from .pf8 import Pf8
Expand Down
237 changes: 237 additions & 0 deletions malariagen_data/as1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
import sys

import plotly.express as px # type: ignore

import malariagen_data
from .anopheles import AnophelesDataResource

MAJOR_VERSION_NUMBER = 1
MAJOR_VERSION_PATH = "v1.0"
CONFIG_PATH = "v1.0-config.json"
GCS_DEFAULT_URL = "gs://vo_aste_release_master_us_central1/"
GCS_DEFAULT_PUBLIC_URL = "gs://vo_aste_release_master_us_central1/"
GCS_REGION_URLS = {
"us-central1": "gs://vo_aste_release_master_us_central1",
}

TAXON_PALETTE = px.colors.qualitative.Plotly
TAXON_COLORS = {
"stephensi": TAXON_PALETTE[0],
}

XPEHH_GWSS_CACHE_NAME = "as1_xpehh_gwss_v1"
IHS_GWSS_CACHE_NAME = "as1_ihs_gwss_v1"
ROH_HMM_CACHE_NAME = "as1_roh_hmm_v1"


class As1(AnophelesDataResource):
"""Provides access to data from As1.0 releases.

Parameters
----------
url : str, optional
Base path to data. Defaults to use Google Cloud Storage, or can
be a local path on your file system if data have been downloaded.
site_filters_analysis : str, optional
Site filters analysis version.
bokeh_output_notebook : bool, optional
If True (default), configure bokeh to output plots to the notebook.
results_cache : str, optional
Path to directory on local file system to save results.
log : str or stream, optional
File path or stream output for logging messages.
debug : bool, optional
Set to True to enable debug level logging.
show_progress : bool, optional
If True, show a progress bar during longer-running computations. The default can be overridden using an environmental variable named MGEN_SHOW_PROGRESS.
check_location : bool, optional
If True, use ipinfo to check the location of the client system.
**kwargs
Passed through to fsspec when setting up file system access.

Examples
--------
Access data from Google Cloud Storage (default):

>>> import malariagen_data
>>> adir1 = malariagen_data.As1()

Access data downloaded to a local file system:

>>> adir1 = malariagen_data.As1("/local/path/to/vo_as_release/")

Access data from Google Cloud Storage, with caching on the local file system
in a directory named "gcs_cache":

>>> as1 = malariagen_data.As1(
... "simplecache::gs://vo_aste_release_master_us_central1",
... simplecache=dict(cache_storage="gcs_cache"),
... )

Set up caching of some longer-running computations on the local file system,
in a directory named "results_cache":

>>> as1 = malariagen_data.As1(results_cache="results_cache")

"""

_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME

def __init__(
self,
url=None,
public_url=GCS_DEFAULT_PUBLIC_URL,
bokeh_output_notebook=True,
results_cache=None,
log=sys.stdout,
debug=False,
show_progress=None,
check_location=True,
cohorts_analysis=None,
site_filters_analysis=None,
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options,
):
super().__init__(
url=url,
public_url=public_url,
config_path=CONFIG_PATH,
cohorts_analysis=cohorts_analysis,
aim_analysis=None,
aim_metadata_dtype=None,
aim_ids=None,
aim_palettes=None,
site_filters_analysis=site_filters_analysis,
discordant_read_calls_analysis=discordant_read_calls_analysis,
default_site_mask="stephensi",
default_phasing_analysis="stephensi",
default_coverage_calls_analysis="stephensi",
bokeh_output_notebook=bokeh_output_notebook,
results_cache=results_cache,
log=log,
debug=debug,
show_progress=show_progress,
check_location=check_location,
pre=pre,
gcs_default_url=GCS_DEFAULT_URL,
gcs_region_urls=GCS_REGION_URLS,
major_version_number=MAJOR_VERSION_NUMBER,
major_version_path=MAJOR_VERSION_PATH,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
storage_options=storage_options,
tqdm_class=tqdm_class,
taxon_colors=TAXON_COLORS,
virtual_contigs=None,
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

def __repr__(self):
text = (
f"<MalariaGEN As1 API client>\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self._available_releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
f"or contact support@malariagen.net. For API documentation see \n"
f"https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/As1.html"
)
return text

def _repr_html_(self):
html = f"""
<table class="malariagen-as1">
<thead>
<tr>
<th style="text-align: left" colspan="2">MalariaGEN As1 API client</th>
</tr>
<tr><td colspan="2" style="text-align: left">
Please note that data are subject to terms of use,
for more information see <a href="https://www.malariagen.net/data">
the MalariaGEN website</a> or contact support@malariagen.net.
See also the <a href="https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/As1.html">As1 API docs</a>.
</td></tr>
</thead>
<tbody>
<tr>
<th style="text-align: left">
Storage URL
</th>
<td>{self._url}</td>
</tr>
<tr>
<th style="text-align: left">
Data releases available
</th>
<td>{", ".join(self._available_releases)}</td>
</tr>
<tr>
<th style="text-align: left">
Results cache
</th>
<td>{self._results_cache}</td>
</tr>
<tr>
<th style="text-align: left">
Cohorts analysis
</th>
<td>{self._cohorts_analysis}</td>
</tr>
<tr>
<th style="text-align: left">
Site filters analysis
</th>
<td>{self._site_filters_analysis}</td>
</tr>
<tr>
<th style="text-align: left">
Software version
</th>
<td>malariagen_data {malariagen_data.__version__}</td>
</tr>
<tr>
<th style="text-align: left">
Client location
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Relevant data releases
</th>
<td>{", ".join(self.releases)}</td>
</tr>
</tbody>
</table>
"""
return html
Loading
Loading