diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py b/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py index dc811a87fc..6bf5c501ba 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py @@ -26,14 +26,39 @@ import argparse import logging +import os import torch +from safetensors.torch import save_file as safetensors_save_file from src.utils.load_checkpoint import load_checkpoint logger = logging.getLogger(__name__) +ALLOWED_HYPERPARAMETER_KEYS = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout_prob", + "attention_probs_dropout_prob", + "initializer_range", + "layer_norm_eps", + "pad_token_id", + "position_embedding_type", + "classifier_dropout", + "rotary_theta", + "ignore_index", + "loss_type", + "lora", + "lora_alpha", + "lora_r", + "lora_dropout", +) + # PYTorch -> TE keymap PYTORCH_TO_TE_KEYMAP = { "model.layers.*.pre_attn_layer_norm.weight": "model.layers.*.self_attention.layernorm_qkv.layer_norm_weight", @@ -300,6 +325,11 @@ def convert_state_dict(src: dict, keymap: dict): return dst_state_dict +def filter_hyper_parameters(hyper_parameters: dict) -> dict: + """Keep only conversion-compatible hyperparameter keys.""" + return {key: value for key, value in hyper_parameters.items() if key in ALLOWED_HYPERPARAMETER_KEYS} + + def main(): """Main function.""" logging.basicConfig(level=logging.INFO) @@ -325,6 +355,7 @@ def main(): # Load source checkpoint (automatically detects format) logger.info(f"Loading checkpoint from {args.src}") src_checkpoint = load_checkpoint(args.src, map_location="cpu") + src_checkpoint["hyper_parameters"] = filter_hyper_parameters(src_checkpoint["hyper_parameters"]) # Perform conversion based on direction if args.direction == "pytorch2te": @@ -341,11 +372,19 @@ def main(): dst_state_dict = split_qkv(converted_state_dict, src_checkpoint["hyper_parameters"]) # Prepare final checkpoint - dst_checkpoint = {"state_dict": dst_state_dict, "hyper_parameters": src_checkpoint["hyper_parameters"]} + dst_checkpoint = { + "state_dict": dst_state_dict, + "hyper_parameters": src_checkpoint["hyper_parameters"], + } # Save the converted checkpoint in pickled format torch.save(dst_checkpoint, args.dst) - logger.info(f"Successfully converted checkpoint from {args.src} to {args.dst}") + logger.info(f"Successfully converted checkpoint saved to {args.dst}") + + # Save the state_dict in safetensors format alongside the .ckpt file + safetensors_path = os.path.splitext(args.dst)[0] + ".safetensors" + safetensors_save_file(dst_state_dict, safetensors_path) + logger.info(f"Successfully saved safetensors checkpoint to {safetensors_path}") if __name__ == "__main__": diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py index 525a426c31..1354b2aa8f 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py @@ -15,6 +15,7 @@ # %% +import argparse import json import sys from pathlib import Path @@ -23,41 +24,52 @@ from tqdm import tqdm -sys.path.append("/workspace/codon_fm") +sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer -data_path = Path("/data/ncbi/processed_unfiltered") -tax_ids_to_remove = json.load(open("/data/ncbi/taxids_to_remove.json")) -metadata = json.load(open(data_path / "metadata.json")) -tokenizer = Tokenizer() - - -groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 -counts = {g: np.zeros(tokenizer.vocab_size) for g in groups} -for fm, cm in tqdm(zip(metadata["file_metadata"], metadata["chunks"]), total=len(metadata["file_metadata"])): - group = fm["file_name"][:-4] - if group in tax_ids_to_remove: - curr_taxids_to_remove = set(tax_ids_to_remove[group]) - else: - curr_taxids_to_remove = set() - mmap = np.memmap( - data_path / cm["sequences"]["path"], - dtype=cm["sequences"]["dtype"], - mode="r", - shape=tuple(cm["sequences"]["shape"]), - ) - idx_mmap = np.memmap( - data_path / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) - ) - for start, end, taxid in idx_mmap: - if taxid in curr_taxids_to_remove: - continue - seq = mmap[start:end] - idx, count = np.unique(seq, return_counts=True) - counts[group][idx] += count +def main(pretraining_processed_data_dir: Path, data_dir: Path): + """Check codon frequency.""" + tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json"))) + metadata = json.load(open(pretraining_processed_data_dir / "metadata.json")) + tokenizer = Tokenizer() -# %% -for g in counts: - counts[g] = counts[g].tolist() -json.dump(counts, open("/data/ncbi/codon_counts_nopathogen.json", "w")) + groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 + counts = {g: np.zeros(tokenizer.vocab_size) for g in groups} + for fm, cm in tqdm(zip(metadata["file_metadata"], metadata["chunks"]), total=len(metadata["file_metadata"])): + group = fm["file_name"][:-4] + if group in tax_ids_to_remove: + curr_taxids_to_remove = set(tax_ids_to_remove[group]) + else: + curr_taxids_to_remove = set() + mmap = np.memmap( + pretraining_processed_data_dir / cm["sequences"]["path"], + dtype=cm["sequences"]["dtype"], + mode="r", + shape=tuple(cm["sequences"]["shape"]), + ) + idx_mmap = np.memmap( + pretraining_processed_data_dir / cm["index"]["path"], + dtype=cm["index"]["dtype"], + mode="r", + shape=tuple(cm["index"]["shape"]), + ) + for start, end, taxid in idx_mmap: + if taxid in curr_taxids_to_remove: + continue + seq = mmap[start:end] + idx, count = np.unique(seq, return_counts=True) + counts[group][idx] += count + + # %% + for g in counts: + counts[g] = counts[g].tolist() + json.dump(counts, open(data_dir / "codon_counts_nopathogen.json", "w")) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Check codon frequency") + parser.add_argument("--pretraining_processed_data_dir", type=str, required=True) + parser.add_argument("--data_dir", type=str, required=True) + args = parser.parse_args() + main(Path(args.pretraining_processed_data_dir), Path(args.data_dir)) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py index e7f4b8b1bb..9a34b66a80 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py @@ -17,6 +17,7 @@ import argparse import json import os +import sys from multiprocessing import Pool, cpu_count import numpy as np @@ -24,6 +25,8 @@ import pyarrow.parquet as pq from tqdm import tqdm + +sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb index f01734c922..65503c1939 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb @@ -7,16 +7,41 @@ "source": [ "# Mutation Datasets Preprocessing\n", "\n", - "This notebook preprocesses four mutation variant datasets for downstream analysis. For each dataset, we extract coding sequence (CDS) context from reference genome annotations, annotate variants with transcript information, codon changes, and amino acid translations, and save the processed data in a standardized format.\n", + "This notebook preprocesses five mutation variant datasets for downstream analysis. For each dataset, we extract coding sequence (CDS) context from reference genome annotations, annotate variants with transcript information, codon changes, and amino acid translations, and save the processed data in a standardized format.\n", + "\n", + "---\n", + "## 📋 Table of Contents\n", + "\n", + "1. **[DDD / ASD Dataset](#1-ddd-asd-dataset)** - Developmental disorder and autism spectrum disorder variants (hg19)\n", + "2. **[ClinVar AlphaMissense](#2-clinvar-alphamissense-dataset)** - ClinVar missense variants with AlphaMissense scores (hg38)\n", + "3. **[Cancer Hotspot](#3-cancer-hotspot)** - Cancer hotspot mutations with AlphaMissense annotations (hg38)\n", + "4. **[ClinVar Synonymous](#4-clinvar-synonymous)** - ClinVar synonymous variants with conservation features (hg38)\n", + "5. **[CHD Missense Dataset](#5-chd-missense-dataset)** - Congenital heart disease rare mutations with DDD/ASD controls (hg19)\n", + "6. **[COSMIC Synonymous](#6-cosmic-synonymous-analyses-data)** - COSMIC synonymous analyses data (hg38)\n", + " - **[COSMIC](#cosmic)** - COSMIC mutant census variants\n", + " - **[gnomAD Common Variants](#gnomad-common-variants)** - gnomAD common variants for comparison\n", + "\n", + "---\n", "\n", "## Required Pre-processing Steps\n", "\n", - "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed and saved at `/data/ncbi`\n", + "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed.\n", + "\n", + "### 1. Open-source Data Download\n", "\n", - "#### 1. Open-source Data Download\n", + "There are two ways to obtain the data used by this notebook:\n", "\n", + "a. **Manual:**\n", + " - Use the links provided above to download each file individually.\n", + " - Use the [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) to export the required tables as TSV.\n", + " - Save them into the corresponding subdirectories under `DATA_DIR` (matching the filenames in the directory structure section above).\n", "\n", - "##### Reference Files\n", + "b. **Automatic (recommended):**\n", + " - Create a UCSC account: [hgLogin](https://genome.ucsc.edu/cgi-bin/hgLogin)\n", + " - Generate an API key: [hgHubConnect](https://genome.ucsc.edu/cgi-bin/hgHubConnect) → click **\"generate key\"**\n", + " - Paste the key into `UCSC_API_KEY` in the download cell below, then run the cell.\n", + "\n", + "#### 1.a. Manual Download - Reference Files\n", "| File | Origin |\n", "|----------------|-------- |\n", "| `hg19.fa` | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz) |\n", @@ -27,6 +52,7 @@ "| Annotation File | Origin | Table |\n", "|----------------|--------|-------|\n", "| `gencode.v47lift37.basic.annotation.gtf` | [GENCODE Release 47lift37](https://www.gencodegenes.org/human/release_47lift37.html) | - |\n", + "| `gencode.v47.basic.annotation.gtf.gz` | [GENCODE Release 47](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz) | - |\n", "| `ucsc_gencodev32_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `wgEncodeGencodeCompV32` |\n", "| `ucsc_refseq_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeq` |\n", "| `ucsc_refseq_hist_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeqHistorical` |\n", @@ -40,89 +66,434 @@ "| `ddd_other` | [Zhou et al. 2022](https://www.nature.com/articles/s41588-022-01148-2) | [Download](https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM7_ESM.xlsx) | Supplementary Table 7 |\n", "| `AlphaMissense ClinVar` | [Cheng et al. 2023](https://www.science.org/doi/10.1126/science.adg7492) | [Download](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) | Data S5 |\n", "| `AlphaMissense CancerHotspot` | [Cheng et al. 2023](https://www.science.org/doi/10.1126/science.adg7492) | [Download](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) | Data S6 |\n", + "| `chd_rare_mutation.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S9 |\n", + "| `chd_mutation_ctrl.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S10 |\n", + "| `Cosmic_Sample_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", + "| `Cosmic_MutantCensus_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", + "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n", + "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n", + "\n", "\n", "##### ClinVar Synonymous Matching Features\n", "\n", "| File | Source | URL |\n", "|------|--------|-----|\n", "| `hg38.phyloP447way.bw` | UCSC Genome Browser | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw) |\n", + "| `hg19.100way.phyloP100way.bw` | UCSC Genome Browser | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw) |\n", "| `ucsc_pliByGene_hg38.tsv` | UCSC Genome Browser → Table Browser | [Download](https://genome.ucsc.edu/cgi-bin/hgTables) (table: `pliByGene`) |\n", - "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n", + "| `gnomad.v2.1.1.lof_metrics.by_transcript.txt` | gnomAD | [Download](https://gnomad.broadinstitute.org/downloads) |\n", + "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n" + ] + }, + { + "cell_type": "markdown", + "id": "74ade3be", + "metadata": {}, + "source": [ + "### 1.b. Automatic Download\n", "\n", - "#### 2. Data Scripts\n", + "If you choose **Automatic**:\n", + " 1. Set the `DATA_DIR` where the files should be saved.\n", + " 2. Set the `UCSC_API_KEY` to download the tables form the UCSC table browser.\n", + " 3. Run the next cell to download the required datasets into `DATA_DIR`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713c7737", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reference genomes\n", + " [skip] reference/hg19/hg19.fa\n", + " [skip] reference/hg38/hg38.fa\n", + "GENCODE annotation\n", + " [skip] reference/gencode.v47lift37.basic.annotation.gtf.gz\n", + " [skip] reference/gencode.v47.basic.annotation.gtf.gz\n", + "DDD / ASD variant files\n", + " [skip] ddd_asd_zhouetal/asd_discov.csv\n", + " [skip] ddd_asd_zhouetal/asd_rep.csv\n", + " [skip] ddd_asd_zhouetal/ddd_other.csv\n", + "ClinVar variant summary\n", + " [skip] clinvar_syn/variant_summary.txt.gz\n", + " Downloading → reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt ...\n", + "phyloP447way conservation scores\n", + " [skip] reference/hg38.phyloP447way.bw\n", + "hg19.100way.phyloP100way.bw conservation scores\n", + " Downloading → reference/hg19.100way.phyloP100way.bw ...\n", + "UCSC Table Browser downloads\n", + " [skip] reference/ucsc_gencodev32_hg38.tsv\n", + " [skip] reference/ucsc_refseq_hg38.tsv\n", + " [skip] reference/ucsc_refseq_hist_hg38.tsv\n", + " [skip] reference/ucsc_pliByGene_hg38.tsv\n", + "\n", + "Done.\n" + ] + } + ], + "source": [ + "import gzip\n", + "import os\n", + "import shutil\n", + "import urllib.request\n", "\n", - "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", + "import pandas as pd\n", + "import requests\n", "\n", - "| File | Purpose | How to Generate |\n", - "|------|---------|-----------------| \n", - "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", - "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `python data_scripts/process_gtf.py` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", "\n", + "# ── Set data directory ───────────────────────────────────────\n", + "DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n", + "OUTPUT_DIR = \"/data/ncbi/mutation_datasets\" # output directory where all processed datasets will be saved\n", + "UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n", + "# ─────────────────────────────────────────────────────────────\n", + "\n", + "# Create output directory\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", + "\n", + "for subdir in [\n", + " \"reference/hg19\",\n", + " \"reference/hg38\",\n", + " \"alphamissense_data\",\n", + " \"ddd_asd_zhouetal\",\n", + " \"clinvar_syn\",\n", + "]:\n", + " os.makedirs(os.path.join(DATA_DIR, subdir), exist_ok=True)\n", + "\n", + "\n", + "def download_file(url, dest, decompress_gz=False):\n", + " \"\"\"Download *url* → *dest*, optionally gunzipping in place. Skips if target already exists.\"\"\"\n", + " final = dest[:-3] if decompress_gz and dest.endswith(\".gz\") else dest\n", + " if os.path.exists(final):\n", + " print(f\" [skip] {os.path.relpath(final, DATA_DIR)}\")\n", + " return\n", + " print(f\" Downloading → {os.path.relpath(final, DATA_DIR)} ...\")\n", + " urllib.request.urlretrieve(url, dest)\n", + " if decompress_gz and dest.endswith(\".gz\"):\n", + " with gzip.open(dest, \"rb\") as f_in, open(final, \"wb\") as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + " os.remove(dest)\n", + "\n", + "\n", + "# ── 1. Reference genomes ────────────────────────────────────\n", + "print(\"Reference genomes\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz\",\n", + " os.path.join(DATA_DIR, \"reference/hg19/hg19.fa.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\",\n", + " os.path.join(DATA_DIR, \"reference/hg38/hg38.fa.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "# ── 2. GENCODE annotation (GTF) ─────────────────────────────\n", + "print(\"GENCODE annotation\")\n", + "download_file(\n", + " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n", + " os.path.join(DATA_DIR, \"reference/gencode.v47lift37.basic.annotation.gtf.gz\"),\n", + " decompress_gz=False,\n", + ")\n", + "\n", + "download_file(\n", + " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz\",\n", + " os.path.join(DATA_DIR, \"reference/gencode.v47.basic.annotation.gtf.gz\"),\n", + " decompress_gz=False,\n", + ")\n", + "\n", + "\n", + "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n", + "print(\"DDD / ASD variant files\")\n", + "xlsx_sources = {\n", + " \"asd_discov\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM5_ESM.xlsx\",\n", + " \"asd_rep\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM6_ESM.xlsx\",\n", + " \"ddd_other\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM7_ESM.xlsx\",\n", + "}\n", + "\n", + "for name, url in xlsx_sources.items():\n", + " csv_path = os.path.join(DATA_DIR, \"ddd_asd_zhouetal\", f\"{name}.csv\")\n", + " if os.path.exists(csv_path):\n", + " print(f\" [skip] ddd_asd_zhouetal/{name}.csv\")\n", + " continue\n", + " xlsx_path = csv_path.replace(\".csv\", \".xlsx\")\n", + " download_file(url, xlsx_path)\n", + " print(f\" Converting {name}.xlsx → csv ...\")\n", + " pd.read_excel(xlsx_path).to_csv(csv_path, index=False)\n", + "\n", + "# ── 4. ClinVar variant summary ──────────────────────────────\n", + "print(\"ClinVar variant summary\")\n", + "download_file(\n", + " \"https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz\",\n", + " os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n", + ")\n", + "\n", + "# ── 5. ClinVar gnomAD ──────────────────────────────\n", + "download_file(\n", + " \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz\",\n", + " os.path.join(DATA_DIR, \"reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "\n", + "# ── 6. phyloP conservation scores ───────────────────────────\n", + "print(\"phyloP447way conservation scores\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n", + " os.path.join(DATA_DIR, \"reference/hg38.phyloP447way.bw\"),\n", + ")\n", + "\n", + "print(\"hg19.100way.phyloP100way.bw conservation scores\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw\",\n", + " os.path.join(DATA_DIR, \"reference/hg19.100way.phyloP100way.bw\"),\n", + ")\n", + "\n", + "# ── 7. UCSC Table Browser downloads ─────────────────────────\n", + "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n", + "UCSC_TABLES = {\n", + " \"wgEncodeGencodeCompV32\": {\n", + " \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n", + " \"subdir\": \"reference\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"wgEncodeGencodeCompV32\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"ncbiRefSeq\": {\n", + " \"filename\": \"ucsc_refseq_hg38.tsv\",\n", + " \"subdir\": \"reference\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"ncbiRefSeq\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"ncbiRefSeqHistorical\": {\n", + " \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n", + " \"subdir\": \"reference\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"ncbiRefSeqHistorical\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"pliByGene\": {\n", + " \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n", + " \"subdir\": \"reference\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"varRep\",\n", + " \"hgta_track\": \"gnomadPLI\",\n", + " \"hgta_table\": \"pliByGene\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + "}\n", + "\n", + "print(\"UCSC Table Browser downloads\")\n", + "if not UCSC_API_KEY:\n", + " print(\" UCSC_API_KEY is not set — skipping automatic download.\")\n", + " print(\" Download these tables manually from https://genome.ucsc.edu/cgi-bin/hgTables:\")\n", + " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", + " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", + " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", + " status = \"found\" if os.path.exists(dest) else \"MISSING\"\n", + " rel = os.path.join(tbl_cfg[\"subdir\"], tbl_cfg[\"filename\"]) if tbl_cfg[\"subdir\"] else tbl_cfg[\"filename\"]\n", + " print(f\" [{status}] {rel} (table: {tbl_name})\")\n", + "else:\n", + " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", + " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", + " os.makedirs(dest_dir, exist_ok=True)\n", + " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", + "\n", + " if os.path.exists(dest):\n", + " print(f\" [skip] {os.path.relpath(dest, DATA_DIR)}\")\n", + " continue\n", "\n", - "## Table of Contents\n", + " print(f\" Downloading {tbl_name} → {os.path.relpath(dest, DATA_DIR)} ...\")\n", + " form = {**tbl_cfg[\"form\"], \"apiKey\": UCSC_API_KEY}\n", + " resp = requests.post(UCSC_URL, data=form, timeout=300)\n", + " resp.raise_for_status()\n", "\n", - "| Section | Dataset | Description | Required Data Files |\n", - "|---------|---------|-------------|---------------------|\n", - "| **1** | [DDD / ASD Dataset](#1-ddd-asd-dataset) | Developmental disorder and autism spectrum disorder variants | `ddd_asd_zhouetal/asd_discov.csv`
`ddd_asd_zhouetal/asd_rep.csv`
`ddd_asd_zhouetal/ddd_other.csv`
`gencode.v47lift37.basic.annotation.processed.tsv`
`alphamissense_data/AlphaMissense_hg19.tsv.gz`
`reference/hg19/hg19.fa` |\n", - "| **2** | [ClinVar AlphaMissense](#2-clinvar-alphamissense-dataset) | ClinVar missense variants with AlphaMissense pathogenicity predictions | `alphamissense_data/alphamissense_clinvar.csv`
`ucsc_gencodev32_hg38.tsv`
`hg38/hg38.fa` |\n", - "| **3** | [Cancer Hotspot](#3-cancer-hotspot) | Cancer hotspot mutations with AlphaMissense scores | `alphamissense_data/alphamissense_cancer_hotspot.csv`
`ucsc_gencodev32_hg38.tsv`
`reference/hg38/hg38.fa` |\n", - "| **4** | [ClinVar Synonymous](#4-clinvar-synonymous) | Extract synonymous variants from ClinVar (benign and pathogenic labels) with optional additional features | `clinvar_syn/variant_summary.txt.gz`
`clinvar_syn/ucsc_refseq_hg38.tsv`
`clinvar_syn/ucsc_refseq_hist_hg38.tsv`
`reference/hg38/hg38.fa` |\n", + " if \"\" in resp.text:\n", + " raise RuntimeError(f\"UCSC returned an error for {tbl_name}. Re-run the cell to retry.\")\n", "\n", - "---\n" + " lines = resp.text.splitlines(keepends=True)\n", + " while lines:\n", + " tail = lines[-1].strip()\n", + " if not tail or tail.startswith(\"---\") or \"cookie\" in tail.lower():\n", + " lines.pop()\n", + " else:\n", + " break\n", + "\n", + " with open(dest, \"w\") as f:\n", + " f.writelines(lines)\n", + " print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n", + "\n", + "# ── 8. gnomAD v4.1 VCF files (exomes + genomes, chr1-22, X, Y) ──\n", + "GNOMAD_S3 = \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf\"\n", + "GNOMAD_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]\n", + "gnomad_datasets = {\n", + " \"exomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.exomes.v4.1\"),\n", + " \"genomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.genomes.v4.1\"),\n", + "}\n", + "\n", + "for ds_type, out_dir in gnomad_datasets.items():\n", + " os.makedirs(out_dir, exist_ok=True)\n", + " print(f\"gnomAD {ds_type} VCFs\")\n", + " for chrom in GNOMAD_CHROMS:\n", + " vcf_name = f\"gnomad.{ds_type}.v4.1.sites.{chrom}.vcf.bgz\"\n", + " download_file(\n", + " f\"{GNOMAD_S3}/{ds_type}/{vcf_name}\",\n", + " os.path.join(out_dir, vcf_name),\n", + " )\n", + "\n", + "print(\"\\nDone.\")" ] }, { "cell_type": "markdown", - "id": "8d094b99", + "id": "3ba6d77d", "metadata": {}, "source": [ - "# Imports and Paths setup" + "### 2. Download AlphaMissense Data\n", + "\n", + "The **AlphaMissense** data can only be downloaded manually due to the webiste's bot protection. [Download the zip file](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) in the `DATA_DIR/alphamissense_data` and run the next cell:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8dfcfbe1", + "execution_count": 2, + "id": "8741cb10", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AlphaMissense data\n", + " Extracting zip → ['science.adg7492_data_captions.pdf', 'science.adg7492_data_s1_to_s4_and_s9.xlsx', 'science.adg7492_data_s5.csv', 'science.adg7492_data_s6.csv', 'science.adg7492_data_s7.csv', 'science.adg7492_data_s8.zip']\n", + " Renamed science.adg7492_data_s5.csv -> alphamissense_clinvar.csv\n", + " Renamed science.adg7492_data_s6.csv -> alphamissense_cancer_hotspot.csv\n" + ] + } + ], "source": [ - "# Uncomment to install PyBigWig\n", - "!pip install pyBigWig" + "import zipfile\n", + "\n", + "\n", + "print(\"AlphaMissense data\")\n", + "\n", + "am_data_dir = os.path.join(DATA_DIR, \"alphamissense_data\")\n", + "am_zip_path = os.path.join(am_data_dir, \"science.adg7492_data_s1_to_s9.zip\")\n", + "am_clinvar_path = os.path.join(am_data_dir, \"alphamissense_clinvar.csv\")\n", + "am_hotspot_path = os.path.join(am_data_dir, \"alphamissense_cancer_hotspot.csv\")\n", + "\n", + "if not os.path.exists(am_zip_path):\n", + " raise FileNotFoundError(\n", + " f\"Required file not found: {am_zip_path}\\n\"\n", + " \"Please manually download science.adg7492_data_s1_to_s9.zip into DATA_DIR/alphamissense_data/.\"\n", + " )\n", + "\n", + "with zipfile.ZipFile(am_zip_path, \"r\") as zf:\n", + " print(f\" Extracting zip → {zf.namelist()}\")\n", + " zf.extractall(am_data_dir)\n", + "\n", + "rename_map = {\n", + " \"science.adg7492_data_s5.csv\": am_clinvar_path,\n", + " \"science.adg7492_data_s6.csv\": am_hotspot_path,\n", + "}\n", + "\n", + "for src_name, dst_path in rename_map.items():\n", + " src_path = os.path.join(am_data_dir, src_name)\n", + " if os.path.exists(src_path):\n", + " os.replace(src_path, dst_path)\n", + " print(f\" Renamed {src_name} -> {os.path.basename(dst_path)}\")\n", + " elif os.path.exists(dst_path):\n", + " print(f\" [skip] {os.path.basename(dst_path)} already present\")\n", + " else:\n", + " raise FileNotFoundError(f\"Expected file not found after extraction: {src_path}\")" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "5dae6998", + "cell_type": "markdown", + "id": "b95805e7", "metadata": {}, - "outputs": [], "source": [ - "import ast\n", - "import json\n", - "import os\n", - "import warnings\n", + "### 3. Data Scripts\n", "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import polars as pl\n", - "import pyBigWig\n", - "import pyfaidx\n", - "import seaborn as sns\n", - "from Bio.Seq import Seq\n", - "from matplotlib.ticker import LogLocator\n", - "from tqdm import tqdm\n", + "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", "\n", + "| File | Purpose | How to Generate |\n", + "|------|---------|-----------------| \n", + "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", + "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `000-Annotation-File-Processing.ipynb` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", + "| `gencode.v47.basic.annotation.processed.filtered.tsv` | Filtered transcripts with CDS sequences (hg38) | Run `000-Annotation-File-Processing.ipynb` Part 1 on the GENCODE v47 GTF file. |\n", "\n", - "warnings.filterwarnings(\"ignore\")" + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e544a031-fec7-4765-8a33-2f26c415b5ac", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 68/68 [1:09:44<00:00, 61.53s/it]\n" + ] + } + ], + "source": [ + "%run ../data_scripts/check_codon_frequency.py --pretraining_processed_data_dir $DATA_DIR/pretraining/postprocessed/ --data_dir $DATA_DIR" ] }, { "cell_type": "markdown", - "id": "01a546f6", + "id": "ffb9ba7a", "metadata": {}, "source": [ - "Before setting the `DATA_DIR` path, ensure the following directory structure (containing the files from the required pre-processing steo) is in place:\n", + "### 4. Downloaded Data Integrity Check\n", + "\n", + "Run the following cell to ensure that the `DATA_DIR` path structure (containing the files from the required pre-processing step) is in place:\n", "\n", "```\n", "📁 DATA_DIR/\n", @@ -135,35 +506,141 @@ "│ ├── asd_rep.csv\n", "│ └── ddd_other.csv\n", "├── 📁 clinvar_syn/\n", - "│ ├── variant_summary.txt.gz\n", - "│ ├── ucsc_refseq_hg38.tsv\n", - "│ └── ucsc_refseq_hist_hg38.tsv\n", + "│ └── variant_summary.txt.gz\n", "├── 📁 reference/\n", + "│ ├── 📄 gencode.v47lift37.basic.annotation.processed.tsv\n", + "│ ├── 📄 gencode.v47.basic.annotation.processed.filtered.tsv\n", + "│ ├── 📄 ucsc_gencodev32_hg38.tsv\n", + "│ ├── 📄 ucsc_pliByGene_hg38.tsv\n", + "│ ├── 📄 hg38.phyloP447way.bw\n", + "| |── 📄 hg19.100way.phyloP100way.bw\n", + "| |── 📄 gnomad.v2.1.1.lof_metrics.by_transcript.txt\n", + "│ ├── ucsc_refseq_hg38.tsv\n", + "│ ├── ucsc_refseq_hist_hg38.tsv\n", "│ ├── hg19/\n", "│ │ ├── hg19.fa\n", - "│ │ └── hg19.fa.fai\n", "│ └── hg38/\n", "│ ├── hg38.fa\n", "│ └── hg38.fa.fai\n", "├── 📄 codon_counts_nopathogen.json\n", - "├── 📄 gencode.v47lift37.basic.annotation.processed.tsv\n", - "├── 📄 ucsc_gencodev32_hg38.tsv\n", - "├── 📄 ucsc_pliByGene_hg38.tsv\n", - "└── 📄 hg38.phyloP447way.bw\n", - "```\n" + "├── 📁 cosmic/\n", + "│ └── 📁 cosmic_raw/\n", + "│ ├── Cosmic_Sample_v102_GRCh38.tsv.gz\n", + "│ └── Cosmic_MutantCensus_v102_GRCh38.tsv.gz\n", + "└── 📁 gnomad/\n", + " ├── 📁 gnomad.exomes.v4.1/\n", + " │ └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", + " └── 📁 gnomad.genomes.v4.1/\n", + " └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b28b4e2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5 file(s) missing from /data/balvisio/ncbi:\n", + " ✗ alphamissense_data/AlphaMissense_hg19.tsv.gz\n", + " ✗ reference/hg19/hg19.fa.fai\n", + " ✗ reference/hg38/hg38.fa.fai\n", + " ✗ codon_counts_nopathogen.json\n", + " ✗ gencode.v47lift37.basic.annotation.processed.tsv\n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "5 required file(s) missing — see list above.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 27\u001b[39m\n\u001b[32m 25\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m missing:\n\u001b[32m 26\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m ✗ \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m27\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(missing)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required file(s) missing — see list above.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 28\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAll \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(expected_files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[31mFileNotFoundError\u001b[39m: 5 required file(s) missing — see list above." + ] + } + ], + "source": [ + "expected_files = [\n", + " \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n", + " \"alphamissense_data/alphamissense_cancer_hotspot.csv\",\n", + " \"alphamissense_data/alphamissense_clinvar.csv\",\n", + " \"ddd_asd_zhouetal/asd_discov.csv\",\n", + " \"ddd_asd_zhouetal/asd_rep.csv\",\n", + " \"ddd_asd_zhouetal/ddd_other.csv\",\n", + " \"clinvar_syn/variant_summary.txt.gz\",\n", + " \"reference/ucsc_refseq_hg38.tsv\",\n", + " \"reference/ucsc_refseq_hist_hg38.tsv\",\n", + " \"reference/hg19/hg19.fa\",\n", + " \"reference/hg38/hg38.fa\",\n", + " \"reference/hg38/hg38.fa.fai\",\n", + " \"codon_counts_nopathogen.json\",\n", + " \"gencode.v47lift37.basic.annotation.processed.tsv\",\n", + " \"ucsc_gencodev32_hg38.tsv\",\n", + " \"ucsc_pliByGene_hg38.tsv\",\n", + " \"hg38.phyloP447way.bw\",\n", + "]\n", + "\n", + "missing = [f for f in expected_files if not os.path.exists(os.path.join(DATA_DIR, f))]\n", + "if missing:\n", + " print(f\"{len(missing)} file(s) missing from {DATA_DIR}:\")\n", + " for f in missing:\n", + " print(f\" ✗ {f}\")\n", + " raise FileNotFoundError(f\"{len(missing)} required file(s) missing — see list above.\")\n", + "else:\n", + " print(f\"All {len(expected_files)} required files found in {DATA_DIR}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d094b99", + "metadata": {}, + "source": [ + "# Imports and Paths setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8dfcfbe1", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to install PyBigWig\n", + "# !pip install pyBigWig" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "48d31ff4", + "execution_count": 3, + "id": "5dae6998", "metadata": {}, "outputs": [], "source": [ - "DATA_DIR = \"/data/ncbi/\" # set this to the path of your data directory\n", + "import ast\n", + "import json\n", + "import os\n", + "import warnings\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyBigWig\n", + "import pyfaidx\n", + "import seaborn as sns\n", + "from Bio.Data import CodonTable\n", + "from Bio.Seq import Seq\n", + "from matplotlib.ticker import LogLocator\n", + "from tqdm import tqdm\n", + "\n", "\n", - "OUTPUT_DIR = \"/data/processed/mutation_datasets_latest\" # output directory where all processed datasets will be saved\n", - "os.makedirs(OUTPUT_DIR, exist_ok=True)" + "warnings.filterwarnings(\"ignore\")" ] }, { @@ -176,79 +653,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 53, "id": "42a905da", "metadata": {}, "outputs": [], "source": [ - "dna_code = {\n", - " \"ATA\": \"I\",\n", - " \"ATC\": \"I\",\n", - " \"ATT\": \"I\",\n", - " \"ATG\": \"M\",\n", - " \"ACA\": \"T\",\n", - " \"ACC\": \"T\",\n", - " \"ACG\": \"T\",\n", - " \"ACT\": \"T\",\n", - " \"AAC\": \"N\",\n", - " \"AAT\": \"N\",\n", - " \"AAA\": \"K\",\n", - " \"AAG\": \"K\",\n", - " \"AGC\": \"S\",\n", - " \"AGT\": \"S\",\n", - " \"AGA\": \"R\",\n", - " \"AGG\": \"R\",\n", - " \"CTA\": \"L\",\n", - " \"CTC\": \"L\",\n", - " \"CTG\": \"L\",\n", - " \"CTT\": \"L\",\n", - " \"CCA\": \"P\",\n", - " \"CCC\": \"P\",\n", - " \"CCG\": \"P\",\n", - " \"CCT\": \"P\",\n", - " \"CAC\": \"H\",\n", - " \"CAT\": \"H\",\n", - " \"CAA\": \"Q\",\n", - " \"CAG\": \"Q\",\n", - " \"CGA\": \"R\",\n", - " \"CGC\": \"R\",\n", - " \"CGG\": \"R\",\n", - " \"CGT\": \"R\",\n", - " \"GTA\": \"V\",\n", - " \"GTC\": \"V\",\n", - " \"GTG\": \"V\",\n", - " \"GTT\": \"V\",\n", - " \"GCA\": \"A\",\n", - " \"GCC\": \"A\",\n", - " \"GCG\": \"A\",\n", - " \"GCT\": \"A\",\n", - " \"GAC\": \"D\",\n", - " \"GAT\": \"D\",\n", - " \"GAA\": \"E\",\n", - " \"GAG\": \"E\",\n", - " \"GGA\": \"G\",\n", - " \"GGC\": \"G\",\n", - " \"GGG\": \"G\",\n", - " \"GGT\": \"G\",\n", - " \"TCA\": \"S\",\n", - " \"TCC\": \"S\",\n", - " \"TCG\": \"S\",\n", - " \"TCT\": \"S\",\n", - " \"TTC\": \"F\",\n", - " \"TTT\": \"F\",\n", - " \"TTA\": \"L\",\n", - " \"TTG\": \"L\",\n", - " \"TAC\": \"Y\",\n", - " \"TAT\": \"Y\",\n", - " \"TAA\": \"*\",\n", - " \"TAG\": \"*\",\n", - " \"TGC\": \"C\",\n", - " \"TGT\": \"C\",\n", - " \"TGA\": \"*\",\n", - " \"TGG\": \"W\",\n", - "}\n", - "\n", - "\n", "def translate(seq):\n", " \"\"\"\n", " Translate an RNA sequence into a protein sequence.\n", @@ -258,11 +667,30 @@ " for i in range(0, len(seq) - 2, 3):\n", " codon = seq[i : i + 3]\n", " # Look up the codon in the genetic code dictionary.\n", - " amino_acid = dna_code.get(codon, \"?\")\n", - " protein += amino_acid\n", + " amino_acid = codon_to_aa(codon)\n", + " protein += amino_acid if amino_acid is not None else \"?\"\n", " return protein\n", "\n", "\n", + "def codon_to_aa(codon):\n", + " \"\"\"\n", + " Translate a single codon to its corresponding amino acid using BioPython's CodonTable.\n", + "\n", + " Parameters:\n", + " codon (str): A 3-nucleotide DNA codon.\n", + "\n", + " Returns:\n", + " str or None: The single-letter amino acid code, '*' for stop codons, or None if invalid.\n", + " \"\"\"\n", + " standard_table = CodonTable.unambiguous_dna_by_name[\"Standard\"]\n", + " codon = codon.upper().replace(\"U\", \"T\")\n", + " if len(codon) != 3 or any(base not in \"ATGC\" for base in codon):\n", + " return None\n", + " if codon in standard_table.stop_codons:\n", + " return \"*\"\n", + " return standard_table.forward_table.get(codon, None)\n", + "\n", + "\n", "def reverse_complement_dna(seq):\n", " \"\"\"\n", " Return the reverse complement of a DNA sequence.\n", @@ -277,7 +705,7 @@ " KeyError: If the sequence contains lowercase letters or invalid characters.\n", " \"\"\"\n", " complement = {\"A\": \"T\", \"T\": \"A\", \"G\": \"C\", \"C\": \"G\", \"N\": \"N\"}\n", - " return \"\".join(complement[base] for base in seq[::-1])\n", + " return \"\".join(complement[base] for base in seq[::-1].upper())\n", "\n", "\n", "def process_gtf(gtf_path, fasta_path):\n", @@ -349,10 +777,18 @@ " gtf[\"cds_length\"] = lengths\n", " gtf[\"cds\"] = seqs # sequence is strand-aware (always gene 5'->3')\n", "\n", - " gtf_s = gtf[\n", - " [\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\", \"cds\"]\n", - " ].copy()\n", - " gtf_s[\"name\"] = gtf_s[\"name\"].str.split(\".\").str[0]\n", + " # remove version numbers from identifiers\n", + " gtf[\"name\"] = gtf[\"name\"].str.split(\".\").str[0]\n", + "\n", + " # Build output columns - gene_id and gene_name are optional\n", + " output_cols = [\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\", \"cds\"]\n", + " if \"gene_name\" in gtf.columns:\n", + " output_cols.append(\"gene_name\")\n", + " if \"gene_id\" in gtf.columns:\n", + " gtf[\"gene_id\"] = gtf[\"gene_id\"].str.split(\".\").str[0]\n", + " output_cols.append(\"gene_id\")\n", + "\n", + " gtf_s = gtf[output_cols].copy()\n", " # Sort transcripts by chromosome, start, and end coordinates, they're in the forward direction and 0-based.\n", " gtf_s = gtf_s.sort_values(by=[\"chrom\", \"cdsStart\", \"cdsEnd\"]).reset_index(drop=True).copy()\n", "\n", @@ -586,12 +1022,6 @@ " return df.apply(_check, axis=1)\n", "\n", "\n", - "def get_reverse_complement(seq):\n", - " \"\"\"Get reverse complement of a sequence\"\"\"\n", - " complement = {\"A\": \"T\", \"T\": \"A\", \"G\": \"C\", \"C\": \"G\", \"N\": \"N\"}\n", - " return \"\".join(complement[base] for base in seq[::-1].upper())\n", - "\n", - "\n", "def extract_cds_sequence(row, fasta):\n", " \"\"\"Extract CDS sequence for a transcript based on exon coordinates and CDS boundaries.\"\"\"\n", " chrom = row[\"chrom\"]\n", @@ -618,7 +1048,7 @@ "\n", " # Reverse complement if on negative strand\n", " if strand == \"-\":\n", - " cds_sequence = get_reverse_complement(cds_sequence)\n", + " cds_sequence = reverse_complement_dna(cds_sequence)\n", "\n", " return cds_sequence\n", "\n", @@ -663,7 +1093,7 @@ " dset = dset.with_columns(\n", " pl.col(\"tx\").map_elements(lambda x: tx_to_name[x], return_dtype=pl.String).alias(\"gene_name\")\n", " )\n", - " pli = pl.read_csv(f\"{DATA_DIR}/ucsc_pliByGene_hg38.tsv\", separator=\"\\t\")\n", + " pli = pl.read_csv(f\"{DATA_DIR}/reference/ucsc_pliByGene_hg38.tsv\", separator=\"\\t\")\n", " gene_to_pli = {row[\"geneName\"]: row[\"_pli\"] for row in pli.rows(named=True)}\n", " dset = dset.with_columns(\n", " pl.col(\"gene_name\").map_elements(lambda x: gene_to_pli.get(x, -1000), return_dtype=pl.Float64).alias(\"pli\")\n", @@ -673,7 +1103,7 @@ " dset = dset.filter(pl.col(\"pli\") != -1000)\n", " dset = dset.with_columns((pl.col(\"pli\") * 10).cast(pl.Int32).alias(\"pli_bin\"))\n", "\n", - " bw = pyBigWig.open(f\"{DATA_DIR}/hg38.phyloP447way.bw\")\n", + " bw = pyBigWig.open(f\"{DATA_DIR}/reference/hg38.phyloP447way.bw\")\n", " phylop = []\n", " for row in tqdm(dset.rows(named=True)):\n", " phylop.append(bw.values(row[\"chrom\"], row[\"pos\"] - 1, row[\"pos\"])[0])\n", @@ -698,7 +1128,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "f4bd8e89", "metadata": {}, "outputs": [ @@ -967,7 +1397,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "b7a9dd69", "metadata": {}, "outputs": [ @@ -977,7 +1407,7 @@ "2933" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -989,7 +1419,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "9a2bd343", "metadata": {}, "outputs": [ @@ -1027,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "185bfa35", "metadata": {}, "outputs": [ @@ -1092,7 +1522,7 @@ "1 1:874817:C:T chr1 874817 C T Affected asd" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1118,7 +1548,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "510f1010", "metadata": {}, "outputs": [ @@ -1126,14 +1556,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing transcripts: 100%|██████████| 65158/65158 [00:14<00:00, 4392.38it/s]\n" + "Processing transcripts: 100%|██████████| 64779/64779 [00:09<00:00, 6511.43it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Processed 65158 GTF CDS sequences\n" + "Processed 64779 GTF CDS sequences\n" ] }, { @@ -1213,7 +1643,8 @@ "assembly = \"hg19\"\n", "all_results = []\n", "gtf_s, fasta = process_gtf(\n", - " f\"{DATA_DIR}/gencode.v47lift37.basic.annotation.processed.tsv\", f\"{DATA_DIR}/reference/{assembly}/hg19.fa\"\n", + " f\"{DATA_DIR}/reference/gencode.v47lift37.basic.annotation.processed.tsv\",\n", + " f\"{DATA_DIR}/reference/{assembly}/hg19.fa\",\n", ")\n", "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" @@ -1319,7 +1750,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing chromosomes: 100%|██████████| 23/23 [00:03<00:00, 7.11it/s]\n" + "Processing chromosomes: 0%| | 0/23 [00:00" ] @@ -1993,7 +2431,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8YAAAGGCAYAAAAJj+sGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABn6ElEQVR4nO3deVhU5f//8dcAgsrmDu6YpokpmJK55JIWmlluaaYFWGY1brmUVm64W5lmU35aXDIt01yz3HArM8UFLVFzTcstNxBMEDi/P/o5XydQGZlxEJ6P65rr4tznnvu85pwZpnxz38dkGIYhAAAAAAAAAAAAAADyKDdXBwAAAAAAAAAAAAAAwJkojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAwF1mxIgRMplMd+RYTZs2VdOmTa3b69evl8lk0oIFC+7I8SMjIxUUFHRHjnW7kpKS9OKLLyowMFAmk0n9+vVzdSSXO3r0qEwmk2bOnOnqKPnSfz+3AAAAAAAK4wAAAADgUjNnzpTJZLI+ChYsqDJlyig8PFwffPCBLl265JDjnDhxQiNGjFBcXJxDxnOk3JwtO8aOHauZM2fqlVde0ezZs/Xcc8/dtO/ixYvvXLi70OXLlzVixAitX7/e1VFwE99//71GjBjh6hgAAAAAkG0mwzAMV4cAAAAAgPxq5syZioqKUnR0tCpVqqSrV6/q1KlTWr9+vVavXq0KFSpo6dKlqlWrlvU5aWlpSktLU8GCBbN9nG3btiksLEwzZsxQZGRktp+XmpoqSfL09JT074zxZs2aaf78+erYsWO2x7ndbFevXlVGRoa8vLwccixneOihh+Th4aGffvrpln19fHzUsWPHPD+T2jAMpaSkqECBAnJ3d7fruWfPnlXJkiU1fPhwCq+36b+fW2fo1auXLBaL+GclAAAAAHcLD1cHAAAAAABIrVq1Ut26da3bQ4YM0dq1a/XEE0/oySef1N69e1WoUCFJkoeHhzw8nPu/c5cvX1bhwoWdWljLjgIFCrj0+Nlx5swZBQcHO3zc5ORkeXt7O3xcZ0pLS1NGRoY8PT3t+sONvMAwDF25csX6OXWF3PK5BQAAAIDciKXUAQAAACCXeuSRRzR06FD98ccf+vLLL63tWd1jfPXq1WrUqJGKFCkiHx8fVatWTW+++aakf2d5h4WFSZKioqKsy7Zfm7XctGlT3X///dq+fbsaN26swoULW597o3sVp6en680331RgYKC8vb315JNP6vjx4zZ9goKCspydfv2Yt8qW1T3Gk5OTNWDAAJUvX15eXl6qVq2a3n333UwzV00mk3r16qXFixfr/vvvl5eXl2rUqKEVK1ZkfcL/48yZM3rhhRcUEBCgggULKiQkRLNmzbLuv3a/9SNHjmj58uXW7EePHs1yPJPJpOTkZM2aNcva99r5uXZN4+Pj9eyzz6po0aJq1KiRJGn37t2KjIzUPffco4IFCyowMFDdu3fXuXPnbMa/NsbBgwcVGRmpIkWKyN/fX1FRUbp8+bJN35u9X665cuWKRowYoapVq6pgwYIqXbq02rdvr0OHDkn6v/uIv/vuu5o8ebIqV64sLy8vxcfHZ3mP8cjISPn4+Ojw4cMKDw+Xt7e3ypQpo+joaOu1O3r0qEqWLClJGjlypPU8XZs5furUKUVFRalcuXLy8vJS6dKl9dRTT93wnNtz7GsyMjI0efJk1ahRQwULFlRAQIB69uypCxcu2PQLCgrSE088oZUrV6pu3boqVKiQ/ve//2V5/F69esnHxyfTdZCkLl26KDAwUOnp6ZKkJUuWqHXr1ipTpoy8vLxUuXJljRo1yrr/Gns+t6mpqRo2bJjq1Kkjf39/eXt76+GHH9a6detsxrz+mn7yySfWaxoWFqbY2Fib82mxWCTJ5lYQAAAAAJCbMWMcAAAAAHKx5557Tm+++aZWrVqlHj16ZNlnz549euKJJ1SrVi1FR0fLy8tLBw8e1KZNmyRJ1atXV3R0tIYNG6aXXnpJDz/8sCSpQYMG1jHOnTunVq1a6ZlnnlG3bt0UEBBw01xjxoyRyWTSG2+8oTNnzmjy5Mlq0aKF4uLi7Joxm51s1zMMQ08++aTWrVunF154QaGhoVq5cqUGDRqkv/76S++//75N/59++kkLFy7Uq6++Kl9fX33wwQfq0KGDjh07puLFi98w1z///KOmTZvq4MGD6tWrlypVqqT58+crMjJSFy9eVN++fVW9enXNnj1br732msqVK6cBAwZIkrWw+1+zZ8/Wiy++qAcffFAvvfSSJKly5co2fZ5++mnde++9Gjt2rLVgu3r1ah0+fFhRUVEKDAzUnj179Mknn2jPnj365ZdfMhUkO3XqpEqVKmncuHHasWOHPvvsM5UqVUoTJkyQdOv3i/TvHz488cQTiomJ0TPPPKO+ffvq0qVLWr16tX777Teb3DNmzNCVK1f00ksvycvLS8WKFVNGRkaW5yA9PV0tW7bUQw89pIkTJ2rFihUaPny40tLSFB0drZIlS+rjjz/WK6+8onbt2ql9+/aSZL2VQIcOHbRnzx717t1bQUFBOnPmjFavXq1jx45l+gMKe499Tc+ePa23OOjTp4+OHDmiDz/8UDt37tSmTZtsVjHYv3+/unTpop49e6pHjx6qVq1alsfu3LmzLBaLli9frqefftrafvnyZS1btkyRkZHWJednzpwpHx8f9e/fXz4+Plq7dq2GDRumxMREvfPOOzbjZvdzm5iYqM8++0xdunRRjx49dOnSJX3++ecKDw/X1q1bFRoaatN/7ty5unTpknr27CmTyaSJEyeqffv2Onz4sAoUKKCePXvqxIkTWr16tWbPnn3T8w4AAAAAuYYBAAAAAHCZGTNmGJKM2NjYG/bx9/c3ateubd0ePny4cf3/zr3//vuGJOPvv/++4RixsbGGJGPGjBmZ9jVp0sSQZEybNi3LfU2aNLFur1u3zpBklC1b1khMTLS2f/PNN4YkY8qUKda2ihUrGhEREbcc82bZIiIijIoVK1q3Fy9ebEgyRo8ebdOvY8eOhslkMg4ePGhtk2R4enratO3atcuQZEydOjXTsa43efJkQ5Lx5ZdfWttSU1ON+vXrGz4+PjavvWLFikbr1q1vOt413t7eWZ6Ta9e0S5cumfZdvnw5U9tXX31lSDI2btyYaYzu3bvb9G3Xrp1RvHhx63Z23i/Tp083JBmTJk3KtC8jI8MwDMM4cuSIIcnw8/Mzzpw5Y9Pn2r7rr2lERIQhyejdu7fNWK1btzY8PT2tef7++29DkjF8+HCbMS9cuGBIMt55550b5r6R7B77xx9/NCQZc+bMsXn+ihUrMrVXrFjRkGSsWLHilsfPyMgwypYta3To0MGm/drn5vrrmNX17tmzp1G4cGHjypUr1jZ7PrdpaWlGSkqKTZ8LFy4YAQEBNu+Xa9etePHixvnz563tS5YsMSQZy5Yts7aZzWaDf1YCAAAAcDdhKXUAAAAAyOV8fHx06dKlG+4vUqSIpH+XYL7RTN1b8fLyUlRUVLb7P//88/L19bVud+zYUaVLl9b3339/W8fPru+//17u7u7q06ePTfuAAQNkGIZ++OEHm/YWLVrYzG6uVauW/Pz8dPjw4VseJzAwUF26dLG2FShQQH369FFSUpI2bNjggFeT2csvv5yp7foZ+FeuXNHZs2f10EMPSZJ27NhxyzEefvhhnTt3TomJiZKy93759ttvVaJECfXu3TvTvv/OUO/QocMNZ8lnpVevXjZj9erVS6mpqVqzZs1Nn1eoUCF5enpq/fr1mZY1d9Sx58+fL39/fz366KM6e/as9VGnTh35+PhkWnq8UqVKCg8Pv+VxTSaTnn76aX3//fdKSkqyts+bN09ly5a1Lpt/7XVec+nSJZ09e1YPP/ywLl++rH379tmMm93Prbu7u/W+4xkZGTp//rzS0tJUt27dLN9DnTt3VtGiRa3b11ZyuNXnBgAAAAByMwrjAAAAAJDLJSUl2RSh/6tz585q2LChXnzxRQUEBOiZZ57RN998Y1eRvGzZstbCWXbce++9Ntsmk0lVqlS55b2ec+qPP/5QmTJlMp2P6tWrW/dfr0KFCpnGKFq06C0Lq3/88YfuvfdeubnZ/m/zjY7jKJUqVcrUdv78efXt21cBAQEqVKiQSpYsae2XkJCQqf9/X/O1Aue115yd98uhQ4dUrVo1eXjc+g5sWWW+ETc3N91zzz02bVWrVpWkW753vLy8NGHCBP3www8KCAhQ48aNNXHiRJ06dcphxz5w4IASEhJUqlQplSxZ0uaRlJSkM2fO2DzfntfeuXNn/fPPP1q6dKmkfz/X33//vZ5++mmbPzbYs2eP2rVrJ39/f/n5+alkyZLq1q2bpMzX257P7axZs1SrVi0VLFhQxYsXV8mSJbV8+fLbeg8BAAAAwN2Ie4wDAAAAQC72559/KiEhQVWqVLlhn0KFCmnjxo1at26dli9frhUrVmjevHl65JFHtGrVKuu9i2/GnvuCZ9d/ZxZfk56enq1MjnCj4xj///7duU1W16FTp076+eefNWjQIIWGhsrHx0cZGRlq2bJlln/8cKvX7Ij3y60yO0u/fv3Upk0bLV68WCtXrtTQoUM1btw4rV27VrVr187x+BkZGSpVqpTmzJmT5f7/zoy357U/9NBDCgoK0jfffKNnn31Wy5Yt0z///KPOnTtb+1y8eFFNmjSRn5+foqOjVblyZRUsWFA7duzQG2+8kel6Z/f4X375pSIjI9W2bVsNGjRIpUqVkru7u8aNG6dDhw5l6n+3fW4AAAAAIDsojAMAAABALjZ79mxJuuVyzW5ubmrevLmaN2+uSZMmaezYsXrrrbe0bt06tWjR4oZF6tt14MABm23DMHTw4EHVqlXL2la0aFFdvHgx03P/+OMPm5m79mSrWLGi1qxZo0uXLtnMGr+2xHTFihWzPdatjrN7925lZGTYzBrP6XHsvQ4XLlxQTEyMRo4cqWHDhlnb/3v+7XWr90vlypW1ZcsWXb16VQUKFMjRsa6XkZGhw4cPW2dqS9Lvv/8uSQoKCpJ063NUuXJlDRgwQAMGDNCBAwcUGhqq9957T19++WWOj125cmWtWbNGDRs2dErBv1OnTpoyZYoSExM1b948BQUFWZfFl6T169fr3LlzWrhwoRo3bmxtP3LkSI6Ou2DBAt1zzz1auHChzfkdPnz4bY/p6N8pAAAAAOBsLKUOAAAAALnU2rVrNWrUKFWqVEldu3a9Yb/z589nagsNDZUkpaSkSJK8vb0lKctC9e344osvbO57vmDBAp08eVKtWrWytlWuXFm//PKLUlNTrW3fffedjh8/bjOWPdkef/xxpaen68MPP7Rpf//992UymWyOnxOPP/64Tp06pXnz5lnb0tLSNHXqVPn4+KhJkya3Na63t7dd1+DazN3/ztSdPHnybR1fyt77pUOHDjp79mym85xVFntdP6ZhGPrwww9VoEABNW/eXJJUuHBhSZnfD5cvX9aVK1ds2ipXrixfX19r7pweu1OnTkpPT9eoUaMyPTctLS3Hn5/OnTsrJSVFs2bN0ooVK9SpUyeb/Vld79TUVH300Uc5Om5W427ZskWbN2++7TEd/TsFAAAAAJyNGeMAAAAAkAv88MMP2rdvn9LS0nT69GmtXbtWq1evVsWKFbV06VIVLFjwhs+Njo7Wxo0b1bp1a1WsWFFnzpzRRx99pHLlyqlRo0aS/i0gFilSRNOmTZOvr6+8vb1Vr149u+6RfL1ixYqpUaNGioqK0unTpzV58mRVqVJFPXr0sPZ58cUXtWDBArVs2VKdOnXSoUOH9OWXX6py5co2Y9mTrU2bNmrWrJneeustHT16VCEhIVq1apWWLFmifv36ZRr7dr300kv63//+p8jISG3fvl1BQUFasGCBNm3apMmTJ9/0nu83U6dOHa1Zs0aTJk1SmTJlVKlSJdWrV++G/f38/Kz30r569arKli2rVatW5WgGcXbeL88//7y++OIL9e/fX1u3btXDDz+s5ORkrVmzRq+++qqeeuqp2zp2wYIFtWLFCkVERKhevXr64YcftHz5cr355pvWZcoLFSqk4OBgzZs3T1WrVlWxYsV0//33Ky0tTc2bN1enTp0UHBwsDw8PLVq0SKdPn9YzzzzjkGM3adJEPXv21Lhx4xQXF6fHHntMBQoU0IEDBzR//nxNmTJFHTt2vK3XLkkPPPCAqlSporfeekspKSk2y6hLUoMGDVS0aFFFRESoT58+MplMmj17do7/GOGJJ57QwoUL1a5dO7Vu3VpHjhzRtGnTFBwcrKSkpNsas06dOpKkPn36KDw8XO7u7tm6DgAAAADgKhTGAQAAACAXuLZMtqenp4oVK6aaNWtq8uTJioqKumUR9sknn9TRo0c1ffp0nT17ViVKlFCTJk00cuRI+fv7S5IKFCigWbNmaciQIXr55ZeVlpamGTNm3HZh/M0339Tu3bs1btw4Xbp0Sc2bN9dHH31kne0r/bv8+3vvvadJkyapX79+qlu3rr777jsNGDDAZix7srm5uWnp0qUaNmyY5s2bpxkzZigoKEjvvPNOpnFzolChQlq/fr0GDx6sWbNmKTExUdWqVdOMGTMUGRl52+NOmjRJL730kt5++239888/1iLtzcydO1e9e/eWxWKRYRh67LHH9MMPP6hMmTK3lSE77xd3d3d9//33GjNmjObOnatvv/1WxYsXV6NGjVSzZs3bOu61cVesWKFXXnlFgwYNkq+vr4YPH26zTLwkffbZZ+rdu7dee+01paamavjw4erdu7e6dOmimJgYzZ49Wx4eHrrvvvv0zTffqEOHDg479rRp01SnTh3973//05tvvikPDw8FBQWpW7duatiw4W2/9ms6d+6sMWPGqEqVKnrggQds9hUvXtz6GXn77bdVtGhRdevWTc2bN7/l7RRuJjIyUqdOndL//vc/rVy5UsHBwfryyy81f/58rV+//rbGbN++vXr37q2vv/5aX375pQzDoDAOAAAAIFczGTn9s2MAAAAAAIBbiIyM1IIFC257hvLdemwAAAAAQO7APcYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdxj3EAAAAAAAAAAAAAQJ7GjHEAAAAAAAAAAAAAQJ5GYRzAHdO7d2+VL19efn5+Klu2rPr166fU1FRJUtOmTeXl5SUfHx/r48SJEzccKzExUc8++6z8/PwUEBCgUaNG2ezfvn27GjVqJD8/P91zzz364osvrPtSUlLUtGlTlSpVSn5+frrvvvv0ySefOOdFAwAAAAAAAAAAwOXyfWHcMAwlJiaKFeUB53v11Ve1b98+JSYmateuXdq1a5cmTpxo3T9hwgQlJSVZH2XKlLnhWL1799b58+d17Ngx/fjjj/r000+txe+LFy/q8ccfV7du3XThwgV99dVX6t27t3766SdJkoeHh6ZOnaoTJ04oMTFRCxcu1NChQ/Xjjz869wQAAAAAAAAAAADAJfJ9YfzSpUvy9/fXpUuXXB0FyPOqV68ub29vSf/+UYqbm5sOHDhg9ziXL1/W119/rdGjR6tIkSKqWrWqevfurc8//1yS9PPPP8vLy0svv/yy3N3dVa9ePbVv316fffaZJMnd3V01a9aUh4eHJMlkMslkMungwYMOeqUAAAAAAAAAAADITfJ9YRzAnTV+/Hj5+PioVKlS2rVrl3r37m3dN3r0aBUrVky1a9e2Wfr8v/bv36/U1FSFhoZa20JDQ7V7925JUkZGRqZVIDIyMqz7r3niiSdUsGBBBQcHKyAgQO3atXPAKwQAAAAAAAAAAEBuQ2EcwB01ePBgJSUlKT4+Xi+//LICAwMlSePGjdOhQ4d0+vRpjR8/Xr1799aiRYuyHCMpKUne3t7WGd+SVKRIEevKD/Xr11dycrI+/PBDXb16VZs2bdKiRYuUmJhoM853332n5ORkrV+/Xh06dFChQoWc9KoBAAAAAAAAAADgShTGAbhE9erVFRISosjISEn/FrP9/f1VoEABhYeHq2fPnpo3b16Wz/Xx8dHly5eVlpZmbUtISJCvr68kqXjx4lq2bJnmzp2rwMBADR48WFFRUSpevHimsdzd3dWkSROdPn1a77zzjuNfKAAAAAAAAAAAAFyOwjgAl7l69eoN7zHu5nbjX0/VqlVTgQIFtGvXLmtbXFycatasad1u2LChfv75Z507d04//vijTp06pSZNmtxWFgAAAAAAAAAAANzd8m1h3GKxKDg4WGFhYa6OAuQLSUlJmjFjhi5evCjDMPTrr79q9OjRCg8P18WLF/X999/r8uXLSk9PV0xMjKZNm6YOHTpYnx8ZGWmdXV64cGF17txZQ4cOVUJCgg4cOKCpU6fqxRdftPbfuXOnUlJS9M8//+jTTz/V+vXr1a9fP0n/FtFXr16tf/75R2lpaVq+fLnmzJmj8PDwO3lKAAAAAAAAAAAAcIeYDMMwXB3ClRITE+Xv76+EhAT5+fm5Og6QZyUnJ6tt27basWOHUlJSVKpUKXXo0EEjR45UcnKynnjiCe3du1eSFBQUpH79+ql79+7W5z/yyCPq0qWLevToIenfz27Pnj313XffqVChQurVq5eGDRtm7R8VFaVFixYpLS1NDRo00Pvvv68aNWpIkrZt26ZXXnlF+/fvl8lkUlBQkF599VX17NnzDp4RAAAAAAAAAAAA3CkUximMA7leSkqKatWqpd9++00FChRwdRwAAAAAAAAAAADcZTxcHQAAbsXLy0v79+93dQwAAAAAAAAAAADcpfLtPcYBAAAAAAAAAAAAAPkDhXEAAAAAAAAAAAAAQJ6WbwvjFotFwcHBCgsLc3UUAAAAAAAAAAAAAIATmQzDMFwdwpUSExPl7++vhIQE+fn5uToOAAAAAAAAAAAAAMDB8u2McQAAAAAAAAAAAABA/uDh6gDIuY4R0a6OAAC4wxbMGubqCAAAAAAAAAAA3DWYMQ4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPC3fFsYtFouCg4MVFhbm6igAAAAAAAAAAAAAACfKt4Vxs9ms+Ph4xcbGujoKAAAAAAAAAAAAAMCJ8m1hHAAAAAAAAAAAAACQP1AYBwAAAAAAAAAAAADkaRTGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKfl28K4xWJRcHCwwsLCXB0FAAAAAAAAAAAAAOBE+bYwbjabFR8fr9jYWFdHAQAAAAAAAAAAAAA4Ub4tjAMAAAAAAAAAAAAA8gcK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDwtzxTGL1++rIoVK2rgwIGujgIAAAAAAAAAAAAAyEXyTGF8zJgxeuihh1wdAwAAAAAAAAAAAACQy+SJwviBAwe0b98+tWrVytVRAAAAAAAAAAAAAAC5jMsL4xs3blSbNm1UpkwZmUwmLV68OFMfi8WioKAgFSxYUPXq1dPWrVtt9g8cOFDjxo27Q4kBAAAAAAAAAAAAAHcTlxfGk5OTFRISIovFkuX+efPmqX///ho+fLh27NihkJAQhYeH68yZM5KkJUuWqGrVqqpateqdjA0AAAAAAAAAAAAAuEt4uDpAq1atbroE+qRJk9SjRw9FRUVJkqZNm6bly5dr+vTpGjx4sH755Rd9/fXXmj9/vpKSknT16lX5+flp2LBhWY6XkpKilJQU63ZiYqJjXxAAAAAAAAAAAAAAIFdx+Yzxm0lNTdX27dvVokULa5ubm5tatGihzZs3S5LGjRun48eP6+jRo3r33XfVo0ePGxbFr/X39/e3PsqXL+/01wEAAAAAAAAAAAAAcJ1cXRg/e/as0tPTFRAQYNMeEBCgU6dO3daYQ4YMUUJCgvVx/PhxR0QFAAAAAAAAAAAAAORSLl9K3ZEiIyNv2cfLy0teXl7ODwMAAAAAAAAAAAAAyBVy9YzxEiVKyN3dXadPn7ZpP336tAIDA3M0tsViUXBwsMLCwnI0DgAAAAAAAAAAAAAgd8vVhXFPT0/VqVNHMTEx1raMjAzFxMSofv36ORrbbDYrPj5esbGxOY0JAAAAAAAAAAAAAMjFXL6UelJSkg4ePGjdPnLkiOLi4lSsWDFVqFBB/fv3V0REhOrWrasHH3xQkydPVnJysqKiolyYGgAAAAAAAAAAAABwt3B5YXzbtm1q1qyZdbt///6SpIiICM2cOVOdO3fW33//rWHDhunUqVMKDQ3VihUrFBAQkKPjWiwWWSwWpaen52gcAAAAAAAAAAAAAEDuZjIMw3B1CFdKTEyUv7+/EhIS5Ofn5+o4t6VjRLSrIwAA7rAFs4a5OgIAAAAAAAAAAHeNXH2PcQAAAAAAAAAAAAAAcorCOAAAAAAAAAAAAAAgT8u3hXGLxaLg4GCFhYW5OgoAAAAAAAAAAAAAwInybWHcbDYrPj5esbGxro4CAAAAAAAAAAAAAHCifFsYBwAAAAAAAAAAAADkDxTGAQAAAAAAAAAAAAB5mt2F8R07dujXX3+1bi9ZskRt27bVm2++qdTUVIeGcybuMQ4AAAAAAAAAAAAA+YPdhfGePXvq999/lyQdPnxYzzzzjAoXLqz58+fr9ddfd3hAZ+Ee4wAAAAAAAAAAAACQP9hdGP/9998VGhoqSZo/f74aN26suXPnaubMmfr2228dnQ8AAAAAAAAAAAAAgByxuzBuGIYyMjIkSWvWrNHjjz8uSSpfvrzOnj3r2HQAAAAAAAAAAAAAAOSQ3YXxunXravTo0Zo9e7Y2bNig1q1bS5KOHDmigIAAhwcEAAAAAAAAAAAAACAn7C6Mv//++9qxY4d69eqlt956S1WqVJEkLViwQA0aNHB4QGexWCwKDg5WWFiYq6MAAAAAAAAAAAAAAJzIZBiG4YiBrly5Ig8PD3l4eDhiuDsmMTFR/v7+SkhIkJ+fn6vj3JaOEdGujgAAuMMWzBrm6ggAAAAAAAAAANw17J4xfs899+jcuXOZ2q9cuaKqVas6JBQAAAAAAAAAAAAAAI5id2H86NGjSk9Pz9SekpKiP//80yGhAAAAAAAAAAAAAABwlGyve7506VLrzytXrpS/v791Oz09XTExMapUqZJj0wEAAAAAAAAAAAAAkEPZLoy3bdtWkmQymRQREWGzr0CBAgoKCtJ7773n0HAAAAAAAAAAAAAAAORUtgvjGRkZkqRKlSopNjZWJUqUcFqoO8FischisWS5LDwAAAAAAAAAAAAAIO+w+x7jR44cueuL4pJkNpsVHx+v2NhYV0cBAAAAAAAAAAAAADhRtmeMXy8mJkYxMTE6c+aMdSb5NdOnT3dIMAAAAAAAAAAAAAAAHMHuwvjIkSMVHR2tunXrqnTp0jKZTM7IBQAAAAAAAAAAAACAQ9hdGJ82bZpmzpyp5557zhl5AAAAAAAAAAAAAABwKLvvMZ6amqoGDRo4IwsAAAAAAAAAAAAAAA5nd2H8xRdf1Ny5c52RBQAAAAAAAAAAAAAAh7N7KfUrV67ok08+0Zo1a1SrVi0VKFDAZv+kSZMcFs6ZLBaLLBaL0tPTXR0FAAAAAAAAAAAAAOBEdhfGd+/erdDQUEnSb7/9ZrPPZDI5JNSdYDabZTablZiYKH9/f1fHAQAAAAAAAAAAAAA4id2F8XXr1jkjBwAAAAAAAAAAAAAATmH3PcYBAAAAAAAAAAAAALib2D1jXJK2bdumb775RseOHVNqaqrNvoULFzokGAAAAAAAAAAAAAAAjmD3jPGvv/5aDRo00N69e7Vo0SJdvXpVe/bs0dq1a7lXNwAAAAAAAAAAAAAg17G7MD527Fi9//77WrZsmTw9PTVlyhTt27dPnTp1UoUKFZyREQAAAAAAAAAAAACA22Z3YfzQoUNq3bq1JMnT01PJyckymUx67bXX9Mknnzg8IAAAAAAAAAAAAAAAOWF3Ybxo0aK6dOmSJKls2bL67bffJEkXL17U5cuXHZsOAAAAAAAAAAAAAIAc8rD3CY0bN9bq1atVs2ZNPf300+rbt6/Wrl2r1atXq3nz5s7ICAAAAAAAAAAAAADAbbO7MP7hhx/qypUrkqS33npLBQoU0M8//6wOHTro7bffdnhAAAAAAAAAAAAAAABywu7CeLFixaw/u7m5afDgwQ4NdKdYLBZZLBalp6e7OgoAAAAAAAAAAAAAwImyVRhPTEyUn5+f9eebudYvtzObzTKbzUpMTJS/v7+r4wAAAAAAAAAAAAAAnCRbhfGiRYvq5MmTKlWqlIoUKSKTyZSpj2EYMplMzMAGAAAAAAAAAAAAAOQq2SqMr1271rqE+rp165waCAAAAAAAAAAAAAAAR8pWYbxJkyaSpLS0NG3YsEHdu3dXuXLlnBoMAAAAAAAAAAAAAABHcLOns4eHh9555x2lpaU5Kw8AAAAAAAAAAAAAAA5lV2Fckh555BFt2LDBGVkAAAAAAAAAAAAAAHC4bC2lfr1WrVpp8ODB+vXXX1WnTh15e3vb7H/yyScdFg4AAAAAAAAAAAAAgJyyuzD+6quvSpImTZqUaZ/JZFJ6enrOUwEAAAAAAAAAAAAA4CB2F8YzMjKckQMAAAAAAAAAAAAAAKew+x7jAAAAAAAAAAAAAADcTeyeMS5JycnJ2rBhg44dO6bU1FSbfX369HFIMAAAAAAAAAAAAAAAHMHuwvjOnTv1+OOP6/Lly0pOTlaxYsV09uxZFS5cWKVKlbrjhfGLFy+qRYsWSktLU1pamvr27asePXrc0QwAAAAAAAAAAAAAgNzL7qXUX3vtNbVp00YXLlxQoUKF9Msvv+iPP/5QnTp19O677zoj4035+vpq48aNiouL05YtWzR27FidO3fujucAAAAAAAAAAAAAAOROdhfG4+LiNGDAALm5ucnd3V0pKSkqX768Jk6cqDfffNMZGW/K3d1dhQsXliSlpKTIMAwZhnHHcwAAAAAAAAAAAAAAcie7C+MFChSQm9u/TytVqpSOHTsmSfL399fx48ftDrBx40a1adNGZcqUkclk0uLFizP1sVgsCgoKUsGCBVWvXj1t3brVZv/FixcVEhKicuXKadCgQSpRooTdOQAAAAAAAAAAAAAAeZPdhfHatWsrNjZWktSkSRMNGzZMc+bMUb9+/XT//ffbHSA5OVkhISGyWCxZ7p83b5769++v4cOHa8eOHQoJCVF4eLjOnDlj7VOkSBHt2rVLR44c0dy5c3X69Gm7cwAAAAAAAAAAAAAA8ia7C+Njx45V6dKlJUljxoxR0aJF9corr+jvv//WJ598YneAVq1aafTo0WrXrl2W+ydNmqQePXooKipKwcHBmjZtmgoXLqzp06dn6hsQEKCQkBD9+OOPNzxeSkqKEhMTbR4AAAAAAAAAAAAAgLzL7sJ43bp11axZM0n/LqW+YsUKJSYmavv27QoJCXFouNTUVG3fvl0tWrSwtrm5ualFixbavHmzJOn06dO6dOmSJCkhIUEbN25UtWrVbjjmuHHj5O/vb32UL1/eoZkBAAAAAAAAAAAAALmL3YXx0aNH68iRI87IksnZs2eVnp6ugIAAm/aAgACdOnVKkvTHH3/o4YcfVkhIiB5++GH17t1bNWvWvOGYQ4YMUUJCgvVxO/dFBwAAAAAAAAAAAADcPTzsfcL8+fM1fPhw1atXT926dVOnTp1UokQJZ2TLlgcffFBxcXHZ7u/l5SUvLy/nBQIAAAAAAAAAAAAA5Cp2zxjftWuXdu/eraZNm+rdd99VmTJl1Lp1a82dO1eXL192aLgSJUrI3d1dp0+ftmk/ffq0AgMDczS2xWJRcHCwwsLCcjQOAAAAAAAAAAAAACB3s7swLkk1atTQ2LFjdfjwYa1bt05BQUHq169fjovV/+Xp6ak6deooJibG2paRkaGYmBjVr18/R2ObzWbFx8crNjY2pzEBAAAAAAAAAAAAALmY3Uup/5e3t7cKFSokT09PXbp0ye7nJyUl6eDBg9btI0eOKC4uTsWKFVOFChXUv39/RUREqG7dunrwwQc1efJkJScnKyoqKqfRAQAAAAAAAAAAAAD5wG0Vxo8cOaK5c+dq7ty52r9/v5o0aaKRI0eqY8eOdo+1bds2NWvWzLrdv39/SVJERIRmzpypzp076++//9awYcN06tQphYaGasWKFQoICLid6FYWi0UWi0Xp6ek5GgcAAAAAAAAAAAAAkLuZDMMw7HnCQw89pNjYWNWqVUtdu3ZVly5dVLZsWWflc7rExET5+/srISFBfn5+ro5zWzpGRLs6AgDgDlswa5irIwAAAAAAAAAAcNewe8Z48+bNNX36dAUHBzsjDwAAAAAAAAAAAAAADmV3YXzMmDHOyAEAAAAAAAAAAAAAgFO4uTqAq1gsFgUHByssLMzVUQAAAAAAAAAAAAAATpRvC+Nms1nx8fGKjY11dRQAAAAAAAAAAAAAgBPl28I4AAAAAAAAAAAAACB/oDAOAAAAAAAAAAAAAMjTbqsw/uOPP6pbt26qX7++/vrrL0nS7Nmz9dNPPzk0nDNxj3EAAAAAAAAAAAAAyB/sLox/++23Cg8PV6FChbRz506lpKRIkhISEjR27FiHB3QW7jEOAAAAAAAAAAAAAPmD3YXx0aNHa9q0afr0009VoEABa3vDhg21Y8cOh4YDAAAAAAAAAAAAACCn7C6M79+/X40bN87U7u/vr4sXLzoiEwAAAAAAAAAAAAAADmN3YTwwMFAHDx7M1P7TTz/pnnvucUgoAAAAAAAAAAAAAAAcxe7CeI8ePdS3b19t2bJFJpNJJ06c0Jw5czRw4EC98sorzsjoFBaLRcHBwQoLC3N1FAAAAAAAAAAAAACAE3nY+4TBgwcrIyNDzZs31+XLl9W4cWN5eXlp4MCB6t27tzMyOoXZbJbZbFZiYqL8/f1dHQcAAAAAAAAAAAAA4CR2F8ZNJpPeeustDRo0SAcPHlRSUpKCg4Pl4+PjjHwAAAAAAAAAAAAAAOSI3YXxazw9PRUcHOzILAAAAAAAAAAAAAAAOJzdhfFmzZrJZDLdcP/atWtzFAgAAAAAAAAAAAAAAEeyuzAeGhpqs3316lXFxcXpt99+U0REhKNyOZ3FYpHFYlF6erqrowAAAAAAAAAAAAAAnMjuwvj777+fZfuIESOUlJSU40B3itlsltlsVmJiovz9/V0dBwAAAAAAAAAAAADgJG6OGqhbt26aPn26o4YDAAAAAAAAAAAAAMAhHFYY37x5swoWLOio4QAAAAAAAAAAAAAAcAi7l1Jv3769zbZhGDp58qS2bdumoUOHOiwYAAAAAAAAAAAAAACOYHdh/L/343Zzc1O1atUUHR2txx57zGHBAAAAAAAAAAAAAABwBLsL4zNmzHBGDgAAAAAAAAAAAAAAnMJh9xgHAAAAAAAAAAAAACA3snvGeNGiRWUymbLV9/z583YHulMsFossFovS09NdHQUAAAAAAAAAAAAA4ER2F8aHDh2q0aNHKzw8XPXr15ckbd68WStXrtTQoUNVrFgxh4d0BrPZLLPZrMTExEz3TQcAAAAAAAAAAAAA5B12F8Y3bdqk6Oho9erVy9rWp08fffjhh1qzZo0WL17syHwAAAAAAAAAAAAAAOSI3fcYX7lypVq2bJmpvWXLllqzZo1DQgEAAAAAAAAAAAAA4Ch2F8aLFy+uJUuWZGpfsmSJihcv7pBQAAAAAAAAAAAAAAA4it1LqY8cOVIvvvii1q9fr3r16kmStmzZohUrVujTTz91eEAAAAAAAAAAAAAAAHLC7sJ4ZGSkqlevrg8++EALFy6UJFWvXl0//fSTtVAOAAAAAAAAAAAAAEBuYXdhXJLq1aunOXPmODoLAAAAAAAAAAAAAAAOl63CeGJiovz8/Kw/38y1fgAAAAAAAAAAAAAA5AbZKowXLVpUJ0+eVKlSpVSkSBGZTKZMfQzDkMlkUnp6usNDAgAAAAAAAAAAAABwu7JVGF+7dq2KFSsmSVq3bp1TAwEAAAAAAAAAAAAA4EjZKow3adIky5/vZhaLRRaLhRnuAAAAAAAAAAAAAJDHud3Oky5evKhVq1bpyy+/1BdffGHzuFuYzWbFx8crNjbW1VEAAAAAl/nwww9Vt25deXl5qW3btjb7EhMT9eyzz8rPz08BAQEaNWqUdd+ZM2fUtWtXlStXTn5+fqpdu7aWLl16h9MDAAAAAAAA2ZOtGePXW7Zsmbp27aqkpCT5+fnZ3G/cZDLp+eefd2hAAAAAAM5TpkwZvf3221qzZo3+/PNPm329e/fW+fPndezYMZ05c0YtWrRQxYoV9fzzzyspKUm1a9fWhAkTVKZMGS1fvlzPPPOMYmNjFRwc7KJXAwAAAAAAAGTN7hnjAwYMUPfu3ZWUlKSLFy/qwoUL1sf58+edkREAAACAk7Rv315t27ZViRIlbNovX76sr7/+WqNHj1aRIkVUtWpV9e7dW59//rkk6Z577tHAgQNVrlw5ubm5qU2bNqpWrZp++eUXV7wMAAAAAAAA4KbsLoz/9ddf6tOnjwoXLuyMPAAAAABygf379ys1NVWhoaHWttDQUO3evTvL/mfOnNHevXtVq1atO5QQAAAAAAAAyD67C+Ph4eHatm2bM7IAAAAAyCWSkpLk7e0tD4//u/tSkSJFdOnSpUx9U1NT9cwzz6hTp06qW7funYwJAAAAAAAAZIvd9xhv3bq1Bg0apPj4eNWsWVMFChSw2f/kk086LBwAAAAA1/Dx8dHly5eVlpZmLY4nJCTI19fXpl9qaqo6duyowoUL69NPP3VFVAAAAAAAAOCW7C6M9+jRQ5IUHR2daZ/JZFJ6enrOUwEAAABwqWrVqqlAgQLatWuX6tSpI0mKi4tTzZo1rX1SU1P19NNPKzU1VUuWLJGnp6er4gIAAAAAAAA3ZfdS6hkZGTd8UBQHAAAA7i5paWm6cuWK0tLSlJGRoStXrig1NVWFCxdW586dNXToUCUkJOjAgQOaOnWqXnzxRUnS1atX1alTJyUnJ2vx4sXy8vJy8SsBAAAAAAAAbszuwjgAAACAvGP06NEqVKiQxowZo2XLlqlQoUJ67LHHJEkffvih/P39Va5cOTVs2FAvvPCCnn/+eUnSzz//rCVLlmjTpk0qUaKEfHx85OPjo7Fjx7ry5QAAAAAAAABZMhmGYdjzhKyWUL/esGHDchToTktMTJS/v78SEhLk5+fn6ji3pWPEza8JACDvWTDr7vq+BQAAAAAAAADAley+x/iiRYtstq9evaojR47Iw8NDlStXvusK4wAAAAAAAAAAAACAvM3uwvjOnTsztSUmJioyMlLt2rVzSCh7HD9+XM8995zOnDkjDw8PDR06VE8//fQdzwEAAAAAAAAAAAAAyJ0cco9xPz8/jRw5UkOHDnXEcHbx8PDQ5MmTFR8fr1WrVqlfv35KTk6+4zkAAAAAAAAAAAAAALmT3TPGbyQhIUEJCQmOGi7bSpcurdKlS0uSAgMDVaJECZ0/f17e3t53PAsAAAAAAAAAAAAAIPexuzD+wQcf2GwbhqGTJ09q9uzZatWqld0BNm7cqHfeeUfbt2/XyZMntWjRIrVt29amj8Vi0TvvvKNTp04pJCREU6dO1YMPPphprO3btys9PV3ly5e3OwcAAAAAAAAAAAAAIG+yuzD+/vvv22y7ubmpZMmSioiI0JAhQ+wOkJycrJCQEHXv3l3t27fPtH/evHnq37+/pk2bpnr16mny5MkKDw/X/v37VapUKWu/8+fP6/nnn9enn35qdwYAAAAAAAAAAAAAQN5ld2H8yJEjDg3QqlWrm840nzRpknr06KGoqChJ0rRp07R8+XJNnz5dgwcPliSlpKSobdu2Gjx4sBo0aHDT46WkpCglJcW6nZiY6IBXAQAAAAAAAAAAAADIrRx2j3FnSE1N1fbt221moru5ualFixbavHmzpH+Xco+MjNQjjzyi55577pZjjhs3TiNHjnRaZgAAkLd1jIh2dQQAwB22YNYwV0cAAAAAAAA55ObqADdz9uxZpaenKyAgwKY9ICBAp06dkiRt2rRJ8+bN0+LFixUaGqrQ0FD9+uuvNxxzyJAhSkhIsD6OHz/u1NcAAAAAAAAAAAAAAHCtXD1jPDsaNWqkjIyMbPf38vKSl5eXExMBAAAAAAAAAAAAAHKTXD1jvESJEnJ3d9fp06dt2k+fPq3AwMAcjW2xWBQcHKywsLAcjQMAAAAAAAAAAAAAyN2yVRh/4IEHdOHCBUlSdHS0Ll++7NRQ13h6eqpOnTqKiYmxtmVkZCgmJkb169fP0dhms1nx8fGKjY3NaUwAAAAAAAAAAAAAQC6WrcL43r17lZycLEkaOXKkkpKSHBYgKSlJcXFxiouLkyQdOXJEcXFxOnbsmCSpf//++vTTTzVr1izt3btXr7zyipKTkxUVFeWwDAAAAAAAAAAAAACAvCtb9xgPDQ1VVFSUGjVqJMMw9O6778rHxyfLvsOGDbMrwLZt29SsWTPrdv/+/SVJERERmjlzpjp37qy///5bw4YN06lTpxQaGqoVK1YoICDAruP8l8VikcViUXp6eo7GAQAAAAAAAAAAAADkbibDMIxbddq/f7+GDx+uQ4cOaceOHQoODpaHR+aauslk0o4dO5wS1FkSExPl7++vhIQE+fn5uTrObekYEe3qCACAO2zBLPv+EA2Ow/cuAOQ/fO8CAAAAAHD3y9aM8WrVqunrr7+WJLm5uSkmJkalSpVyajAAAAAAAAAAAAAAABwhW4Xx62VkZDgjBwAAAAAAAAAAAAAATmF3YVySDh06pMmTJ2vv3r2SpODgYPXt21eVK1d2aDhn4h7jAAAAAAAAAAAAAJA/uNn7hJUrVyo4OFhbt25VrVq1VKtWLW3ZskU1atTQ6tWrnZHRKcxms+Lj4xUbG+vqKAAAAAAAAAAAAAAAJ7J7xvjgwYP12muvafz48Zna33jjDT366KMOCwcAAAAAAAAAAAAAQE7ZPWN87969euGFFzK1d+/eXfHx8Q4JBQAAAAAAAAAAAACAo9hdGC9ZsqTi4uIytcfFxalUqVKOyHRHWCwWBQcHKywszNVRAAAAAAAAAAAAAABOZPdS6j169NBLL72kw4cPq0GDBpKkTZs2acKECerfv7/DAzqL2WyW2WxWYmKi/P39XR0HAAAAAAAAAAAAAOAkdhfGhw4dKl9fX7333nsaMmSIJKlMmTIaMWKE+vTp4/CAAAAAAAAAAAAAAADkhN2FcZPJpNdee02vvfaaLl26JEny9fV1eDAAAAAAAAAAAAAAABzB7nuMX8/X1/euLYpzj3EAAAAAAAAAAAAAyB9yVBi/m5nNZsXHxys2NtbVUQAAAAAAAAAAAAAATpRvC+MAAAAAAAAAAAAAgPyBwjgAAAAAAAAAAAAAIE+zqzB+9epVNW/eXAcOHHBWHgAAAAAAAAAAAAAAHMquwniBAgW0e/duZ2UBAAAAAAAAAAAAAMDh7F5KvVu3bvr888+dkeWOslgsCg4OVlhYmKujAAAAAAAAAAAAAACcyMPeJ6SlpWn69Olas2aN6tSpI29vb5v9kyZNclg4ZzKbzTKbzUpMTJS/v7+r4wAAAAAAAAAAAAAAnMTuwvhvv/2mBx54QJL0+++/2+wzmUyOSQUAAAAAAAAAAAAAgIPYXRhft26dM3IAAAAAAAAAAAAAAOAUdt9j/JqDBw9q5cqV+ueffyRJhmE4LBQAAAAAAAAAAAAAAI5id2H83Llzat68uapWrarHH39cJ0+elCS98MILGjBggMMDAgAAAAAAAAAAAACQE3YXxl977TUVKFBAx44dU+HCha3tnTt31ooVKxwaDgAAAAAAAAAAAACAnLL7HuOrVq3SypUrVa5cOZv2e++9V3/88YfDggEAAAAAAAAAAAAA4Ah2zxhPTk62mSl+zfnz5+Xl5eWQUHeCxWJRcHCwwsLCXB0FAAAAAAAAAAAAAOBEdhfGH374YX3xxRfWbZPJpIyMDE2cOFHNmjVzaDhnMpvNio+PV2xsrKujAAAAAAAAAAAAAACcyO6l1CdOnKjmzZtr27ZtSk1N1euvv649e/bo/Pnz2rRpkzMyAgAAAAAAAAAAAABw2+yeMX7//ffr999/V6NGjfTUU08pOTlZ7du3186dO1W5cmVnZAQAAAAAAAAAAAAA4LbZPWNckvz9/fXWW285OgsAAAAAAAAAAAAAAA53W4XxCxcu6PPPP9fevXslScHBwYqKilKxYsUcGg4AAAAAAAAAAAAAgJyyeyn1jRs3KigoSB988IEuXLigCxcu6IMPPlClSpW0ceNGZ2QEAAAAAAAAAAAAAOC22T1j3Gw2q3Pnzvr444/l7u4uSUpPT9err74qs9msX3/91eEhAQAAAAAAAAAAAAC4XXbPGD948KAGDBhgLYpLkru7u/r376+DBw86NBwAAAAAAAAAAAAAADlld2H8gQcesN5b/Hp79+5VSEiIQ0IBAAAAAAAAAAAAAOAo2VpKfffu3daf+/Tpo759++rgwYN66KGHJEm//PKLLBaLxo8f75yUAAAAAAAAAAAAAADcpmwVxkNDQ2UymWQYhrXt9ddfz9Tv2WefVefOnR2XzoksFossFovS09NdHQUAAAAAAAAAAAAA4ETZKowfOXLE2TnuOLPZLLPZrMTERPn7+7s6DgAAAAAAAAAAAADASbJVGK9YsaKzcwAAAAAAAAAAAAAA4BTZKoz/14kTJ/TTTz/pzJkzysjIsNnXp08fhwQDAAAAAAAAAAAAAMAR7C6Mz5w5Uz179pSnp6eKFy8uk8lk3WcymSiMAwAAAAAAAAAAAAByFbsL40OHDtWwYcM0ZMgQubm5OSMTAAAAAAAAAAAAAAAOY3dl+/Lly3rmmWcoigMAAAAAAAAAAAAA7gp2V7dfeOEFzZ8/3xlZAAAAAAAAAAAAAABwOLuXUh83bpyeeOIJrVixQjVr1lSBAgVs9k+aNMlh4QAAAAAAAAAAAAAAyKnbKoyvXLlS1apVkySZTCbrvut/BgAAAAAAAAAAAAAgN7C7MP7ee+9p+vTpioyMdEIcAAAAAAAAAAAAAAAcy+57jHt5ealhw4bOyAIAAAAAAAAAAAAAgMPZXRjv27evpk6d6owst61du3YqWrSoOnbs6OooAAAAAAAAAAAAAIBcxu6l1Ldu3aq1a9fqu+++U40aNVSgQAGb/QsXLnRYuOzq27evunfvrlmzZt3xYwMAAAAAAAAAAAAAcje7C+NFihRR+/btnZHltjVt2lTr1693dQwAAAAAAAAAAAAAQC5kd2F8xowZDg2wceNGvfPOO9q+fbtOnjypRYsWqW3btjZ9LBaL3nnnHZ06dUohISGaOnWqHnzwQYfmAAAAAAAAAAAAAADkTXbfY9zRkpOTFRISIovFkuX+efPmqX///ho+fLh27NihkJAQhYeH68yZM3c4KQAAAAAAAAAAAADgbmT3jPFKlSrJZDLdcP/hw4ftGq9Vq1Zq1arVDfdPmjRJPXr0UFRUlCRp2rRpWr58uaZPn67BgwfbdSxJSklJUUpKinU7MTHR7jEAAAAAAAAAAAAAAHcPuwvj/fr1s9m+evWqdu7cqRUrVmjQoEGOyiVJSk1N1fbt2zVkyBBrm5ubm1q0aKHNmzff1pjjxo3TyJEjHRURAAAAAAAAAAAAAJDL2V0Y79u3b5btFotF27Zty3Gg6509e1bp6ekKCAiwaQ8ICNC+ffus2y1atNCuXbuUnJyscuXKaf78+apfv36WYw4ZMkT9+/e3bicmJqp8+fIOzQ0AAAAAAAAAAAAAyD3sLozfSKtWrTRkyBDNmDHDUUNm25o1a7Ld18vLS15eXk5MAwAAAAAAAAAAAADITdwcNdCCBQtUrFgxRw0nSSpRooTc3d11+vRpm/bTp08rMDAwR2NbLBYFBwcrLCwsR+MAAAAAAAAAAAAAAHI3u2eM165dWyaTybptGIZOnTqlv//+Wx999JFDw3l6eqpOnTqKiYlR27ZtJUkZGRmKiYlRr169cjS22WyW2WxWYmKi/P39HZAWAAAAAAAAAAAAAJAb2V0Yv1agvsbNzU0lS5ZU06ZNdd9999kdICkpSQcPHrRuHzlyRHFxcSpWrJgqVKig/v37KyIiQnXr1tWDDz6oyZMnKzk5WVFRUXYfCwAAAAAAAAAAAACQ/9hdGB8+fLhDA2zbtk3NmjWzbvfv31+SFBERoZkzZ6pz5876+++/NWzYMJ06dUqhoaFasWKFAgICcnRci8Uii8Wi9PT0HI0DAAAAAAAAAAAAAMjdTIZhGK4O4UrXllJPSEiQn5+fq+Pclo4R0a6OAAC4wxbMGubqCPkW37sAkP/wvQsAAAAAwN0v2zPG3dzcbO4tnhWTyaS0tLQchwIAAAAAAAAAAAAAwFGyXRhftGjRDfdt3rxZH3zwgTIyMhwS6k5gKXUAAAAAAAAAAAAAyB+yXRh/6qmnMrXt379fgwcP1rJly9S1a1dFR989S4uazWaZzWbrUuoAAAAAAAAAAAAAgLzJ7XaedOLECfXo0UM1a9ZUWlqa4uLiNGvWLFWsWNHR+QAAAAAAAAAAAAAAyBG7CuMJCQl64403VKVKFe3Zs0cxMTFatmyZ7r//fmflAwAAAAAAAAAAAAAgR7K9lPrEiRM1YcIEBQYG6quvvspyaXUAAAAAAAAAAAAAAHKbbBfGBw8erEKFCqlKlSqaNWuWZs2alWW/hQsXOiycM1ksFlksFqWnp7s6CgAAAAAAAAAAAADAibJdGH/++edlMpmcmeWOMpvNMpvNSkxMlL+/v6vjAAAAAAAAAAAAAACcJNuF8ZkzZzoxBgAAAAAAAABJ6t27txYvXqyEhAT5+vrq6aef1sSJE+Xp6enqaAAAAMBdy83VAQAAAAAAAAD8n1dffVX79u1TYmKidu3apV27dmnixImujgUAAADc1fJtYdxisSg4OFhhYWGujgIAAAAAAABYVa9eXd7e3pIkwzDk5uamAwcOuDgVAAAAcHfLt4Vxs9ms+Ph4xcbGujoKAAAAAAAAYGP8+PHy8fFRqVKltGvXLvXu3dvVkQAAAIC7Wr4tjAMAAAAAAAC51eDBg5WUlKT4+Hi9/PLLCgwMdHUkAAAA4K5GYRwAAAAAAADIpapXr66QkBBFRka6OgoAAABwV6MwDgAAAAAAAORiV69e5R7jAAAAQA5RGAcAAAAAAAByiaSkJM2YMUMXL16UYRj69ddfNXr0aIWHh7s6GgAAAHBXy7eFcYvFouDgYIWFhbk6CgAAAAAAACBJMplMmjt3ripXrixfX1899dRTat26tSZPnuzqaAAAAMBdzcPVAVzFbDbLbDYrMTFR/v7+ro4DAAAAAAAAyNvbW6tXr3Z1DAAAACDPybczxgEAAAAAAAAAAAAA+QOFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp1EYBwAAAAAAAAAAAADkaRTGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECe5uHqAK5isVhksViUnp7u6igAAAAAAOAGOkZEuzoCAOAOWzBrmKsjAACAPCjfzhg3m82Kj49XbGysq6MAAAAAAAAAAAAAAJwo3xbGAQAAAAAAAAAAAAD5A4VxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdRGAcAAAAAAAAAAAAA5GkUxgEAAAAAAAAAAAAAeVq+LYxbLBYFBwcrLCzM1VEAAAAAAAAAAAAAAE6UbwvjZrNZ8fHxio2NdXUUAAAAAAAAAAAAAIAT5dvCOAAAAAAAAAAAAPK3Dz/8UHXr1pWXl5fatm3r6jgAnMjD1QEAAAAAAAAAAAAAVyhTpozefvttrVmzRn/++aer4wBwIgrjAAAAAAAAAAAAyJfat28vSYqLi6MwDuRxLKUOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDyNwjgAAAAAAAAAAAAAIE/zcHUAAAAAAAAAAAAAwBXS0tKsj4yMDF25ckVubm7y9PR0dTQADkZhHAAAAAAAAAAAAPnS6NGjNXLkSOt2oUKF1KRJE61fv951oQA4BUupAwAAAAAAAAAAIF8aMWKEDMOweVAUB/KmPFEY/+6771StWjXde++9+uyzz1wdBwAAAAAAAAAAAACQi9z1S6mnpaWpf//+Wrdunfz9/VWnTh21a9dOxYsXd3U0AAAAAAAAAAAAAEAucNfPGN+6datq1KihsmXLysfHR61atdKqVatcHQsAAAAAAAAAAAAAkEu4vDC+ceNGtWnTRmXKlJHJZNLixYsz9bFYLAoKClLBggVVr149bd261brvxIkTKlu2rHW7bNmy+uuvv+5EdAAAAAAAAAAAAADAXcDlhfHk5GSFhITIYrFkuX/evHnq37+/hg8frh07digkJETh4eE6c+bMHU4KAAAAAAAAAAAAALgbubww3qpVK40ePVrt2rXLcv+kSZPUo0cPRUVFKTg4WNOmTVPhwoU1ffp0SVKZMmVsZoj/9ddfKlOmzA2Pl5KSosTERJsHAAAAAAAAAAAAACDv8nB1gJtJTU3V9u3bNWTIEGubm5ubWrRooc2bN0uSHnzwQf3222/666+/5O/vrx9++EFDhw694Zjjxo3TyJEjnZ4dAAAAAAAAAIC7TceIaFdHAADcYQtmDXN1hDvC5TPGb+bs2bNKT09XQECATXtAQIBOnTolSfLw8NB7772nZs2aKTQ0VAMGDFDx4sVvOOaQIUOUkJBgfRw/ftyprwEAAAAAAAAAAAAA4Fq5esZ4dj355JN68skns9XXy8tLXl5eTk4EAAAAAAAAAAAAAMgtcvWM8RIlSsjd3V2nT5+2aT99+rQCAwNzNLbFYlFwcLDCwsJyNA4AAAAAAAAAAAAAIHfL1YVxT09P1alTRzExMda2jIwMxcTEqH79+jka22w2Kz4+XrGxsTmNCQAAAAAAAAAAAADIxVy+lHpSUpIOHjxo3T5y5Iji4uJUrFgxVahQQf3791dERITq1q2rBx98UJMnT1ZycrKioqJcmBoAAAAAAAAAAAAAcLdweWF827ZtatasmXW7f//+kqSIiAjNnDlTnTt31t9//61hw4bp1KlTCg0N1YoVKxQQEJCj41osFlksFqWnp+doHAAAAAAAAAAAAABA7ubywnjTpk1lGMZN+/Tq1Uu9evVy6HHNZrPMZrMSExPl7+/v0LEBAAAAAAAAAAAAALlHrr7HOAAAAAAAAAAAAAAAOZVvC+MWi0XBwcEKCwtzdRQAAAAAAAAAAAAAgBPl28K42WxWfHy8YmNjXR0FAAAAAAAAAAAAAOBE+bYwDgAAAAAAAAAAAADIHyiMAwAAAAAAAAAAAADyNA9XB3AVi8Uii8WitLQ0SVJiYqKLE92+q6lXXB0BAHCH3c3fW3c7vncBIP/he9e1+O4FgPyH717X4rsXAPKfvPDd6+vrK5PJdNM+JsMwjDuUJ1f6888/Vb58eVfHAAAAAAAAAAAAAADchoSEBPn5+d20T74vjGdkZOjEiRPZ+isCALlHYmKiypcvr+PHj9/yFx0AAMg5vnsBALiz+O4FAODO4rsXuLtlp9abb5dSv8bNzU3lypVzdQwAt8nPz4//SAEA4A7iuxcAgDuL714AAO4svnuBvMvN1QEAAAAAAAAAAAAAAHAmCuMAAAAAAAAAAAAAgDyNwjiAu5KXl5eGDx8uLy8vV0cBACBf4LsXAIA7i+9eAADuLL57gbzPZBiG4eoQAAAAAAAAAAAAAAA4CzPGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHcFfZuHGj2rRpozJlyshkMmnx4sWujgQAQJ42btw4hYWFydfXV6VKlVLbtm21f/9+V8cCACDPunTpkvr166eKFSuqUKFCatCggWJjY10dCwCAPOFW/74cGRkpk8lk82jZsqVrwgJwOArjAO4qycnJCgkJkcVicXUUAADyhQ0bNshsNuuXX37R6tWrdfXqVT322GNKTk52dTQAAPKkF198UatXr9bs2bP166+/6rHHHlOLFi30119/uToaAAB3vez8+3LLli118uRJ6+Orr766gwkBOJPJMAzD1SEA4HaYTCYtWrRIbdu2dXUUAADyjb///lulSpXShg0b1LhxY1fHAQAgT/nnn3/k6+urJUuWqHXr1tb2OnXqqFWrVho9erQL0wEAkLdk9e/LkZGRunjxIiuVAnkUM8YBAAAAZFtCQoIkqVixYi5OAgBA3pOWlqb09HQVLFjQpr1QoUL66aefXJQKAID8Zf369SpVqpSqVaumV155RefOnXN1JAAOQmEcAAAAQLZkZGSoX79+atiwoe6//35XxwEAIM/x9fVV/fr1NWrUKJ04cULp6en68ssvtXnzZp08edLV8QAAyPNatmypL774QjExMZowYYI2bNigVq1aKT093dXRADiAh6sDAAAAALg7mM1m/fbbb8xYAwDAiWbPnq3u3burbNmycnd31wMPPKAuXbpo+/btro4GAECe98wzz1h/rlmzpmrVqqXKlStr/fr1at68uQuTAXAEZowDAAAAuKVevXrpu+++07p161SuXDlXxwEAIM+qXLmyNmzYoKSkJB0/flxbt27V1atXdc8997g6GgAA+c4999yjEiVK6ODBg66OAsABKIwDAAAAuCHDMNSrVy8tWrRIa9euVaVKlVwdCQCAfMHb21ulS5fWhQsXtHLlSj311FOujgQAQL7z559/6ty5cypdurSrowBwAJZSB3BXSUpKsvnrvCNHjiguLk7FihVThQoVXJgMAIC8yWw2a+7cuVqyZIl8fX116tQpSZK/v78KFSrk4nQAAOQ9K1eulGEYqlatmg4ePKhBgwbpvvvuU1RUlKujAQBw17vZvy8XK1ZMI0eOVIcOHRQYGKhDhw7p9ddfV5UqVRQeHu7C1AAcxWQYhuHqEACQXevXr1ezZs0ytUdERGjmzJl3PhAAAHmcyWTKsn3GjBmKjIy8s2EAAMgHvvnmGw0ZMkR//vmnihUrpg4dOmjMmDHy9/d3dTQAAO56N/v35Y8//lht27bVzp07dfHiRZUpU0aPPfaYRo0apYCAABekBeBoFMYBAAAAAAAAAAAAAHka9xgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdRGAcAAAAAAAAAAAAA5GkUxgEAAAAAAAAAAAAAeRqFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp1EYBwAAAAAXOHr0qEwmk+Li4lwdxWrfvn166KGHVLBgQYWGhro6jtNERkaqbdu2ro6RqwQFBWny5MkuOfbMmTNVpEgRh43XtGlT9evX744e83Y463dAbn5/r1+/XiaTSRcvXnR1FAAAAAD5EIVxAAAAAPlSZGSkTCaTxo8fb9O+ePFimUwmF6VyreHDh8vb21v79+9XTExMln2yU3TM7aZMmaKZM2fa9RxXFo7vhNjYWL300kvZ7p8bCss3snDhQo0aNcq6nVuvXfny5XXy5Endf//9ro5yxzRo0EAnT56Uv7+/Q8c1mUxavHixQ8cEAAAAkPdQGAcAAACQbxUsWFATJkzQhQsXXB3FYVJTU2/7uYcOHVKjRo1UsWJFFS9e/LbHMQxDaWlpt/18Z0lPT1dGRob8/f1zbVHXVUqWLKnChQu7OoZDFCtWTL6+vq6OcUvu7u4KDAyUh4eHq6PcEVevXpWnp6cCAwPz7R8fAQAAAHAtCuMAAAAA8q0WLVooMDBQ48aNu2GfESNGZFpWfPLkyQoKCrJuX1u6eOzYsQoICFCRIkUUHR2ttLQ0DRo0SMWKFVO5cuU0Y8aMTOPv27dPDRo0UMGCBXX//fdrw4YNNvt/++03tWrVSj4+PgoICNBzzz2ns2fPWvc3bdpUvXr1Ur9+/VSiRAmFh4dn+ToyMjIUHR2tcuXKycvLS6GhoVqxYoV1v8lk0vbt2xUdHS2TyaQRI0ZkGiMyMlIbNmzQlClTZDKZZDKZdPToUevyyD/88IPq1KkjLy8v/fTTTzp06JCeeuopBQQEyMfHR2FhYVqzZo3NmEFBQRo7dqy6d+8uX19fVahQQZ988ol1f2pqqnr16qXSpUurYMGCqlixos31unjxonr27KmAgADrOfzuu+8k/d+s5qVLlyo4OFheXl46duxYpqWmr53DXr16yd/fXyVKlNDQoUNlGIZ1/x9//KHXXnvN+rol6Y8//lCbNm1UtGhReXt7q0aNGvr++++zPP+SNHv2bNWtW1e+vr4KDAzUs88+qzNnzlj3X7hwQV27dlXJkiVVqFAh3Xvvvdb3zK3Ow7Fjx/TUU0/Jx8dHfn5+6tSpk06fPm1z/GXLliksLEwFCxZUiRIl1K5dO5vrcP2s6kmTJqlmzZry9vZW+fLl9eqrryopKUnSv8thR0VFKSEhwXo+rr1fUlJSNHDgQJUtW1be3t6qV6+e1q9fb5Nj5syZqlChggoXLqx27drp3LlzNzxnktSxY0f16tXLut2vXz+ZTCbt27fPem68vb2t763rVzW40bW7ZuXKlapevbp8fHzUsmVLnTx58oY50tPT9cILL6hSpUoqVKiQqlWrpilTptw0+82u6X+XUr/2OYqJiVHdunVVuHBhNWjQQPv377cZc/To0SpVqpR8fX314osvavDgwTe99UFGRobGjRtnzR0SEqIFCxbcsP+bb76pevXqZWoPCQlRdHS0pH9XGHj00UdVokQJ+fv7q0mTJtqxY4dNf5PJpI8//lhPPvmkvL29NWbMmExLqZ87d05dunRR2bJlVbhwYdWsWVNfffWVzThNmzZVnz599Prrr6tYsWIKDAy0+f107Xdxu3btZDKZbH43AwAAAMD1KIwDAAAAyLfc3d01duxYTZ06VX/++WeOxlq7dq1OnDihjRs3atKkSRo+fLieeOIJFS1aVFu2bNHLL7+snj17ZjrOoEGDNGDAAO3cuVP169dXmzZtrIXCixcv6pFHHlHt2rW1bds2rVixQqdPn1anTp1sxpg1a5Y8PT21adMmTZs2Lct8U6ZM0Xvvvad3331Xu3fvVnh4uJ588kkdOHBAknTy5EnVqFFDAwYM0MmTJzVw4MAsx6hfv7569OihkydP6uTJkypfvrx1/+DBgzV+/Hjt3btXtWrVUlJSkh5//HHFxMRo586datmypdq0aaNjx47ZjPvee++pbt262rlzp1599VW98sor1mLgBx98oKVLl+qbb77R/v37NWfOHGvhKyMjQ61atdKmTZv05ZdfKj4+XuPHj5e7u7t17MuXL2vChAn67LPPtGfPHpUqVSrL8zNr1ix5eHho69atmjJliiZNmqTPPvtM0r9Lc5crV07R0dHW1y1JZrNZKSkp2rhxo3799VdNmDBBPj4+WY4v/TtjdtSoUdq1a5cWL16so0ePKjIy0rp/6NChio+P1w8//KC9e/fq448/VokSJbJ1Hp566imdP39eGzZs0OrVq3X48GF17tzZOvby5cvVrl07Pf7449q5c6diYmL04IMP3jCrm5ubPvjgA+3Zs0ezZs3S2rVr9frrr0v6dznsyZMny8/Pz3o+rr1fevXqpc2bN+vrr7/W7t279fTTT6tly5bW99mWLVv0wgsvqFevXoqLi1OzZs00evToG+aQpCZNmtgU1zds2KASJUpY22JjY3X16lU1aNAg03NvdO2kf98b7777rmbPnq2NGzfq2LFjWb7vr8nIyFC5cuU0f/58xcfHa9iwYXrzzTf1zTff3PA5N7umN/LWW2/pvffe07Zt2+Th4aHu3btb982ZM0djxozRhAkTtH37dlWoUEEff/zxTccbN26cvvjiC02bNk179uzRa6+9pm7dumX6I5xrunbtqq1bt+rQoUPWtj179mj37t169tlnJUmXLl1SRESEfvrpJ/3yyy+699579fjjj+vSpUs2Y40YMULt2rXTr7/+avM6rrly5Yrq1Kmj5cuX67ffftNLL72k5557Tlu3brXpN2vWLHl7e2vLli2aOHGioqOjtXr1akn/Xn9JmjFjhk6ePGndBgAAAIBMDAAAAADIhyIiIoynnnrKMAzDeOihh4zu3bsbhmEYixYtMq7/X6Xhw4cbISEhNs99//33jYoVK9qMVbFiRSM9Pd3aVq1aNePhhx+2bqelpRne3t7GV199ZRiGYRw5csSQZIwfP97a5+rVq0a5cuWMCRMmGIZhGKNGjTIee+wxm2MfP37ckGTs37/fMAzDaNKkiVG7du1bvt4yZcoYY8aMsWkLCwszXn31Vet2SEiIMXz48JuO06RJE6Nv3742bevWrTMkGYsXL75ljho1ahhTp061blesWNHo1q2bdTsjI8MoVaqU8fHHHxuGYRi9e/c2HnnkESMjIyPTWCtXrjTc3Nys5+K/ZsyYYUgy4uLibNqvv/bXXlP16tVtjvHGG28Y1atXt8n5/vvv24xTs2ZNY8SIEbd8zTcSGxtrSDIuXbpkGIZhtGnTxoiKisqy783Ow6pVqwx3d3fj2LFj1rY9e/YYkoytW7cahmEY9evXN7p27XrDLFm9vuvNnz/fKF68uHV7xowZhr+/v02fP/74w3B3dzf++usvm/bmzZsbQ4YMMQzDMLp06WI8/vjjNvs7d+6caazr7d692zCZTMaZM2eM8+fPG56ensaoUaOMzp07G4ZhGKNHjzYaNGhg7f/f92hWr+3ae+PgwYPWNovFYgQEBNwwR1bMZrPRoUOHG+6/2TW99jtg586dhmH83+dozZo11j7Lly83JBn//POPYRiGUa9ePcNsNtuM07BhQ5vfUde/v69cuWIULlzY+Pnnn22e88ILLxhdunS5Ye6QkBAjOjrauj1kyBCjXr16N+yfnp5u+Pr6GsuWLbO2STL69etn0+/aa7xw4cINx2rdurUxYMAA63aTJk2MRo0a2fQJCwsz3njjDZtjLVq06IZjAgAAAIBhGAYzxgEAAADkexMmTNCsWbO0d+/e2x6jRo0acnP7v//FCggIUM2aNa3b7u7uKl68uM3S2ZJUv359688eHh6qW7euNceuXbu0bt06+fj4WB/33XefJNnM5qxTp85NsyUmJurEiRNq2LChTXvDhg1z9Jr/q27dujbbSUlJGjhwoKpXr64iRYrIx8dHe/fuzTRjvFatWtafTSaTAgMDrecpMjJScXFxqlatmvr06aNVq1ZZ+8bFxalcuXKqWrXqDTN5enrajH8jDz30kM0y2/Xr19eBAweUnp5+w+f06dNHo0ePVsOGDTV8+HDt3r37psfYvn272rRpowoVKsjX11dNmjSRJOv5eOWVV/T1118rNDRUr7/+un7++Wfrc292Hvbu3avy5cvbzN4PDg5WkSJFrNc3Li5OzZs3v+V5uGbNmjVq3ry5ypYtK19fXz333HM6d+6cLl++fMPn/Prrr0pPT1fVqlVt3rMbNmywvl/37t2baZnu6z8DWbn//vtVrFgxbdiwQT/++KNq166tJ554wjrjecOGDWratGm2X9s1hQsXVuXKla3bpUuXzvT5/C+LxaI6deqoZMmS8vHx0SeffJLp/Xy9m13TG7n+/Vq6dGlJsubav39/ppn+N5v5f/DgQV2+fFmPPvqozTX54osvbH6H/FfXrl01d+5cSZJhGPrqq6/UtWtX6/7Tp0+rR48euvfee+Xv7y8/Pz8lJSVlOhf//Z3wX+np6Ro1apRq1qypYsWKycfHRytXrrzp7wgpe9cKAAAAAP6LwjgAAACAfK9x48YKDw/XkCFDMu1zc3Oz3mv6mqtXr2bqV6BAAZttk8mUZVtGRka2cyUlJalNmzaKi4uzeRw4cECNGze29vP29s72mM703xwDBw7UokWLNHbsWP3444+Ki4tTzZo1lZqaatPvZufpgQce0JEjRzRq1Cj9888/6tSpkzp27ChJKlSo0C0zFSpUKNN9pR3lxRdf1OHDh/Xcc8/p119/Vd26dTV16tQs+yYnJys8PFx+fn6aM2eOYmNjtWjRIkmyno9WrVpZ74d94sQJNW/e3Lq0983OQ3Zk51xdc/ToUT3xxBOqVauWvv32W23fvl0Wi8Uma1aSkpLk7u6u7du327xf9+7de8t7cd+MyWRS48aNtX79emsRvFatWkpJSdFvv/2mn3/+2fpHBvbI6n3338/69b7++msNHDhQL7zwglatWqW4uDhFRUXd9Jzc7JpmJ9e19649vzeud+2+8MuXL7e5JvHx8Te9z3iXLl20f/9+7dixQz///LOOHz9uszR/RESE4uLiNGXKFP3888+Ki4tT8eLFM52LW/1ueueddzRlyhS98cYbWrduneLi4hQeHm7X7wgAAAAAyC4K4wAAAAAgafz48Vq2bJk2b95s016yZEmdOnXKpmAWFxfnsOP+8ssv1p/T0tK0fft2Va9eXdK/xdA9e/YoKChIVapUsXnYUwz38/NTmTJltGnTJpv2TZs2KTg42K68np6eN51F/d/xIyMj1a5dO9WsWVOBgYE6evSoXceT/s3fuXNnffrpp5o3b56+/fZbnT9/XrVq1dKff/6p33//3e4x/2vLli0229fum3ztfuU3et3ly5fXyy+/rIULF2rAgAH69NNPsxx/3759OnfunMaPH6+HH35Y9913X5YzXkuWLKmIiAh9+eWXmjx5sj755BPrvhudh+rVq+v48eM6fvy4tW98fLwuXrxovb61atVSTExMts7F9u3blZGRoffee08PPfSQqlatqhMnTtj0yep81K5dW+np6Tpz5kym92tgYKAkqXr16lme61u5dp/x9evXq2nTpnJzc1Pjxo31zjvvKCUlJdNqCLfKejs2bdqkBg0a6NVXX1Xt2rVVpUqVm866vuZm19Re1apVy3QP7ZvdUzs4OFheXl46duxYpmty/QoD/1WuXDk1adJEc+bM0Zw5c/Too4+qVKlS1v2bNm1Snz599Pjjj6tGjRry8vLS2bNn7X49mzZt0lNPPaVu3bopJCRE99xzz219ngsUKOCQawwAAAAgb6MwDgAAAACSatasqa5du+qDDz6waW/atKn+/vtvTZw4UYcOHZLFYtEPP/zgsONaLBYtWrRI+/btk9ls1oULF9S9e3dJktls1vnz59WlSxfFxsbq0KFDWrlypaKiouwuAg0aNEgTJkzQvHnztH//fg0ePFhxcXHq27evXeMEBQVpy5YtOnr0qM6ePXvTWZv33nuvFi5cqLi4OO3atUvPPvus3bM8J02apK+++kr79u3T77//rvnz5yswMFBFihRRkyZN1LhxY3Xo0EGrV6/WkSNH9MMPP2jFihV2HUP6dznz/v37a//+/frqq680depUm3MTFBSkjRs36q+//rIWAPv166eVK1fqyJEj2rFjh9atW2f9o4b/qlChgjw9PTV16lQdPnxYS5cu1ahRo2z6DBs2TEuWLNHBgwe1Z88efffdd9bxbnYeWrRoYX3/7tixQ1u3btXzzz+vJk2aWJeyHj58uL766isNHz5ce/fu1a+//qoJEyZkmbVKlSq6evWqNevs2bM1bdo0mz5BQUFKSkpSTEyMzp49q8uXL6tq1arq2rWrnn/+eS1cuFBHjhzR1q1bNW7cOC1fvlzSv8vPr1ixQu+++64OHDigDz/8MFvXq2nTpoqPj9eePXvUqFEja9ucOXNUt27dm/6hSFbX7nbce++92rZtm1auXKnff/9dQ4cOvWlRWrr5Nb0dvXv31ueff65Zs2bpwIEDGj16tHbv3n3DVRF8fX01cOBAvfbaa5o1a5YOHTqkHTt2aOrUqZo1a9ZNj9W1a1d9/fXXmj9/vs0y6tK/52L27Nnau3evtmzZoq5du9q1KsH146xevVo///yz9u7dq549e+r06dN2jxMUFKSYmBidOnVKFy5csPv5AAAAAPIHCuMAAAAA8P9FR0dnKtxWr15dH330kSwWi0JCQrR169ZbLoVsj/Hjx2v8+PEKCQnRTz/9pKVLl6pEiRKSZJ3lnZ6erscee0w1a9ZUv379VKRIEZv7mWdHnz591L9/fw0YMEA1a9bUihUrtHTpUt177712jTNw4EC5u7srODhYJUuWvOn9lSdNmqSiRYuqQYMGatOmjcLDw/XAAw/YdTxfX19NnDhRdevWVVhYmI4eParvv//e+vq//fZbhYWFqUuXLgoODtbrr79+WzNHn3/+ef3zzz968MEHZTab1bdvX7300kvW/dHR0Tp69KgqV66skiVLSvr3/shms1nVq1dXy5YtVbVqVX300UdZjl+yZEnNnDlT8+fPV3BwsMaPH693333Xpo+np6eGDBmiWrVqqXHjxnJ3d9fXX399y/NgMpm0ZMkSFS1aVI0bN1aLFi10zz33aN68edaxmzZtqvnz52vp0qUKDQ3VI488oq1bt2aZNSQkRJMmTdKECRN0//33a86cORo3bpxNnwYNGujll19W586dVbJkSU2cOFGSNGPGDD3//PMaMGCAqlWrprZt2yo2NlYVKlSQ9O+93D/99FNNmTJFISEhWrVqld5+++1bXp+aNWuqSJEiCg0NlY+Pj/U1paen3/L+4lldu9vRs2dPtW/fXp07d1a9evV07tw5vfrqqzd9zs2u6e3o2rWrhgwZooEDB1qX14+MjFTBggVv+JxRo0Zp6NChGjdunPW9unz5clWqVOmmx+rYsaP1vvJt27a12ff555/rwoULeuCBB/Tcc8+pT58+NjPKs+vtt9/WAw88oPDwcDVt2lSBgYGZjpUd7733nlavXq3y5curdu3adj8fAAAAQP5gMm52Ay0AAAAAAPK4pk2bKjQ0VJMnT3Z1FMBujz76qAIDAzV79mxXRwEAAACAXM3D1QEAAAAAAABwa5cvX9a0adMUHh4ud3d3ffXVV1qzZo1Wr17t6mgAAAAAkOtRGAcAAAAAALgLmEwmff/99xozZoyuXLmiatWq6dtvv1WLFi1cHQ0AAAAAcj2WUgcAAAAAAAAAAAAA5Glurg4AAAAAAAAAAAAAAIAzURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdRGAcAAAAAAAAAAAAA5GkUxgEAAAAAAAAAAAAAeRqFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp/0/l+Z9bE2aRhsAAAAASUVORK5CYII=", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8YAAAGGCAYAAAAJj+sGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABoCElEQVR4nO3deVhU5f//8dcAgsrmDu6YpkkpmJKp5ZIWmpmappkmYJnVuGullRvuVqbZlJ8Wl0zLNNcsN9zKTHFBTdRc03LLDQRSBM7vj37O1wlURmYchOfjuua6OPe55z6vOWeGKd/c9zEZhmEIAAAAAAAAAAAAAIA8ys3VAQAAAAAAAAAAAAAAcCYK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAcJcZPny4TCbTHTlW48aN1bhxY+v2unXrZDKZNH/+/Dty/MjISAUFBd2RY92upKQkvfTSSwoMDJTJZFLfvn1dHcnljh49KpPJpBkzZrg6Sr70388tAAAAAIDCOAAAAAC41IwZM2QymayPggULqkyZMgoPD9eHH36oS5cuOeQ4J06c0PDhwxUXF+eQ8RwpN2fLjjFjxmjGjBl69dVXNWvWLL3wwgs37bto0aI7F+4ulJKSouHDh2vdunWujoKb+OGHHzR8+HBXxwAAAACAbDMZhmG4OgQAAAAA5FczZsxQVFSUoqOjValSJV29elWnTp3SunXrtGrVKlWoUEFLlixRzZo1rc9JS0tTWlqaChYsmO3jbN26VWFhYZo+fboiIyOz/bzU1FRJkqenp6R/Z4w3adJE8+bNU/v27bM9zu1mu3r1qjIyMuTl5eWQYznDww8/LA8PD/3888+37Ovj46P27dvn+ZnUhmHoypUrKlCggNzd3e167tmzZ1WyZEkNGzaMwutt+u/n1hl69uwpi8Ui/lkJAAAAwN3Cw9UBAAAAAABSixYtVKdOHev24MGDtWbNGj311FN6+umntXfvXhUqVEiS5OHhIQ8P5/7vXEpKigoXLuzUwlp2FChQwKXHz44zZ84oODjY4eMmJyfL29vb4eM6U1pamjIyMuTp6WnXH27kBYZh6PLly9bPqSvkls8tAAAAAORGLKUOAAAAALnUY489piFDhuiPP/7QV199ZW3P6h7jq1at0iOPPKIiRYrIx8dH1apV01tvvSXp31neYWFhkqSoqCjrsu3XZi03btxYDzzwgLZt26aGDRuqcOHC1ufe6F7F6enpeuuttxQYGChvb289/fTTOn78uE2foKCgLGenXz/mrbJldY/x5ORkDRgwQOXLl5eXl5eqVaum9957L9PMVZPJpJ49e2rRokV64IEH5OXlpfvvv1/Lly/P+oT/x5kzZ/Tiiy8qICBABQsWVEhIiGbOnGndf+1+60eOHNGyZcus2Y8ePZrleCaTScnJyZo5c6a177Xzc+2axsfH6/nnn1fRokX1yCOPSJJ27dqlyMhI3XPPPSpYsKACAwPVrVs3nTt3zmb8a2McPHhQkZGRKlKkiPz9/RUVFaWUlBSbvjd7v1xz+fJlDR8+XFWrVlXBggVVunRpPfPMMzp06JCk/7uP+HvvvadJkyapcuXK8vLyUnx8fJb3GI+MjJSPj48OHz6s8PBweXt7q0yZMoqOjrZeu6NHj6pkyZKSpBEjRljP07WZ46dOnVJUVJTKlSsnLy8vlS5dWq1bt77hObfn2NdkZGRo0qRJuv/++1WwYEEFBASoR48eunDhgk2/oKAgPfXUU1qxYoXq1KmjQoUK6X//+1+Wx+/Zs6d8fHwyXQdJ6tSpkwIDA5Weni5JWrx4sVq2bKkyZcrIy8tLlStX1siRI637r7Hnc5uamqqhQ4eqdu3a8vf3l7e3tx599FGtXbvWZszrr+mnn35qvaZhYWGKjY21OZ8Wi0WSbG4FAQAAAAC5GTPGAQAAACAXe+GFF/TWW29p5cqV6t69e5Z99uzZo6eeeko1a9ZUdHS0vLy8dPDgQW3cuFGSVL16dUVHR2vo0KF6+eWX9eijj0qS6tevbx3j3LlzatGihZ577jl16dJFAQEBN801evRomUwmvfnmmzpz5owmTZqkZs2aKS4uzq4Zs9nJdj3DMPT0009r7dq1evHFFxUaGqoVK1bo9ddf119//aUPPvjApv/PP/+sBQsW6LXXXpOvr68+/PBDtWvXTseOHVPx4sVvmOuff/5R48aNdfDgQfXs2VOVKlXSvHnzFBkZqYsXL6pPnz6qXr26Zs2apX79+qlcuXIaMGCAJFkLu/81a9YsvfTSS3rooYf08ssvS5IqV65s0+fZZ5/VvffeqzFjxlgLtqtWrdLhw4cVFRWlwMBA7dmzR59++qn27NmjX3/9NVNBskOHDqpUqZLGjh2r7du36/PPP1epUqU0fvx4Sbd+v0j//uHDU089pZiYGD333HPq06ePLl26pFWrVum3336zyT19+nRdvnxZL7/8sry8vFSsWDFlZGRkeQ7S09PVvHlzPfzww5owYYKWL1+uYcOGKS0tTdHR0SpZsqQ++eQTvfrqq2rbtq2eeeYZSbLeSqBdu3bas2ePevXqpaCgIJ05c0arVq3SsWPHMv0Bhb3HvqZHjx7WWxz07t1bR44c0UcffaQdO3Zo48aNNqsY7N+/X506dVKPHj3UvXt3VatWLctjd+zYURaLRcuWLdOzzz5rbU9JSdHSpUsVGRlpXXJ+xowZ8vHxUf/+/eXj46M1a9Zo6NChSkxM1LvvvmszbnY/t4mJifr888/VqVMnde/eXZcuXdIXX3yh8PBwbdmyRaGhoTb958yZo0uXLqlHjx4ymUyaMGGCnnnmGR0+fFgFChRQjx49dOLECa1atUqzZs266XkHAAAAgFzDAAAAAAC4zPTp0w1JRmxs7A37+Pv7G7Vq1bJuDxs2zLj+f+c++OADQ5Lx999/33CM2NhYQ5Ixffr0TPsaNWpkSDKmTp2a5b5GjRpZt9euXWtIMsqWLWskJiZa27/99ltDkjF58mRrW8WKFY2IiIhbjnmzbBEREUbFihWt24sWLTIkGaNGjbLp1759e8NkMhkHDx60tkkyPD09bdp27txpSDKmTJmS6VjXmzRpkiHJ+Oqrr6xtqampRr169QwfHx+b116xYkWjZcuWNx3vGm9v7yzPybVr2qlTp0z7UlJSMrV9/fXXhiRjw4YNmcbo1q2bTd+2bdsaxYsXt25n5/0ybdo0Q5IxceLETPsyMjIMwzCMI0eOGJIMPz8/48yZMzZ9ru27/ppGREQYkoxevXrZjNWyZUvD09PTmufvv/82JBnDhg2zGfPChQuGJOPdd9+9Ye4bye6xf/rpJ0OSMXv2bJvnL1++PFN7xYoVDUnG8uXLb3n8jIwMo2zZska7du1s2q99bq6/jlld7x49ehiFCxc2Ll++bG2z53OblpZmXLlyxabPhQsXjICAAJv3y7XrVrx4ceP8+fPW9sWLFxuSjKVLl1rbzGazwT8rAQAAALibsJQ6AAAAAORyPj4+unTp0g33FylSRNK/SzDfaKburXh5eSkqKirb/bt27SpfX1/rdvv27VW6dGn98MMPt3X87Prhhx/k7u6u3r1727QPGDBAhmHoxx9/tGlv1qyZzezmmjVrys/PT4cPH77lcQIDA9WpUydrW4ECBdS7d28lJSVp/fr1Dng1mb3yyiuZ2q6fgX/58mWdPXtWDz/8sCRp+/bttxzj0Ucf1blz55SYmCgpe++X7777TiVKlFCvXr0y7fvvDPV27drdcJZ8Vnr27GkzVs+ePZWamqrVq1ff9HmFChWSp6en1q1bl2lZc0cde968efL399fjjz+us2fPWh+1a9eWj49PpqXHK1WqpPDw8Fse12Qy6dlnn9UPP/ygpKQka/vcuXNVtmxZ67L5117nNZcuXdLZs2f16KOPKiUlRfv27bMZN7ufW3d3d+t9xzMyMnT+/HmlpaWpTp06Wb6HOnbsqKJFi1q3r63kcKvPDQAAAADkZhTGAQAAACCXS0pKsilC/1fHjh3VoEEDvfTSSwoICNBzzz2nb7/91q4iedmyZa2Fs+y49957bbZNJpOqVKlyy3s959Qff/yhMmXKZDof1atXt+6/XoUKFTKNUbRo0VsWVv/44w/de++9cnOz/d/mGx3HUSpVqpSp7fz58+rTp48CAgJUqFAhlSxZ0tovISEhU///vuZrBc5rrzk775dDhw6pWrVq8vC49R3Yssp8I25ubrrnnnts2qpWrSpJt3zveHl5afz48frxxx8VEBCghg0basKECTp16pTDjn3gwAElJCSoVKlSKlmypM0jKSlJZ86csXm+Pa+9Y8eO+ueff7RkyRJJ/36uf/jhBz377LM2f2ywZ88etW3bVv7+/vLz81PJkiXVpUsXSZmvtz2f25kzZ6pmzZoqWLCgihcvrpIlS2rZsmW39R4CAAAAgLsR9xgHAAAAgFzszz//VEJCgqpUqXLDPoUKFdKGDRu0du1aLVu2TMuXL9fcuXP12GOPaeXKldZ7F9+MPfcFz67/ziy+Jj09PVuZHOFGxzH+//27c5usrkOHDh30yy+/6PXXX1doaKh8fHyUkZGh5s2bZ/nHD7d6zY54v9wqs7P07dtXrVq10qJFi7RixQoNGTJEY8eO1Zo1a1SrVq0cj5+RkaFSpUpp9uzZWe7/78x4e177ww8/rKCgIH377bd6/vnntXTpUv3zzz/q2LGjtc/FixfVqFEj+fn5KTo6WpUrV1bBggW1fft2vfnmm5mud3aP/9VXXykyMlJt2rTR66+/rlKlSsnd3V1jx47VoUOHMvW/2z43AAAAAJAdFMYBAAAAIBebNWuWJN1yuWY3Nzc1bdpUTZs21cSJEzVmzBi9/fbbWrt2rZo1a3bDIvXtOnDggM22YRg6ePCgatasaW0rWrSoLl68mOm5f/zxh83MXXuyVaxYUatXr9alS5dsZo1fW2K6YsWK2R7rVsfZtWuXMjIybGaN5/Q49l6HCxcuKCYmRiNGjNDQoUOt7f89//a61fulcuXK2rx5s65evaoCBQrk6FjXy8jI0OHDh60ztSXp999/lyQFBQVJuvU5qly5sgYMGKABAwbowIEDCg0N1fvvv6+vvvoqx8euXLmyVq9erQYNGjil4N+hQwdNnjxZiYmJmjt3roKCgqzL4kvSunXrdO7cOS1YsEANGza0th85ciRHx50/f77uueceLViwwOb8Dhs27LbHdPTvFAAAAABwNpZSBwAAAIBcas2aNRo5cqQqVaqkzp0737Df+fPnM7WFhoZKkq5cuSJJ8vb2lqQsC9W348svv7S57/n8+fN18uRJtWjRwtpWuXJl/frrr0pNTbW2ff/99zp+/LjNWPZke/LJJ5Wenq6PPvrIpv2DDz6QyWSyOX5OPPnkkzp16pTmzp1rbUtLS9OUKVPk4+OjRo0a3da43t7edl2DazN3/ztTd9KkSbd1fCl775d27drp7Nmzmc5zVlnsdf2YhmHoo48+UoECBdS0aVNJUuHChSVlfj+kpKTo8uXLNm2VK1eWr6+vNXdOj92hQwelp6dr5MiRmZ6blpaW489Px44ddeXKFc2cOVPLly9Xhw4dbPZndb1TU1P18ccf5+i4WY27efNmbdq06bbHdPTvFAAAAABwNmaMAwAAAEAu8OOPP2rfvn1KS0vT6dOntWbNGq1atUoVK1bUkiVLVLBgwRs+Nzo6Whs2bFDLli1VsWJFnTlzRh9//LHKlSunRx55RNK/BcQiRYpo6tSp8vX1lbe3t+rWrWvXPZKvV6xYMT3yyCOKiorS6dOnNWnSJFWpUkXdu3e39nnppZc0f/58NW/eXB06dNChQ4f01VdfqXLlyjZj2ZOtVatWatKkid5++20dPXpUISEhWrlypRYvXqy+fftmGvt2vfzyy/rf//6nyMhIbdu2TUFBQZo/f742btyoSZMm3fSe7zdTu3ZtrV69WhMnTlSZMmVUqVIl1a1b94b9/fz8rPfSvnr1qsqWLauVK1fmaAZxdt4vXbt21Zdffqn+/ftry5YtevTRR5WcnKzVq1frtddeU+vWrW/r2AULFtTy5csVERGhunXr6scff9SyZcv01ltvWZcpL1SokIKDgzV37lxVrVpVxYoV0wMPPKC0tDQ1bdpUHTp0UHBwsDw8PLRw4UKdPn1azz33nEOO3ahRI/Xo0UNjx45VXFycnnjiCRUoUEAHDhzQvHnzNHnyZLVv3/62XrskPfjgg6pSpYrefvttXblyxWYZdUmqX7++ihYtqoiICPXu3Vsmk0mzZs3K8R8jPPXUU1qwYIHatm2rli1b6siRI5o6daqCg4OVlJR0W2PWrl1bktS7d2+Fh4fL3d09W9cBAAAAAFyFwjgAAAAA5ALXlsn29PRUsWLFVKNGDU2aNElRUVG3LMI+/fTTOnr0qKZNm6azZ8+qRIkSatSokUaMGCF/f39JUoECBTRz5kwNHjxYr7zyitLS0jR9+vTbLoy/9dZb2rVrl8aOHatLly6padOm+vjjj62zfaV/l39///33NXHiRPXt21d16tTR999/rwEDBtiMZU82Nzc3LVmyREOHDtXcuXM1ffp0BQUF6d133800bk4UKlRI69at06BBgzRz5kwlJiaqWrVqmj59uiIjI2973IkTJ+rll1/WO++8o3/++cdapL2ZOXPmqFevXrJYLDIMQ0888YR+/PFHlSlT5rYyZOf94u7urh9++EGjR4/WnDlz9N1336l48eJ65JFHVKNGjds67rVxly9frldffVWvv/66fH19NWzYMJtl4iXp888/V69evdSvXz+lpqZq2LBh6tWrlzp16qSYmBjNmjVLHh4euu+++/Ttt9+qXbt2Djv21KlTVbt2bf3vf//TW2+9JQ8PDwUFBalLly5q0KDBbb/2azp27KjRo0erSpUqevDBB232FS9e3PoZeeedd1S0aFF16dJFTZs2veXtFG4mMjJSp06d0v/+9z+tWLFCwcHB+uqrrzRv3jytW7futsZ85pln1KtXL33zzTf66quvZBgGhXEAAAAAuZrJyOmfHQMAAAAAANxCZGSk5s+ff9szlO/WYwMAAAAAcgfuMQ4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPI17jAMAAAAAAAAAAAAA8jRmjAMAAAAAAAAAAAAA8jQK4wDumF69eql8+fLy8/NT2bJl1bdvX6WmpkqSGjduLC8vL/n4+FgfJ06cuOFYiYmJev755+Xn56eAgACNHDnSZn98fLyaNm2qokWLKjAwUC+//LJSUlIkSWfOnFHnzp1Vrlw5+fn5qVatWlqyZInzXjgAAAAAAAAAAABcKt8Xxg3DUGJiolhRHnC+1157Tfv27VNiYqJ27typnTt3asKECdb948ePV1JSkvVRpkyZG47Vq1cvnT9/XseOHdNPP/2kzz77TF9++aV1//PPP69q1arp9OnT2r17t3bu3GktniclJalWrVr69ddfdfHiRUVHR6tTp06Kj4933osHAAAAAAAAAACAy+T7wvilS5fk7++vS5cuuToKkOdVr15d3t7ekv79oxQ3NzcdOHDA7nFSUlL0zTffaNSoUSpSpIiqVq2qXr166YsvvrD2OXz4sLp06SJPT0+VLFlSTz/9tHbv3i1JuueeezRw4ECVK1dObm5uatWqlapVq6Zff/3VMS8UAAAAAAAAAAAAuUq+L4wDuLPGjRsnHx8flSpVSjt37lSvXr2s+0aNGqVixYqpVq1aNrO//2v//v1KTU1VaGiotS00NFS7du2ybg8cOFBffvml/vnnH506dUoLFy5Uq1atshzvzJkz2rt3r2rWrJnzFwgAAAAAAAAAAIBch8I4gDtq0KBBSkpKUnx8vF555RUFBgZKksaOHatDhw7p9OnTGjdunHr16qWFCxdmOUZSUpK8vb3l4eFhbStSpIjNyg8tWrTQzz//LF9fX5UuXVrly5dXt27dMo2Vmpqq5557Th06dFCdOnUc/GoBAAAAAAAAAACQG1AYB+AS1atXV0hIiCIjIyVJ9erVk7+/vwoUKKDw8HD16NFDc+fOzfK5Pj4+SklJUVpamrUtISFBvr6+kqQLFy6oWbNm6t69u1JSUnT+/Hl5e3urS5cuNuOkpqaqffv2Kly4sD777DPnvFAAAAAAAAAAAAC4HIVxAC5z9erVG95j3M3txr+eqlWrpgIFCmjnzp3Wtri4ONWoUUOSdOjQIf3zzz/q3bu3PD09VbRoUfXo0UPLli2z9k9NTdWzzz6r1NRUfffdd/L09HTQqwIAAAAAAAAAAEBuk28L4xaLRcHBwQoLC3N1FCBfSEpK0vTp03Xx4kUZhqHdu3dr1KhRCg8P18WLF/XDDz8oJSVF6enpiomJ0dSpU9WuXTvr8yMjI62zywsXLqyOHTtqyJAhSkhI0IEDBzRlyhS99NJLkqT77rtPPj4++vjjj5WWlqZLly7ps88+U61atST9W5Dv0KGDkpOTtWjRInl5ed3x8wEAAAAAAAAAAIA7J98Wxs1ms+Lj4xUbG+vqKEC+YDKZNGfOHFWuXFm+vr5q3bq1WrZsqUmTJunq1asaMWKEAgMDVbRoUfXr108TJ07Us88+a33+sWPH1KBBA+v2Rx99JH9/f5UrV04NGjTQiy++qK5du0r6d6n1pUuX6uuvv1aJEiUUFBSkixcvaubMmZKkX375RYsXL9bGjRtVokQJ+fj4yMfHR2PGjLmzJwUAAAAAAAAAAAB3hMkwDMPVIVwpMTFR/v7+SkhIkJ+fn6vjAMjClStXVLNmTf32228qUKCAq+MAAAAAAAAAAADgLuPh6gAAcCteXl7av3+/q2MAAAAAAAAAAADgLpVvl1IHAAAAAAAAAAAAAOQPFMYBAAAAAAAAAAAAAHlavi2MWywWBQcHKywszNVRAAAAAAAAAAAAAABOZDIMw3B1CFdKTEyUv7+/EhIS5Ofn5+o4AAAAAAAAAAAAAAAHy7czxgEAAAAAAAAAAAAA+YOHqwMg59pHRLs6AgDgDps/c6irIwAAAAAAAAAAcNdgxjgAAAAAAAAAAAAAIE+jMA4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8rR8Wxi3WCwKDg5WWFiYq6MAAAAAAAAAAAAAAJwo3xbGzWaz4uPjFRsb6+ooAAAAAAAAAAAAAAAnyreFcQAAAAAAAAAAAABA/kBhHAAAAAAAAAAAAACQp1EYBwAAAAAAAAAAAADkaRTGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ6WbwvjFotFwcHBCgsLc3UUAAAAAAAAAAAAAIAT5dvCuNlsVnx8vGJjY10dBQAAAAAAAAAAAADgRPm2MA4AAAAAAAAAAAAAyB8ojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPK0PFMYT0lJUcWKFTVw4EBXRwEAAAAAAAAAAAAA5CJ5pjA+evRoPfzww66OAQAAAAAAAAAAAADIZfJEYfzAgQPat2+fWrRo4eooAAAAAAAAAAAAAIBcxuWF8Q0bNqhVq1YqU6aMTCaTFi1alKmPxWJRUFCQChYsqLp162rLli02+wcOHKixY8feocQAAAAAAAAAAAAAgLuJywvjycnJCgkJkcViyXL/3Llz1b9/fw0bNkzbt29XSEiIwsPDdebMGUnS4sWLVbVqVVWtWvVOxgYAAAAAAAAAAAAA3CU8XB2gRYsWN10CfeLEierevbuioqIkSVOnTtWyZcs0bdo0DRo0SL/++qu++eYbzZs3T0lJSbp69ar8/Pw0dOjQLMe7cuWKrly5Yt1OTEx07AsCAAAAAAAAAAAAAOQqLp8xfjOpqanatm2bmjVrZm1zc3NTs2bNtGnTJknS2LFjdfz4cR09elTvvfeeunfvfsOi+LX+/v7+1kf58uWd/joAAAAAAAAAAAAAAK6TqwvjZ8+eVXp6ugICAmzaAwICdOrUqdsac/DgwUpISLA+jh8/7oioAAAAAAAAAAAAAIBcyuVLqTtSZGTkLft4eXnJy8vL+WEAAAAAAAAAAAAAALlCrp4xXqJECbm7u+v06dM27adPn1ZgYGCOxrZYLAoODlZYWFiOxgEAAAAAAAAAAAAA5G65ujDu6emp2rVrKyYmxtqWkZGhmJgY1atXL0djm81mxcfHKzY2NqcxAQAAAAAAAAAAAAC5mMuXUk9KStLBgwet20eOHFFcXJyKFSumChUqqH///oqIiFCdOnX00EMPadKkSUpOTlZUVJQLUwMAAAAAAAAAAAAA7hYuL4xv3bpVTZo0sW73799fkhQREaEZM2aoY8eO+vvvvzV06FCdOnVKoaGhWr58uQICAnJ0XIvFIovFovT09ByNAwAAAAAAAAAAAADI3UyGYRiuDuFKiYmJ8vf3V0JCgvz8/Fwd57a0j4h2dQQAwB02f+ZQV0cAAAAAAAAAAOCukavvMQ4AAAAAAAAAAAAAQE5RGAcAAAAAAAAAAAAA5Gn5tjBusVgUHByssLAwV0cBAAAAAAAAAAAAADhRvi2Mm81mxcfHKzY21tVRAAAAAAAAAAAAAABOlG8L4wAAAAAAAAAAAACA/IHCOAAAAAAAAAAAAAAgT7O7ML59+3bt3r3bur148WK1adNGb731llJTUx0azpm4xzgAAAAAAAAAAAAA5A92F8Z79Oih33//XZJ0+PBhPffccypcuLDmzZunN954w+EBnYV7jAMAAAAAAAAAAABA/mB3Yfz3339XaGioJGnevHlq2LCh5syZoxkzZui7775zdD4AAAAAAAAAAAAAAHLE7sK4YRjKyMiQJK1evVpPPvmkJKl8+fI6e/asY9MBAAAAAAAAAAAAAJBDdhfG69Spo1GjRmnWrFlav369WrZsKUk6cuSIAgICHB4QAAAAAAAAAAAAAICcsLsw/sEHH2j79u3q2bOn3n77bVWpUkWSNH/+fNWvX9/hAZ3FYrEoODhYYWFhro4CAAAAAAAAAAAAAHAik2EYhiMGunz5sjw8POTh4eGI4e6YxMRE+fv7KyEhQX5+fq6Oc1vaR0S7OgIA4A6bP3OoqyMAAAAAAAAAAHDXsHvG+D333KNz585lar98+bKqVq3qkFAAAAAAAAAAAAAAADiK3YXxo0ePKj09PVP7lStX9OeffzokFAAAAAAAAAAAAAAAjpLtdc+XLFli/XnFihXy9/e3bqenpysmJkaVKlVybDoAAAAAAAAAAAAAAHIo24XxNm3aSJJMJpMiIiJs9hUoUEBBQUF6//33HRoOAAAAAAAAAAAAAICcynZhPCMjQ5JUqVIlxcbGqkSJEk4LdSdYLBZZLJYsl4UHAAAAAAAAAAAAAOQddt9j/MiRI3d9UVySzGaz4uPjFRsb6+ooAAAAAAAAAAAAAAAnyvaM8evFxMQoJiZGZ86csc4kv2batGkOCQYAAAAAAAAAAAAAgCPYXRgfMWKEoqOjVadOHZUuXVomk8kZuQAAAAAAAAAAAAAAcAi7C+NTp07VjBkz9MILLzgjDwAAAAAAAAAAAAAADmX3PcZTU1NVv359Z2QBAAAAAAAAAAAAAMDh7C6Mv/TSS5ozZ44zsgAAAAAAAAAAAAAA4HB2L6V++fJlffrpp1q9erVq1qypAgUK2OyfOHGiw8I5k8VikcViUXp6uqujAAAAAAAAAAAAAACcyO7C+K5duxQaGipJ+u2332z2mUwmh4S6E8xms8xmsxITE+Xv7+/qOAAAAAAAAAAAAAAAJ7G7ML527Vpn5AAAAAAAAAAAAAAAwCnsvsc4AAAAAAAAAAAAAAB3E7tnjEvS1q1b9e233+rYsWNKTU212bdgwQKHBAMAAAAAAAAAAAAAwBHsnjH+zTffqH79+tq7d68WLlyoq1evas+ePVqzZg336gYAAAAAAAAAAAAA5Dp2F8bHjBmjDz74QEuXLpWnp6cmT56sffv2qUOHDqpQoYIzMgIAAAAAAAAAAAAAcNvsLowfOnRILVu2lCR5enoqOTlZJpNJ/fr106effurwgAAAAAAAAAAAAAAA5ITdhfGiRYvq0qVLkqSyZcvqt99+kyRdvHhRKSkpjk0HAAAAAAAAAAAAAEAOedj7hIYNG2rVqlWqUaOGnn32WfXp00dr1qzRqlWr1LRpU2dkBAAAAAAAAAAAAADgttldGP/oo490+fJlSdLbb7+tAgUK6JdfflG7du30zjvvODwgAAAAAAAAAAAAAAA5YXdhvFixYtaf3dzcNGjQIIcGulMsFossFovS09NdHQUAAAAAAAAAAAAA4ETZKownJibKz8/P+vPNXOuX25nNZpnNZiUmJsrf39/VcQAAAAAAAAAAAAAATpKtwnjRokV18uRJlSpVSkWKFJHJZMrUxzAMmUwmZmADAAAAAAAAAAAAAHKVbBXG16xZY11Cfe3atU4NBAAAAAAAAAAAAACAI2WrMN6oUSNJUlpamtavX69u3bqpXLlyTg0GAAAAAAAAAAAAAIAjuNnT2cPDQ++++67S0tKclQcAAAAAAAAAAAAAAIeyqzAuSY899pjWr1/vjCwAAAAAAAAAAAAAADhctpZSv16LFi00aNAg7d69W7Vr15a3t7fN/qefftph4QAAAAAAAAAAAAAAyCm7C+OvvfaaJGnixImZ9plMJqWnp+c8FQAAAAAAAAAAAAAADmJ3YTwjI8MZOQAAAAAAAAAAAAAAcAq77zEOAAAAAAAAAAAAAMDdxO4Z45KUnJys9evX69ixY0pNTbXZ17t3b4cEAwAAAAAAAAAAAADAEewujO/YsUNPPvmkUlJSlJycrGLFiuns2bMqXLiwSpUqdccL4xcvXlSzZs2UlpamtLQ09enTR927d7+jGQAAAAAAAAAAAAAAuZfdS6n369dPrVq10oULF1SoUCH9+uuv+uOPP1S7dm299957zsh4U76+vtqwYYPi4uK0efNmjRkzRufOnbvjOQAAAAAAAAAAAAAAuZPdhfG4uDgNGDBAbm5ucnd315UrV1S+fHlNmDBBb731ljMy3pS7u7sKFy4sSbpy5YoMw5BhGHc8BwAAAAAAAAAAAAAgd7K7MF6gQAG5uf37tFKlSunYsWOSJH9/fx0/ftzuABs2bFCrVq1UpkwZmUwmLVq0KFMfi8WioKAgFSxYUHXr1tWWLVts9l+8eFEhISEqV66cXn/9dZUoUcLuHAAAAAAAAAAAAACAvMnuwnitWrUUGxsrSWrUqJGGDh2q2bNnq2/fvnrggQfsDpCcnKyQkBBZLJYs98+dO1f9+/fXsGHDtH37doWEhCg8PFxnzpyx9ilSpIh27typI0eOaM6cOTp9+rTdOQAAAAAAAAAAAAAAeZPdhfExY8aodOnSkqTRo0eraNGievXVV/X333/r008/tTtAixYtNGrUKLVt2zbL/RMnTlT37t0VFRWl4OBgTZ06VYULF9a0adMy9Q0ICFBISIh++umnGx7vypUrSkxMtHkAAAAAAAAAAAAAAPIuuwvjderUUZMmTST9u5T68uXLlZiYqG3btikkJMSh4VJTU7Vt2zY1a9bM2ubm5qZmzZpp06ZNkqTTp0/r0qVLkqSEhARt2LBB1apVu+GYY8eOlb+/v/VRvnx5h2YGAAAAAAAAAAAAAOQudhfGR40apSNHjjgjSyZnz55Venq6AgICbNoDAgJ06tQpSdIff/yhRx99VCEhIXr00UfVq1cv1ahR44ZjDh48WAkJCdbH7dwXHQAAAAAAAAAAAABw9/Cw9wnz5s3TsGHDVLduXXXp0kUdOnRQiRIlnJEtWx566CHFxcVlu7+Xl5e8vLycFwgAAAAAAAAAAAAAkKvYPWN8586d2rVrlxo3bqz33ntPZcqUUcuWLTVnzhylpKQ4NFyJEiXk7u6u06dP27SfPn1agYGBORrbYrEoODhYYWFhORoHAAAAAAAAAAAAAJC72V0Yl6T7779fY8aM0eHDh7V27VoFBQWpb9++OS5W/5enp6dq166tmJgYa1tGRoZiYmJUr169HI1tNpsVHx+v2NjYnMYEAAAAAAAAAAAAAORidi+l/l/e3t4qVKiQPD09denSJbufn5SUpIMHD1q3jxw5ori4OBUrVkwVKlRQ//79FRERoTp16uihhx7SpEmTlJycrKioqJxGBwAAAAAAAAAAAADkA7dVGD9y5IjmzJmjOXPmaP/+/WrUqJFGjBih9u3b2z3W1q1b1aRJE+t2//79JUkRERGaMWOGOnbsqL///ltDhw7VqVOnFBoaquXLlysgIOB2oltZLBZZLBalp6fnaBwAAAAAAAAAAAAAQO5mMgzDsOcJDz/8sGJjY1WzZk117txZnTp1UtmyZZ2Vz+kSExPl7++vhIQE+fn5uTrObWkfEe3qCACAO2z+zKGujgAAAAAAAAAAwF3D7hnjTZs21bRp0xQcHOyMPAAAAAAAAAAAAAAAOJTdhfHRo0c7IwcAAAAAAAAAAAAAAE7h5uoArmKxWBQcHKywsDBXRwEAAAAAAAAAAAAAOFG+LYybzWbFx8crNjbW1VEAAAAAAAAAAAAAAE6UbwvjAAAAAAAAAAAAAID8gcI4AAAAAAAAAAAAACBPu63C+E8//aQuXbqoXr16+uuvvyRJs2bN0s8//+zQcM7EPcYBAAAAAAAAAAAAIH+wuzD+3XffKTw8XIUKFdKOHTt05coVSVJCQoLGjBnj8IDOwj3GAQAAAAAAAAAAACB/sLswPmrUKE2dOlWfffaZChQoYG1v0KCBtm/f7tBwAAAAAAAAAAAAAADklN2F8f3796thw4aZ2v39/XXx4kVHZAIAAAAAAAAAAAAAwGHsLowHBgbq4MGDmdp//vln3XPPPQ4JBQAAAAAAAAAAAACAo9hdGO/evbv69OmjzZs3y2Qy6cSJE5o9e7YGDhyoV1991RkZncJisSg4OFhhYWGujgIAAAAAAAAAAAAAcCIPe58waNAgZWRkqGnTpkpJSVHDhg3l5eWlgQMHqlevXs7I6BRms1lms1mJiYny9/d3dRwAAAAAAAAAAAAAgJPYXRg3mUx6++239frrr+vgwYNKSkpScHCwfHx8nJEPAAAAAAAAAAAAAIAcsbswfo2np6eCg4MdmQUAAAAAAAAAAAAAAIezuzDepEkTmUymG+5fs2ZNjgIBAAAAAAAAAAAAAOBIdhfGQ0NDbbavXr2quLg4/fbbb4qIiHBULqezWCyyWCxKT093dRQAAAAAAAAAAAAAgBPZXRj/4IMPsmwfPny4kpKSchzoTjGbzTKbzUpMTJS/v7+r4wAAAAAAAAAAAAAAnMTNUQN16dJF06ZNc9RwAAAAAAAAAAAAAAA4hMMK45s2bVLBggUdNRwAAAAAAAAAAAAAAA5h91LqzzzzjM22YRg6efKktm7dqiFDhjgsGAAAAAAAAAAAAAAAjmB3Yfy/9+N2c3NTtWrVFB0drSeeeMJhwQAAAAAAAAAAAAAAcAS7C+PTp093Rg4AAAAAAAAAAAAAAJzCYfcYBwAAAAAAAAAAAAAgN7J7xnjRokVlMpmy1ff8+fN2B7pTLBaLLBaL0tPTXR0FAAAAAAAAAAAAAOBEdhfGhwwZolGjRik8PFz16tWTJG3atEkrVqzQkCFDVKxYMYeHdAaz2Syz2azExMRM900HAAAAAAAAAAAAAOQddhfGN27cqOjoaPXs2dPa1rt3b3300UdavXq1Fi1a5Mh8AAAAAAAAAAAAAADkiN33GF+xYoWaN2+eqb158+ZavXq1Q0IBAAAAAAAAAAAAAOAodhfGixcvrsWLF2dqX7x4sYoXL+6QUAAAAAAAAAAAAAAAOIrdS6mPGDFCL730ktatW6e6detKkjZv3qzly5frs88+c3hAAAAAAAAAAAAAAABywu7CeGRkpKpXr64PP/xQCxYskCRVr15dP//8s7VQDgAAAAAAAAAAAABAbmF3YVyS6tatq9mzZzs6CwAAAAAAAAAAAAAADpetwnhiYqL8/PysP9/MtX4AAAAAAAAAAAAAAOQG2SqMFy1aVCdPnlSpUqVUpEgRmUymTH0Mw5DJZFJ6errDQwIAAAAAAAAAAAAAcLuyVRhfs2aNihUrJklau3atUwMBAAAAAAAAAAAAAOBI2SqMN2rUKMuf72YWi0UWi4UZ7gAAAAAAAAAAAACQx7ndzpMuXryolStX6quvvtKXX35p87hbmM1mxcfHKzY21tVRAAAAAJf56KOPVKdOHXl5ealNmzY2+xITE/X888/Lz89PAQEBGjlypHXfmTNn1LlzZ5UrV05+fn6qVauWlixZcofTAwAAAAAAANmTrRnj11u6dKk6d+6spKQk+fn52dxv3GQyqWvXrg4NCAAAAMB5ypQpo3feeUerV6/Wn3/+abOvV69eOn/+vI4dO6YzZ86oWbNmqlixorp27aqkpCTVqlVL48ePV5kyZbRs2TI999xzio2NVXBwsIteDQAAAAAAAJA1u2eMDxgwQN26dVNSUpIuXryoCxcuWB/nz593RkYAAAAATvLMM8+oTZs2KlGihE17SkqKvvnmG40aNUpFihRR1apV1atXL33xxReSpHvuuUcDBw5UuXLl5ObmplatWqlatWr69ddfXfEyAAAAAAAAgJuyuzD+119/qXfv3ipcuLAz8gAAAADIBfbv36/U1FSFhoZa20JDQ7Vr164s+585c0Z79+5VzZo171BCAAAAAAAAIPvsLoyHh4dr69atzsgCAAAAIJdISkqSt7e3PDz+7+5LRYoU0aVLlzL1TU1N1XPPPacOHTqoTp06dzImAAAAAAAAkC1232O8ZcuWev311xUfH68aNWqoQIECNvuffvpph4UDAAAA4Bo+Pj5KSUlRWlqatTiekJAgX19fm36pqalq3769ChcurM8++8wVUQEAAAAAAIBbsrsw3r17d0lSdHR0pn0mk0np6ek5TwUAAADApapVq6YCBQpo586dql27tiQpLi5ONWrUsPZJTU3Vs88+q9TUVC1evFienp6uigsAAAAAAADclN1LqWdkZNzwQVEcAAAAuLukpaXp8uXLSktLU0ZGhi5fvqzU1FQVLlxYHTt21JAhQ5SQkKADBw5oypQpeumllyRJV69eVYcOHZScnKxFixbJy8vLxa8EAAAAAAAAuDG7C+MAAAAA8o5Ro0apUKFCGj16tJYuXapChQrpiSeekCR99NFH8vf3V7ly5dSgQQO9+OKL6tq1qyTpl19+0eLFi7Vx40aVKFFCPj4+8vHx0ZgxY1z5cgAAAAAAAIAsmQzDMOx5QlZLqF9v6NChOQp0pyUmJsrf318JCQny8/NzdZzb0j7i5tcEAJD3zJ95d33fAgAAAAAAAADgSnbfY3zhwoU221evXtWRI0fk4eGhypUr33WFcQAAAAAAAAAAAABA3mZ3YXzHjh2Z2hITExUZGam2bds6JJQ9jh8/rhdeeEFnzpyRh4eHhgwZomefffaO5wAAAAAAAAAAAAAA5E4Ouce4n5+fRowYoSFDhjhiOLt4eHho0qRJio+P18qVK9W3b18lJyff8RwAAAAAAAAAAAAAgNzJ7hnjN5KQkKCEhARHDZdtpUuXVunSpSVJgYGBKlGihM6fPy9vb+87ngUAAAAAAAAAAAAAkPvYXRj/8MMPbbYNw9DJkyc1a9YstWjRwu4AGzZs0Lvvvqtt27bp5MmTWrhwodq0aWPTx2Kx6N1339WpU6cUEhKiKVOm6KGHHso01rZt25Senq7y5cvbnQMAAAAAAAAAAAAAkDfZXRj/4IMPbLbd3NxUsmRJRUREaPDgwXYHSE5OVkhIiLp166Znnnkm0/65c+eqf//+mjp1qurWratJkyYpPDxc+/fvV6lSpaz9zp8/r65du+qzzz6zOwMAAAAAAAAAAAAAIO+yuzB+5MgRhwZo0aLFTWeaT5w4Ud27d1dUVJQkaerUqVq2bJmmTZumQYMGSZKuXLmiNm3aaNCgQapfv/5Nj3flyhVduXLFup2YmOiAVwEAAAAAAAAAAAAAyK0cdo9xZ0hNTdW2bdtsZqK7ubmpWbNm2rRpk6R/l3KPjIzUY489phdeeOGWY44dO1YjRoxwWmYAAJC3tY+IdnUEAMAdNn/mUFdHAAAAAAAAOeTm6gA3c/bsWaWnpysgIMCmPSAgQKdOnZIkbdy4UXPnztWiRYsUGhqq0NBQ7d69+4ZjDh48WAkJCdbH8ePHnfoaAAAAAAAAAAAAAACulatnjGfHI488ooyMjGz39/LykpeXlxMTAQAAAAAAAAAAAAByk1w9Y7xEiRJyd3fX6dOnbdpPnz6twMDAHI1tsVgUHByssLCwHI0DAAAAAAAAAAAAAMjdslUYf/DBB3XhwgVJUnR0tFJSUpwa6hpPT0/Vrl1bMTEx1raMjAzFxMSoXr16ORrbbDYrPj5esbGxOY0JAAAAAAAAAAAAAMjFslUY37t3r5KTkyVJI0aMUFJSksMCJCUlKS4uTnFxcZKkI0eOKC4uTseOHZMk9e/fX5999plmzpypvXv36tVXX1VycrKioqIclgEAAAAAAAAAAAAAkHdl6x7joaGhioqK0iOPPCLDMPTee+/Jx8cny75Dhw61K8DWrVvVpEkT63b//v0lSREREZoxY4Y6duyov//+W0OHDtWpU6cUGhqq5cuXKyAgwK7j/JfFYpHFYlF6enqOxgEAAAAAAAAAAAAA5G4mwzCMW3Xav3+/hg0bpkOHDmn79u0KDg6Wh0fmmrrJZNL27dudEtRZEhMT5e/vr4SEBPn5+bk6zm1pHxHt6ggAgDts/kz7/hANjsP3LgDkP3zvAgAAAABw98vWjPFq1arpm2++kSS5ubkpJiZGpUqVcmowAAAAAAAAAAAAAAAcIVuF8etlZGQ4IwcAAAAAAAAAAAAAAE5hd2Fckg4dOqRJkyZp7969kqTg4GD16dNHlStXdmg4Z+Ie4wAAAAAAAAAAAACQP7jZ+4QVK1YoODhYW7ZsUc2aNVWzZk1t3rxZ999/v1atWuWMjE5hNpsVHx+v2NhYV0cBAAAAAAAAAAAAADiR3TPGBw0apH79+mncuHGZ2t988009/vjjDgsHAAAAAAAAAAAAAEBO2T1jfO/evXrxxRcztXfr1k3x8fEOCQUAAAAAAAAAAAAAgKPYXRgvWbKk4uLiMrXHxcWpVKlSjsh0R1gsFgUHByssLMzVUQAAAAAAAAAAAAAATmT3Uurdu3fXyy+/rMOHD6t+/fqSpI0bN2r8+PHq37+/wwM6i9lsltlsVmJiovz9/V0dBwAAAAAAAAAAAADgJHYXxocMGSJfX1+9//77Gjx4sCSpTJkyGj58uHr37u3wgAAAAAAAAAAAAAAA5ITdhXGTyaR+/fqpX79+unTpkiTJ19fX4cEAAAAAAAAAAAAAAHAEu+8xfj1fX9+7tijOPcYBAAAAAAAAAAAAIH/IUWH8bmY2mxUfH6/Y2FhXRwEAAAAAAAAAAAAAOFG+LYwDAAAAAAAAAAAAAPIHCuMAAAAAAAAAAAAAgDzNrsL41atX1bRpUx04cMBZeQAAAAAAAAAAAAAAcCi7CuMFChTQrl27nJUFAAAAAAAAAAAAAACHs3sp9S5duuiLL75wRpY7ymKxKDg4WGFhYa6OAgAAAAAAAAAAAABwIg97n5CWlqZp06Zp9erVql27try9vW32T5w40WHhnMlsNstsNisxMVH+/v6ujgMAAAAAAAAAAAAAcBK7C+O//fabHnzwQUnS77//brPPZDI5JhUAAAAAAAAAAAAAAA5id2F87dq1zsgBAAAAAAAAAAAAAIBT2H2P8WsOHjyoFStW6J9//pEkGYbhsFAAAAAAAAAAAAAAADiK3YXxc+fOqWnTpqpataqefPJJnTx5UpL04osvasCAAQ4PCAAAAAAAAAAAAABATthdGO/Xr58KFCigY8eOqXDhwtb2jh07avny5Q4NBwAAAAAAAAAAAABATtl9j/GVK1dqxYoVKleunE37vffeqz/++MNhwQAAAAAAAAAAAAAAcAS7Z4wnJyfbzBS/5vz58/Ly8nJIqDvBYrEoODhYYWFhro4CAAAAAAAAAAAAAHAiuwvjjz76qL788kvrtslkUkZGhiZMmKAmTZo4NJwzmc1mxcfHKzY21tVRAAAAAAAAAAAAAABOZPdS6hMmTFDTpk21detWpaam6o033tCePXt0/vx5bdy40RkZAQAAAAAAAAAAAAC4bXbPGH/ggQf0+++/65FHHlHr1q2VnJysZ555Rjt27FDlypWdkREAAAAAAAAAAAAAgNtm94xxSfL399fbb7/t6CwAAAAAAAAAAAAAADjcbRXGL1y4oC+++EJ79+6VJAUHBysqKkrFihVzaDgAAAAAAAAAAAAAAHLK7qXUN2zYoKCgIH344Ye6cOGCLly4oA8//FCVKlXShg0bnJERAAAAAAAAAAAAAIDbZveMcbPZrI4dO+qTTz6Ru7u7JCk9PV2vvfaazGazdu/e7fCQAAAAAAAAAAAAAADcLrtnjB88eFADBgywFsUlyd3dXf3799fBgwcdGg4AAAAAAAAAAAAAgJyyuzD+4IMPWu8tfr29e/cqJCTEIaEAAAAAAAAAAAAAAHCUbC2lvmvXLuvPvXv3Vp8+fXTw4EE9/PDDkqRff/1VFotF48aNc05KAAAAAAAAAAAAAABuU7YK46GhoTKZTDIMw9r2xhtvZOr3/PPPq2PHjo5L50QWi0UWi0Xp6emujgIAAAAAAAAAAAAAcKJsFcaPHDni7Bx3nNlsltlsVmJiovz9/V0dBwAAAAAAAAAAAADgJNkqjFesWNHZOQAAAAAAAAAAAAAAcIpsFcb/68SJE/r555915swZZWRk2Ozr3bu3Q4IBAAAAAAAAAAAAAOAIdhfGZ8yYoR49esjT01PFixeXyWSy7jOZTBTGAQAAAAAAAAAAAAC5it2F8SFDhmjo0KEaPHiw3NzcnJEJAAAAAAAAAAAAAACHsbuynZKSoueee46iOAAAAAAAAAAAAADgrmB3dfvFF1/UvHnznJEFAAAAAAAAAAAAAACHs3sp9bFjx+qpp57S8uXLVaNGDRUoUMBm/8SJEx0WDgAAAAAAAAAAAACAnLqtwviKFStUrVo1SZLJZLLuu/5nAAAAAAAAAAAAAAByA7sL4++//76mTZumyMhIJ8QBAAAAAAAAAAAAAMCx7L7HuJeXlxo0aOCMLAAAAAAAAAAAAAAAOJzdhfE+ffpoypQpzshy29q2bauiRYuqffv2ro4CAAAAAAAAAAAAAMhl7F5KfcuWLVqzZo2+//573X///SpQoIDN/gULFjgsXHb16dNH3bp108yZM+/4sQEAAAAAAAAAAAAAuZvdhfEiRYromWeecUaW29a4cWOtW7fO1TEAAAAAAAAAAAAAALmQ3YXx6dOnOzTAhg0b9O6772rbtm06efKkFi5cqDZt2tj0sVgsevfdd3Xq1CmFhIRoypQpeuihhxyaAwAAAAAAAAAAAACQN9l9j3FHS05OVkhIiCwWS5b7586dq/79+2vYsGHavn27QkJCFB4erjNnztzhpAAAAAAAAAAAAACAu5HdM8YrVaokk8l0w/2HDx+2a7wWLVqoRYsWN9w/ceJEde/eXVFRUZKkqVOnatmyZZo2bZoGDRpk17Ek6cqVK7py5Yp1OzEx0e4xAAAAAAAAAAAAAAB3D7sL43379rXZvnr1qnbs2KHly5fr9ddfd1QuSVJqaqq2bdumwYMHW9vc3NzUrFkzbdq06bbGHDt2rEaMGOGoiAAAAAAAAAAAAACAXM7uwnifPn2ybLdYLNq6dWuOA13v7NmzSk9PV0BAgE17QECA9u3bZ91u1qyZdu7cqeTkZJUrV07z5s1TvXr1shxz8ODB6t+/v3U7MTFR5cuXd2huAAAAAAAAAAAAAEDuYXdh/EZatGihwYMHa/r06Y4aMttWr16d7b5eXl7y8vJyYhoAAAAAAAAAAAAAQG7i5qiB5s+fr2LFijlqOElSiRIl5O7urtOnT9u0nz59WoGBgTka22KxKDg4WGFhYTkaBwAAAAAAAAAAAACQu9k9Y7xWrVoymUzWbcMwdOrUKf3999/6+OOPHRrO09NTtWvXVkxMjNq0aSNJysjIUExMjHr27Jmjsc1ms8xmsxITE+Xv7++AtAAAAAAAAAAAAACA3Mjuwvi1AvU1bm5uKlmypBo3bqz77rvP7gBJSUk6ePCgdfvIkSOKi4tTsWLFVKFCBfXv318RERGqU6eOHnroIU2aNEnJycmKioqy+1gAAAAAAAAAAAAAgPzH7sL4sGHDHBpg69atatKkiXW7f//+kqSIiAjNmDFDHTt21N9//62hQ4fq1KlTCg0N1fLlyxUQEJCj41osFlksFqWnp+doHAAAAAAAAAAAAABA7mYyDMNwdQhXuraUekJCgvz8/Fwd57a0j4h2dQQAwB02f+ZQV0fIt/jeBYD8h+9dAAAAAADuftmeMe7m5mZzb/GsmEwmpaWl5TgUAAAAAAAAAAAAAACOku3C+MKFC2+4b9OmTfrwww+VkZHhkFB3AkupAwAAAAAAAAAAAED+kO3CeOvWrTO17d+/X4MGDdLSpUvVuXNnRUffPUuLms1mmc1m61LqAAAAAAAAAAAAAIC8ye12nnTixAl1795dNWrUUFpamuLi4jRz5kxVrFjR0fkAAAAAAAAAAAAAAMgRuwrjCQkJevPNN1WlShXt2bNHMTExWrp0qR544AFn5QMAAAAAAAAAAAAAIEeyvZT6hAkTNH78eAUGBurrr7/Ocml1AAAAAAAAAAAAAABym2wXxgcNGqRChQqpSpUqmjlzpmbOnJllvwULFjgsnDNZLBZZLBalp6e7OgoAAAAAAAAAAAAAwImyXRjv2rWrTCaTM7PcUWazWWazWYmJifL393d1HAAAAAAAAAAAAACAk2S7MD5jxgwnxgAAAAAAAAAgSb169dKiRYuUkJAgX19fPfvss5owYYI8PT1dHQ0AAAC4a7m5OgAAAAAAAACA//Paa69p3759SkxM1M6dO7Vz505NmDDB1bEAAACAu1q+LYxbLBYFBwcrLCzM1VEAAAAAAAAAq+rVq8vb21uSZBiG3NzcdODAARenAgAAAO5u+bYwbjabFR8fr9jYWFdHAQAAAAAAAGyMGzdOPj4+KlWqlHbu3KlevXq5OhIAAABwV8u3hXEAAAAAAAAgtxo0aJCSkpIUHx+vV155RYGBga6OBAAAANzVKIwDAAAAAAAAuVT16tUVEhKiyMhIV0cBAAAA7moUxgEAAAAAAIBc7OrVq9xjHAAAAMghCuMAAAAAAABALpGUlKTp06fr4sWLMgxDu3fv1qhRoxQeHu7qaAAAAMBdLd8Wxi0Wi4KDgxUWFubqKAAAAAAAAIAkyWQyac6cOapcubJ8fX3VunVrtWzZUpMmTXJ1NAAAAOCu5uHqAK5iNptlNpuVmJgof39/V8cBAAAAAAAA5O3trVWrVrk6BgAAAJDn5NsZ4wAAAAAAAAAAAACA/IHCOAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBP83B1AFexWCyyWCxKT093dRQAAAAAAHAD7SOiXR0BAHCHzZ851NURAABAHpRvZ4ybzWbFx8crNjbW1VEAAAAAAAAAAAAAAE6UbwvjAAAAAAAAAAAAAID8gcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDyNwjgAAAAAAAAAAAAAIE+jMA4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPC3fFsYtFouCg4MVFhbm6igAAAAAAAAAAAAAACfKt4Vxs9ms+Ph4xcbGujoKAAAAAAAAAAAAAMCJ8m1hHAAAAAAAAAAAAPnbRx99pDp16sjLy0tt2rRxdRwATuTh6gAAAAAAAAAAAACAK5QpU0bvvPOOVq9erT///NPVcQA4EYVxAAAAAAAAAAAA5EvPPPOMJCkuLo7COJDHsZQ6AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDzNw9UBAAAAAAAAAAAAAFdIS0uzPjIyMnT58mW5ubnJ09PT1dEAOBiFcQAAAAAAAAAAAORLo0aN0ogRI6zbhQoVUqNGjbRu3TrXhQLgFCylDgAAAAAAAAAAgHxp+PDhMgzD5kFRHMib8kRh/Pvvv1e1atV077336vPPP3d1HAAAAAAAAAAAAABALnLXL6Welpam/v37a+3atfL391ft2rXVtm1bFS9e3NXRAAAAAAAAAAAAAAC5wF0/Y3zLli26//77VbZsWfn4+KhFixZauXKlq2MBAAAAAAAAAAAAAHIJlxfGN2zYoFatWqlMmTIymUxatGhRpj4Wi0VBQUEqWLCg6tatqy1btlj3nThxQmXLlrVuly1bVn/99dediA4AAAAAAAAAAAAAuAu4vDCenJyskJAQWSyWLPfPnTtX/fv317Bhw7R9+3aFhIQoPDxcZ86cucNJAQAAAAAAAAAAAAB3I5cXxlu0aKFRo0apbdu2We6fOHGiunfvrqioKAUHB2vq1KkqXLiwpk2bJkkqU6aMzQzxv/76S2XKlLnh8a5cuaLExESbBwAAAAAAAAAAAAAg7/JwdYCbSU1N1bZt2zR48GBrm5ubm5o1a6ZNmzZJkh566CH99ttv+uuvv+Tv768ff/xRQ4YMueGYY8eO1YgRI5yeHQAAAAAAAACAu037iGhXRwAA3GHzZw51dYQ7wuUzxm/m7NmzSk9PV0BAgE17QECATp06JUny8PDQ+++/ryZNmig0NFQDBgxQ8eLFbzjm4MGDlZCQYH0cP37cqa8BAAAAAAAAAAAAAOBauXrGeHY9/fTTevrpp7PV18vLS15eXk5OBAAAAAAAAAAAAADILXL1jPESJUrI3d1dp0+ftmk/ffq0AgMDczS2xWJRcHCwwsLCcjQOAAAAAAAAAAAAACB3y9WFcU9PT9WuXVsxMTHWtoyMDMXExKhevXo5GttsNis+Pl6xsbE5jQkAAAAAAAAAAAAAyMVcvpR6UlKSDh48aN0+cuSI4uLiVKxYMVWoUEH9+/dXRESE6tSpo4ceekiTJk1ScnKyoqKiXJgaAAAAAAAAAAAAAHC3cHlhfOvWrWrSpIl1u3///pKkiIgIzZgxQx07dtTff/+toUOH6tSpUwoNDdXy5csVEBCQo+NaLBZZLBalp6fnaBwAAAAAAAAAAAAAQO7m8sJ448aNZRjGTfv07NlTPXv2dOhxzWazzGazEhMT5e/v79CxAQAAAAAAAAAAAAC5R66+xzgAAAAAAAAAAAAAADmVbwvjFotFwcHBCgsLc3UUAAAAAAAAAAAAAIAT5dvCuNlsVnx8vGJjY10dBQAAAAAAAAAAAADgRPm2MA4AAAAAAAAAAAAAyB8ojAMAAAAAAAAAAAAA8jQPVwdwFYvFIovForS0NElSYmKiixPdvqupl10dAQBwh93N31t3O753ASD/4XvXtfjuBYD8h+9e1+K7FwDyn7zw3evr6yuTyXTTPibDMIw7lCdX+vPPP1W+fHlXxwAAAAAAAAAAAAAA3IaEhAT5+fndtE++L4xnZGToxIkT2forAgC5R2JiosqXL6/jx4/f8hcdAADIOb57AQC4s/juBQDgzuK7F7i7ZafWm2+XUr/Gzc1N5cqVc3UMALfJz8+P/0gBAOAO4rsXAIA7i+9eAADuLL57gbzLzdUBAAAAAAAAAAAAAABwJgrjAAAAAAAAAAAAAIA8jcI4gLuSl5eXhg0bJi8vL1dHAQAgX+C7FwCAO4vvXgAA7iy+e4G8z2QYhuHqEAAAAAAAAAAAAAAAOAszxgEAAAAAAAAAAAAAeRqFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp1EYB3BX2bBhg1q1aqUyZcrIZDJp0aJFro4EAECeNnbsWIWFhcnX11elSpVSmzZttH//flfHAgAgz7p06ZL69u2rihUrqlChQqpfv75iY2NdHQsAgDzhVv++HBkZKZPJZPNo3ry5a8ICcDgK4wDuKsnJyQoJCZHFYnF1FAAA8oX169fLbDbr119/1apVq3T16lU98cQTSk5OdnU0AADypJdeekmrVq3SrFmztHv3bj3xxBNq1qyZ/vrrL1dHAwDgrpedf19u3ry5Tp48aX18/fXXdzAhAGcyGYZhuDoEANwOk8mkhQsXqk2bNq6OAgBAvvH333+rVKlSWr9+vRo2bOjqOAAA5Cn//POPfH19tXjxYrVs2dLaXrt2bbVo0UKjRo1yYToAAPKWrP59OTIyUhcvXmSlUiCPYsY4AAAAgGxLSEiQJBUrVszFSQAAyHvS0tKUnp6uggUL2rQXKlRIP//8s4tSAQCQv6xbt06lSpVStWrV9Oqrr+rcuXOujgTAQSiMAwAAAMiWjIwM9e3bVw0aNNADDzzg6jgAAOQ5vr6+qlevnkaOHKkTJ04oPT1dX331lTZt2qSTJ0+6Oh4AAHle8+bN9eWXXyomJkbjx4/X+vXr1aJFC6Wnp7s6GgAH8HB1AAAAAAB3B7PZrN9++40ZawAAONGsWbPUrVs3lS1bVu7u7nrwwQfVqVMnbdu2zdXRAADI85577jnrzzVq1FDNmjVVuXJlrVu3Tk2bNnVhMgCOwIxxAAAAALfUs2dPff/991q7dq3KlSvn6jgAAORZlStX1vr165WUlKTjx49ry5Ytunr1qu655x5XRwMAIN+55557VKJECR08eNDVUQA4AIVxAAAAADdkGIZ69uyphQsXas2aNapUqZKrIwEAkC94e3urdOnSunDhglasWKHWrVu7OhIAAPnOn3/+qXPnzql06dKujgLAAVhKHcBdJSkpyeav844cOaK4uDgVK1ZMFSpUcGEyAADyJrPZrDlz5mjx4sXy9fXVqVOnJEn+/v4qVKiQi9MBAJD3rFixQoZhqFq1ajp48KBef/113XfffYqKinJ1NAAA7no3+/flYsWKacSIEWrXrp0CAwN16NAhvfHGG6pSpYrCw8NdmBqAo5gMwzBcHQIAsmvdunVq0qRJpvaIiAjNmDHjzgcCACCPM5lMWbZPnz5dkZGRdzYMAAD5wLfffqvBgwfrzz//VLFixdSuXTuNHj1a/v7+ro4GAMBd72b/vvzJJ5+oTZs22rFjhy5evKgyZcroiSee0MiRIxUQEOCCtAAcjcI4AAAAAAAAAAAAACBP4x7jAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDyNwjgAAAAAAAAAAAAAIE+jMA4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAADgAkePHpXJZFJcXJyro1jt27dPDz/8sAoWLKjQ0FBXx3GayMhItWnTxtUxcpWgoCBNmjTJJceeMWOGihQp4rDxGjdurL59+97RY94OZ/0OyM3v73Xr1slkMunixYuujgIAAAAgH6IwDgAAACBfioyMlMlk0rhx42zaFy1aJJPJ5KJUrjVs2DB5e3tr//79iomJybJPdoqOud3kyZM1Y8YMu57jysLxnRAbG6uXX3452/1zQ2H5RhYsWKCRI0dat3PrtStfvrxOnjypBx54wNVR7pj69evr5MmT8vf3d+i4JpNJixYtcuiYAAAAAPIeCuMAAAAA8q2CBQtq/PjxunDhgqujOExqauptP/fQoUN65JFHVLFiRRUvXvy2xzEMQ2lpabf9fGdJT09XRkaG/P39c21R11VKliypwoULuzqGQxQrVky+vr6ujnFL7u7uCgwMlIeHh6uj3BFXr16Vp6enAgMD8+0fHwEAAABwLQrjAAAAAPKtZs2aKTAwUGPHjr1hn+HDh2daVnzSpEkKCgqybl9bunjMmDEKCAhQkSJFFB0drbS0NL3++usqVqyYypUrp+nTp2caf9++fapfv74KFiyoBx54QOvXr7fZ/9tvv6lFixby8fFRQECAXnjhBZ09e9a6v3HjxurZs6f69u2rEiVKKDw8PMvXkZGRoejoaJUrV05eXl4KDQ3V8uXLrftNJpO2bdum6OhomUwmDR8+PNMYkZGRWr9+vSZPniyTySSTyaSjR49al0f+8ccfVbt2bXl5eennn3/WoUOH1Lp1awUEBMjHx0dhYWFavXq1zZhBQUEaM2aMunXrJl9fX1WoUEGffvqpdX9qaqp69uyp0qVLq2DBgqpYsaLN9bp48aJ69OihgIAA6zn8/vvvJf3frOYlS5YoODhYXl5eOnbsWKalpq+dw549e8rf318lSpTQkCFDZBiGdf8ff/yhfv36WV+3JP3xxx9q1aqVihYtKm9vb91///364Ycfsjz/kjRr1izVqVNHvr6+CgwM1PPPP68zZ85Y91+4cEGdO3dWyZIlVahQId17773W98ytzsOxY8fUunVr+fj4yM/PTx06dNDp06dtjr906VKFhYWpYMGCKlGihNq2bWtzHa6fVT1x4kTVqFFD3t7eKl++vF577TUlJSVJ+nc57KioKCUkJFjPx7X3y5UrVzRw4ECVLVtW3t7eqlu3rtatW2eTY8aMGapQoYIKFy6stm3b6ty5czc8Z5LUvn179ezZ07rdt29fmUwm7du3z3puvL29re+t61c1uNG1u2bFihWqXr26fHx81Lx5c508efKGOdLT0/Xiiy+qUqVKKlSokKpVq6bJkyffNPvNrul/l1K/9jmKiYlRnTp1VLhwYdWvX1/79++3GXPUqFEqVaqUfH199dJLL2nQoEE3vfVBRkaGxo4da80dEhKi+fPn37D/W2+9pbp162ZqDwkJUXR0tKR/Vxh4/PHHVaJECfn7+6tRo0bavn27TX+TyaRPPvlETz/9tLy9vTV69OhMS6mfO3dOnTp1UtmyZVW4cGHVqFFDX3/9tc04jRs3Vu/evfXGG2+oWLFiCgwMtPn9dO13cdu2bWUymWx+NwMAAADA9SiMAwAAAMi33N3dNWbMGE2ZMkV//vlnjsZas2aNTpw4oQ0bNmjixIkaNmyYnnrqKRUtWlSbN2/WK6+8oh49emQ6zuuvv64BAwZox44dqlevnlq1amUtFF68eFGPPfaYatWqpa1bt2r58uU6ffq0OnToYDPGzJkz5enpqY0bN2rq1KlZ5ps8ebLef/99vffee9q1a5fCw8P19NNP68CBA5KkkydP6v7779eAAQN08uRJDRw4MMsx6tWrp+7du+vkyZM6efKkypcvb90/aNAgjRs3Tnv37lXNmjWVlJSkJ598UjExMdqxY4eaN2+uVq1a6dixYzbjvv/++6pTp4527Nih1157Ta+++qq1GPjhhx9qyZIl+vbbb7V//37Nnj3bWvjKyMhQixYttHHjRn311VeKj4/XuHHj5O7ubh07JSVF48eP1+eff649e/aoVKlSWZ6fmTNnysPDQ1u2bNHkyZM1ceJEff7555L+XZq7XLlyio6Otr5uSTKbzbpy5Yo2bNig3bt3a/z48fLx8clyfOnfGbMjR47Uzp07tWjRIh09elSRkZHW/UOGDFF8fLx+/PFH7d27V5988olKlCiRrfPQunVrnT9/XuvXr9eqVat0+PBhdezY0Tr2smXL1LZtWz355JPasWOHYmJi9NBDD90wq5ubmz788EPt2bNHM2fO1Jo1a/TGG29I+nc57EmTJsnPz896Pq69X3r27KlNmzbpm2++0a5du/Tss8+qefPm1vfZ5s2b9eKLL6pnz56Ki4tTkyZNNGrUqBvmkKRGjRrZFNfXr1+vEiVKWNtiY2N19epV1a9fP9Nzb3TtpH/fG++9955mzZqlDRs26NixY1m+76/JyMhQuXLlNG/ePMXHx2vo0KF666239O23397wOTe7pjfy9ttv6/3339fWrVvl4eGhbt26WffNnj1bo0eP1vjx47Vt2zZVqFBBn3zyyU3HGzt2rL788ktNnTpVe/bsUb9+/dSlS5dMf4RzTefOnbVlyxYdOnTI2rZnzx7t2rVLzz//vCTp0qVLioiI0M8//6xff/1V9957r5588kldunTJZqzhw4erbdu22r17t83ruOby5cuqXbu2li1bpt9++00vv/yyXnjhBW3ZssWm38yZM+Xt7a3NmzdrwoQJio6O1qpVqyT9e/0lafr06Tp58qR1GwAAAAAyMQAAAAAgH4qIiDBat25tGIZhPPzww0a3bt0MwzCMhQsXGtf/r9KwYcOMkJAQm+d+8MEHRsWKFW3GqlixopGenm5tq1atmvHoo49at9PS0gxvb2/j66+/NgzDMI4cOWJIMsaNG2ftc/XqVaNcuXLG+PHjDcMwjJEjRxpPPPGEzbGPHz9uSDL2799vGIZhNGrUyKhVq9YtX2+ZMmWM0aNH27SFhYUZr732mnU7JCTEGDZs2E3HadSokdGnTx+btrVr1xqSjEWLFt0yx/33329MmTLFul2xYkWjS5cu1u2MjAyjVKlSxieffGIYhmH06tXLeOyxx4yMjIxMY61YscJwc3Oznov/mj59uiHJiIuLs2m//tpfe03Vq1e3Ocabb75pVK9e3SbnBx98YDNOjRo1jOHDh9/yNd9IbGysIcm4dOmSYRiG0apVKyMqKirLvjc7DytXrjTc3d2NY8eOWdv27NljSDK2bNliGIZh1KtXz+jcufMNs2T1+q43b948o3jx4tbt6dOnG/7+/jZ9/vjjD8Pd3d3466+/bNqbNm1qDB482DAMw+jUqZPx5JNP2uzv2LFjprGut2vXLsNkMhlnzpwxzp8/b3h6ehojR440OnbsaBiGYYwaNcqoX7++tf9/36NZvbZr742DBw9a2ywWixEQEHDDHFkxm81Gu3btbrj/Ztf02u+AHTt2GIbxf5+j1atXW/ssW7bMkGT8888/hmEYRt26dQ2z2WwzToMGDWx+R13//r58+bJRuHBh45dffrF5zosvvmh06tTphrlDQkKM6Oho6/bgwYONunXr3rB/enq64evrayxdutTaJsno27evTb9rr/HChQs3HKtly5bGgAEDrNuNGjUyHnnkEZs+YWFhxptvvmlzrIULF95wTAAAAAAwDMNgxjgAAACAfG/8+PGaOXOm9u7de9tj3H///XJz+7//xQoICFCNGjWs2+7u7ipevLjN0tmSVK9ePevPHh4eqlOnjjXHzp07tXbtWvn4+Fgf9913nyTZzOasXbv2TbMlJibqxIkTatCggU17gwYNcvSa/6tOnTo220lJSRo4cKCqV6+uIkWKyMfHR3v37s00Y7xmzZrWn00mkwIDA63nKTIyUnFxcapWrZp69+6tlStXWvvGxcWpXLlyqlq16g0zeXp62ox/Iw8//LDNMtv16tXTgQMHlJ6efsPn9O7dW6NGjVKDBg00bNgw7dq166bH2LZtm1q1aqUKFSrI19dXjRo1kiTr+Xj11Vf1zTffKDQ0VG+88YZ++eUX63Nvdh727t2r8uXL28zeDw4OVpEiRazXNy4uTk2bNr3lebhm9erVatq0qcqWLStfX1+98MILOnfunFJSUm74nN27dys9PV1Vq1a1ec+uX7/e+n7du3dvpmW6r/8MZOWBBx5QsWLFtH79ev3000+qVauWnnrqKeuM5/Xr16tx48bZfm3XFC5cWJUrV7Zuly5dOtPn878sFotq166tkiVLysfHR59++mmm9/P1bnZNb+T692vp0qUlyZpr//79mWb632zm/8GDB5WSkqLHH3/c5pp8+eWXNr9D/qtz586aM2eOJMkwDH399dfq3Lmzdf/p06fVvXt33XvvvfL395efn5+SkpIynYv//k74r/T0dI0cOVI1atRQsWLF5OPjoxUrVtz0d4SUvWsFAAAAAP9FYRwAAABAvtewYUOFh4dr8ODBmfa5ublZ7zV9zdWrVzP1K1CggM22yWTKsi0jIyPbuZKSktSqVSvFxcXZPA4cOKCGDRta+3l7e2d7TGf6b46BAwdq4cKFGjNmjH766SfFxcWpRo0aSk1Ntel3s/P04IMP6siRIxo5cqT++ecfdejQQe3bt5ckFSpU6JaZChUqlOm+0o7y0ksv6fDhw3rhhRe0e/du1alTR1OmTMmyb3JyssLDw+Xn56fZs2crNjZWCxculCTr+WjRooX1ftgnTpxQ06ZNrUt73+w8ZEd2ztU1R48e1VNPPaWaNWvqu+++07Zt22SxWGyyZiUpKUnu7u7atm2bzft17969t7wX982YTCY1bNhQ69atsxbBa9asqStXrui3337TL7/8Yv0jA3tk9b7772f9et98840GDhyoF198UStXrlRcXJyioqJuek5udk2zk+vae9ee3xvXu3Zf+GXLltlck/j4+JveZ7xTp07av3+/tm/frl9++UXHjx+3WZo/IiJCcXFxmjx5sn755RfFxcWpePHimc7FrX43vfvuu5o8ebLefPNNrV27VnFxcQoPD7frdwQAAAAAZBeFcQAAAACQNG7cOC1dulSbNm2yaS9ZsqROnTplUzCLi4tz2HF//fVX689paWnatm2bqlevLunfYuiePXsUFBSkKlWq2DzsKYb7+fmpTJky2rhxo037xo0bFRwcbFdeT0/Pm86i/u/4kZGRatu2rWrUqKHAwEAdPXrUruNJ/+bv2LGjPvvsM82dO1ffffedzp8/r5o1a+rPP//U77//bveY/7V582ab7Wv3Tb52v/Ibve7y5cvrlVde0YIFCzRgwAB99tlnWY6/b98+nTt3TuPGjdOjjz6q++67L8sZryVLllRERIS++uorTZo0SZ9++ql1343OQ/Xq1XX8+HEdP37c2jc+Pl4XL160Xt+aNWsqJiYmW+di27ZtysjI0Pvvv6+HH35YVatW1YkTJ2z6ZHU+atWqpfT0dJ05cybT+zUwMFCSVL169SzP9a1cu8/4unXr1LhxY7m5ualhw4Z69913deXKlUyrIdwq6+3YuHGj6tevr9dee021atVSlSpVbjrr+pqbXVN7VatWLdM9tG92T+3g4GB5eXnp2LFjma7J9SsM/Fe5cuXUqFEjzZ49W7Nnz9bjjz+uUqVKWfdv3LhRvXv31pNPPqn7779fXl5eOnv2rN2vZ+PGjWrdurW6dOmikJAQ3XPPPbf1eS5QoIBDrjEAAACAvI3COAAAAABIqlGjhjp37qwPP/zQpr1x48b6+++/NWHCBB06dEgWi0U//vijw45rsVi0cOFC7du3T2azWRcuXFC3bt0kSWazWefPn1enTp0UGxurQ4cOacWKFYqKirK7CPT6669r/Pjxmjt3rvbv369BgwYpLi5Offr0sWucoKAgbd68WUePHtXZs2dvOmvz3nvv1YIFCxQXF6edO3fq+eeft3uW58SJE/X1119r3759+v333zVv3jwFBgaqSJEiatSokRo2bKh27dpp1apVOnLkiH788UctX77crmNI/y5n3r9/f+3fv19ff/21pkyZYnNugoKCtGHDBv3111/WAmDfvn21YsUKHTlyRNu3b9fatWutf9TwXxUqVJCnp6emTJmiw4cPa8mSJRo5cqRNn6FDh2rx4sU6ePCg9uzZo++//9463s3OQ7Nmzazv3+3bt2vLli3q2rWrGjVqZF3KetiwYfr66681bNgw7d27V7t379b48eOzzFqlShVdvXrVmnXWrFmaOnWqTZ+goCAlJSUpJiZGZ8+eVUpKiqpWrarOnTura9euWrBggY4cOaItW7Zo7NixWrZsmaR/l59fvny53nvvPR04cEAfffRRtq5X48aNFR8frz179uiRRx6xts2ePVt16tS56R+KZHXtbse9996rrVu3asWKFfr99981ZMiQmxalpZtf09vRq1cvffHFF5o5c6YOHDigUaNGadeuXTdcFcHX11cDBw5Uv379NHPmTB06dEjbt2/XlClTNHPmzJseq3Pnzvrmm280b948m2XUpX/PxaxZs7R3715t3rxZnTt3tmtVguvHWbVqlX755Rft3btXPXr00OnTp+0eJygoSDExMTp16pQuXLhg9/MBAAAA5A8UxgEAAADg/4uOjs5UuK1evbo+/vhjWSwWhYSEaMuWLbdcCtke48aN07hx4xQSEqKff/5ZS5YsUYkSJSTJOss7PT1dTzzxhGrUqKG+ffuqSJEiNvczz47evXurf//+GjBggGrUqKHly5dryZIluvfee+0aZ+DAgXJ3d1dwcLBKlix50/srT5w4UUWLFlX9+vXVqlUrhYeH68EHH7TreL6+vpowYYLq1KmjsLAwHT16VD/88IP19X/33XcKCwtTp06dFBwcrDfeeOO2Zo527dpV//zzjx566CGZzWb16dNHL7/8snV/dHS0jh49qsqVK6tkyZKS/r0/stlsVvXq1dW8eXNVrVpVH3/8cZbjlyxZUjNmzNC8efMUHByscePG6b333rPp4+npqcGDB6tmzZpq2LCh3N3d9c0339zyPJhMJi1evFhFixZVw4YN1axZM91zzz2aO3eudezGjRtr3rx5WrJkiUJDQ/XYY49py5YtWWYNCQnRxIkTNX78eD3wwAOaPXu2xo4da9Onfv36euWVV9SxY0eVLFlSEyZMkCRNnz5dXbt21YABA1StWjW1adNGsbGxqlChgqR/7+X+2WefafLkyQoJCdHKlSv1zjvv3PL61KhRQ0WKFFFoaKh8fHysryk9Pf2W9xfP6trdjh49euiZZ55Rx44dVbduXZ07d06vvfbaTZ9zs2t6Ozp37qzBgwdr4MCB1uX1IyMjVbBgwRs+Z+TIkRoyZIjGjh1rfa8uW7ZMlSpVuumx2rdvb72vfJs2bWz2ffHFF7pw4YIefPBBvfDCC+rdu7fNjPLseuedd/Tggw8qPDxcjRs3VmBgYKZjZcf777+vVatWqXz58qpVq5bdzwcAAACQP5iMm91ACwAAAACAPK5x48YKDQ3VpEmTXB0FsNvjjz+uwMBAzZo1y9VRAAAAACBX83B1AAAAAAAAANxaSkqKpk6dqvDwcLm7u+vrr7/W6tWrtWrVKldHAwAAAIBcj8I4AAAAAADAXcBkMumHH37Q6NGjdfnyZVWrVk3fffedmjVr5upoAAAAAJDrsZQ6AAAAAAAAAAAAACBPc3N1AAAAAAAAAAAAAAAAnInCOAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDzt/wHZ8mipKCuUGgAAAABJRU5ErkJggg==", "text/plain": [ "
" ] @@ -2043,7 +2481,7 @@ { "data": { "text/plain": [ - "(35339, 31)" + "(35328, 31)" ] }, "execution_count": 22, @@ -2149,53 +2587,148 @@ "variants.head(2)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee3f08ed", - "metadata": {}, - "outputs": [], - "source": [ - "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files\n", - "# Using same annotation file that the authors used\n", - "from tqdm import tqdm\n", - "\n", - "\n", - "# Extract assembly from first variant_id (e.g. chr1_925969_C_T_hg38 -> hg38)\n", - "assembly = variants[\"variant_id\"].iloc[0].split(\"_\")[-1]\n", - "assert assembly == \"hg38\"\n", - "# Extract genomic coordinates from the variant_id\n", - "variants[[\"chrom\", \"pos\", \"ref\", \"alt\"]] = variants[\"variant_id\"].str.extract(\n", - " r\"(chr\\d+|chrX|chrY)_(\\d+)_([ACGT])_([ACGT])\"\n", - ")\n", - "variants[\"pos\"] = variants[\"pos\"].astype(int)\n", - "variants = variants.sort_values(by=[\"chrom\", \"pos\"]).reset_index(drop=True).reset_index()\n", - "# Remove version numbers after dot in transcript_id\n", - "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", - "gtf_s, fasta = process_gtf(f\"{DATA_DIR}/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\")\n", - "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", - "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" - ] - }, { "cell_type": "code", "execution_count": 26, - "id": "a45df45b", + "id": "ee3f08ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing chromosomes: 100%|██████████| 24/24 [00:04<00:00, 5.01it/s]\n" + "Processing transcripts: 100%|██████████| 110025/110025 [00:16<00:00, 6700.35it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - " Processed 312994 mutations with CDS context:\n" + "Processed 110025 GTF CDS sequences\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namechromstrandcdsStartcdsEndcds_startscds_endscds_length
0ENST00000641515chr1+6556470008(65564, 69036)(65573, 70008)981
1ENST00000335137chr1+6909070008(69090,)(70008,)918
\n", + "
" + ], + "text/plain": [ + " name chrom strand cdsStart cdsEnd cds_starts \\\n", + "0 ENST00000641515 chr1 + 65564 70008 (65564, 69036) \n", + "1 ENST00000335137 chr1 + 69090 70008 (69090,) \n", + "\n", + " cds_ends cds_length \n", + "0 (65573, 70008) 981 \n", + "1 (70008,) 918 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files\n", + "# Using same annotation file that the authors used\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "# Extract assembly from first variant_id (e.g. chr1_925969_C_T_hg38 -> hg38)\n", + "assembly = variants[\"variant_id\"].iloc[0].split(\"_\")[-1]\n", + "assert assembly == \"hg38\"\n", + "# Extract genomic coordinates from the variant_id\n", + "variants[[\"chrom\", \"pos\", \"ref\", \"alt\"]] = variants[\"variant_id\"].str.extract(\n", + " r\"(chr\\d+|chrX|chrY)_(\\d+)_([ACGT])_([ACGT])\"\n", + ")\n", + "variants[\"pos\"] = variants[\"pos\"].astype(int)\n", + "variants = variants.sort_values(by=[\"chrom\", \"pos\"]).reset_index(drop=True).reset_index()\n", + "# Remove version numbers after dot in transcript_id\n", + "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", + "gtf_s, fasta = process_gtf(\n", + " f\"{DATA_DIR}/reference/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\"\n", + ")\n", + "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", + "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a45df45b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing chromosomes: 0%| | 0/24 [00:00\n", " \n", " \n", + " level_0\n", " id\n", " variant_id\n", " transcript_id\n", @@ -2447,13 +2981,12 @@ " chrom\n", " pos\n", " ref\n", - " alt\n", " ...\n", - " alt_codon\n", " ref_aa\n", " alt_aa\n", " alt_seq\n", " codon_position\n", + " level_0_y\n", " index_y\n", " transcript_id_y\n", " protein_variant_y\n", @@ -2465,6 +2998,7 @@ " \n", " 0\n", " 0\n", + " 0\n", " chr1_925969_C_T_hg38\n", " ENST00000342066\n", " Q96NU1:P10S\n", @@ -2473,14 +3007,13 @@ " chr1\n", " 925969\n", " C\n", - " T\n", " ...\n", - " TCT\n", " P\n", " S\n", " ATGTCCAAGGGGATCCTGCAGGTGCATTCTCCGATCTGCGACTGCC...\n", " 9\n", " 0\n", + " 0\n", " ENST00000342066\n", " Q96NU1:P10S\n", " 0.967398\n", @@ -2489,6 +3022,7 @@ " \n", " 1\n", " 1\n", + " 1\n", " chr1_930165_G_A_hg38\n", " ENST00000342066\n", " Q96NU1:R28Q\n", @@ -2497,14 +3031,13 @@ " chr1\n", " 930165\n", " G\n", - " A\n", " ...\n", - " CAG\n", " R\n", " Q\n", " ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...\n", " 27\n", " 1\n", + " 1\n", " ENST00000342066\n", " Q96NU1:R28Q\n", " 0.662765\n", @@ -2512,30 +3045,34 @@ " \n", " \n", "\n", - "

2 rows × 27 columns

\n", + "

2 rows × 29 columns

\n", "" ], "text/plain": [ - " id variant_id transcript_id protein_variant AlphaMissense \\\n", - "0 0 chr1_925969_C_T_hg38 ENST00000342066 Q96NU1:P10S 0.967398 \n", - "1 1 chr1_930165_G_A_hg38 ENST00000342066 Q96NU1:R28Q 0.662765 \n", + " level_0 id variant_id transcript_id protein_variant \\\n", + "0 0 0 chr1_925969_C_T_hg38 ENST00000342066 Q96NU1:P10S \n", + "1 1 1 chr1_930165_G_A_hg38 ENST00000342066 Q96NU1:R28Q \n", + "\n", + " AlphaMissense label chrom pos ref ... ref_aa alt_aa \\\n", + "0 0.967398 0.0 chr1 925969 C ... P S \n", + "1 0.662765 0.0 chr1 930165 G ... R Q \n", "\n", - " label chrom pos ref alt ... alt_codon ref_aa alt_aa \\\n", - "0 0.0 chr1 925969 C T ... TCT P S \n", - "1 0.0 chr1 930165 G A ... CAG R Q \n", + " alt_seq codon_position \\\n", + "0 ATGTCCAAGGGGATCCTGCAGGTGCATTCTCCGATCTGCGACTGCC... 9 \n", + "1 ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC... 27 \n", "\n", - " alt_seq codon_position index_y \\\n", - "0 ATGTCCAAGGGGATCCTGCAGGTGCATTCTCCGATCTGCGACTGCC... 9 0 \n", - "1 ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC... 27 1 \n", + " level_0_y index_y transcript_id_y protein_variant_y AlphaMissense_y \\\n", + "0 0 0 ENST00000342066 Q96NU1:P10S 0.967398 \n", + "1 1 1 ENST00000342066 Q96NU1:R28Q 0.662765 \n", "\n", - " transcript_id_y protein_variant_y AlphaMissense_y label_y \n", - "0 ENST00000342066 Q96NU1:P10S 0.967398 0.0 \n", - "1 ENST00000342066 Q96NU1:R28Q 0.662765 0.0 \n", + " label_y \n", + "0 0.0 \n", + "1 0.0 \n", "\n", - "[2 rows x 27 columns]" + "[2 rows x 29 columns]" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2546,7 +3083,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "7fbe8ebf", "metadata": {}, "outputs": [ @@ -2567,7 +3104,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "7d6dc737", "metadata": {}, "outputs": [], @@ -2585,7 +3122,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "2b5660bb", "metadata": {}, "outputs": [ @@ -2648,7 +3185,7 @@ "1 1 " ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2661,10 +3198,96 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "9c9927e0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing transcripts: 100%|██████████| 110025/110025 [00:16<00:00, 6591.19it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processed 110025 GTF CDS sequences\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namechromstrandcdsStartcdsEndcds_startscds_endscds_length
0ENST00000641515chr1+6556470008(65564, 69036)(65573, 70008)981
1ENST00000335137chr1+6909070008(69090,)(70008,)918
\n", + "
" + ], + "text/plain": [ + " name chrom strand cdsStart cdsEnd cds_starts \\\n", + "0 ENST00000641515 chr1 + 65564 70008 (65564, 69036) \n", + "1 ENST00000335137 chr1 + 69090 70008 (69090,) \n", + "\n", + " cds_ends cds_length \n", + "0 (65573, 70008) 981 \n", + "1 (70008,) 918 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files #\n", "# Using same annotation file that the authors used\n", @@ -2681,14 +3304,16 @@ "## Remove version numbers after dot in transcript_id\n", "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", "## Get the CDS sequences and annotations from the GTF and FASTA files\n", - "gtf_s, fasta = process_gtf(f\"{DATA_DIR}/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\")\n", + "gtf_s, fasta = process_gtf(\n", + " f\"{DATA_DIR}/reference/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\"\n", + ")\n", "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 39, "id": "89778ae1", "metadata": {}, "outputs": [ @@ -2696,7 +3321,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing chromosomes: 100%|██████████| 23/23 [00:00<00:00, 66.11it/s]" + "Processing chromosomes: 0%| | 0/23 [00:00\n", " \n", " \n", + " level_0\n", " id\n", " variant_id\n", " transcript_id\n", @@ -2931,13 +3564,12 @@ " chrom\n", " pos\n", " ref\n", - " alt\n", " ...\n", - " alt_codon\n", " ref_aa\n", " alt_aa\n", " alt_seq\n", " codon_position\n", + " level_0_y\n", " index_y\n", " transcript_id_y\n", " protein_variant_y\n", @@ -2949,6 +3581,7 @@ " \n", " 0\n", " 0\n", + " 0\n", " chr1_2557810_G_A_hg38\n", " ENST00000355716\n", " Q92956:E52K\n", @@ -2957,14 +3590,13 @@ " chr1\n", " 2557810\n", " G\n", - " A\n", " ...\n", - " AAG\n", " E\n", " K\n", " ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC...\n", " 51\n", " 0\n", + " 0\n", " ENST00000355716\n", " Q92956:E52K\n", " 0.232843\n", @@ -2973,6 +3605,7 @@ " \n", " 1\n", " 1\n", + " 1\n", " chr1_2558346_A_G_hg38\n", " ENST00000355716\n", " Q92956:Y61C\n", @@ -2981,14 +3614,13 @@ " chr1\n", " 2558346\n", " A\n", - " G\n", " ...\n", - " TGT\n", " Y\n", " C\n", " ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC...\n", " 60\n", " 1\n", + " 1\n", " ENST00000355716\n", " Q92956:Y61C\n", " 0.839032\n", @@ -2996,30 +3628,34 @@ " \n", " \n", "\n", - "

2 rows × 27 columns

\n", + "

2 rows × 29 columns

\n", "" ], "text/plain": [ - " id variant_id transcript_id protein_variant AlphaMissense \\\n", - "0 0 chr1_2557810_G_A_hg38 ENST00000355716 Q92956:E52K 0.232843 \n", - "1 1 chr1_2558346_A_G_hg38 ENST00000355716 Q92956:Y61C 0.839032 \n", + " level_0 id variant_id transcript_id protein_variant \\\n", + "0 0 0 chr1_2557810_G_A_hg38 ENST00000355716 Q92956:E52K \n", + "1 1 1 chr1_2558346_A_G_hg38 ENST00000355716 Q92956:Y61C \n", "\n", - " label chrom pos ref alt ... alt_codon ref_aa alt_aa \\\n", - "0 0 chr1 2557810 G A ... AAG E K \n", - "1 0 chr1 2558346 A G ... TGT Y C \n", + " AlphaMissense label chrom pos ref ... ref_aa alt_aa \\\n", + "0 0.232843 0 chr1 2557810 G ... E K \n", + "1 0.839032 0 chr1 2558346 A ... Y C \n", "\n", - " alt_seq codon_position index_y \\\n", - "0 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 51 0 \n", - "1 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 60 1 \n", + " alt_seq codon_position \\\n", + "0 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 51 \n", + "1 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 60 \n", "\n", - " transcript_id_y protein_variant_y AlphaMissense_y label_y \n", - "0 ENST00000355716 Q92956:E52K 0.232843 0 \n", - "1 ENST00000355716 Q92956:Y61C 0.839032 0 \n", + " level_0_y index_y transcript_id_y protein_variant_y AlphaMissense_y \\\n", + "0 0 0 ENST00000355716 Q92956:E52K 0.232843 \n", + "1 1 1 ENST00000355716 Q92956:Y61C 0.839032 \n", "\n", - "[2 rows x 27 columns]" + " label_y \n", + "0 0 \n", + "1 0 \n", + "\n", + "[2 rows x 29 columns]" ] }, - "execution_count": 36, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -3037,7 +3673,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 42, "id": "de460bcf", "metadata": {}, "outputs": [ @@ -3050,7 +3686,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 37, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -3061,7 +3697,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "id": "0d4628a1", "metadata": {}, "outputs": [ @@ -3082,7 +3718,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 44, "id": "cc60a940", "metadata": {}, "outputs": [], @@ -3100,7 +3736,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 48, "id": "e64b85f3", "metadata": {}, "outputs": [ @@ -3130,7 +3766,7 @@ "└──────────┴─────────────────────┴────────────────────┴────────┴───┴───────┴───────────┴─────┴─────┘" ] }, - "execution_count": 7, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -3184,7 +3820,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 49, "id": "9a0f952c", "metadata": {}, "outputs": [ @@ -3198,7 +3834,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 17)
#binnamechromstrandtxStarttxEndcdsStartcdsEndexonCountexonStartsexonEndsscorename2cdsStartStatcdsEndStatexonFramescds_sequence
i64strstrstri64i64i64i64i64strstri64strstrstrstrstr
14"NM_021079.5""chr17""+"4506131645109016450613294510563912"45061316,45081643,45086507,450…"45061460,45081752,45086652,450…0"NMT1""cmpl""cmpl""0,2,0,1,0,2,2,2,0,0,0,0,""ATGGCGGACGAGAGTGAGACAGCAGTGAAG…
1010"NR_026723.1""chr12""-"5575246255817756558177565581775612"55752462,55757382,55760550,557…"55752840,55757553,55760640,557…0"CIP29""none""none""-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,…""
" + "shape: (2, 17)
#binnamechromstrandtxStarttxEndcdsStartcdsEndexonCountexonStartsexonEndsscorename2cdsStartStatcdsEndStatexonFramescds_sequence
i64strstrstri64i64i64i64i64strstri64strstrstrstrstr
1098"NR_136665.1""chr16""+"6724887167272204672722046727220415"67248871,67252541,67255034,672…"67249201,67252805,67255184,672…0"SLC9A5""none""none""-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,…""
1172"NR_169510.1""chr14""-"770315587703420677034206770342062"77031558,77033839,""77033215,77034206,"0"LOC105370579""none""none""-1,-1,"""
" ], "text/plain": [ "shape: (2, 17)\n", @@ -3208,29 +3844,25 @@ "│ i64 ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str │\n", "╞══════╪═════════════╪═══════╪════════╪═══╪══════════════╪════════════╪══════════════╪═════════════╡\n", - "│ 14 ┆ NM_021079.5 ┆ chr17 ┆ + ┆ … ┆ cmpl ┆ cmpl ┆ 0,2,0,1,0,2, ┆ ATGGCGGACGA │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2,2,0,0,0,0, ┆ GAGTGAGACAG │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ CAGTGAAG… │\n", - "│ 1010 ┆ NR_026723.1 ┆ chr12 ┆ - ┆ … ┆ none ┆ none ┆ -1,-1,-1,-1, ┆ │\n", + "│ 1098 ┆ NR_136665.1 ┆ chr16 ┆ + ┆ … ┆ none ┆ none ┆ -1,-1,-1,-1, ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -1,-1,-1,-1, ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -1,-1,… ┆ │\n", + "│ 1172 ┆ NR_169510.1 ┆ chr14 ┆ - ┆ … ┆ none ┆ none ┆ -1,-1, ┆ │\n", "└──────┴─────────────┴───────┴────────┴───┴──────────────┴────────────┴──────────────┴─────────────┘" ] }, - "execution_count": 12, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the reference sequence and CDS context annotation of the variants from the GTF and FASTA files\n", - "refseq = pl.read_csv(f\"{DATA_DIR}/clinvar_syn/ucsc_refseq_hg38.tsv\", separator=\"\\t\")\n", - "refseq.head(2)\n", "\n", "# Build CDS sequences for synonymous variants\n", "valid_chroms = [\"chr\" + str(i) for i in range(1, 23)]\n", - "refseq = pl.read_csv(f\"{DATA_DIR}/clinvar_syn/ucsc_refseq_hg38.tsv\", separator=\"\\t\")\n", - "refseq_hist = pl.read_csv(f\"{DATA_DIR}/clinvar_syn/ucsc_refseq_hist_hg38.tsv\", separator=\"\\t\")\n", + "refseq = pl.read_csv(f\"{DATA_DIR}/reference/ucsc_refseq_hg38.tsv\", separator=\"\\t\")\n", + "refseq_hist = pl.read_csv(f\"{DATA_DIR}/reference/ucsc_refseq_hist_hg38.tsv\", separator=\"\\t\")\n", "refseq = pl.concat([refseq, refseq_hist])\n", "refseq = refseq.filter(pl.col(\"chrom\").is_in(valid_chroms)).unique()\n", "fasta = {}\n", @@ -3248,7 +3880,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 50, "id": "90a4cf3e", "metadata": {}, "outputs": [ @@ -3281,7 +3913,7 @@ "└──────────┴────────────┴─────────────────┴────────┴───┴─────┴─────┴─────────────┴─────────────────┘" ] }, - "execution_count": 13, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -3351,169 +3983,2700 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "97de04a0", "metadata": {}, - "outputs": [], - "source": [ - "# Process variants per chromosome and add additional features: pLI, PhyloP, codon frequencies\n", - "import re\n", - "\n", - "\n", - "result = []\n", - "\n", - "for row in tqdm(dset.rows(named=True)):\n", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 6731/129454 [00:06<01:58, 1032.67it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 178069, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3588C>T (p.Ser1196=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2025', 'RS# (dbSNP)': 200077311, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002313002|RCV001668310|RCV000735080', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900|MedGen:CN169374', 'PhenotypeList': 'Inborn genetic diseases|not provided|not specified', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721196, 'Stop': 50721196, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA180434', 'SubmitterCategories': 2, 'VariationID': 167684, 'PositionVCF': 50721196, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847412|SCV000863275|SCV001882588|SCV004011423|SCV005277460', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721196, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 9%|▉ | 11706/129454 [00:11<01:54, 1025.35it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 237006, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2028G>A (p.Thr676=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2025', 'RS# (dbSNP)': 73892912, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000224793|RCV001726056|RCV002315674', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50704769, 'Stop': 50704769, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 6, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325660', 'SubmitterCategories': 2, 'VariationID': 235319, 'PositionVCF': 50704769, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000280798|SCV000847390|SCV001759587|SCV005331035', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50704769, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 16%|█▌ | 20263/129454 [00:19<01:44, 1047.00it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 346769, 'Type': 'single nucleotide variant', 'Name': 'NM_001379500.1(COL18A1):c.2832A>C (p.Pro944=)', 'GeneID': 80781, 'GeneSymbol': 'COL18A1', 'HGNC_ID': 'HGNC:2195', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 03, 2025', 'RS# (dbSNP)': 751825604, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000349746|RCV002057775|RCV004549794', 'PhenotypeIDS': 'MONDO:MONDO:0800166,MedGen:C1849409,OMIM:PS267750,Orphanet:1571|MedGen:C3661900|', 'PhenotypeList': 'Knobloch syndrome|not provided|COL18A1-related disorder', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000021.9', 'Chromosome': '21', 'Start': 45504529, 'Stop': 45504529, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '21q22.3', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10650647', 'SubmitterCategories': 2, 'VariationID': 340260, 'PositionVCF': 45504529, 'ReferenceAlleleVCF': 'A', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000436429|SCV002323680|SCV004146729|SCV004772740|SCV005207582', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr21', 'pos': 45504529, 'ref': 'A', 'alt': 'C', 'tx': 'NM_001379500.1', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 23%|██▎ | 29339/129454 [00:27<01:36, 1038.97it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 431503, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4038C>T (p.Gly1346=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Apr 01, 2024', 'RS# (dbSNP)': 367676023, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000590960|RCV001662502|RCV002358390', 'PhenotypeIDS': 'Human Phenotype Ontology:HP:0000717,MONDO:MONDO:0005260,MeSH:D001321,MedGen:C0004352,OMIM:209850|MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'Autism|not provided|Inborn genetic diseases', 'Origin': 'germline;unknown', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721646, 'Stop': 50721646, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326151', 'SubmitterCategories': 2, 'VariationID': 437882, 'PositionVCF': 50721646, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001875419|SCV002620213|SCV004155335', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721646, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 28%|██▊ | 36418/129454 [00:34<01:19, 1168.23it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 486315, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.5043C>G (p.Pro1681=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jul 01, 2024', 'RS# (dbSNP)': 958460783, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000585318', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50730934, 'Stop': 50730934, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA325591023', 'SubmitterCategories': 2, 'VariationID': 493352, 'PositionVCF': 50730934, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'G', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000693096|SCV001801058', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50730934, 'ref': 'C', 'alt': 'G', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 35%|███▍ | 45075/129454 [00:42<01:27, 967.44it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 580539, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2118C>T (p.Ile706=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 01, 2024', 'RS# (dbSNP)': 182897668, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001683645|RCV002313699', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50704859, 'Stop': 50704859, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325682', 'SubmitterCategories': 2, 'VariationID': 588762, 'PositionVCF': 50704859, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848960|SCV001903881|SCV004155306', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50704859, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580555, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2820G>A (p.Ala940=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Apr 01, 2024', 'RS# (dbSNP)': 758217731, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001545823|RCV002318678', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720428, 'Stop': 50720428, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325923', 'SubmitterCategories': 2, 'VariationID': 589256, 'PositionVCF': 50720428, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000849956|SCV001765227|SCV005041872', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720428, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580575, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3876C>T (p.Asn1292=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2024', 'RS# (dbSNP)': 371876840, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001644789|RCV002316080', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721484, 'Stop': 50721484, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326094', 'SubmitterCategories': 2, 'VariationID': 588161, 'PositionVCF': 50721484, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847568|SCV001856550|SCV004155330|SCV005207863', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721484, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580581, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4671G>A (p.Gly1557=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 20, 2021', 'RS# (dbSNP)': 191010623, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001571309|RCV002318819', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722279, 'Stop': 50722279, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326286', 'SubmitterCategories': 2, 'VariationID': 589408, 'PositionVCF': 50722279, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000850229|SCV001795752', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722279, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580587, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.5172C>T (p.Pro1724=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2025', 'RS# (dbSNP)': 557669600, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002314485|RCV001531381', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50731063, 'Stop': 50731063, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326348', 'SubmitterCategories': 2, 'VariationID': 588394, 'PositionVCF': 50731063, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848147|SCV001746453|SCV001831170', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50731063, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580670, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2268C>G (p.Pro756=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2025', 'RS# (dbSNP)': 61731160, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002314483|RCV001531378|RCV001701434', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900|MedGen:CN169374', 'PhenotypeList': 'Inborn genetic diseases|not provided|not specified', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50706085, 'Stop': 50706085, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325755', 'SubmitterCategories': 2, 'VariationID': 588392, 'PositionVCF': 50706085, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'G', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848145|SCV001746449|SCV001889731', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50706085, 'ref': 'C', 'alt': 'G', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580674, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2667G>C (p.Pro889=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2016', 'RS# (dbSNP)': 1569114747, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312412|RCV004704195', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720275, 'Stop': 50720275, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515258776', 'SubmitterCategories': 2, 'VariationID': 587929, 'PositionVCF': 50720275, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846762|SCV005207860', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720275, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580676, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2535G>A (p.Pro845=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 01, 2025', 'RS# (dbSNP)': 117066889, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002318081|RCV001573322|RCV001701435', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900|MedGen:CN169374', 'PhenotypeList': 'Inborn genetic diseases|not provided|not specified', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50715713, 'Stop': 50715713, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 6, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325884', 'SubmitterCategories': 2, 'VariationID': 589141, 'PositionVCF': 50715713, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000849750|SCV001871856|SCV002496764|SCV005277454', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50715713, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580677, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2676G>C (p.Pro892=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Oct 01, 2023', 'RS# (dbSNP)': 1173390690, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312413|RCV003424305', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720284, 'Stop': 50720284, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515258805', 'SubmitterCategories': 2, 'VariationID': 587930, 'PositionVCF': 50720284, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846764|SCV004155315', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720284, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580680, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2673G>C (p.Pro891=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Nov 01, 2022', 'RS# (dbSNP)': 1569114751, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312411|RCV001566431', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720281, 'Stop': 50720281, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515258795', 'SubmitterCategories': 2, 'VariationID': 587928, 'PositionVCF': 50720281, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846759|SCV001789944|SCV004155314|SCV005207861', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720281, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580682, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3222C>T (p.Tyr1074=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Nov 13, 2019', 'RS# (dbSNP)': 144470529, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001572895|RCV001700296|RCV002312344', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720830, 'Stop': 50720830, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325943', 'SubmitterCategories': 2, 'VariationID': 587855, 'PositionVCF': 50720830, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846468|SCV001895077|SCV005277457', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720830, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580688, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3372C>T (p.Pro1124=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 04, 2021', 'RS# (dbSNP)': 200572899, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001577348|RCV002312432', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720980, 'Stop': 50720980, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325963', 'SubmitterCategories': 2, 'VariationID': 587951, 'PositionVCF': 50720980, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846864|SCV001804705', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720980, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580701, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3762G>A (p.Lys1254=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2025', 'RS# (dbSNP)': 145196448, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001653983|RCV002312271', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721370, 'Stop': 50721370, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326059', 'SubmitterCategories': 2, 'VariationID': 587769, 'PositionVCF': 50721370, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000845947|SCV001868546|SCV002496765|SCV005277461', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721370, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580708, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4569C>T (p.His1523=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 03, 2020', 'RS# (dbSNP)': 368142005, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001655575|RCV002316168', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722177, 'Stop': 50722177, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326260', 'SubmitterCategories': 2, 'VariationID': 588251, 'PositionVCF': 50722177, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847795|SCV001868143', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722177, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580806, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2296C>T (p.Leu766=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 04, 2020', 'RS# (dbSNP)': 201094179, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312267|RCV001644784', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50706113, 'Stop': 50706113, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325759', 'SubmitterCategories': 2, 'VariationID': 587765, 'PositionVCF': 50706113, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000845934|SCV001857925', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50706113, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580808, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2436C>T (p.Ala812=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2024', 'RS# (dbSNP)': 61729465, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001655577|RCV002314482', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50714993, 'Stop': 50714993, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325846', 'SubmitterCategories': 2, 'VariationID': 588391, 'PositionVCF': 50714993, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848142|SCV001861741|SCV004155312|SCV005207858', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50714993, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580824, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3114G>C (p.Ala1038=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2024', 'RS# (dbSNP)': 772152761, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001644788|RCV002312783', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720722, 'Stop': 50720722, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325936', 'SubmitterCategories': 2, 'VariationID': 588063, 'PositionVCF': 50720722, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847249|SCV001858079|SCV002544750', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720722, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580832, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3559C>T (p.Leu1187=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 19, 2021', 'RS# (dbSNP)': 376858991, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312773|RCV001592915', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721167, 'Stop': 50721167, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326004', 'SubmitterCategories': 2, 'VariationID': 588053, 'PositionVCF': 50721167, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847222|SCV001823167|SCV005207862', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721167, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 49%|████▊ | 63068/129454 [00:59<01:01, 1071.16it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 728536, 'Type': 'single nucleotide variant', 'Name': 'NM_001401501.2(MUC16):c.44076G>A (p.Glu14692=)', 'GeneID': 94025, 'GeneSymbol': 'MUC16', 'HGNC_ID': 'HGNC:15582', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2023', 'RS# (dbSNP)': 187392925, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000891417|RCV003940689', 'PhenotypeIDS': 'MedGen:C3661900|', 'PhenotypeList': 'not provided|MUC16-related disorder', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000019.10', 'Chromosome': '19', 'Start': 8882863, 'Stop': 8882863, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '19p13.2', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA9162760', 'SubmitterCategories': 2, 'VariationID': 718464, 'PositionVCF': 8882863, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001035235|SCV004146564|SCV005309160', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr19', 'pos': 8882863, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001401501.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 60%|██████ | 78067/129454 [01:13<00:49, 1033.55it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1001214, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4083A>G (p.Pro1361=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2021', 'RS# (dbSNP)': 371543035, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001310466|RCV002366158', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721691, 'Stop': 50721691, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326158', 'SubmitterCategories': 2, 'VariationID': 1012485, 'PositionVCF': 50721691, 'ReferenceAlleleVCF': 'A', 'AlternateAlleleVCF': 'G', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001500271|SCV002623691', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721691, 'ref': 'A', 'alt': 'G', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 71%|███████▏ | 92335/129454 [01:27<00:38, 974.15it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1173520, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4233G>A (p.Pro1411=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 17, 2021', 'RS# (dbSNP)': 369083529, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001540436|RCV002377903', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721841, 'Stop': 50721841, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326194', 'SubmitterCategories': 2, 'VariationID': 1182740, 'PositionVCF': 50721841, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001758323|SCV002624534', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721841, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 72%|███████▏ | 93021/129454 [01:28<00:37, 974.89it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1196260, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4104C>T (p.Ser1368=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 01, 2025', 'RS# (dbSNP)': 201793890, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001572736|RCV001701200|RCV002368594', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721712, 'Stop': 50721712, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 7, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326166', 'SubmitterCategories': 2, 'VariationID': 1205883, 'PositionVCF': 50721712, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001856828|SCV002624886|SCV004155337|SCV005277462', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721712, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1199396, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3885G>A (p.Glu1295=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Dec 14, 2020', 'RS# (dbSNP)': 546313986, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001575039|RCV002458542', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721493, 'Stop': 50721493, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326099', 'SubmitterCategories': 2, 'VariationID': 1207144, 'PositionVCF': 50721493, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001801950|SCV002617661', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721493, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 72%|███████▏ | 93616/129454 [01:28<00:36, 971.95it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1215858, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.1635C>T (p.Ala545=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2024', 'RS# (dbSNP)': 780922475, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001609046', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50697627, 'Stop': 50697627, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325523', 'SubmitterCategories': 2, 'VariationID': 1227172, 'PositionVCF': 50697627, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001834698|SCV004698485|SCV005277433', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50697627, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1217481, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2184T>C (p.Gly728=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2025', 'RS# (dbSNP)': 747708688, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001608387|RCV001821925|RCV002421229', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50705026, 'Stop': 50705026, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325717', 'SubmitterCategories': 2, 'VariationID': 1224929, 'PositionVCF': 50705026, 'ReferenceAlleleVCF': 'T', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001835133|SCV002068754|SCV002718228|SCV004155307|SCV005277443', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50705026, 'ref': 'T', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1221491, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4392C>T (p.Ser1464=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Aug 01, 2024', 'RS# (dbSNP)': 767710495, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001616570', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722000, 'Stop': 50722000, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326213', 'SubmitterCategories': 2, 'VariationID': 1228939, 'PositionVCF': 50722000, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001840328|SCV005330831', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722000, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 93923/129454 [01:29<00:35, 1003.77it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1227669, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3705T>C (p.Ala1235=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 03, 2021', 'RS# (dbSNP)': 576803553, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001638944|RCV002334637', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721313, 'Stop': 50721313, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326045', 'SubmitterCategories': 2, 'VariationID': 1238541, 'PositionVCF': 50721313, 'ReferenceAlleleVCF': 'T', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001848243|SCV002618596', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721313, 'ref': 'T', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 94337/129454 [01:29<00:34, 1020.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1244004, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2745G>A (p.Pro915=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 01, 2022', 'RS# (dbSNP)': 1453397190, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001665211|RCV002425018', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720353, 'Stop': 50720353, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515259020', 'SubmitterCategories': 2, 'VariationID': 1254067, 'PositionVCF': 50720353, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001874947|SCV002742586|SCV004155319', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720353, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1247765, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4170C>T (p.Asn1390=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Oct 22, 2020', 'RS# (dbSNP)': 558643743, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001666132|RCV002370258', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721778, 'Stop': 50721778, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326178', 'SubmitterCategories': 2, 'VariationID': 1256982, 'PositionVCF': 50721778, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001883431|SCV002626073', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721778, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 94640/129454 [01:29<00:36, 965.54it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1252960, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4410C>T (p.Thr1470=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Apr 01, 2024', 'RS# (dbSNP)': 376136109, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001671868|RCV002329704', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722018, 'Stop': 50722018, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326218', 'SubmitterCategories': 2, 'VariationID': 1263051, 'PositionVCF': 50722018, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001887974|SCV002626993|SCV004011424|SCV005277463', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722018, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1253191, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2799C>T (p.Gly933=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jul 01, 2023', 'RS# (dbSNP)': 907713706, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001669520', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720407, 'Stop': 50720407, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA325578069', 'SubmitterCategories': 2, 'VariationID': 1260703, 'PositionVCF': 50720407, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001887411|SCV004155320', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720407, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 94955/129454 [01:30<00:33, 1021.17it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1263803, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3144C>T (p.Ser1048=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Aug 21, 2021', 'RS# (dbSNP)': 760688077, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001682504|RCV002440835', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720752, 'Stop': 50720752, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325938', 'SubmitterCategories': 2, 'VariationID': 1275625, 'PositionVCF': 50720752, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001905306|SCV002751012', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720752, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 74%|███████▎ | 95467/129454 [01:30<00:34, 985.75it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1279830, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.5160C>T (p.Pro1720=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jul 30, 2024', 'RS# (dbSNP)': 751652089, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001714909|RCV001775182|RCV002343801', 'PhenotypeIDS': 'MedGen:C3661900|MONDO:MONDO:0011652,MedGen:C1853490,OMIM:606232,Orphanet:48652|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Phelan-McDermid syndrome|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50731051, 'Stop': 50731051, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326347', 'SubmitterCategories': 2, 'VariationID': 1290001, 'PositionVCF': 50731051, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001942614|SCV002011909|SCV002646862|SCV004155346', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50731051, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 74%|███████▍ | 96077/129454 [01:31<00:35, 952.85it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1319371, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2250C>T (p.Arg750=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 11, 2021', 'RS# (dbSNP)': 188450024, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001797316|RCV002422854', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50705092, 'Stop': 50705092, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325737', 'SubmitterCategories': 2, 'VariationID': 1328684, 'PositionVCF': 50705092, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002038764|SCV002718632', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50705092, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 75%|███████▍ | 96813/129454 [01:32<00:32, 1019.92it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1334709, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2022G>A (p.Thr674=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 06, 2021', 'RS# (dbSNP)': 147941361, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001843666|RCV002406904', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50704763, 'Stop': 50704763, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325659', 'SubmitterCategories': 2, 'VariationID': 1343064, 'PositionVCF': 50704763, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002102746|SCV002714984', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50704763, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 85%|████████▌ | 110267/129454 [01:44<00:19, 1003.82it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1798062, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4656C>T (p.Pro1552=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2025', 'RS# (dbSNP)': 750023626, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002333979|RCV004809819', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722264, 'Stop': 50722264, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326281', 'SubmitterCategories': 2, 'VariationID': 1740560, 'PositionVCF': 50722264, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002628299|SCV005434732', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722264, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 89%|████████▉ | 115017/129454 [01:49<00:13, 1093.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1845299, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2703G>A (p.Ala901=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2025', 'RS# (dbSNP)': 925909458, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002455586|RCV003427481', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720311, 'Stop': 50720311, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA325578045', 'SubmitterCategories': 2, 'VariationID': 1791866, 'PositionVCF': 50720311, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002738318|SCV004155317', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720311, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 95%|█████████▍| 122824/129454 [01:57<00:06, 980.15it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 2815385, 'Type': 'single nucleotide variant', 'Name': 'NM_001401501.2(MUC16):c.44484C>T (p.Asn14828=)', 'GeneID': 94025, 'GeneSymbol': 'MUC16', 'HGNC_ID': 'HGNC:15582', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 01, 2022', 'RS# (dbSNP)': 372141764, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV003423296', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000019.10', 'Chromosome': '19', 'Start': 8871641, 'Stop': 8871641, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '19p13.2', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA9162557', 'SubmitterCategories': 2, 'VariationID': 2649218, 'PositionVCF': 8871641, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV004146562|SCV005208164', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr19', 'pos': 8871641, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001401501.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 129454/129454 [02:03<00:00, 1048.95it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Process variants per chromosome and add additional features: pLI, PhyloP, codon frequencies\n", + "import re\n", + "\n", + "\n", + "result = []\n", + "\n", + "for row in tqdm(dset.rows(named=True)):\n", " s = row[\"Name\"].split(\":\")[1].split(\" \")[0]\n", " m = re.fullmatch(r\"c\\.(\\d+)([ACGT])>([ACGT])\", s)\n", " pos_cds, ref_cds, alt_cds = int(m.group(1)), m.group(2), m.group(3)\n", "\n", - " tx = refseq.filter((pl.col(\"name\") == row[\"tx\"]) & (pl.col(\"chrom\") == row[\"chrom\"]))[0]\n", - " try:\n", - " pos_cds0 = tx_gposes[(row[\"chrom\"], row[\"tx\"])].index(row[\"pos\"] - 1)\n", - " except:\n", - " continue\n", - " seq = tx[0, \"cds_sequence\"]\n", - " if pos_cds0 + 1 != pos_cds:\n", - " print(str(row))\n", - " assert seq[pos_cds0] == ref_cds\n", - " assert ref_cds == row[\"ref\"] if tx[0, \"strand\"] == \"+\" else get_reverse_complement(row[\"ref\"])\n", - " assert alt_cds == row[\"alt\"] if tx[0, \"strand\"] == \"+\" else get_reverse_complement(row[\"alt\"])\n", + " tx = refseq.filter((pl.col(\"name\") == row[\"tx\"]) & (pl.col(\"chrom\") == row[\"chrom\"]))[0]\n", + " try:\n", + " pos_cds0 = tx_gposes[(row[\"chrom\"], row[\"tx\"])].index(row[\"pos\"] - 1)\n", + " except:\n", + " continue\n", + " seq = tx[0, \"cds_sequence\"]\n", + " if pos_cds0 + 1 != pos_cds:\n", + " print(str(row))\n", + " assert seq[pos_cds0] == ref_cds\n", + " assert ref_cds == row[\"ref\"] if tx[0, \"strand\"] == \"+\" else reverse_complement_dna(row[\"ref\"])\n", + " assert alt_cds == row[\"alt\"] if tx[0, \"strand\"] == \"+\" else reverse_complement_dna(row[\"alt\"])\n", + "\n", + " codon_position = pos_cds0 // 3\n", + " ref_codon = seq[codon_position * 3 : (codon_position + 1) * 3]\n", + " remainder = pos_cds0 % 3\n", + " alt_nuc = list(ref_codon)\n", + " alt_nuc[remainder] = alt_cds\n", + " alt_codon = \"\".join(alt_nuc)\n", + " item = {\n", + " \"chrom\": row[\"chrom\"],\n", + " \"pos\": row[\"pos\"],\n", + " \"ref\": row[\"ref\"],\n", + " \"alt\": row[\"alt\"],\n", + " \"var_rel_dist_in_cds\": pos_cds0,\n", + " \"codon_position\": codon_position,\n", + " \"ref_codon\": ref_codon,\n", + " \"alt_codon\": alt_codon,\n", + " \"tx\": row[\"tx\"],\n", + " \"label\": row[\"ClinicalSignificance\"],\n", + " \"in_splice_junction\": row[\"in_splice_junction\"],\n", + " \"ref_seq\": seq,\n", + " \"alt_seq\": seq[:pos_cds0] + alt_cds + seq[pos_cds0 + 1 :],\n", + " }\n", + " result.append(item)\n", + "\n", + "\n", + "result_df = pl.from_dicts(result).with_row_index(\"id\")\n", + "frame = result_df.to_pandas()\n", + "(frame[\"ref_seq\"].apply(lambda x: len(x) == 0)).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "f7ecb7ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding additional features (pLI, PhyloP, codon frequencies)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 129384/129384 [00:02<00:00, 45206.96it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset with additional features: 129384 variants\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 27)
idchromposrefaltvar_rel_dist_in_cdscodon_positionref_codonalt_codontxlabelin_splice_junctionref_seqalt_seqref_aaalt_aaref_codon_freqalt_codon_freqcodon_freq_ratiogene_nameplipli_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
u64stri64strstri64i64strstrstrstrboolstrstrstrstrf64f64f64strf64i32f64i32u32f64i32
0"chr1"45015006"G""A"941313"GAG""GAA""NM_000374.5""Likely pathogenic"true"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"E""E"4.6414453e73.7827281e70.20458"UROD"0.007.998811040.8523558
1"chr10"124400865"G""A"1133377"AAC""AAT""NM_000274.4""Benign"false"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"N""N"2.0900468e72.0353876e70.0265"OAT"0.00-2.351-213200.8583338
" + ], + "text/plain": [ + "shape: (2, 27)\n", + "┌─────┬───────┬───────────┬─────┬───┬────────────┬────────────┬─────────────────┬──────────────────┐\n", + "│ id ┆ chrom ┆ pos ┆ ref ┆ … ┆ phylop_bin ┆ cds_length ┆ cds_offset_frac ┆ cds_offset_frac_ │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ bin │\n", + "│ u64 ┆ str ┆ i64 ┆ str ┆ ┆ i32 ┆ u32 ┆ f64 ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ i32 │\n", + "╞═════╪═══════╪═══════════╪═════╪═══╪════════════╪════════════╪═════════════════╪══════════════════╡\n", + "│ 0 ┆ chr1 ┆ 45015006 ┆ G ┆ … ┆ 8 ┆ 1104 ┆ 0.852355 ┆ 8 │\n", + "│ 1 ┆ chr10 ┆ 124400865 ┆ G ┆ … ┆ -2 ┆ 1320 ┆ 0.858333 ┆ 8 │\n", + "└─────┴───────┴───────────┴─────┴───┴────────────┴────────────┴─────────────────┴──────────────────┘" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Adding additional features (pLI, PhyloP, codon frequencies)...\")\n", + "dset = process_dset(result_df, refseq, remove_non_pli=False)\n", + "print(f\"Dataset with additional features: {dset.shape[0]} variants\")\n", + "dset.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "a6fe62c7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAHDCAYAAACpu1eiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABTsUlEQVR4nO3deVwVZf//8TcgB3BBQAXEUFHLfUncMFNLBJU0y1zKysw0CyuzrGxRRLs1LbfcslJb9E7NtFIzSS1LyS3NNTNvva27wDsVcAWE6/eHvzO3R8AFUHC+r+fjwUPPzDUz1+fMnDPvMzNnjpsxxggAAAA3PPei7gAAAAAKB8EOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMHOplauXKlGjRrJ29tbbm5uSklJKeouIRdVq1bVI488UtTdwDV0pa/FuLg4ubm56e+//76+HSwkhb0tL1y4UAEBATp58mShzdNu3NzcFBcXV9TduKy2bduqXr16Rd2NG0qvXr3Uo0ePfE1bKMFuxowZ6t69uypXriw3N7c8X9zr1q1Tly5dFBoaKm9vbwUHB6tDhw5av359ru03bNigVq1aqWTJkgoODtbTTz+d40W+efNmDRo0SHXr1lWpUqVUuXJl9ejRQ7/++muO+bm5ueX51759+wI/D1drw4YNiouLK/TQdfToUfXo0UM+Pj6aNm2aPvroI5UqVSrXtidPntSIESPUoUMHBQQEyM3NTXPnzs3RLjs7W3PnzrXWX6lSpVSvXj2NHj1aZ8+ezdE+OTlZffv2VWBgoHx8fNS4cWMtWrTosn1v37693NzcNGjQoBzj8lp3Y8eOdWm3ZMkSRUdHKyQkRF5eXrrpppt03333adeuXZddvh3Nnz9fkyZNKupuFGvF4bWI/8nKytKIESP01FNPqXTp0kXalz///FNxcXHavn37NV/W9OnTc33/vRFcz+cpv652PyZJ77//vmrXri1vb2/dfPPNevvtty+7nEvtx6Tz+8fHH39clSpVkre3t6pWrap+/fq5tHnxxRe1ePFi/fzzz1ddZ4mrniIXb7zxhk6cOKFmzZrpr7/+yrPdr7/+Knd3dw0cOFDBwcE6fvy4Pv74Y7Vu3VrLly9Xhw4drLbbt29Xu3btVLt2bU2YMEF//PGH3nzzTe3fv19fffWVy7LXr1+v7t27q0GDBkpKStLUqVPVuHFj/fjjjy6fEj766KMcfdqyZYsmT56sqKiowngqrsqGDRs0cuRIPfLII/Lz8yu0+W7evFknTpzQqFGjFBkZecm2f//9t+Lj41W5cmU1bNhQ3377ba7tTp8+rb59+6pFixYaOHCgAgMDlZiYqBEjRmj16tVas2aN3NzcJElpaWlq1aqVkpOT9cwzzyg4OFgLFy5Ujx49NG/ePD3wwAO5LuOzzz5TYmLiJfvbvn17Pfzwwy7Dbr31VpfHO3fulL+/v5555hmVL19eSUlJmj17tpo1a6bExEQ1bNjwksu4nvbt2yd392t74Hz+/PnatWuXBg8efE2XcyMrDq9F/M+XX36pffv2acCAAUXdFf35558aOXKkqlatqkaNGl3TZU2fPl3ly5e/IY/iX8/nKb+uZj8mSe+8844GDhyobt26aciQIfr+++/19NNP6/Tp03rxxRdzXcbl9mO///67brvtNknSwIEDValSJf3555/atGmTS7tbb71VTZo00VtvvaUPP/zw6go1heDQoUMmOzvbGGNMqVKlTJ8+fa542lOnTpmgoCATHR3tMrxjx46mYsWKJjU11Rr27rvvGknm66+/toatX7/epKenu0z766+/Gi8vL9O7d+/LLr9fv37Gzc3N/P7771fc58Iyfvx4I8kcPHiwUOf7wQcfGElm8+bNl2179uxZ89dffxljjNm8ebORZObMmZOjXXp6ulm/fn2O4SNHjjSSTEJCgjVs3LhxRpJZvXq1NSwrK8s0bdrUBAcH51hfxhhz5swZU7VqVRMfH28kmdjY2Bxt8hp+JZKSkkyJEiXM448/nq/pC1N2drY5ffr0dVteTEyMqVKlynVb3qVkZmbmuv6LWnF4LY4YMcJIMv/9738LtQ/XS5UqVa7qvf9SunTpYlq1alUo8yqoS70vFra6deuaNm3aXHF7SWbEiBHXrD9X41LPU5s2bUzdunWvf6cucjX7sdOnT5ty5cqZmJgYl7a9e/c2pUqVMseOHcsxnyvZj3Xs2NGEhYWZv//++7L9ffPNN02pUqXMiRMnrqQ8S6EEuwtdbbAzxph69eqZ5s2bW49TU1NNiRIlzNChQ13apaenm9KlS5t+/fpddp6NGzc2jRs3vmSbs2fPGj8/P9O2bdsr6ufJkyfNkCFDzE033WQcDoe55ZZbzPjx461Qa4wxBw8ezHPjvvBF6HwTv/jvcjuWhQsXmsaNGxtvb29Trlw507t3b/PHH39Y49u0aZNjnle6PvLzBrZjxw4jyUyZMsUa1rlzZ1OhQoUcbZ07z1WrVuUYN3LkSFO5cmVz+vTpywa706dPmzNnzlxxH405H6Z8fX1Nz549L9kuJibGhIWF5TquRYsWJjw83Ho8e/Zsc8cdd5gKFSoYh8NhateubaZPn55juipVqpiYmBizcuVKEx4ebry8vMzEiROtcReun6NHj5rnnnvO1KtXz5QqVcqUKVPGdOjQwWzfvt1lnmvXrjWSzIIFC8zo0aNNpUqVjJeXl7nzzjvN/v37rXa5bQ8XhrwpU6aYOnXqGB8fH+Pn52fCw8PNvHnzLvkcpaenm9dee800btzY+Pr6mpIlS5pWrVqZNWvWuLRzvhbGjx9vJk6caKpVq2bc3d3Ntm3bjDHG7N2713Tr1s34+/sbLy8vEx4ebj7//PNLLtvJjq9FZz/27t1runfvbsqUKWMCAgLM008/nWN7z8zMNPHx8aZatWrG4XCYKlWqmGHDhpmzZ8/mWeeFLt7u5syZYySZH374wTz77LOmfPnypmTJkqZr167myJEjLtNmZ2ebUaNGmUqVKhkfHx/Ttm1bs2vXrhzzzMjIMHFxcaZGjRrGy8vLBAQEmNtuuy3X1/+Fzpw5YxwOh4mLi8t1/EcffWSaNm1qbbO33367y4d9Y4yZNm2aqVOnjnE4HKZixYrmySefNMePH3dp4wwbu3fvNm3btjU+Pj4mJCTEvPHGG1Yb5+vs4r8Lt6kff/zRREdHG19fX+Pj42Nat25tfvjhB2v8nj17jLe3t3nooYdclv/9998bd3d388ILLxhjzq+Ti5dzuZCX2/r9448/TN++fU1gYKBxOBymTp065v3333dpc6XvH05Tp041YWFhxtvb2zRt2tSsW7fOtGnTxurf5Z6nK3mui1Ju+7Hly5cbSWb58uUubTds2GAkmY8++ijHfC63H9u7d6+RZO0nzpw5YzIyMvLs188//2wkmc8+++yq6imUU7FXKy0tTRkZGfr777/14YcfateuXXr55Zet8Tt37tS5c+fUpEkTl+kcDocaNWqkbdu2XXL+xhglJyerbt26l2y3YsUKpaSkqHfv3pftszFGXbp00dq1a9WvXz81atRIX3/9tYYOHar//Oc/mjhx4mXncaF7771Xv/76q/75z39q4sSJKl++vCSpQoUKeU4zd+5c9e3bV02bNtWYMWOUnJysyZMna/369dq2bZv8/Pz0yiuvqGbNmpo1a5bi4+MVFham6tWrX1XfrkZSUpIkWf2XpPT0dPn4+ORoW7JkSUnS1q1bXa5pPHz4sMaOHavZs2fnOt2F5s6dq+nTp8sYo9q1a+vVV1/N89RuSkqKMjMzlZSUpEmTJiktLU3t2rW75Px79uyphx9+WJs3b1bTpk2t4f/+97/1448/avz48dawGTNmqG7duurSpYtKlCihL7/8Uk8++aSys7MVGxvrMt99+/bp/vvv1+OPP67+/furZs2auS7/X//6l5YuXaru3bsrLCxMycnJeuedd9SmTRvt2bNHISEhLu3Hjh0rd3d3Pf/880pNTdW4cePUu3dvbdy4UZL0yiuvKDU1VX/88Ye1jTqvWXr33Xf19NNP67777tMzzzyjs2fPaseOHdq4cWOez6l0/vX73nvv6f7771f//v114sQJvf/++4qOjtamTZtynIaZM2eOzp49qwEDBsjLy0sBAQHavXu3brvtNlWqVEkvvfSSSpUqpYULF6pr165avHix7rnnnjyXb/fXYo8ePVS1alWNGTNGP/74o6ZMmaLjx4+7nI557LHH9MEHH+i+++7Tc889p40bN2rMmDHau3evlixZclX1X+ipp56Sv7+/RowYoUOHDmnSpEkaNGiQFixYYLUZPny4Ro8erU6dOqlTp0766aefFBUVpYyMDJd5xcXFacyYMXrsscfUrFkzpaWlacuWLfrpp58ueU3z1q1blZGRocaNG+cYN3LkSMXFxally5aKj4+Xw+HQxo0btWbNGutymri4OI0cOVKRkZF64okntG/fPs2YMUObN2/W+vXr5enpac3v+PHj6tChg+6991716NFDn376qV588UXVr19fHTt2VO3atRUfH6/hw4drwIABuv322yVJLVu2lCStWbNGHTt2VHh4uEaMGCF3d3fNmTNHd955p77//ns1a9ZMtWvX1qhRozR06FDdd9996tKli06dOqVHHnlEtWrVUnx8vCRp0qRJ1jWFr7zyiiQpKCjoqtZfcnKyWrRoYV3fVaFCBX311Vfq16+f0tLSclyOcbn3D+n8+9ygQYN0++2369lnn9WhQ4fUtWtX+fv766abbpKkyz5PV/JcX0pqaqoyMzMvW7+3t3e+rsnMbT/mzBkX55Dw8HC5u7tr27ZtevDBB63hV7If++abbySdX6/t2rXTmjVr5OHhofbt22vGjBmqWrWqS/s6derIx8dH69evv+R7Yg5XFQOvwJUcsYuOjrYSvcPhMI8//rjLJ9JFixYZSWbdunU5pu3evbsJDg6+5Pw/+ugjIynHp5SLdevWzXh5eeX4JJebpUuXGklm9OjRLsPvu+8+4+bmZn777TdjzJUfJTDm6k7/ZGRkmMDAQFOvXj2X52rZsmVGkhk+fLg1zPnp+0pO/1woP0fsIiMjja+vr8tz+NRTTxl3d3dz6NAhl7a9evUyksygQYNcht93332mZcuW1mPlccSuZcuWZtKkSebzzz83M2bMMPXq1XP59HOxmjVrWttZ6dKlzauvvmqysrIuWU9qaqrx8vIyzz33nMvwcePGGTc3N/Pvf//bGpbb6dTo6GhTrVo1l2HOT+IrV67M0f7ioxxnz57N0ceDBw8aLy8vEx8fbw1zfkKuXbu2y6nNyZMnG0lm586d1rC8TsXefffd+To9cu7cuRynU48fP26CgoLMo48+6tJvScbX1zfHUZ927dqZ+vXruxxhys7ONi1btjQ333zzJZdv19ei84hdly5dXIY/+eSTRpL5+eefjTHGbN++3Ugyjz32mEu7559/3khyOXJ6cZ1OeR2xi4yMdDnq+eyzzxoPDw+TkpJijDHmyJEjxuFwmJiYGJd2L7/8co4jkg0bNsxxGutKvPfeezm2YWOM2b9/v3F3dzf33HNPjteIsy/O/kVFRbm0mTp1qpFkZs+ebQ1zHlH98MMPrWHp6ekmODjYdOvWzRqW1/tidna2ufnmm010dLTLc3H69GkTFhZm2rdvbw3LysoyrVq1MkFBQebvv/82sbGxpkSJEjm2i4Keiu3Xr5+pWLFijtN8vXr1MmXLlrXes670/SM9Pd2UK1fONG3a1GRmZlrt5s6dm+OI4uVOxV7Jc52X3I5+5/aX30sBctuPxcbGGg8Pj1zbV6hQwfTq1ctl2JXsx55++mkjyZQrV8506NDBLFiwwIwfP96ULl3aVK9e3Zw6dSrHsm655RbTsWPHq6qnSG53MnbsWK1atUrvv/++WrRooYyMDJ07d84af+bMGUmSl5dXjmm9vb2t8bn55ZdfFBsbq4iICPXp0yfPdmlpaVq+fLk6dep0RRdLr1ixQh4eHnr66addhj/33HMyxrh8oeNa2LJli44cOaInn3xS3t7e1vCYmBjVqlVLy5cvv6bLz80//vEPffPNNxo7dqzLc/jYY4/Jw8NDPXr00IYNG3TgwAGNGTPGOpJw4fpbu3atFi9efEXf2ly/fr2eeeYZdenSRQMHDtTWrVtVr149vfzyy7luE3PmzNHKlSs1ffp01a5dW2fOnFFWVtYll+Hr66uOHTtq4cKFOv/aPG/BggVq0aKFKleubA278FNZamqq/v77b7Vp00b/+te/lJqa6jLfsLAwRUdHX7ZGLy8v68sUWVlZOnr0qEqXLq2aNWvqp59+ytG+b9++cjgc1mPnJ+V//etfl12Wn5+f/vjjD23evPmybS/k4eFhLTM7O1vHjh2zjrDn1sdu3bq5HP06duyY1qxZox49eujEiRP6+++/9ffff+vo0aOKjo7W/v379Z///CfP5dv9tXjx0d6nnnpK0vm6L/x3yJAhLu2ee+45SSrQ8gcMGOBy8fjtt9+urKws/fvf/5Z0/ohDRkaGnnrqKZd2uX0xx8/PT7t379b+/fuvqg9Hjx6VJPn7+7sMX7p0qbKzszV8+PAcXzhy9sXZv8GDB7u06d+/v3x9fXM8N6VLl3Y56uJwONSsWbMrev1s375d+/fv1wMPPKCjR49a2/GpU6fUrl07rVu3TtnZ2ZIkd3d3zZ07VydPnlTHjh01ffp0DRs2LMfRoIIwxmjx4sXq3LmzjDFWf/7++29FR0crNTU1x+vzcu8fW7Zs0dGjR9W/f3+VKPG/E3y9e/fOsX4upyDP9VtvvaWEhITL/r3wwgtX1Scp7/3YmTNnXJ6bC12cQ650P+a8q0dwcLCWL1+uHj166Pnnn9e7776rAwcOaP78+Tmm8ff3v+pbIBXJqdgLT9U8+OCDaty4sR555BF9+umnkv63w0xPT88x7dmzZ/M8zJmUlKSYmBiVLVtWn376qTw8PPLsw+LFi3X27NkrOg0rnT8VFxISojJlyrgMr127tjX+WnLOP7dTeLVq1dIPP/xwTZd/sQULFujVV19Vv3799MQTT7iMa9CggebPn6+BAwda3/4JDg7WpEmT9MQTT1iHys+dO6enn35aDz30kMtpzyvlcDg0aNAgK+S1atXKZXxERIT1/169elnr6s0337zkfHv27KmlS5cqMTFRLVu21IEDB7R169YcL9r169drxIgRSkxM1OnTp13GpaamqmzZstbjsLCwK6opOztbkydP1vTp03Xw4EGXIFquXLkc7S8MmtL/dobHjx+/7LJefPFFffPNN2rWrJlq1KihqKgoPfDAA9Y6u5QPPvhAb731ln755ReXUyS51XnxsN9++03GGL322mt67bXXcp3/kSNHVKlSpVzH2f21ePPNN7s8rl69utzd3XXo0CFr+e7u7qpRo4ZLu+DgYPn5+RWo/sttT855X9zHChUq5NjRx8fH6+6779Ytt9yievXqqUOHDnrooYfUoEGDK+rLhR+sJOnAgQNyd3dXnTp18pwmr3XjcDhUrVq1HM/NTTfd5BJQpfM179ix47L9cwbWSx1ASE1NtZ6X6tWrKy4uTkOHDlW9evXy3Pbz67///a9SUlI0a9YszZo1K9c2R44ccXl8pev74m2tRIkSOU4bXk5Bnuvw8PCrWtaVutR+zMfHJ8flBU4X5pCr2Y85p+nRo4fLB4/u3bvroYce0oYNG/TYY4+5TGOMyfG8XU6RBLsLORwOdenSRWPHjtWZM2fk4+OjihUrSlKut07566+/clxnJJ1/AXXs2FEpKSn6/vvvc21zoXnz5qls2bK66667CqeQ/y+vFXC5I0U3koSEBD388MOKiYnRzJkzc23jvJbk559/VlZWlho3bmzdSuWWW26RJH344Yfat2+f3nnnHWun5XTixAkdOnRIgYGB1rV5uQkNDZV0/ijQpfj7++vOO+/UvHnzLhvsOnfurJIlS2rhwoVq2bKlFi5cKHd3d3Xv3t1qc+DAAbVr1061atXShAkTFBoaKofDoRUrVmjixInWJ3Wny1076PSPf/xDr732mh599FGNGjVKAQEBcnd31+DBg3PMU1KeH14u3inmpnbt2tq3b5+WLVumlStXavHixZo+fbqGDx+ukSNH5jndxx9/rEceeURdu3bV0KFDFRgYKA8PD40ZM0YHDhzI0f7i2p11PP/883kexbx4R5Ifdnkt5lXH1b7ZXyiv56Ag29PFWrdurQMHDujzzz/XqlWr9N5772nixImaOXNmjp3XhZwfYI4fP25dw3WtFKRe53Y8fvz4PG/vcfH1XqtWrZJ0/tYgR48eVXBw8FX09sr68+CDD+YZNi8O1YW5vi+nIMs6duxYniHrQj4+Pi4fqC/lcvuxihUrKisrS0eOHFFgYKA1PCMjQ0ePHrUyxtXsx5zTXHztpIeHh8qVK5frB/Ljx4/n+CB1OUUe7KTzhzyNMTpx4oR8fHxUr149lShRQlu2bHG583JGRoa2b9+e427MZ8+eVefOnfXrr7/qm2++ueQnOul8OFy7dq0eeeSRXE/35qZKlSr65ptvdOLECZcjBb/88os1XvrfJ56Lb3Sa26foq3ljds5/3759uvPOO13G7du3zxp/rW3cuFH33HOPmjRpooULF7ocnr+Yw+Fw+QTjvHDUeT+vw4cPKzMzM9cjRB9++KE+/PBDLVmyRF27ds1zGc7D+Je60N3pzJkzOU6R5qZUqVK66667tGjRIk2YMEELFizQ7bff7vJh4csvv1R6erq++OILl0+9a9euvez8L+XTTz/VHXfcoffff99leEpKisuFvVfjUttZqVKl1LNnT/Xs2VMZGRm699579frrr2vYsGEupxkv7mO1atX02Wefucx7xIgRV9SfatWqSZI8PT3zdW83u78W9+/f73KU87ffflN2drZ1hKRKlSrKzs7W/v37raOU0vkL51NSUlyW7+/vn6P+jIyMS95v9FKc896/f7+1HqXzR4ty2ykFBASob9++6tu3r06ePKnWrVsrLi7uksGuVq1akqSDBw+qfv361vDq1asrOztbe/bsyTNIXbhuLuxfRkaGDh48mK/tLa9tw/lFGF9f3yua78yZM5WQkKDXX39dY8aM0eOPP67PP//8ipZ1JSpUqKAyZcooKyur0O6Z6Hw+f/vtN91xxx3W8HPnzunQoUMuQbEgfb+ce++9V999991l2/Xp0+eKbvB8Jfsx5za2ZcsWderUyRq+ZcsWZWdnW+OvZj/mPPJ48aUmzi+TXrwfO3funH7//Xd16dLlsjVd6LpeY3fxYWDp/Jvu4sWLFRoaaqXismXLKjIyUh9//LFOnDhhtf3oo4908uRJlyMnWVlZ6tmzpxITE7Vo0SKX0295+eSTT5SdnX3Fp2ElqVOnTsrKytLUqVNdhk+cOFFubm7Wt3p8fX1Vvnx5rVu3zqXd9OnTc8zTeQf6K7nbfZMmTRQYGKiZM2e6nKL+6quvtHfvXsXExFxxLfnlXE7VqlW1bNmyKz4KJZ3fEcycOVN33XWXdcSuV69eWrJkSY4/6fzzvWTJEjVv3lzS+R3HxU6cOKFJkyapfPnyLofqc9vODh06pNWrV1/xNS09e/bUn3/+qffee08///yzevbs6TLe+enzwk+bqampmjNnzhXNPy8eHh45PsEuWrToktecXU6pUqVyDbTOa5mcHA6H6tSpI2PMJb+BllvtGzduvOzNpZ0CAwPVtm1bvfPOO7kGjNzW9YXs/lqcNm2ay2Pnne6ddTl3MhdfGjBhwgRJcll+9erVc9Q/a9asfB+1jIyMlKenp95++22X9Z/btUUXb1+lS5dWjRo1cr3E5kLh4eFyOBzasmWLy/CuXbvK3d1d8fHxOY5eO/sSGRkph8OhKVOmuPTv/fffV2pqar7WTV7bRnh4uKpXr64333wz1589u3A7PnjwoIYOHapu3brp5Zdf1ptvvqkvvvgix41nS5Uqle9fP/Hw8FC3bt20ePHiXH9l53Kvq9w0adJE5cqV07vvvutyHfy8efNyBPmreQ1drcK8xu5K92N33nmnAgICNGPGDJfhM2bMUMmSJa1t6Wr2Y23btlVgYKDmzZvn8ksXc+fOVVZWVo5vi+/Zs0dnz551+XbxlSiUI3Zffvml9bMXmZmZ2rFjh0aPHi1J6tKli5XqO3bsqJtuuknNmzdXYGCgDh8+rDlz5ujPP/90+Tq9JL3++utq2bKl2rRpowEDBuiPP/7QW2+9paioKJdfqHjuuef0xRdfqHPnzjp27Jg+/vhjl/lceLGm07x58xQSEqK2bdtecY2dO3fWHXfcoVdeeUWHDh1Sw4YNtWrVKn3++ecaPHiwy20MHnvsMY0dO1aPPfaYmjRponXr1uX6E2fOMPLKK6+oV69e8vT0VOfOnXP9ySFPT0+98cYb6tu3r9q0aaP777/fusVC1apV9eyzz15xLRebOnWqUlJS9Oeff0o6vz7/+OMPSecv3C5btqxOnDih6OhoHT9+XEOHDs1xEXL16tVdQnWdOnWsn5k7ePCgZsyYoYCAAJdD3rVq1bI+nV8sLCzM5UjdtGnTtHTpUnXu3FmVK1fWX3/9pdmzZ+vw4cP66KOPXC5yrV+/vtq1a6dGjRrJ399f+/fv1/vvv6/MzMwcPz+Wl06dOqlMmTJ6/vnnrTfMC0VFRcnhcKhz5856/PHHdfLkSb377rsKDAzM99EQSbrrrrsUHx+vvn37qmXLltq5c6fmzZvncvThaoWHh2vBggUaMmSImjZtqtKlS6tz586KiopScHCwbrvtNgUFBWnv3r2aOnWqYmJicly/dnEfP/vsM91zzz2KiYnRwYMHNXPmTNWpU+eKf9dz2rRpatWqlerXr6/+/furWrVqSk5OVmJiov74449L/oyOnV+L0vkQ0KVLF3Xo0EGJiYn6+OOP9cADD1i/mNKwYUP16dNHs2bNUkpKitq0aaNNmzbpgw8+UNeuXV2OrDz22GPWnfPbt2+vn3/+WV9//XW+j/5WqFBBzz//vMaMGaO77rpLnTp10rZt2/TVV1/lmGedOnXUtm1bhYeHKyAgQFu2bNGnn36a588sOXl7eysqKkrffPONdSsQ6fzp+VdeeUWjRo3S7bffrnvvvVdeXl7avHmzQkJCNGbMGFWoUEHDhg3TyJEj1aFDB3Xp0kX79u3T9OnT1bRp01z3B5dTvXp1+fn5aebMmSpTpoxKlSql5s2bKywsTO+99546duyounXrqm/fvqpUqZL+85//aO3atfL19dWXX34pY4weffRR+fj4WCHh8ccf1+LFi/XMM88oMjLSOhsQHh6uGTNmaPTo0apRo4YCAwNzHBW+lLFjx2rt2rVq3ry5+vfvrzp16ujYsWP66aef9M0331z2kpWLORwOxcXF6amnntKdd96pHj166NChQ5o7d66qV6/ucpTuUs9TQRXWNXZXsx/z8fHRqFGjFBsbq+7duys6Olrff/+9Pv74Y73++usKCAiQdHX7MS8vL40fP159+vRR69at9dBDD+nw4cOaPHmytU1fKCEhQSVLlrz6nzy9qu/Q5qFPnz55fv34wq8+T5061bRq1cqUL1/elChRwlSoUMF07tw519uaGHP+Bo4tW7Y03t7epkKFCiY2NtakpaW5tLnc16Av9ssvvxhJZsiQIVdd54kTJ8yzzz5rQkJCjKenp7n55ptz3BTVmPNfd+/Xr58pW7asKVOmjOnRo4c5cuRIrrcecN7o093d/Yput7BgwQJz6623Wjf8vPimqMZc/e1OcrsxpvPP2R/nrSPy+rv4a+a9evUyoaGhxuFwmJCQEDNw4ECTnJx8Rf1RLl8TX7VqlWnfvr0JDg42np6exs/Pz0RFRbn8uoXTiBEjTJMmTYy/v78pUaKECQkJMb169TI7duy4ouU79e7d27oFRG6++OIL06BBA+Pt7W2qVq1q3njjDTN79uwc69F5g+Lc5Ha7k+eee85UrFjR+Pj4mNtuu80kJia63AzUmP/drmDRokUu88vtFh8nT540DzzwgPHz8zPS/25Q/M4775jWrVubcuXKGS8vL1O9enUzdOhQl197yU12drb5xz/+YapUqWK8vLzMrbfeapYtW2b69OnjcluVC29QnJsDBw6Yhx9+2FqnlSpVMnfddZf59NNPL7l8Y+z5WnTe7mTPnj3mvvvuM2XKlDH+/v5m0KBBud6geOTIkSYsLMx4enqa0NDQXG9QnJWVZV588UXrhsPR0dHmt99+y/N2Jxf307mdrV271mWeI0eOtLbRvG5QPHr0aNOsWTPj5+dnfHx8TK1atczrr79+yRuyOn322WfGzc3NHD58OMe42bNnW8+7v7+/adOmjcsvBhhzfl9Tq1Yt4+npaYKCgswTTzyR5w2KL3bxdmyMMZ9//rmpU6eOKVGiRI7X17Zt28y9995rvY6qVKlievToYb03OW8hsnjxYpd5Hj582Pj6+ppOnTpZw5KSkkxMTIwpU6ZMjtuJ5Ca37Tg5OdnExsaa0NBQ4+npaYKDg027du3MrFmzrDZX8/5hzPkbmTtf782aNTPr16834eHhpkOHDlf0PF3Nc30tXe1+zBhjZs2aZWrWrGkcDoepXr26mThxYo73mdzkth9z+uc//2kaNmxovLy8TFBQkBk0aFCObGOMMc2bNzcPPvjgVdfp9v87AABAsZCVlaU6deqoR48eGjVqVFF3BxfJzs5WhQoVdO+99+rdd98t6u7Y0vbt29W4cWP99NNPV/3bu0VyHzsAAPLi4eGh+Ph4TZs27YpP7+PaOHv2bI7rfj/88EMdO3bsqi5nwtUZO3as7rvvvqsOdZLEETsAAJCrb7/9Vs8++6y6d++ucuXK6aefftL777+v2rVra+vWrXnexBdFp1jc7gQAABQ/VatWVWhoqKZMmaJjx44pICBADz/8sMaOHUuoK6Y4YgcAAGATXGMHAABgEwQ7AAAAm+AaO5vLzs7Wn3/+qTJlylzTn3wBABRv5v//dGdISIjLj9DDXgh2Nvfnn38qNDS0qLsBACgmfv/9d910001F3Q1cIwQ7m3P+NNTvv/8uX1/fIu5N/mRmZmrVqlWKioqSp6dnUXenwOxUj51qkainuKOegklLS1NoaOglfzIQNz6Cnc05T7/6+vre0MGuZMmS8vX1tc2buV3qsVMtEvUUd9RTOLgsx944yQ4AAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyiRFF3ADeOqi8tL5LlenkYjWsm1Yv7WulZbpKkQ2NjiqQvAAAUZxyxAwAAsAmCXS7WrVunzp07KyQkRG5ublq6dKnLeGOMhg8frooVK8rHx0eRkZHav3+/S5tjx46pd+/e8vX1lZ+fn/r166eTJ0+6tNmxY4duv/12eXt7KzQ0VOPGjcvRl0WLFqlWrVry9vZW/fr1tWLFikKvFwAA2APBLhenTp1Sw4YNNW3atFzHjxs3TlOmTNHMmTO1ceNGlSpVStHR0Tp79qzVpnfv3tq9e7cSEhK0bNkyrVu3TgMGDLDGp6WlKSoqSlWqVNHWrVs1fvx4xcXFadasWVabDRs26P7771e/fv20bds2de3aVV27dtWuXbuuXfEAAOCGxTV2uejYsaM6duyY6zhjjCZNmqRXX31Vd999tyTpww8/VFBQkJYuXapevXpp7969WrlypTZv3qwmTZpIkt5++2116tRJb775pkJCQjRv3jxlZGRo9uzZcjgcqlu3rrZv364JEyZYAXDy5Mnq0KGDhg4dKkkaNWqUEhISNHXqVM2cOfM6PBMAAOBGQrC7SgcPHlRSUpIiIyOtYWXLllXz5s2VmJioXr16KTExUX5+flaok6TIyEi5u7tr48aNuueee5SYmKjWrVvL4XBYbaKjo/XGG2/o+PHj8vf3V2JiooYMGeKy/Ojo6Bynhi+Unp6u9PR063FaWpokKTMzU5mZmQWq3cvDFGj6fC/X3bj8K6nAtRQlZ99v5Bqc7FSLRD3FHfUUzvJgbwS7q5SUlCRJCgoKchkeFBRkjUtKSlJgYKDL+BIlSiggIMClTVhYWI55OMf5+/srKSnpksvJzZgxYzRy5Mgcw1etWqWSJUteSYl5GtesQJMX2Kgm2db/7XCtYUJCQlF3odDYqRaJeoo76smf06dPX5floGgR7Gxm2LBhLkf50tLSFBoaqqioKPn6+hZo3vXivi5o9/LFy91oVJNsvbbFXenZ5293sisuukj6UhgyMzOVkJCg9u3by9PTs6i7UyB2qkWinuKOegrGeQYH9kawu0rBwcGSpOTkZFWsWNEanpycrEaNGlltjhw54jLduXPndOzYMWv64OBgJScnu7RxPr5cG+f43Hh5ecnLyyvHcE9PzwK/cTjvIVdU0rPdrD7Y4U29MNZJcWGnWiTqKe6oJ//Lgf3xrdirFBYWpuDgYK1evdoalpaWpo0bNyoiIkKSFBERoZSUFG3dutVqs2bNGmVnZ6t58+ZWm3Xr1rlc85CQkKCaNWvK39/fanPhcpxtnMsBAAC4EMEuFydPntT27du1fft2See/MLF9+3YdPnxYbm5uGjx4sEaPHq0vvvhCO3fu1MMPP6yQkBB17dpVklS7dm116NBB/fv316ZNm7R+/XoNGjRIvXr1UkhIiCTpgQcekMPhUL9+/bR7924tWLBAkydPdjmN+swzz2jlypV666239MsvvyguLk5btmzRoEGDrvdTAgAAbgCcis3Fli1bdMcdd1iPnWGrT58+mjt3rl544QWdOnVKAwYMUEpKilq1aqWVK1fK29vbmmbevHkaNGiQ2rVrJ3d3d3Xr1k1TpkyxxpctW1arVq1SbGyswsPDVb58eQ0fPtzlXnctW7bU/Pnz9eqrr+rll1/WzTffrKVLl6pevXrX4VkAAAA3GoJdLtq2bStj8r61h5ubm+Lj4xUfH59nm4CAAM2fP/+Sy2nQoIG+//77S7bp3r27unfvfukOAwAAiFOxAAAAtkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBLt8ysrK0muvvaawsDD5+PioevXqGjVqlIwxVhtjjIYPH66KFSvKx8dHkZGR2r9/v8t8jh07pt69e8vX11d+fn7q16+fTp486dJmx44duv322+Xt7a3Q0FCNGzfuutQIAABuLAS7fHrjjTc0Y8YMTZ06VXv37tUbb7yhcePG6e2337bajBs3TlOmTNHMmTO1ceNGlSpVStHR0Tp79qzVpnfv3tq9e7cSEhK0bNkyrVu3TgMGDLDGp6WlKSoqSlWqVNHWrVs1fvx4xcXFadasWde1XgAAUPyVKOoO3Kg2bNigu+++WzExMZKkqlWr6p///Kc2bdok6fzRukmTJunVV1/V3XffLUn68MMPFRQUpKVLl6pXr17au3evVq5cqc2bN6tJkyaSpLfffludOnXSm2++qZCQEM2bN08ZGRmaPXu2HA6H6tatq+3bt2vChAkuARAAAIBgl08tW7bUrFmz9Ouvv+qWW27Rzz//rB9++EETJkyQJB08eFBJSUmKjIy0pilbtqyaN2+uxMRE9erVS4mJifLz87NCnSRFRkbK3d1dGzdu1D333KPExES1bt1aDofDahMdHa033nhDx48fl7+/v0u/0tPTlZ6ebj1OS0uTJGVmZiozM7NANXt5mMs3uga83I3Lv5IKXEtRcvb9Rq7ByU61SNRT3FFP4SwP9kawy6eXXnpJaWlpqlWrljw8PJSVlaXXX39dvXv3liQlJSVJkoKCglymCwoKssYlJSUpMDDQZXyJEiUUEBDg0iYsLCzHPJzjLg52Y8aM0ciRI3P0d9WqVSpZsmR+y5UkjWtWoMkLbFSTbOv/K1asKMKeFI6EhISi7kKhsVMtEvUUd9STP6dPn74uy0HRItjl08KFCzVv3jzNnz/fOj06ePBghYSEqE+fPkXWr2HDhmnIkCHW47S0NIWGhioqKkq+vr4Fmne9uK8L2r188XI3GtUkW69tcVd6tpskaVdcdJH0pTBkZmYqISFB7du3l6enZ1F3p0DsVItEPcUd9RSM8wwO7I1gl09Dhw7VSy+9pF69ekmS6tevr3//+98aM2aM+vTpo+DgYElScnKyKlasaE2XnJysRo0aSZKCg4N15MgRl/meO3dOx44ds6YPDg5WcnKySxvnY2ebC3l5ecnLyyvHcE9PzwK/caRnuRVo+oJKz3az+mCHN/XCWCfFhZ1qkainuKOe/C8H9se3YvPp9OnTcnd3ffo8PDyUnX3+dGFYWJiCg4O1evVqa3xaWpo2btyoiIgISVJERIRSUlK0detWq82aNWuUnZ2t5s2bW23WrVvncm1EQkKCatasmeM0LAAA+L+NYJdPnTt31uuvv67ly5fr0KFDWrJkiSZMmKB77rlHkuTm5qbBgwdr9OjR+uKLL7Rz5049/PDDCgkJUdeuXSVJtWvXVocOHdS/f39t2rRJ69ev16BBg9SrVy+FhIRIkh544AE5HA7169dPu3fv1oIFCzR58mSX060AAAASp2Lz7e2339Zrr72mJ598UkeOHFFISIgef/xxDR8+3Grzwgsv6NSpUxowYIBSUlLUqlUrrVy5Ut7e3labefPmadCgQWrXrp3c3d3VrVs3TZkyxRpftmxZrVq1SrGxsQoPD1f58uU1fPhwbnUCAAByINjlU5kyZTRp0iRNmjQpzzZubm6Kj49XfHx8nm0CAgI0f/78Sy6rQYMG+v777/PbVQAA8H8Ep2IBAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgXwn//8Rw8++KDKlSsnHx8f1a9fX1u2bLHGG2M0fPhwVaxYUT4+PoqMjNT+/ftd5nHs2DH17t1bvr6+8vPzU79+/XTy5EmXNjt27NDtt98ub29vhYaGaty4cdelPgAAcGMh2OXT8ePHddttt8nT01NfffWV9uzZo7feekv+/v5Wm3HjxmnKlCmaOXOmNm7cqFKlSik6Olpnz5612vTu3Vu7d+9WQkKCli1bpnXr1mnAgAHW+LS0NEVFRalKlSraunWrxo8fr7i4OM2aNeu61gsAAIq/EkXdgRvVG2+8odDQUM2ZM8caFhYWZv3fGKNJkybp1Vdf1d133y1J+vDDDxUUFKSlS5eqV69e2rt3r1auXKnNmzerSZMmkqS3335bnTp10ptvvqmQkBDNmzdPGRkZmj17thwOh+rWravt27drwoQJLgEQAACAYJdPX3zxhaKjo9W9e3d99913qlSpkp588kn1799fknTw4EElJSUpMjLSmqZs2bJq3ry5EhMT1atXLyUmJsrPz88KdZIUGRkpd3d3bdy4Uffcc48SExPVunVrORwOq010dLTeeOMNHT9+3OUIoSSlp6crPT3depyWliZJyszMVGZmZoFq9vIwBZo+38t1Ny7/SipwLUXJ2fcbuQYnO9UiUU9xRz2FszzYG8Eun/71r39pxowZGjJkiF5++WVt3rxZTz/9tBwOh/r06aOkpCRJUlBQkMt0QUFB1rikpCQFBga6jC9RooQCAgJc2lx4JPDCeSYlJeUIdmPGjNHIkSNz9HfVqlUqWbJkASqWxjUr0OQFNqpJtvX/FStWFGFPCkdCQkJRd6HQ2KkWiXqKO+rJn9OnT1+X5aBoEezyKTs7W02aNNE//vEPSdKtt96qXbt2aebMmerTp0+R9WvYsGEaMmSI9TgtLU2hoaGKioqSr69vgeZdL+7rgnYvX7zcjUY1ydZrW9yVnu0mSdoVF10kfSkMmZmZSkhIUPv27eXp6VnU3SkQO9UiUU9xRz0F4zyDA3sj2OVTxYoVVadOHZdhtWvX1uLFiyVJwcHBkqTk5GRVrFjRapOcnKxGjRpZbY4cOeIyj3PnzunYsWPW9MHBwUpOTnZp43zsbHMhLy8veXl55Rju6elZ4DeO9Cy3Ak1fUOnZblYf7PCmXhjrpLiwUy0S9RR31JP/5cD++FZsPt12223at2+fy7Bff/1VVapUkXT+ixTBwcFavXq1NT4tLU0bN25URESEJCkiIkIpKSnaunWr1WbNmjXKzs5W8+bNrTbr1q1zuTYiISFBNWvWzHEaFgAA/N9GsMunZ599Vj/++KP+8Y9/6LffftP8+fM1a9YsxcbGSpLc3Nw0ePBgjR49Wl988YV27typhx9+WCEhIeratauk80f4OnTooP79+2vTpk1av369Bg0apF69eikkJESS9MADD8jhcKhfv37avXu3FixYoMmTJ7ucbgUAAJA4FZtvTZs21ZIlSzRs2DDFx8crLCxMkyZNUu/eva02L7zwgk6dOqUBAwYoJSVFrVq10sqVK+Xt7W21mTdvngYNGqR27drJ3d1d3bp105QpU6zxZcuW1apVqxQbG6vw8HCVL19ew4cP51YnAAAgB4JdAdx1112666678hzv5uam+Ph4xcfH59kmICBA8+fPv+RyGjRooO+//z7f/QQAAP83cCoWAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYFcIxo4dKzc3Nw0ePNgadvbsWcXGxqpcuXIqXbq0unXrpuTkZJfpDh8+rJiYGJUsWVKBgYEaOnSozp0759Lm22+/VePGjeXl5aUaNWpo7ty516EiAABwIyLYFdDmzZv1zjvvqEGDBi7Dn332WX355ZdatGiRvvvuO/3555+69957rfFZWVmKiYlRRkaGNmzYoA8++EBz587V8OHDrTYHDx5UTEyM7rjjDm3fvl2DBw/WY489pq+//vq61QcAAG4cBLsCOHnypHr37q13331X/v7+1vDU1FS9//77mjBhgu68806Fh4drzpw52rBhg3788UdJ0qpVq7Rnzx59/PHHatSokTp27KhRo0Zp2rRpysjIkCTNnDlTYWFheuutt1S7dm0NGjRI9913nyZOnFgk9QIAgOKtRFF34EYWGxurmJgYRUZGavTo0dbwrVu3KjMzU5GRkdawWrVqqXLlykpMTFSLFi2UmJio+vXrKygoyGoTHR2tJ554Qrt379att96qxMREl3k421x4yvdi6enpSk9Ptx6npaVJkjIzM5WZmVmger08TIGmz/dy3Y3Lv5IKXEtRcvb9Rq7ByU61SNRT3FFP4SwP9kawy6dPPvlEP/30kzZv3pxjXFJSkhwOh/z8/FyGBwUFKSkpyWpzYahzjneOu1SbtLQ0nTlzRj4+PjmWPWbMGI0cOTLH8FWrVqlkyZJXXmAuxjUr0OQFNqpJtvX/FStWFGFPCkdCQkJRd6HQ2KkWiXqKO+rJn9OnT1+X5aBoEezy4ffff9czzzyjhIQEeXt7F3V3XAwbNkxDhgyxHqelpSk0NFRRUVHy9fUt0LzrxRXNtX1e7kajmmTrtS3uSs92kyTtiosukr4UhszMTCUkJKh9+/by9PQs6u4UiJ1qkainuKOegnGewYG9EezyYevWrTpy5IgaN25sDcvKytK6des0depUff3118rIyFBKSorLUbvk5GQFBwdLkoKDg7Vp0yaX+Tq/NXthm4u/SZucnCxfX99cj9ZJkpeXl7y8vHIM9/T0LPAbR3qWW4GmL6j0bDerD3Z4Uy+MdVJc2KkWiXqKO+rJ/3Jgf3x5Ih/atWunnTt3avv27dZfkyZN1Lt3b+v/np6eWr16tTXNvn37dPjwYUVEREiSIiIitHPnTh05csRqk5CQIF9fX9WpU8dqc+E8nG2c8wAAALgQR+zyoUyZMqpXr57LsFKlSqlcuXLW8H79+mnIkCEKCAiQr6+vnnrqKUVERKhFixaSpKioKNWpU0cPPfSQxo0bp6SkJL366quKjY21jrgNHDhQU6dO1QsvvKBHH31Ua9as0cKFC7V8+fLrWzAAALghEOyukYkTJ8rd3V3dunVTenq6oqOjNX36dGu8h4eHli1bpieeeEIREREqVaqU+vTpo/j4eKtNWFiYli9frmeffVaTJ0/WTTfdpPfee0/R0Tfu9WUAAODaIdgVkm+//dblsbe3t6ZNm6Zp06blOU2VKlUu++3Otm3batu2bYXRRQAAYHNcYwcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwy6cxY8aoadOmKlOmjAIDA9W1a1ft27fPpc3Zs2cVGxurcuXKqXTp0urWrZuSk5Nd2hw+fFgxMTEqWbKkAgMDNXToUJ07d86lzbfffqvGjRvLy8tLNWrU0Ny5c691eQAA4AZEsMun7777TrGxsfrxxx+VkJCgzMxMRUVF6dSpU1abZ599Vl9++aUWLVqk7777Tn/++afuvfdea3xWVpZiYmKUkZGhDRs26IMPPtDcuXM1fPhwq83BgwcVExOjO+64Q9u3b9fgwYP12GOP6euvv76u9QIAgOKvRFF34Ea1cuVKl8dz585VYGCgtm7dqtatWys1NVXvv/++5s+frzvvvFOSNGfOHNWuXVs//vijWrRooVWrVmnPnj365ptvFBQUpEaNGmnUqFF68cUXFRcXJ4fDoZkzZyosLExvvfWWJKl27dr64YcfNHHiREVHR1/3ugEAQPFFsCskqampkqSAgABJ0tatW5WZmanIyEirTa1atVS5cmUlJiaqRYsWSkxMVP369RUUFGS1iY6O1hNPPKHdu3fr1ltvVWJioss8nG0GDx6caz/S09OVnp5uPU5LS5MkZWZmKjMzs0A1enmYAk2f7+W6G5d/JRW4lqLk7PuNXIOTnWqRqKe4o57CWR7sjWBXCLKzszV48GDddtttqlevniQpKSlJDodDfn5+Lm2DgoKUlJRktbkw1DnHO8ddqk1aWprOnDkjHx8fl3FjxozRyJEjc/Rx1apVKlmyZP6LlDSuWYEmL7BRTbKt/69YsaIIe1I4EhISiroLhcZOtUjUU9xRT/6cPn36uiwHRYtgVwhiY2O1a9cu/fDDD0XdFQ0bNkxDhgyxHqelpSk0NFRRUVHy9fUt0LzrxRXNdX1e7kajmmTrtS3uSs92kyTtirtxT0NnZmYqISFB7du3l6enZ1F3p0DsVItEPcUd9RSM8wwO7I1gV0CDBg3SsmXLtG7dOt10003W8ODgYGVkZCglJcXlqF1ycrKCg4OtNps2bXKZn/Nbsxe2ufibtMnJyfL19c1xtE6SvLy85OXllWO4p6dngd840rPcCjR9QaVnu1l9sMObemGsk+LCTrVI1FPcUU/+lwP741ux+WSM0aBBg7RkyRKtWbNGYWFhLuPDw8Pl6emp1atXW8P27dunw4cPKyIiQpIUERGhnTt36siRI1abhIQE+fr6qk6dOlabC+fhbOOcBwAAgBNH7PIpNjZW8+fP1+eff64yZcpY18SVLVtWPj4+Klu2rPr166chQ4YoICBAvr6+euqppxQREaEWLVpIkqKiolSnTh099NBDGjdunJKSkvTqq68qNjbWOuo2cOBATZ06VS+88IIeffRRrVmzRgsXLtTy5cuLrHYAAFA8ccQun2bMmKHU1FS1bdtWFStWtP4WLFhgtZk4caLuuusudevWTa1bt1ZwcLA+++wza7yHh4eWLVsmDw8PRURE6MEHH9TDDz+s+Ph4q01YWJiWL1+uhIQENWzYUG+99Zbee+89bnUCAABy4IhdPhlz+Vt/eHt7a9q0aZo2bVqebapUqXLZb3i2bdtW27Ztu+o+AgCA/1s4YgcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbKFHUHQDyo+pLy4u6C5ZDY2OKugsAAEjiiB0AAIBtcMQOAFBsXOpovJeH0bhmUr24r5We5XbN+8LReNyICHYA/k+4ktP31zs4XGt2qwfA5RHsAFwzxelaSOBqXevt92qCN0cPcaW4xg4AAMAmOGIH2Mz1PErGqT4AKF44YgcAAGATBDsAAACbINgBAADYBNfYAQV0tde0cV0aAOBa4YjdDWLatGmqWrWqvL291bx5c23atKmouwQAAIoZgt0NYMGCBRoyZIhGjBihn376SQ0bNlR0dLSOHDlS1F0DAADFCMHuBjBhwgT1799fffv2VZ06dTRz5kyVLFlSs2fPLuquAQCAYoRr7Iq5jIwMbd26VcOGDbOGubu7KzIyUomJiTnap6enKz093XqcmpoqSTp27JgyMzML1JcS504VaPp8Lzfb6PTpbJXIdFdW9o1/TZqd6rFTLRL1FHf/l+s5evRogZd34sQJSZIxpsDzQvFFsCvm/v77b2VlZSkoKMhleFBQkH755Zcc7ceMGaORI0fmGB4WFnbN+ng9PFDUHShkdqrHTrVI1FPc/V+tp/xbhbfMEydOqGzZsoU3QxQrBDubGTZsmIYMGWI9zs7O1rFjx1SuXDm5ud2Yn3DT0tIUGhqq33//Xb6+vkXdnQKzUz12qkWinuKOegrGGKMTJ04oJCTkmi8LRYdgV8yVL19eHh4eSk5OdhmenJys4ODgHO29vLzk5eXlMszPz+9advG68fX1tcWbuZOd6rFTLRL1FHfUk38cqbM/vjxRzDkcDoWHh2v16tXWsOzsbK1evVoRERFF2DMAAFDccMTuBjBkyBD16dNHTZo0UbNmzTRp0iSdOnVKffv2LequAQCAYoRgdwPo2bOn/vvf/2r48OFKSkpSo0aNtHLlyhxfqLArLy8vjRgxIscp5huVneqxUy0S9RR31ANcnpvhe88AAAC2wDV2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYIdrLi4uTm5ubi5/tWrVssafPXtWsbGxKleunEqXLq1u3brluCHz4cOHFRMTo5IlSyowMFBDhw7VuXPnXNp8++23aty4sby8vFSjRg3NnTu3UPq/bt06de7cWSEhIXJzc9PSpUtdxhtjNHz4cFWsWFE+Pj6KjIzU/v37XdocO3ZMvXv3lq+vr/z8/NSvXz+dPHnSpc2OHTt0++23y9vbW6GhoRo3blyOvixatEi1atWSt7e36tevrxUrVhR6PY888kiO9dWhQ4diWc+YMWPUtGlTlSlTRoGBgeratav27dvn0uZ6bl/Tpk1T1apV5e3trebNm2vTpk2FXk/btm1zrJ+BAwcWy3pmzJihBg0aWDfgjYiI0FdffWWNv5HWzZXUcyOtG9iYAa6xESNGmLp165q//vrL+vvvf/9rjR84cKAJDQ01q1evNlu2bDEtWrQwLVu2tMafO3fO1KtXz0RGRppt27aZFStWmPLly5thw4ZZbf71r3+ZkiVLmiFDhpg9e/aYt99+23h4eJiVK1cWuP8rVqwwr7zyivnss8+MJLNkyRKX8WPHjjVly5Y1S5cuNT///LPp0qWLCQsLM2fOnLHadOjQwTRs2ND8+OOP5vvvvzc1atQw999/vzU+NTXVBAUFmd69e5tdu3aZf/7zn8bHx8e88847Vpv169cbDw8PM27cOLNnzx7z6quvGk9PT7Nz585CradPnz6mQ4cOLuvr2LFjLm2KSz3R0dFmzpw5ZteuXWb79u2mU6dOpnLlyubkyZNWm+u1fX3yySfG4XCY2bNnm927d5v+/fsbPz8/k5ycXKj1tGnTxvTv399l/aSmphbLer744guzfPly8+uvv5p9+/aZl19+2Xh6eppdu3YZY26sdXMl9dxI6wb2RbDDNTdixAjTsGHDXMelpKQYT09Ps2jRImvY3r17jSSTmJhojDkfRNzd3U1SUpLVZsaMGcbX19ekp6cbY4x54YUXTN26dV3m3bNnTxMdHV2otVwchLKzs01wcLAZP368S01eXl7mn//8pzHGmD179hhJZvPmzVabr776yri5uZn//Oc/xhhjpk+fbvz9/a16jDHmxRdfNDVr1rQe9+jRw8TExLj0p3nz5ubxxx8vtHqMOR/s7r777jynKc71HDlyxEgy3333nTHm+m5fzZo1M7GxsdbjrKwsExISYsaMGVNo9RhzPjw888wzeU5TnOsxxhh/f3/z3nvv3fDr5uJ6jLnx1w3sgVOxuC7279+vkJAQVatWTb1799bhw4clSVu3blVmZqYiIyOttrVq1VLlypWVmJgoSUpMTFT9+vVdbsgcHR2ttLQ07d6922pz4TycbZzzuFYOHjyopKQkl2WXLVtWzZs3d+m/n5+fmjRpYrWJjIyUu7u7Nm7caLVp3bq1HA6HS//37dun48ePW22uV43ffvutAgMDVbNmTT3xxBM6evSoNa4415OamipJCggIkHT9tq+MjAxt3brVpY27u7siIyMLtR6nefPmqXz58qpXr56GDRum06dPW+OKaz1ZWVn65JNPdOrUKUVERNzw6+biepxuxHUDe+GXJ3DNNW/eXHPnzlXNmjX1119/aeTIkbr99tu1a9cuJSUlyeFwyM/Pz2WaoKAgJSUlSZKSkpJy/MqG8/Hl2qSlpenMmTPy8fG5JrU5l5/bsi/sW2BgoMv4EiVKKCAgwKVNWFhYjnk4x/n7++dZo3MehaVDhw669957FRYWpgMHDujll19Wx44dlZiYKA8Pj2JbT3Z2tgYPHqzbbrtN9erVs5Z1Pbav48ePKysrK9c2v/zyS6HVI0kPPPCAqlSpopCQEO3YsUMvvvii9u3bp88++6xY1rNz505FRETo7NmzKl26tJYsWaI6depo+/btN+S6yase6cZbN7Angh2uuY4dO1r/b9CggZo3b64qVapo4cKF1yxwIf969epl/b9+/fpq0KCBqlevrm+//Vbt2rUrwp5dWmxsrHbt2qUffvihqLtSKPKqZ8CAAdb/69evr4oVK6pdu3Y6cOCAqlevfr27eVk1a9bU9u3blZqaqk8//VR9+vTRd999V9Tdyre86qlTp84Nt25gT5yKxXXn5+enW265Rb/99puCg4OVkZGhlJQUlzbJyckKDg6WJAUHB+f4ppzz8eXa+Pr6XtPw6Fx+bsu+sG9HjhxxGX/u3DkdO3asUGp0jr9WqlWrpvLly+u3336z+lHc6hk0aJCWLVumtWvX6qabbrKGX6/tq3z58vLw8Ljm9eSmefPmkuSyfopTPQ6HQzVq1FB4eLjGjBmjhg0bavLkyTfsusmrntwU93UDeyLY4bo7efKkDhw4oIoVKyo8PFyenp5avXq1NX7fvn06fPiwdd1KRESEdu7c6RImEhIS5Ovra50CiYiIcJmHs82F175cC2FhYQoODnZZdlpamjZu3OjS/5SUFG3dutVqs2bNGmVnZ1tv/BEREVq3bp0yMzNd+l+zZk35+/tbbYqixj/++ENHjx5VxYoVi109xhgNGjRIS5Ys0Zo1a3Kc/r1e25fD4VB4eLhLm+zsbK1evbpQ68nN9u3bJcll/RSXenKTnZ2t9PT0G27dXK6e3Nxo6wY2UdTf3oD9Pffcc+bbb781Bw8eNOvXrzeRkZGmfPny5siRI8aY87c8qFy5slmzZo3ZsmWLiYiIMBEREdb0zlsEREVFme3bt5uVK1eaChUq5HqLgKFDh5q9e/eaadOmFdrtTk6cOGG2bdtmtm3bZiSZCRMmmG3btpl///vfxpjztzvx8/Mzn3/+udmxY4e5++67c73dya233mo2btxofvjhB3PzzTe73B4kJSXFBAUFmYceesjs2rXLfPLJJ6ZkyZI5bg9SokQJ8+abb5q9e/eaESNG5Ot2J5eq58SJE+b55583iYmJ5uDBg+abb74xjRs3NjfffLM5e/ZssavniSeeMGXLljXffvutyy0mTp8+bbW5XtvXJ598Yry8vMzcuXPNnj17zIABA4yfn5/LNyALWs9vv/1m4uPjzZYtW8zBgwfN559/bqpVq2Zat25dLOt56aWXzHfffWcOHjxoduzYYV566SXj5uZmVq1adcOtm8vVc6OtG9gXwQ7XXM+ePU3FihWNw+EwlSpVMj179jS//fabNf7MmTPmySefNP7+/qZkyZLmnnvuMX/99ZfLPA4dOmQ6duxofHx8TPny5c1zzz1nMjMzXdqsXbvWNGrUyDgcDlOtWjUzZ86cQun/2rVrjaQcf3369DHGnL/lyWuvvWaCgoKMl5eXadeundm3b5/LPI4ePWruv/9+U7p0aePr62v69u1rTpw44dLm559/Nq1atTJeXl6mUqVKZuzYsTn6snDhQnPLLbcYh8Nh6tata5YvX16o9Zw+fdpERUWZChUqGE9PT1OlShXTv3//HDuM4lJPbnVIcln313P7evvtt03lypWNw+EwzZo1Mz/++GOh1nP48GHTunVrExAQYLy8vEyNGjXM0KFDXe6VVpzqefTRR02VKlWMw+EwFSpUMO3atbNCnTE31rq5XD032rqBfbkZY8z1Oz4IAACAa4Vr7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYxP8DF+7GYo5LDP8AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "context_to_check = 2046\n", + "checks = check_mutation_positions(result_df.to_pandas(), context_to_check)\n", + "checks[checks[\"out_of_bounds\"]].codon_position.hist(figsize=(5, 5))\n", + "plt.title(\n", + " f\" {checks['out_of_bounds'].sum()} out of {len(checks)} variants are out of bounds (context length = {context_to_check})\"\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "6b85ad06", + "metadata": {}, + "outputs": [], + "source": [ + "# Save processed results, dset, and refseq tables\n", + "dset.write_csv(f\"{OUTPUT_DIR}/clinvar_synom.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "98d131bc", + "metadata": {}, + "source": [ + "# 5. CHD missense dataset" + ] + }, + { + "cell_type": "markdown", + "id": "6e59856c", + "metadata": {}, + "source": [ + "- Download the variant tables from the publication [Jin et al. Contribution of rare inherited and de novo variants in 2,871 congenital heart disease probands](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/#SD1). \n", + "\n", + "- The excel table with variants information can be downloaded from this [link](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) \n", + "\n", + "- We saved the `S9` table (cases) as `chd_rare_mutation.csv`, and `S10` table (controls) as `chd_mutation_ctrl.csv` \n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "fcca5699", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- 1. Loading and Filtering Variants ---\n", + "Initial: 2776, Missense: 1773, Removed 4 duplicates, Final: 1769\n", + "class\n", + "chd 1769\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Blinded IDchromposrefaltensembl_gene_idgene_nameclassclassificationpLI ScoreAA_changeRadialSVM_scorevariant_id
01-01849chr1898217CTENSG00000187961KLHL17chdmis0.0p.P321L-0.492chr1_898217_C_T
11-03030chr11425984CGENSG00000160072ATAD3Bchdmis0.0p.S516W-0.917chr1_1425984_C_G
\n", + "
" + ], + "text/plain": [ + " Blinded ID chrom pos ref alt ensembl_gene_id gene_name class \\\n", + "0 1-01849 chr1 898217 C T ENSG00000187961 KLHL17 chd \n", + "1 1-03030 chr1 1425984 C G ENSG00000160072 ATAD3B chd \n", + "\n", + " classification pLI Score AA_change RadialSVM_score variant_id \n", + "0 mis 0.0 p.P321L -0.492 chr1_898217_C_T \n", + "1 mis 0.0 p.S516W -0.917 chr1_1425984_C_G " + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def load_and_filter_variants(chd_path: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Load CHD and control cases, clean, merge, and filter to keep only missense mutations.\n", + " \"\"\"\n", + " print(\"--- 1. Loading and Filtering Variants ---\")\n", + " cols = [\n", + " \"Blinded ID\",\n", + " \"CHROM\",\n", + " \"POS\",\n", + " \"REF\",\n", + " \"ALT\",\n", + " \"Ensemble_GENEID\",\n", + " \"Gene\",\n", + " \"class\",\n", + " \"Variant_Class\",\n", + " \"pLI Score\",\n", + " \"AA_change\",\n", + " \"RadialSVM_score\",\n", + " ]\n", + "\n", + " # Load and clean pathogenic (CHD)\n", + " pathogenic = pd.read_csv(chd_path, header=1).dropna()\n", + " pathogenic[\"class\"] = \"chd\"\n", + " pathogenic.rename(columns={\"pLI score\": \"pLI Score\", \"AA change\": \"AA_change\"}, inplace=True)\n", + "\n", + " # Concatenate and standardize columns\n", + " variants = pathogenic[cols]\n", + " variants = variants.sort_values(by=[\"CHROM\", \"POS\"]).reset_index(drop=True).copy()\n", + " variants.rename(\n", + " columns={\n", + " \"CHROM\": \"chrom\",\n", + " \"POS\": \"pos\",\n", + " \"REF\": \"ref\",\n", + " \"ALT\": \"alt\",\n", + " \"Ensemble_GENEID\": \"ensembl_gene_id\",\n", + " \"Gene\": \"gene_name\",\n", + " \"Variant_Class\": \"classification\",\n", + " },\n", + " inplace=True,\n", + " )\n", + "\n", + " # Format chrom and pos\n", + " variants[\"chrom\"] = \"chr\" + variants[\"chrom\"].astype(str)\n", + " variants[\"pos\"] = variants[\"pos\"].astype(int)\n", + "\n", + " # Filter missense mutations only (single base change and classification)\n", + " initial_count = variants.shape[0]\n", + " variants = variants.loc[\n", + " (variants[\"ref\"].str.len() == 1) & (variants[\"alt\"].str.len() == 1)\n", + " ].copy() # single mutation only\n", + " variants = variants.loc[variants[\"classification\"].isin([\"misD\", \"mis\"])] # only missense mutations\n", + " final_count = variants.shape[0]\n", + "\n", + " variants[\"variant_id\"] = (\n", + " variants[\"chrom\"] + \"_\" + variants[\"pos\"].astype(str) + \"_\" + variants[\"ref\"] + \"_\" + variants[\"alt\"]\n", + " )\n", + "\n", + " # Remove duplicate variants from published data\n", + " pre_dedup = variants.shape[0]\n", + " variants = variants.drop_duplicates(subset=[\"variant_id\", \"gene_name\"], keep=\"first\").reset_index(drop=True)\n", + " print(\n", + " f\"Initial: {initial_count}, Missense: {final_count}, Removed {pre_dedup - variants.shape[0]} duplicates, Final: {variants.shape[0]}\"\n", + " )\n", + " print(variants[\"class\"].value_counts())\n", + "\n", + " return variants\n", + "\n", + "\n", + "chd_path = f\"{DATA_DIR}/chd_rare_mutation.csv\"\n", + "variants = load_and_filter_variants(chd_path)\n", + "variants.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "88428334", + "metadata": {}, + "source": [ + "- Load fasta and annotation file, filter the gtf table by gene names and canonical transctipts" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "ebafa2bf", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_canonical_gtf(gtf_s: pd.DataFrame, gtf_path: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Filter a GTF-derived DataFrame to keep only canonical transcripts.\n", + " Only keeps genes with explicit canonical annotations (Ensembl_canonical or MANE_Select).\n", + " Genes without canonical annotations are dropped.\n", + " \"\"\"\n", + " print(\"Filtering GTF for canonical transcripts...\")\n", + " canonical_map_pl = pl.read_csv(\n", + " gtf_path,\n", + " comment_prefix=\"#\",\n", + " separator=\"\\t\",\n", + " has_header=False,\n", + " columns=[2, 8],\n", + " new_columns=[\"feature\", \"attrs\"],\n", + " ).filter(pl.col(\"feature\") == \"transcript\")\n", + "\n", + " # Extract gene and transcript, then filter for canonical tags in full attrs string\n", + " # (GTF files can have multiple tag entries, so we check the full string)\n", + " canonical_map_pl = (\n", + " canonical_map_pl.with_columns(\n", + " [\n", + " pl.col(\"attrs\").str.extract(r'gene_name \"([^\"]+)\"', 1).alias(\"gene\"),\n", + " pl.col(\"attrs\").str.extract(r'transcript_id \"([^\"]+)\"', 1).alias(\"transcript\"),\n", + " ]\n", + " )\n", + " # Filter for explicit canonical tags in the full attrs string\n", + " .filter(pl.col(\"attrs\").str.contains(\"Ensembl_canonical\") | pl.col(\"attrs\").str.contains(\"MANE_Select\"))\n", + " )\n", + "\n", + " # Prioritize MANE_Select over Ensembl_canonical if both exist for a gene\n", + " canonical_map_pl = (\n", + " canonical_map_pl.with_columns(\n", + " pl.when(pl.col(\"attrs\").str.contains(\"MANE_Select\"))\n", + " .then(2)\n", + " .when(pl.col(\"attrs\").str.contains(\"Ensembl_canonical\"))\n", + " .then(1)\n", + " .otherwise(0)\n", + " .alias(\"priority\")\n", + " )\n", + " .sort(\"priority\", descending=True)\n", + " .group_by(\"gene\")\n", + " .first()\n", + " .select([\"gene\", \"transcript\"])\n", + " )\n", + "\n", + " genes_with_canonical = canonical_map_pl.shape[0]\n", + " print(f\"Found {genes_with_canonical} genes with explicit canonical transcripts\")\n", + "\n", + " canonical_map_df = canonical_map_pl.to_pandas()\n", + " canonical_map_df[\"transcript\"] = canonical_map_df[\"transcript\"].str.split(\".\").str[0] # remove version\n", + " gtf_s[\"transcript_id\"] = gtf_s[\"name\"].str.split(\".\").str[0]\n", + "\n", + " original_shape = gtf_s.shape[0]\n", + " original_genes = gtf_s[\"gene_name\"].nunique()\n", + "\n", + " gtf_filtered = gtf_s.merge(\n", + " canonical_map_df, left_on=[\"gene_name\", \"transcript_id\"], right_on=[\"gene\", \"transcript\"], how=\"inner\"\n", + " ).drop(columns=[\"gene\", \"transcript\"])\n", + "\n", + " filtered_genes = gtf_filtered[\"gene_name\"].nunique()\n", + " dropped_genes = original_genes - filtered_genes\n", + "\n", + " print(f\"GTF size before canonical filter: {original_shape} entries from {original_genes} genes\")\n", + " print(f\"GTF size after canonical filter: {gtf_filtered.shape[0]} entries from {filtered_genes} genes\")\n", + " print(f\"Dropped {dropped_genes} genes without canonical annotations\")\n", + "\n", + " return gtf_filtered\n", + "\n", + "\n", + "def prepare_annotations(variants: pd.DataFrame, gtf_path: str, fasta_path: str):\n", + " \"\"\"\n", + " Load GTF/FASTA, subset GTF to genes in variant table, filter for CDS length, and canonical transcripts.\n", + " \"\"\"\n", + " print(\"\\n--- 2. Preparing Annotations (GTF & FASTA) ---\")\n", + " # Get reference and annotation files (hg19 assembly)\n", + " gtf_s, fasta = process_gtf(gtf_path, fasta_path)\n", + " # Subset GTF to genes present in the variant table (using ENSEMBL ID)\n", + " variant_gene_ids = variants[\"ensembl_gene_id\"].unique()\n", + " gtf_gene_ids = gtf_s[\"gene_id\"].unique()\n", + " missing_gene_ids = set(variant_gene_ids) - set(gtf_gene_ids)\n", + " if missing_gene_ids:\n", + " print(f\"⚠️ Warning: {len(missing_gene_ids)} variant Ensembl IDs are missing from the GTF table.\")\n", + " # Printing IDs is less useful, but we can print the corresponding gene names if needed\n", + " missing_names = variants[variants[\"ensembl_gene_id\"].isin(missing_gene_ids)][\"gene_name\"].unique()\n", + " print(f\" Missing {len(missing_names)} names: {list(missing_names)}\")\n", + " gtf_subset = gtf_s[gtf_s[\"gene_id\"].isin(variant_gene_ids)].copy()\n", + " print(f\"GTF subset to variant genes (by Ensembl ID): {gtf_subset.shape[0]} rows.\")\n", + " # Check 2: Filter for CDS length multiple of 3\n", + " gtf_subset = gtf_subset[gtf_subset[\"cds\"].str.len() % 3 == 0]\n", + " print(f\"After filtering CDS length multiple of 3: {gtf_subset.shape[0]}\")\n", + " # Check 3: Filter for canonical transcripts only\n", + " gtf_filtered = filter_canonical_gtf(gtf_subset, gtf_path=gtf_path.replace(\".processed.tsv\", \".gtf.gz\"))\n", + " # Check 4: Validate reference allele in fasta matches variants (hg19 assembly)\n", + " print(\"\\nRunning FASTA reference allele validation...\")\n", + " for i in range(variants.shape[0]):\n", + " t = variants.iloc[i]\n", + " chrom = t[\"chrom\"]\n", + " pos = t[\"pos\"]\n", + " ref = t[\"ref\"]\n", + " try:\n", + " hg19_ref = fasta[chrom][pos - 1]\n", + " if hg19_ref != ref:\n", + " print(f\"Mismatch at {chrom}:{pos}, {ref} (variants) != {hg19_ref} (fasta), {t['variant_id']}\")\n", + " except KeyError:\n", + " print(f\"Warning: Chromosome {chrom} not found in FASTA.\")\n", + " print(\"FASTA reference allele validation complete.\")\n", + " return gtf_filtered, fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "4f3c68c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- 2. Preparing Annotations (GTF & FASTA) ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing transcripts: 100%|██████████| 64779/64779 [00:10<00:00, 6443.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⚠️ Warning: 10 variant Ensembl IDs are missing from the GTF table.\n", + " Missing 10 names: ['OTUD7B', 'OR8B4', 'SLCO1B7', 'CYFIP1', 'LENG9', 'ADRA2B', 'TPTE', 'SSTR3', 'ATP6AP1L', 'SLC25A53']\n", + "GTF subset to variant genes (by Ensembl ID): 6200 rows.\n", + "After filtering CDS length multiple of 3: 6181\n", + "Filtering GTF for canonical transcripts...\n", + "Found 64705 genes with explicit canonical transcripts\n", + "GTF size before canonical filter: 6181 entries from 1548 genes\n", + "GTF size after canonical filter: 1544 entries from 1544 genes\n", + "Dropped 4 genes without canonical annotations\n", + "\n", + "Running FASTA reference allele validation...\n", + "FASTA reference allele validation complete.\n" + ] + } + ], + "source": [ + "GTF_PROCESSED_PATH = f\"{DATA_DIR}/reference/gencode.v47lift37.basic.annotation.processed.tsv\"\n", + "FASTA_PATH = f\"{DATA_DIR}/reference/hg19/hg19.fa\"\n", + "gtf_filtered, fasta = prepare_annotations(variants=variants, gtf_path=GTF_PROCESSED_PATH, fasta_path=FASTA_PATH)" + ] + }, + { + "cell_type": "markdown", + "id": "82fc6cb2", + "metadata": {}, + "source": [ + "- Get missense variant table with CDS sequences for ref and alt codons, filtered to canonical transcripts:" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d43ada2e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_results_and_filter_canonical(\n", + " variants: pd.DataFrame, gtf_filtered: pd.DataFrame, filter_canonical: bool = True\n", + "):\n", + " \"\"\"\n", + " Run annotation processing, merge with variant data, and filter for the canonical transcript\n", + " for each unique variant.\n", + " \"\"\"\n", + " print(\"\\n--- 3. Running Annotation and Canonical Transcript Filtering ---\")\n", + " all_results = []\n", + " chroms = variants[\"chrom\"].unique()\n", + " # Annotation loop\n", + " for chrom in tqdm(chroms, desc=\"Processing chromosomes\", total=len(chroms)):\n", + " curr_variants = (\n", + " variants[variants[\"chrom\"] == chrom][[\"variant_id\", \"chrom\", \"pos\", \"ref\", \"alt\"]].drop_duplicates().copy()\n", + " )\n", + " chrom_gtf = gtf_filtered[gtf_filtered[\"chrom\"] == chrom]\n", + " chrom_results = process_a_chrom(curr_variants, chrom_gtf, return_alt_cds=True) # Assumed defined elsewhere\n", + " all_results.append(chrom_results)\n", + " all_results = pd.concat(all_results).reset_index(drop=True)\n", + " all_results.insert(0, \"id\", np.arange(all_results.shape[0])) # Add row ID\n", + " # Merge with original variant metadata\n", + " assembly = \"hg19\"\n", + " all_results[\"variant_id\"] = all_results[\"variant_id\"] + \"_\" + assembly\n", + " all_results = all_results.merge(variants.drop(\"variant_id\", axis=1), on=[\"chrom\", \"pos\", \"ref\", \"alt\"])\n", + " if filter_canonical:\n", + " print(f\"Total results rows before canonical filtering: {all_results.shape[0]}\")\n", + " # Since gtf_filtered is already canonical, we can use it to filter the results.\n", + " canonical_transcripts = gtf_filtered[[\"gene_name\", \"name\"]].copy()\n", + " canonical_transcripts.rename(columns={\"name\": \"transcript_name\"}, inplace=True)\n", + " all_results = all_results.merge(\n", + " canonical_transcripts,\n", + " left_on=[\"gene_name\", \"tx_name\"], # Assumes 'transcript_name' in all_results\n", + " right_on=[\"gene_name\", \"transcript_name\"],\n", + " how=\"inner\",\n", + " )\n", + " print(f\"Total results rows after canonical filtering: {all_results.shape[0]}\")\n", + " # Keep only variants where ref_aa != alt_aa\n", + " all_results = all_results[all_results[\"ref_aa\"] != all_results[\"alt_aa\"]]\n", + " print(f\"Total results rows after filtering ref_aa != alt_aa: {all_results.shape[0]}\")\n", + "\n", + " # Remove nonsense variants: ref_aa != \"*\" & alt_aa !=\"*\"\n", + " all_results = all_results[(all_results[\"ref_aa\"] != \"*\") & (all_results[\"alt_aa\"] != \"*\")]\n", + " print(f\"Total results rows after filtering nonsense variants: {all_results.shape[0]}\")\n", + "\n", + " # Remove variants where ref_aa or all_aa are different from AA_change\n", + " all_results = all_results[all_results[\"ref_aa\"] == all_results[\"AA_change\"].apply(lambda x: x[2])]\n", + " all_results = all_results[all_results[\"alt_aa\"] == all_results[\"AA_change\"].apply(lambda x: x[-1])]\n", + " print(\n", + " f\"Total results rows after filtering variants where ref_aa or alt_aa are different from provided AA_change: {all_results.shape[0]}\"\n", + " )\n", + " return all_results" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "eda9517b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- 3. Running Annotation and Canonical Transcript Filtering ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing chromosomes: 100%|██████████| 23/23 [00:00<00:00, 324.53it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total results rows before canonical filtering: 1721\n", + "Total results rows after canonical filtering: 1628\n", + "Total results rows after filtering ref_aa != alt_aa: 1624\n", + "Total results rows after filtering nonsense variants: 1624\n", + "Total results rows after filtering variants where ref_aa or alt_aa are different from provided AA_change: 1623\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "final_results = get_results_and_filter_canonical(variants=variants, gtf_filtered=gtf_filtered, filter_canonical=True)" + ] + }, + { + "cell_type": "markdown", + "id": "092cb6d7", + "metadata": {}, + "source": [ + "- Add alpha missense scores" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "1c14a684", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with AlphaMissense scores: 1114 / 1623\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idchromposvariant_idrefalttx_namecdsStartcdsEndtx_strand...ensembl_gene_idgene_nameclassclassificationpLI ScoreAA_changeRadialSVM_scoretranscript_nameAlphaMissenseam_class
00chr1898217chr1_898217_C_T_hg19CTENST00000338591896073900571+...ENSG00000187961KLHL17chdmis0.0p.P321L-0.492ENST000003385910.8814pathogenic
22chr13389648chr1_3389648_C_G_hg19CGENST0000037837833796483397151+...ENSG00000130762ARHGEF16chdmis0.0p.F343L-0.963ENST000003783780.8865pathogenic
\n", + "

2 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " id chrom pos variant_id ref alt tx_name \\\n", + "0 0 chr1 898217 chr1_898217_C_T_hg19 C T ENST00000338591 \n", + "2 2 chr1 3389648 chr1_3389648_C_G_hg19 C G ENST00000378378 \n", + "\n", + " cdsStart cdsEnd tx_strand ... ensembl_gene_id gene_name class \\\n", + "0 896073 900571 + ... ENSG00000187961 KLHL17 chd \n", + "2 3379648 3397151 + ... ENSG00000130762 ARHGEF16 chd \n", + "\n", + " classification pLI Score AA_change RadialSVM_score transcript_name \\\n", + "0 mis 0.0 p.P321L -0.492 ENST00000338591 \n", + "2 mis 0.0 p.F343L -0.963 ENST00000378378 \n", + "\n", + " AlphaMissense am_class \n", + "0 0.8814 pathogenic \n", + "2 0.8865 pathogenic \n", + "\n", + "[2 rows x 29 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "af_hg19 = pl.read_csv(f\"{DATA_DIR}/alphamissense_data/AlphaMissense_hg19.tsv.gz\", separator=\"\\t\", skip_rows=3)\n", + "af_hg19 = af_hg19.rename({\"#CHROM\": \"chrom\", \"POS\": \"pos\", \"REF\": \"ref\", \"ALT\": \"alt\"})\n", + "af_hg19 = af_hg19.with_columns(\n", + " pl.concat_str(\n", + " [pl.col(\"chrom\"), pl.col(\"pos\").cast(str), pl.col(\"ref\"), pl.col(\"alt\"), pl.lit(\"hg19\")], separator=\"_\"\n", + " ).alias(\"variant_id\")\n", + ")\n", + "af_hg19 = af_hg19.with_columns(pl.col(\"transcript_id\").str.split(\".\").list.first().alias(\"tx_name\"))\n", + "# Join with final_results\n", + "final_results_pl = pl.from_pandas(final_results)\n", + "final_results = final_results_pl.join(\n", + " af_hg19.select([\"variant_id\", \"tx_name\", \"am_pathogenicity\", \"am_class\"]), on=[\"variant_id\", \"tx_name\"], how=\"left\"\n", + ").rename({\"am_pathogenicity\": \"AlphaMissense\"})\n", + "print(\n", + " f\"Variants with AlphaMissense scores: {final_results.filter(pl.col('AlphaMissense').is_not_null()).shape[0]} / {final_results.shape[0]}\"\n", + ")\n", + "final_results = final_results.to_pandas().dropna(subset=[\"AlphaMissense\"])\n", + "final_results.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "ebde35a9", + "metadata": {}, + "source": [ + "- Add DDD/ASD control variants" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "aa312b44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of benign variants in DDD/ASD dataset: 3590\n", + "\n", + "Concatenated dataframe shape: (4704, 19)\n", + "Columns: ['id', 'variant_id', 'chrom', 'pos', 'ref', 'alt', 'class', 'ref_codon', 'alt_codon', 'codon_position', 'var_rel_dist_in_cds', 'classification', 'ref_aa', 'alt_aa', 'ref_seq', 'alt_seq', 'tx_name', 'AlphaMissense', 'am_class']\n", + "\n", + "First few rows:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvariant_idchromposrefaltclassref_codonalt_codoncodon_positionvar_rel_dist_in_cdsclassificationref_aaalt_aaref_seqalt_seqtx_nameAlphaMissenseam_class
059chr10_101163334_A_G_hg19chr10101163334AGcontrol_ddd_asdGTCGCC283850misVAATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG...ATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG...ENST000003705080.3736ambiguous
173chr10_101371064_G_A_hg19chr10101371064GAcontrol_ddd_asdCGGTGG212636misRWATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG...ATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG...ENST000003704950.1444benign
\n", + "
" + ], + "text/plain": [ + " id variant_id chrom pos ref alt class \\\n", + "0 59 chr10_101163334_A_G_hg19 chr10 101163334 A G control_ddd_asd \n", + "1 73 chr10_101371064_G_A_hg19 chr10 101371064 G A control_ddd_asd \n", + "\n", + " ref_codon alt_codon codon_position var_rel_dist_in_cds classification \\\n", + "0 GTC GCC 283 850 mis \n", + "1 CGG TGG 212 636 mis \n", + "\n", + " ref_aa alt_aa ref_seq \\\n", + "0 V A ATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG... \n", + "1 R W ATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG... \n", + "\n", + " alt_seq tx_name \\\n", + "0 ATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG... ENST00000370508 \n", + "1 ATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG... ENST00000370495 \n", + "\n", + " AlphaMissense am_class \n", + "0 0.3736 ambiguous \n", + "1 0.1444 benign " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load ctrls\n", + "missense_variants_ddd_asd = pd.read_csv(f\"{OUTPUT_DIR}/ddd_asd_zhouetal_processed_am.csv\")\n", + "missense_variants_ddd_asd_ctrl = missense_variants_ddd_asd[\n", + " missense_variants_ddd_asd[\"classification\"] == \"control\"\n", + "].copy()\n", + "missense_variants_ddd_asd_ctrl[\"classification\"] = \"mis\"\n", + "missense_variants_ddd_asd_ctrl[\"class\"] = \"control_ddd_asd\"\n", + "print(f\"Number of benign variants in DDD/ASD dataset: {missense_variants_ddd_asd_ctrl.shape[0]}\")\n", + "cols = [\n", + " \"id\",\n", + " \"variant_id\",\n", + " \"chrom\",\n", + " \"pos\",\n", + " \"ref\",\n", + " \"alt\",\n", + " \"class\",\n", + " \"ref_codon\",\n", + " \"alt_codon\",\n", + " \"codon_position\",\n", + " \"var_rel_dist_in_cds\",\n", + " \"classification\",\n", + " \"ref_aa\",\n", + " \"alt_aa\",\n", + " \"ref_seq\",\n", + " \"alt_seq\",\n", + " \"tx_name\",\n", + " \"AlphaMissense\",\n", + " \"am_class\",\n", + "]\n", + "# Concatenate\n", + "variants = pd.concat([missense_variants_ddd_asd_ctrl[cols], final_results[cols]], ignore_index=True)\n", + "print(f\"\\nConcatenated dataframe shape: {variants.shape}\")\n", + "print(f\"Columns: {variants.columns.tolist()}\")\n", + "print(\"\\nFirst few rows:\")\n", + "variants.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "6a93d71e", + "metadata": {}, + "source": [ + "- QC of the variants:" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "5f4723e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- 4. Quality Control and Display ---\n", + "A. Initial variant type and class counts:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
classificationclass
miscontrol_ddd_asd3590
chd827
misDchd287
\n", + "
" + ], + "text/plain": [ + " count\n", + "classification class \n", + "mis control_ddd_asd 3590\n", + " chd 827\n", + "misD chd 287" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "B. Unique Variant Overlaps and Totals:\n", + "CHD-Control overlap (same variant_id): 0\n", + "Total unique CHD variants: 1114\n", + "Total unique Control variants: 0\n", + "\n", + "Position Overlaps (same chrom:pos, potentially different alleles):\n", + "CHD-Control position overlap: 0\n", + "CHD-Control_DDD_ASD position overlap: 1\n", + "Total unique CHD positions: 1113\n", + "Total unique Control positions: 0\n", + "Total unique Control_DDD_ASD positions: 3570\n", + "\n", + "C. Unique Variants and Total Rows by Class/Type:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variant_id
unique_variantstotal_rows
classclassification
chdmis827827
misD287287
control_ddd_asdmis35733590
\n", + "
" + ], + "text/plain": [ + " variant_id \n", + " unique_variants total_rows\n", + "class classification \n", + "chd mis 827 827\n", + " misD 287 287\n", + "control_ddd_asd mis 3573 3590" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "D. Final Missense-specific QC (matching original 'mis'/'misD' annotation):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variant_id
unique_variantstotal_rows
classclassification
chdmis827827
misD287287
control_ddd_asdmis35733590
\n", + "
" + ], + "text/plain": [ + " variant_id \n", + " unique_variants total_rows\n", + "class classification \n", + "chd mis 827 827\n", + " misD 287 287\n", + "control_ddd_asd mis 3573 3590" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "E. Venn Diagram - Overlap of Missense Variants Between Classes:\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbcAAAHDCAYAAACnJFQ8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACEUklEQVR4nO3dd3xT5f4H8M9JmtF0pOmedFCglFUsFNkg5XIVFVwMURCvOEC9WCfee0VwAOLAwVAEQdQLPxEXekEtw4WAIIrs0UIpbTrTtE2b+fz+CI1Nd9okJzn5vl+vvCAnJyff0+Sc73me8wyOMcZACCGECIiI7wAIIYQQZ6PkRgghRHAouRFCCBEcSm6EEEIEh5IbIYQQwaHkRgghRHAouRFCCBEcSm6EEEIEh5IbIYQQwaHk5gIbNmwAx3HIz8/nO5Q2LV++HCkpKRCLxcjIyHD69vfs2QOO47Bnzx6nb5u0jeM4PPvss3yHQQhvvDq5HTt2DHfccQfi4uIgk8kQGxuLGTNm4NixY3yH5vG++eYbPPHEExg+fDjee+89vPjii62ue9ddd4HjOAQHB6Ourq7Z62fOnAHHceA4Di+//LIrw/ZKr776KjiOw3fffdfqOmvXrgXHcfjiiy/cGFnXrFq1Chs2bHDa9houhho/QkNDcfXVV+PDDz/s9HY/+ugjrFixwmlxeqqSkhI89dRT6NevHwIDAyGXy5GamorZs2fjxx9/5Ds8t/PjO4DO2rZtG6ZPn47Q0FD84x//QHJyMvLz87Fu3Tps3boVmzdvxk033cR3mB5r165dEIlEWLduHaRSabvr+/n5QafT4csvv8SUKVPsXvvwww8hl8tRX19vt3zUqFGoq6vr0PaFbNq0aXj88cfx0UcfITs7u8V1PvroI4SFheHaa691ymfW1dXBz8+1h/eqVasQHh6Ou+66y6nbffjhhzF48GAAQHl5ObZs2YI77rgDGo0G8+bNc3h7H330Ef7880/Mnz/fqXF6kgMHDmDixImorq7GtGnTcP/990MmkyEvLw+fffYZNmzYgL1792LUqFF8h+o+zAudPXuWKRQKlpaWxkpKSuxeKy0tZWlpaSwgIICdO3fOrXHV1NQwxhh77733GACWl5fn1s93xOzZs1lAQECH1p01axYLCAhgf/vb39jkyZObvd6jRw92yy23MABs+fLlzg5VEMaNG8eUSiWrr69v9tqlS5eYSCRi999/f5c+w2w2s7q6ui5twxF9+vRho0ePdtr2du/ezQCwjz/+2G65Xq9ncXFxbNiwYZ3a7sSJE1liYqITIvRMFRUVLCYmhkVHR7MTJ040e91isbCPPvqIHThwoM3tNJy/hMIrqyWXL18OnU6Hd955BxEREXavhYeH4+2330ZtbS1eeuklAMDWrVvBcRz27t3bbFtvv/02OI7Dn3/+aVt28uRJ3HrrrQgNDYVcLsegQYOaVRc13Ffbu3cv5s6di8jISMTHx7ca8+eff46JEyciNjYWMpkM3bt3x3PPPQez2Wy33pgxY9C3b18cOnQIw4YNg7+/P5KTk7FmzZoO/W1MJhOee+45dO/eHTKZDElJSXj66aeh1+tt63Ach/feew+1tbW26p+OVC/dfvvt+N///geNRmNbdvDgQZw5cwa33357s/Vbuud25swZ3HLLLYiOjoZcLkd8fDymTZuGqqoq2zrffvstRowYgZCQEAQGBqJXr154+umn7bat1+uxcOFCpKamQiaTISEhAU888YTdfjbs64MPPojPPvsMffv2hUwmQ58+fbBjxw679aqrqzF//nwkJSVBJpMhMjIS48ePx+HDh+3W279/P/7+979DqVRCoVBg9OjR+Omnn9r9291xxx2oqqrCV1991ey1zZs3w2KxYMaMGQCAl19+GcOGDUNYWBj8/f2RmZmJrVu3Nntfw759+OGH6NOnD2QymW2/mt5zu3DhAubOnYtevXrB398fYWFhuO2225rdF274Xf/000/IyclBREQEAgICcNNNN6G0tNS2XlJSEo4dO4a9e/fafkNjxowBABiNRixatAg9evSAXC5HWFgYRowYgW+//bbdv1NLpFIpVCpViyXRDz74AJmZmfD390doaCimTZuGgoIC2+tjxozBV199hQsXLtjiTEpKAmMM4eHhyMnJsa1rsVgQEhICsVhs9xtftmwZ/Pz8UFNTY1vWkXMEAGg0GsyfPx8JCQmQyWRITU3FsmXLYLFYbOvk5+fbqvTfeecd27E7ePBgHDx4sN2/z5o1a1BUVIQVK1YgLS2t2escx2H69Om20jAAPPvss+A4DsePH8ftt98OlUqFESNGAOjYOaRhuy3d101KSrIrzTf8pr7//nvcd999CAsLQ3BwMGbOnInKykq79/7666+YMGECwsPDbee+u+++u92/QYv4zq6dERsby5KSktpcJykpicXHxzPGGNPpdCwwMJDNnTu32Xpjx45lffr0sT3/888/mVKpZOnp6WzZsmXsrbfeYqNGjWIcx7Ft27bZ1msonaWnp7PRo0ezN998ky1dutTutcYlt8mTJ7MpU6aw5cuXs9WrV7PbbruNAWCPPfaYXTyjR49msbGxLDIykj344IPsjTfeYCNGjGAA2Lp169r928yaNYsBYLfeeitbuXIlmzlzJgNgV+LatGkTGzlyJJPJZGzTpk1s06ZNbZZyG0puWq2WyeVyuzjmz5/P0tLSWF5eXrOSW8OV+O7duxlj1ivw5ORkFhsby55//nn27rvvskWLFrHBgwez/Px8299fKpWyQYMGsddff52tWbOGPfbYY2zUqFG27ZrNZva3v/2NKRQKNn/+fPb222+zBx98kPn5+bFJkybZxQ6ADRgwgMXExLDnnnuOrVixgqWkpDCFQsHKysps691+++1MKpWynJwc9u6777Jly5axG264gX3wwQe2dXJzc5lUKmVDhw5lr7zyCnvttddY//79mVQqZfv372/ze6mqqmJyuZzdcsstzV676qqrWGJiIrNYLIwxxuLj49ncuXPZW2+9xV599VWWlZXFALDt27c327fevXuziIgItmjRIrZy5Ur222+/2V5buHChbd2PP/6YDRgwgD3zzDPsnXfeYU8//TRTqVQsMTGR1dbW2tZr+O0OHDiQXXPNNezNN99kjz76KBOLxWzKlCm29T799FMWHx/P0tLSbL+hb775hjHG2NNPP804jmNz5sxha9euZa+88gqbPn267fhoTcPvZf369ay0tJSVlpayU6dOsYULF7b4+3/++ecZx3Fs6tSpbNWqVWzRokUsPDycJSUlscrKSsYYY9988w3LyMhg4eHhtjg//fRTxhhjN954I8vMzLRt77fffmMAmEgksvtbT5w4kQ0aNMj2vKPniNraWta/f38WFhbGnn76abZmzRo2c+ZMxnEc++c//2lbr+HYGThwIEtNTWXLli1jL730EgsPD2fx8fHMYDC0+XcbOnQo8/f3b3e9xhr+punp6WzSpEls1apVbOXKlYyxjp1DGGv+G2uQmJjIZs2aZXve8Jvq168fGzlyJHvjjTfYvHnzmEgkYqNGjbL97tVqNVOpVKxnz55s+fLlbO3atexf//oX6927d4f3yy6+Tr2LRxqNhgFodhJr6sYbb2QAmFarZYwxNn36dBYZGclMJpNtnaKiIiYSidjixYtty8aNG8f69etnV31ksVjYsGHDWI8ePWzLGr6wESNG2G2z8WuNk5tOp2sW43333ccUCoXdZ40ePZoBYK+88optmV6vZxkZGSwyMrLNH/CRI0cYAHbPPffYLX/ssccYALZr1y7bsoaE1RGN17311lvZuHHjGGPWJBMdHc0WLVrUoeTWcPJoWu3U2GuvvcYAsNLS0lbX2bRpExOJROyHH36wW75mzRoGgP3000+2ZQCYVCplZ8+etS37/fffGQD25ptv2pYplUo2b968Vj/TYrGwHj16sAkTJtgORsas32tycjIbP358q+9tcNtttzG5XM6qqqpsy06ePMkAsAULFthtszGDwcD69u3LrrnmGrvlDSfiY8eONfuspieeln5/+/btYwDY+++/b1vW8NvNzs62289HHnmEicViptFobMtaq5YcMGAAmzhxYgt/gbY1/F6aPkQiEXvhhRfs1s3Pz2disbjZ8qNHjzI/Pz+75a1VSy5fvpyJxWLbOeKNN95giYmJLCsriz355JOMMetvPCQkhD3yyCO293X0HPHcc8+xgIAAdvr0abvPfeqpp5hYLGYXL15kjP2V3MLCwlhFRYVtvc8//5wBYF9++WWbfzeVSsUyMjKaLddqtbaLhNLSUrtqx4bkNn36dLv3OHIOcTS5ZWZm2p2/XnrpJQaAff7554wx6wUTAHbw4ME297ejvK5asrq6GgAQFBTU5noNr2u1WgDA1KlTUVJSYldFtnXrVlgsFkydOhUAUFFRgV27dmHKlCmorq5GWVkZysrKUF5ejgkTJuDMmTMoLCy0+5w5c+ZALBa3G7e/v7/dPpSVlWHkyJHQ6XQ4efKk3bp+fn647777bM+lUinuu+8+lJSU4NChQ61+xtdffw0AdlUtAPDoo48CQItVYo66/fbbsWfPHhQXF2PXrl0oLi5usUqyJUqlEgCwc+dO6HS6FtcJCQkBYK3GbVx109jHH3+M3r17Iy0tzfYdlZWV4ZprrgEA7N6922797OxsdO/e3fa8f//+CA4Oxvnz5+0+d//+/bh8+XKLn3nkyBFb9Wt5ebntM2trazFu3Dh8//33rcbb4I477kB9fT22bdtmW/bRRx8BgK1KErD/rVRWVqKqqgojR45sVkUKAKNHj0Z6enqbn9t0m0ajEeXl5UhNTUVISEiL27333nvBcZzt+ciRI2E2m3HhwoV2PyskJATHjh3DmTNn2l23Jc888wy+/fZbfPvtt9iyZQumT5+Of/3rX3j99ddt62zbtg0WiwVTpkyx+w1ER0ejR48ezX4DLWnYp59//hkA8MMPP2DkyJEYOXIkfvjhBwDAn3/+CY1Gg5EjRwJw7Bzx8ccfY+TIkVCpVHYxZmdnw2w24/vvv7eLZ+rUqVCpVHbxAbD7nbZEq9UiMDCw2fI777wTERERtseTTz7ZbJ3777/f7rkrzyH33nsvJBKJ7fkDDzwAPz8/22c2HPvbt2+H0Wjs9Oc08Lrk1pC0GpJca5omwYb7JFu2bLGts2XLFmRkZKBnz54AgLNnz4Ixhv/85z92P4qIiAgsXLgQgLW5bWPJyckdivvYsWO46aaboFQqERwcjIiICNxxxx0AYHe/CQBiY2MREBBgt6whxrb6zl24cAEikQipqal2y6OjoxESEtKhE1N7rrvuOgQFBWHLli348MMPMXjw4Gaf15rk5GTk5OTg3XffRXh4OCZMmICVK1fa7f/UqVMxfPhw3HPPPYiKisK0adPwf//3f3aJ48yZMzh27Fiz76jhb9T0O+rWrVuzWFQqlV19/0svvYQ///wTCQkJyMrKwrPPPmt3Umk4Uc+aNavZ57777rvQ6/XNvsemrr32WoSGhtoSGgD897//xYABA9CnTx/bsu3bt+Pqq6+GXC5HaGgoIiIisHr16ha339HfX11dHZ555hnbvZ/w8HBERERAo9G0uN2mf7OGk27TeyQtWbx4MTQaDXr27Il+/frh8ccfxx9//NGhOAGgX79+yM7ORnZ2NqZMmYIPPvgA119/PZ566inbfb8zZ86AMYYePXo0+z5OnDjR7DfQkquuugoKhcKWyBqS26hRo/Drr7+ivr7e9lrD/ShHzhFnzpzBjh07mq3X0GK2vd9pR//mQUFBdvcDGyxevNh2kdCapr8fV55DevToYfc8MDAQMTExtnPa6NGjccstt2DRokUIDw/HpEmT8N577zW719dRXtcVQKlUIiYmpt2D5Y8//kBcXByCg4MBADKZDJMnT8ann36KVatWQa1W46effrLr39VwAn3ssccwYcKEFrfb9EtvfEXcGo1Gg9GjRyM4OBiLFy9G9+7dIZfLcfjwYTz55JPtXvE7qvEVt7PJZDLcfPPN2LhxI86fP+9wR+FXXnkFd911Fz7//HN88803ePjhh7FkyRL88ssviI+Ph7+/P77//nvs3r0bX331FXbs2IEtW7bgmmuuwTfffAOxWAyLxYJ+/frh1VdfbfEzEhIS7J63VrK21qxYTZkyBSNHjsSnn36Kb775BsuXL8eyZcuwbds2XHvttbbvaPny5a12eG/p6rkxiUSCKVOmYO3atVCr1bh48SLOnDlja/gEWE+wN954I0aNGoVVq1YhJiYGEokE7733nl1SbNCR3x8APPTQQ3jvvfcwf/58DB06FEqlEhzHYdq0aS3+/jryN2vNqFGjcO7cOdt3/O677+K1117DmjVrcM8993Qo3qbGjRuH7du325q8WywWcByH//3vfy3G2t53AVi/jyFDhuD777/H2bNnUVxcjJEjRyIqKgpGoxH79+/HDz/8gLS0NFvDNUfOERaLBePHj8cTTzzR4noNF2MNOvs3T0tLw++//w6j0WhXMurfv3+b7wNa//105RzStJFcR3Ech61bt+KXX37Bl19+iZ07d+Luu+/GK6+8gl9++aVD32ljXpfcAOD666/H2rVr8eOPP9quqBr74YcfkJ+fb1e1B1hLBRs3bkRubi5OnDgBxpitShIAUlJSAFh/9K31R+qMPXv2oLy8HNu2bbPrZ5KXl9fi+pcvX0Ztba1d6e306dMArC2RWpOYmAiLxYIzZ86gd+/etuVqtRoajQaJiYld3BOr22+/HevXr4dIJMK0adMcfn+/fv3Qr18//Pvf/8bPP/+M4cOHY82aNXj++ecBACKRCOPGjcO4cePw6quv4sUXX8S//vUv7N6921bF+Pvvv2PcuHFOTeQxMTGYO3cu5s6di5KSElx11VV44YUXcO2119qqNYODg7v025gxYwbWrFmDLVu2IC8vz9aSrcEnn3wCuVyOnTt3QiaT2Za/9957nd8xWKvgZ82ahVdeecW2rL6+3q5VoKPa+tuHhoZi9uzZmD17NmpqajBq1Cg8++yznU5uJpMJAGwllO7du4MxhuTk5GZJwpE4R44ciWXLluG7775DeHg40tLSwHEc+vTpgx9++AE//PADrr/+etv6jpwjunfvjpqaGqeeS1py/fXX45dffsGnn37arA+qoxw5h6hUqma/H4PBgKKioha3febMGYwdO9b2vKamBkVFRbjuuuvs1rv66qtx9dVX44UXXsBHH32EGTNmYPPmzQ7/dryuWhIAHn/8cfj7++O+++5DeXm53WsVFRW4//77oVAo8Pjjj9u9lp2djdDQUGzZsgVbtmxBVlaWXbE8MjISY8aMwdtvv93iF9S4KbQjGq7IGl+BGQwGrFq1qsX1TSYT3n77bbt13377bURERCAzM7PVz2n4kTQdjaGhhDNx4sROxd/U2LFj8dxzz+Gtt95CdHR0h9+n1WptJ6kG/fr1g0gkslU9VFRUNHtfQ0mpYZ0pU6agsLAQa9eubbZuXV0damtrOxwTYL3SbFo1FxkZidjYWNtnZmZmonv37nj55ZdbrALq6G9j+PDhSEpKwgcffIAtW7Zg9OjRdl1IxGIxOI6zu/rNz8/HZ5995tA+NSUWi5uVAN58881OX2UDQEBAQIvJsekxGRgYiNTU1E5XLwHWqloAGDBgAADg5ptvhlgsxqJFi5rtF2PMLoaAgIBWq4xHjhwJvV6PFStWYMSIEbZEOHLkSGzatAmXL1+23fsCHDtHTJkyBfv27cPOnTubrafRaJodC531wAMPICoqCo888ojtIrixjpS2GzhyDunevXuz+4bvvPNOq7+pd955x+5e2urVq2EymWwDF1RWVjaLtemx7wivLLn16NEDGzduxIwZM9CvX79mI5SUlZXhv//9r10jAsB6tXXzzTdj8+bNqK2tbXGoqJUrV2LEiBHo168f5syZg5SUFKjVauzbtw+XLl3C77//7nC8w4YNg0qlwqxZs/Dwww+D4zhs2rSp1R9dbGwsli1bhvz8fPTs2RNbtmzBkSNH8M4779hVOzQ1YMAAzJo1C++8846tKvTAgQPYuHEjJk+ebHfV1BUikQj//ve/HX7frl278OCDD+K2225Dz549YTKZsGnTJojFYtxyyy0ArPcJvv/+e0ycOBGJiYkoKSnBqlWrEB8fbyul33nnnfi///s/3H///di9ezeGDx8Os9mMkydP4v/+7/+wc+dODBo0qMNxVVdXIz4+HrfeeisGDBiAwMBAfPfddzh48KCtpCMSifDuu+/i2muvRZ8+fTB79mzExcWhsLAQu3fvRnBwML788st2P4vjONx+++226vDFixfbvT5x4kS8+uqr+Pvf/47bb78dJSUlWLlyJVJTUx26b9XU9ddfj02bNkGpVCI9PR379u3Dd999h7CwsE5vMzMzE6tXr8bzzz+P1NRUREZG4pprrkF6ejrGjBmDzMxMhIaG4tdff8XWrVvx4IMPdmi7P/zwg220m4qKCnzxxRfYu3cvpk2bZuvH1b17dzz//PNYsGAB8vPzMXnyZAQFBSEvLw+ffvop7r33Xjz22GO2OLds2YKcnBwMHjwYgYGBuOGGGwAAQ4cOhZ+fH06dOoV7773XFsOoUaOwevVqALBLbkDHzxGPP/44vvjiC1x//fW46667kJmZidraWhw9ehRbt25Ffn4+wsPDO/33bxAaGopPP/0UN9xwAwYMGIBp06Zh8ODBkEgkKCgowMcffwyg5XvPTTlyDrnnnntw//3345ZbbsH48ePx+++/Y+fOna3uk8FgwLhx4zBlyhScOnUKq1atwogRI3DjjTcCADZu3IhVq1bhpptuQvfu3VFdXY21a9ciODi4WemuQ5zS5pInf/zxB5s+fTqLiYlhEomERUdHs+nTp7OjR4+2+p5vv/2WAWAcx7GCgoIW1zl37hybOXMmi46OZhKJhMXFxbHrr7+ebd261bZOQ/PWlpqtttQV4KeffmJXX3018/f3Z7GxseyJJ55gO3futGsqz5i1K0CfPn3Yr7/+yoYOHcrkcjlLTExkb731Vof+JkajkS1atIglJycziUTCEhIS2IIFC5qNjNHZrgCt6UhXgPPnz7O7776bde/encnlchYaGsrGjh3LvvvuO9t7cnNz2aRJk1hsbCyTSqUsNjaWTZ8+vVlzaoPBwJYtW8b69OnDZDIZU6lULDMzky1atMiuqT2AFpv4N26urNfr2eOPP84GDBjAgoKCWEBAABswYABbtWpVs/f99ttv7Oabb2ZhYWFMJpOxxMRENmXKFJabm9vu37HBsWPHGAAmk8ls/bEaW7duHevRoweTyWQsLS2Nvffee7am2421tm8NrzVupl1ZWclmz57NwsPDWWBgIJswYQI7efJkq822m/6um36XjDFWXFzMJk6cyIKCghgAW7eA559/nmVlZbGQkBDm7+/P0tLS2AsvvNBuP6yWugJIpdI23//JJ5+wESNGsICAABYQEMDS0tLYvHnz2KlTp2zr1NTUsNtvv52FhIQwAM26BQwePJgBsOureOnSJQaAJSQktBhrR84RjDFWXV3NFixYwFJTU5lUKmXh4eFs2LBh7OWXX7btT0vHToOm32NbioqK2OOPP87S09OZv78/k8lkLCUlhc2cOZN9//33dus2/J5a6nLT0XOI2WxmTz75JAsPD2cKhYJNmDCBnT17ttXf1N69e9m9997LVCoVCwwMZDNmzGDl5eW29Q4fPsymT5/OunXrxmQyGYuMjGTXX389+/XXXzu0/01xjDlQZiUuN2bMGJSVldmNmEIIId5qw4YNmD17Ng4ePOhQjUpXeeU9N0IIIaQtlNwIIYQIDiU3QgghgkP33AghhAgOldwIIYQIDiU3QgghguMVnbgtFgsuX76MoKAgl46bSAghxHMxxlBdXY3Y2FiIRG2XzbwiuV2+fLnZYLiEEEJ8U0FBgd2wdS3xiuTWMG1NQUGBbZR/QgghvkWr1SIhIaHd+TwBL0luDVWRwcHBlNwIIcTHdeT2FDUoIYQQIjiU3AghhAgOJTdCCCGC4xX33DrCYrHAYDDwHQYhHSaVStttzkwI6RxBJDeDwYC8vDxYLBa+QyGkw0QiEZKTkyGVSvkOhRDB8frkxhhDUVERxGIxEhIS6EqYeIWGgQmKiorQrVs3GpyAECfz+uRmMpmg0+kQGxsLhULBdziEdFhERAQuX74Mk8kEiUTCdziECIrXF3PMZjMAUNUO8ToNv9mG3zAhxHm8Prk1oGod4m3oN0uI63Qqua1cuRJJSUmQy+UYMmQIDhw40Oq6Y8aMAcdxzR4TJ07sdNCEEEJIWxxOblu2bEFOTg4WLlyIw4cPY8CAAZgwYQJKSkpaXH/btm0oKiqyPf7880+IxWLcdtttXQ6eEEIIaYnDDUpeffVVzJkzB7NnzwYArFmzBl999RXWr1+Pp556qtn6oaGhds83b94MhULh8uT25Zcu3XwzN9zQufcVFxfjhRdewFdffYXCwkJERkYiIyMD8+fPx7hx45CUlIT58+dj/vz5du979tln8dlnn+HIkSO254sWLQIAiMVihISEID09HTfffDMeeOAByGSyLuwdIYR4F4dKbgaDAYcOHUJ2dvZfGxCJkJ2djX379nVoG+vWrcO0adMQEBDgWKQClJ+fj8zMTOzatQvLly/H0aNHsWPHDowdOxbz5s1zeHt9+vRBUVERLl68iN27d+O2227DkiVLMGzYMFRXV7tgDwghxDM5VHIrKyuD2WxGVFSU3fKoqCicPHmy3fcfOHAAf/75J9atW9fmenq9Hnq93vZcq9U6EqbXmDt3LjiOw4EDB+ySfZ8+fXD33Xc7vD0/Pz9ER0cDAGJjY9GvXz+MHz8eAwYMwLJly/D88887LXbiAiYToNcD9fWAxWJ9MGZ9NB6gQCQCOM76EIkAsRiQyQC53Pp/Qoh7+7mtW7cO/fr1Q1ZWVpvrLVmyxFbFJlQVFRXYsWMHXnjhhRZLsSEhIU75nLS0NFx77bXYtm0bJTe+NCSnhmTV8G9dnTWZ/fKL9f8mU9c/y8/PmuQakl3T/8vlgEJhTYyECJhDyS08PBxisRhqtdpuuVqttpUYWlNbW4vNmzdj8eLF7X7OggULkJOTY3veMEGdkJw9exaMMaSlpbW77pNPPol///vfdssMBgPS09M79FlpaWn45ptvOhUncRBjgNnc/NESk8n6mrMSW8M2a2qsj9aIRIBS+dcjJAQICqKERwTFoeQmlUqRmZmJ3NxcTJ48GYB1GKHc3Fw8+OCDbb73448/hl6vxx133NHu58hkMsE3gGCMdXjdxx9/HHfddZfdsjfeeAPff/99hz+L+lS5gCOJzJNYLEBlpfXRgBIeERiHqyVzcnIwa9YsDBo0CFlZWVixYgVqa2ttrSdnzpyJuLg4LFmyxO5969atw+TJkxEWFuacyL1cjx49wHFch+5VhoeHIzU11W5Z01aobTlx4gSSk5MdjpG0wGwGjEZrCclZpS1P0FrCCw0FoqKA6GhrdSYhXsLh5DZ16lSUlpbimWeeQXFxMTIyMrBjxw5bI5OLFy82G7z41KlT+PHHH6lqrJHQ0FBMmDABK1euxMMPP9zsvptGo3HKfbeTJ09ix44dWLBgQZe35ZMaSmdGo/XhSzNPWCxAWZn1ceyYtSQXFWV9qFRUqiMerVMNSh588MFWqyH37NnTbFmvXr0cqobzFStXrsTw4cORlZWFxYsXo3///jCZTPj222+xevVqnDhxwqHtmUwmFBcXw2KxoLy8HHv27MHzzz+PjIwMPP744y7aCwFi7K/SmdFofU6A6mrr4+xZayOVyEhriS4iglppEo/j9bMCeLOUlBQcPnwYL7zwAh599FEUFRUhIiICmZmZWL16tcPbO3bsGGJiYiAWi6FUKpGeno4FCxZQJ+6OaEhoBoOwqhtdRa8HCgqsD5EICA8H4uOBmBjrc0J4xjEvKFJptVoolUpUVVUhODjY7rX6+nrk5eUhOTkZcrmcpwiJ1zKbrQnNYHB7Ca3eYEBeQQGSS0ogF0pClcmAhAQgKQnw9+c7GiIwbeWCpqjkRnwPY391mBZKUvEUer212vLcOWu1ZVKS9V9C3IySG/EdFstfpTRfahjCB8YAtdr6CAgAEhOBbt0AmpSVuAklNyJ8DaU0o5HvSHxTbS1w/Dhw8iQQF2ctzTlpBB5CWkPJjQiXyWQdp5GqHj2DxfJXI5SoKKB3b2v3AkJcgJIbER6z2ZrUqKTmudRqoKTEWpJLS6PGJ8TpKLkR4bBYrEnNYOA7EtIRjAGXLgGXL1urKnv0AKRSvqMiAkHJjXg/i8V6T42H5vzECSwW4Px54OJFoHt3ICXFOrsBIV1AvyDivRizJjW9npKaEJhMwKlTQH6+tRSXmEgdwkmn0S+HeCej0ToUVH09JTah0euBP/8Edu8GSkv5joZ4KUpuxCF79uwBx3HQaDT8BGCxADqdtXk59VUTNp3OOpHrH39Qi1fiMMFWS375+5du/bwbBtzQqfcVFxfjhRdewFdffYXCwkJERkYiIyMD8+fPx7hx45wS25gxY5CRkYEVK1Y4ZXu8MRqtE3tSUvMtFy5YW1YOGGAdpJmQDhBscvMG+fn5GD58OEJCQrB8+XL069cPRqMRO3fuxLx58zo015uzMMZgNpvh54k38qkVJKmrs5biEhOB9HRqcELaRdWSPJo7dy44jsOBAwdwyy23oGfPnujTpw9ycnLwyy+/ALDOjzdp0iQEBgYiODgYU6ZMgVqttm3j2WefRUZGBjZt2oSkpCQolUpMmzYN1dXVAIC77roLe/fuxeuvvw6O48BxHPLz823Vi//73/+QmZkJmUyGH3/8EXq9Hg8//DAiIyMhl8sxYsQIHDx4kJe/DwBraa2mhhIbsbpwAdizh+7FkXZRcuNJRUUFduzYgXnz5jWbqBQAQkJCYLFYMGnSJFRUVGDv3r349ttvcf78eUydOtVu3XPnzuGzzz7D9u3bsX37duzduxdLly4FALz++usYOnQo5syZg6KiIhQVFSEhIcH23qeeegpLly7FiRMn0L9/fzzxxBP45JNPsHHjRhw+fBipqamYMGECKioqXPsHaYrurZHWNJTi6F4caQOV7Xly9uxZMMaQlpbW6jq5ubk4evQo8vLybAnp/fffR58+fXDw4EEMHjwYAGCxWLBhwwYEXRnK6M4770Rubi5eeOEFKJVKSKVSKBQKREdHN/uMxYsXY/z48QCA2tparF69Ghs2bMC1114LAFi7di2+/fZbrFu3zn0TntK9NdIRDffiBg4EwsL4joZ4GCq58aQj0+idOHECCQkJdiWt9PR0hISE2M3SnZSUZEtsABATE4OSkpIOxTFo0CDb/8+dOwej0Yjhw4fblkkkEmRlZTk8K3in1ddTaY10XF0dsG8fkJfHdyTEw1By40mPHj3AcZxTGo1ImkwjwnEcLB1MDi1VifKCMWtSq6/nOxLibRiz9ov74w+6KCI2lNx4EhoaigkTJmDlypWora1t9rpGo0Hv3r1RUFCAgoIC2/Ljx49Do9EgPT29w58llUphNpvbXa979+6QSqX46aefbMuMRiMOHjzo0Oc5zGKxNhqhgY5JV1y4YC3FUeMjAkpuvFq5ciXMZjOysrLwySef4MyZMzhx4gTeeOMNDB06FNnZ2ejXrx9mzJiBw4cP48CBA5g5cyZGjx5tV53YnqSkJOzfvx/5+fkoKytrtVQXEBCABx54AI8//jh27NiB48ePY86cOdDpdPjHP/7hrN22ZzJZRxrpQPIlpF0VFcD33wNaLd+REJ5RcuNRSkoKDh8+jLFjx+LRRx9F3759MX78eOTm5mL16tXgOA6ff/45VCoVRo0ahezsbKSkpGDLli0Ofc5jjz0GsViM9PR0RERE4OLFi62uu3TpUtxyyy248847cdVVV+Hs2bPYuXMnVCpVV3e3OYPBWmKj4bOIM9XVAT/+CBQV8R0J4RHHOtKygWdarRZKpRJVVVUIDg62e62+vh55eXlITk6GXC7nKULiEMas99b0er4j4VW9wYC8ggIkl5RATk3aXaNnT6BXL76jIE7SVi5oikpuxL0sFmvDER9PbMRNTp8Gfv2Vqr19ECU34j4NiY1KKcSdioqsDU2owZJPoeRG3KOhRSRdQRM+VFZSS0ofQ8mNuF5DYqM+SIRPVVXAzz9TlbiPoORGXMtspsRGPEd1tTXB0WABgkfJjbiO2UxDaRHPU1NDCc4HUHIjrtHQeIQSG/FEtbV0D07gKLkR56N7bMQb1NRQK0oBo+RGnItKbMSbaLXWueEowQkOJTfikIYZvDUaTfMXG0b2p+b+xJtoNMCBA3RBJjDCnaz0yy/d+3k33NCptxUXF+OFF17AV199hcLCQkRGRiIjIwPz58/HuHHjnBLamDFjkJGRgRUrVjhle63S6ewS254ff8TYK38XjuMQFBiIlKQkjB8zBo/MnYuYRpOnPrt0KRYtWwYAEIvFCFEqkd6rF26+4QY8cPfdkMlkf+3P9ddj75WZC6RSKcLDwnBV//6YPWMGbu7E93Df/Pl4d9MmbF63DrdNntxkl3R4bvly/N9nn6GwqAhBgYFI79ULOfPmYdJ117kkHsKDigrg6FFgwAC+IyFOQiU3HuXn5yMzMxO7du3C8uXLcfToUezYsQNjx47FvHnz3BoLYwymrowcUl/fatXOqYMHcfnECRzctQtP/vOf+G7vXvQdNgxHjx2zW69PWhqKTp7ExaNHsfuLL3Db5MlY8tprGDZhAqqrq+3WnTNrFopOnsS5w4fxycaNSO/VC9P+8Q/cO3++Q2HrdDps3rYNTzz8MNZ/8EGz1+/PycG27dvx5rJlOHngAHZs3YpbJ01CeUWFS+IhPLp4kSY9FRBKbjyaO3cuOI7DgQMHcMstt6Bnz57o06cPcnJy8MsvvwAALl68iEmTJiEwMBDBwcGYMmUK1Gq1bRvPPvssMjIysGnTJiQlJUGpVGLatGm2ZHDXXXdh7969eP3118FxHDiOQ35+vq168X//+x8yMzMhk8nw448/Qq/X4+GHH0ZkZCTkcjlGjBiBgwcPtr0jRmObzaojIyIQHRWFnqmpmHbLLfhpxw5EhIfjgUcftVvPz88P0VFRiI2JQb8+ffDQvfdi7/bt+PPECSx7/XW7dRX+/oiOikJ8XByuHjwYyxYtwtuvvYa1Gzfiuz17OvwdfPz550hPS8NT8+fj+337UHDpkt3rX/zvf3g6JwfX/e1vSOrWDZkZGXjo3ntx9x13uCQewrNjx4CyMr6jIE5AyY0nFRUV2LFjB+bNm9fibNghISGwWCyYNGkSKioqsHfvXnz77bc4f/48pk6darfuuXPn8Nlnn2H79u3Yvn079u7di6VLlwIAXn/9dQwdOhRz5sxBUVERioqKkJCQYHvvU089haVLl+LEiRPo378/nnjiCXzyySfYuHEjDh8+jNTUVEyYMAEVTUoqNmaztTrSAf7+/rh/9mz8tH8/SkpL21w3rWdPXJudjW3bt7e73VnTp0MVEoJtDlRJr9u0CXfcdhuUSiWuzc7Ghv/+1+716KgofP3tt81Kjh3RmXgIzxgDDh1y+DdNPA8lN56cPXsWjDGkpaW1uk5ubi6OHj2Kjz76CJmZmRgyZAjef/997N271640ZbFYsGHDBvTt2xcjR47EnXfeidzcXACAUqmEVCqFQqFAdHQ0oqOjIRaLbe9dvHgxxo8fj+7du0Mmk2H16tVYvnw5rr32WqSnp2Pt2rXw9/fHunXrmgfY0DKyE7MmpfXoAQDIb2NuucbrdmQ9kUiEnqmpHVoXAM6cO4dffv0VU2++GQBwx5QpeO/DD9F4Fqh3XnsNP+/fj7Du3TH4mmvwyNNP46crpWpnx0M8hMFgbWBCA3x7NUpuPOnINHonTpxAQkKCXUkrPT0dISEhOHHihG1ZUlISgoKCbM9jYmJQUlLSoTgaz+h97tw5GI1GDB8+3LZMIpEgKyvL7vOu7ID16raTLcwa9p/juA6t25H1HF13/QcfYMI11yA8LAwAcN348ajSarHr++9t64waPhznjxxB7mef4dYbb8Sxkycx8rrr8Nzy5U6Ph3iQ6mrgt9/4joJ0gXBbS3q4Hj16gOM4nDx5ssvbkkgkds85joOlg0mnpSrRDqmvB7pw0j5x+jQAIKlbtw6tm9yB9cxmM86cO4fBAwd2aN2NmzejWK2GX3i43fL1H3yAcaNH25ZJJBKMHDYMI4cNw5Pz5+P5l1/G4pdewpP//CekUqlT4nHEsYsX8exHH+HQuXMorqyEQiZDekICHr/5ZtyQlWVb764VK7Bx165m7+8VF4eTq1fbnj/70UdYtHlzq5/349KlGJ6eDgBYu3MnPtizBycvXYKmthaxoaEY068fFk6bhqSoKCfupQcoLgZOngTaqF0hnouSG09CQ0MxYcIErFy5Eg8//HCzJKPRaNC7d28UFBSgoKDAVno7fvw4NBoN0q+cbDpCKpXC3IG+Z927d4dUKsVPP/2ExMREAIDRaMTBgwcxv2mrP4MB8PfvcAyN1dXV4Z2NGzFq2DBENEosLTl5+jR25OZiwSOPtLvdjf/9Lyo1Gtxy443trvv1N9+guqYGv+3da1dN++eJE5j94IPQVFUhRKls8b3pvXrBZDKhvr6+zeTmSDyOuFBSguq6Osy65hrEhoZCp9fjk59/xo3PP4+3587FvX//u21dmUSCdx980O79yia/tZuHDkVqTEyzz3l60ybU1Ndj8JUqZAD47fx5JEdF4casLKgCA5GnVmPtN99g+8GD+P311xF7pRQsGGfOAMHBQGws35EQB1Fy49HKlSsxfPhwZGVlYfHixejfvz9MJhO+/fZbrF69GsePH0e/fv0wY8YMrFixAiaTCXPnzsXo0aPtqhPbk5SUhP379yM/Px+BgYEIDQ1tcb2AgAA88MADePzxxxEaGopu3brhpZdegk6nwz/+8Q/rSp3ooF1SWor6+npU19Tg0JEjeOmNN1BWXo5t779vt57JZEKxWg2LxYLyigrs+eknPP/yy8jo1w+PP/SQ3bq6ujoUq9UwmUy4dPkyPt2+Ha+tXo0H7r4bY0eObDemdR98gInjx2NAv352y9PT0vDIv/6FD//v/zBvzhyMuf56TL/lFgwaOBBhoaE4fvIknn7uOYwdOdJumvuuxuOI6wYNwnVNvv8HJ05EZk4OXv38c7vk5icW446xY9vcXv/kZPRPTrZbVlBaikvl5bhn/HhIG9UMrHrggWbvn3z11RiUk4P3d+/GU7fe2pld8mxHjgBKJdDZWg7CC0puPEpJScHhw4fxwgsv4NFHH0VRUREiIiKQmZmJ1atXg+M4fP7553jooYcwatQoiEQi/P3vf8ebb77p0Oc89thjmDVrFtLT01FXV4e8NvryLF26FBaLBXfeeSeqq6sxaNAg7Ny5EyqVynqfrRNzYfUaPBgcxyEwMBApiYn429ixyJk3D9FNqrGOnTyJmLQ0iMViKIODkd6rFxY88kizTtwAsHbjRqzduBFSqRRhoaHIHDAAW9avx03XX99uPOqSEnz1zTf4aO3aZq+JRCLcNHEi1n3wAebNmYMJ11yDjf/9L55+7jno6uoQGx2N6ydMwDNPPOG0eJxBLBYjITwcB8+cafaa2WxGrV6PYIWiw9v77/ffgzGGGWPGtLtuUmQkAEBTW9vh7XsVs9ma4IYN61JVPHEvjnWkZQPPtFotlEolqqqq7K6WAaC+vh55eXlITk6GXC7nKUIfUV9P04Q4Ub3BgLyCAiSXlEDeiZZ5tfX1qDMYUFVbiy8OHMDj772HqSNH4sMr/QfvWrEC7+/eDX+pFDq9HqrAQEwfNQrLZs1CYDtVygMefhiVNTW4sG5diw1iyrVamC0WXCwtxeItW/DlgQP4ZtEijHfy/UWP0qcPkJLCdxQ+ra1c0BSV3EjHmM2U2DzMo+vX4+0dOwBYS5w3X3013rrvPtvrMSoVnrj5ZlzVvTssFgt2HD6MVV9/jd/z8rDnxRfh1+heY2PHLl7EH/n5eOLmm1tt6Rk3ezb0V0akCQsKwhv33ivsxAZYG5dERVH1pJeg5Eba19Dsv4lrb70VP7TS5+vpRx7B001GIHGnF195BS++9lqLr428+mr8b+tWN0fkfPNvvBG3DhuGyxUV+L8ff4TZYoGh0RBoS2bNslt/2qhR6Bkbi3998AG2/vQTpo0a1eJ2P7wyosqMRi1Gm/rfwoWoNxpxoqAAH+zZg1pfuPCh6kmvQtWSpH2tVEcWXr6MulZOaqEqFUJVKldH1qqKykpUVFa2+Jq/XI44D2j91tVqyab+9swz0NTWYv/LL7da4qrT6xE4dSpmjxuHd5s00gGs/fKS58xBkL8/jnbw3u65oiL0feghLL/rLjzopnuMvKLqSd5QtSRxnjaqIz0hQbSG7+TKh1uHDcN9q1bhdGEhesXHt7iOv0yGsKAgVLQynNhPJ07gQkkJlsyc2eHP7R4Tg4EpKfhw717fSG5UPekVOjVCycqVK5GUlAS5XI4hQ4bgwIEDba6v0Wgwb948xMTEQCaToWfPnvj66687FTBxo1aqI4lnqjMYAABVbXxn1TodyrRaRLTSh+/DKwNq395GlWSLn63Xt/m5gtJQPen5lV4+zeHktmXLFuTk5GDhwoU4fPgwBgwYgAkTJrQ63JPBYMD48eORn5+PrVu34tSpU1i7di3i4uK6HHxjXlC76n30epp41IVsv1kHf7slLUwUazSZbC0j0xMSUG8woLqFZPPcli1gjOHvV13V4jY+/uknjOjdG90iIpq9bjKbUVlT02z5gdOncfTCBQxKTXVoP7xaRQVNj+PhHK6WfPXVVzFnzhzMnj0bALBmzRp89dVXWL9+PZ566qlm669fvx4VFRX4+eefbcNEJSUldS3qRiQSCTiOQ2lpKSIiImgcP2cxm62DIhOXYIyhVKsFZzZD4uD4nPetWgWtTodRffogLiwMxZWV+HDvXpy8dAmv3H03Av39ka9WY+D8+Zg+ahTSrlRR7vztN3z966/4+1VXYdKQIc22u/O331BeXd1q37aaujok3H03po4YgT7duiFALsfR/Hy8l5sLZUAA/tNktgrBo+pJj+ZQgxKDwQCFQoGtW7dicqMZi2fNmgWNRoPPP/+82Xuuu+46hIaGQqFQ4PPPP0dERARuv/12PPnkk3bDHjWm1+uhb9RZWKvVIiEhodWbiDU1Nbh06RKV3pyJSm0ux5nNiK+oQGArk7y2ZvP332Pdt9/i6IULKK+uRpC/PzK7d8dD11+PG68kLU1NDR565x38cuoULldUwGyxIDUmBjNGj8ZjN90EiV/z69rpy5fjk337ULxxI0IbDcTdwGA04okNG7D76FHkl5SgzmBAbGgosgcMwL+nTBHe2JIdERMDODBaEOkaRxqUOJTcLl++jLi4OPz8888YOnSobfkTTzyBvXv3Yv/+/c3ek5aWhvz8fMyYMQNz587F2bNnMXfuXDz88MNYuHBhi5/z7LPPYtGiRc2Wt7VDZrMZRgdPEqQVFRXA77/zHYWwMQaJxQIxXZB5v5EjgZAQvqPwCR7VWtJisSAyMhLvvPMOxGIxMjMzUVhYiOXLl7ea3BYsWICcnBzb84aSW1vEYnGrJUHioLNnaS4rQjrqxAmg0cU+8QwOJbfw8HCIxWKo1Wq75Wq1GtHR0S2+JyYmBhKJxC7x9O7dG8XFxTAYDC2Oqi6TyZqNJUjc5PJloKqK7ygI8R5lZUBpKdBCIxzCH4daS0qlUmRmZtpmeQasJbPc3Fy7asrGhg8fjrNnz9rNL3b69GnExMS0OV0I4QFj1pvkhBDHNJ3Ml/DO4a4AOTk5WLt2LTZu3IgTJ07ggQceQG1tra315MyZM7FgwQLb+g888AAqKirwz3/+E6dPn8ZXX32FF198EfPmzXPeXhDnuHiRWkgS0hlVVUBhId9RkEYcvuc2depUlJaW4plnnkFxcTEyMjKwY8cORF1pKXXx4kWIRH/lzISEBOzcuROPPPII+vfvj7i4OPzzn//Ek08+6by9IF1nNgNXZscmhHTCqVPW1pOiTo2NQZzM68eWJE5y9ixVrRDSVf36AU7sx0vsOZIL6BKDAEajNbkRQrrm9GnqH+ohKLkRa2KjPoKEdJ1eD5w/z3cUBJTciMkE5OfzHQUhwpGXBzg4pBpxPkpuvu7SJeqwTYgz6fVAURHfUfg8Sm6+7sIFviMgRHioNoR3lNx8WUUFoNXyHQUhwkPHFu8oufkymo+KENeh0huvKLn5Kr0eKC7mOwpChKuwkO5n84iSm6+6eJFadBHiSiYTUFDAdxQ+i5KbL2KMGpIQ4g50nPGGkpsvUquBujq+oyBE+KqrgfJyvqPwSZTcfBHd6CbEfeh44wUlN1+j01knViSEuEdxMWAw8B2Fz6Hk5muohSQh7mWxWG8FELei5OZr6CAjxP3ootLtKLn5EqORbm4TwofSUup642aU3HxJSYm1GwAhxL3MZqCsjO8ofAolN19CVSOE8IeOP7ei5OYrLBZqJUkIn+h+t1tRcvMV5eU02zYhfKqvBzQavqPwGZTcfAVdNRLCPzoO3YaSm6+gg4oQ/tF9N7eh5OYLtFrryCSEEH5ptTSuq5tQcvMFJSV8R0AIaUDHo1tQcvMFdBObEM9Bx6NbUHLzBVVVfEdACGlAyc0tKLkJndFI99sI8SQ1NTQUlxtQchM6ukokxLNYLNaGJcSlKLkJHVVJEuJ56KLT5Si5CR0dRIR4HrrodDlKbkJHBxEhnocuOl2OkpuQUWMSQjwTNSpxOUpuQkZXh4R4JmpU4nKU3ISMqiQJ8Vx08elSlNyEjJIbIZ6Ljk+XouQmZHS/jRDPRQMouxQlNyGrr+c7AkJIa+j4dClKbkLFGKDX8x0FIaQ1lNxcipKbUBkM1gRHCPFMRiN1B3AhSm5CRaU2Qjwfld5chpKbUNFBQ4jno+PUZSi5CRUdNIR4PqphcRlKbkJFBw0hno8uQl2GkptQ0UFDiOej49RlKLkJFR00hHg+qmFxGUpuQkUHDSGejy5CXYaSm1AZDHxHQAhpDx2nLkPJTaiocyghno8GWnCZTiW3lStXIikpCXK5HEOGDMGBAwdaXXfDhg3gOM7uIZfLOx0w6SA6aAjxfHQR6jIOJ7ctW7YgJycHCxcuxOHDhzFgwABMmDABJSUlrb4nODgYRUVFtseFCxe6FDTpAEpuhHg+Ok5dxuHk9uqrr2LOnDmYPXs20tPTsWbNGigUCqxfv77V93Ach+joaNsjKiqqS0GTDqArQkI8HyU3l3EouRkMBhw6dAjZ2dl/bUAkQnZ2Nvbt29fq+2pqapCYmIiEhARMmjQJx44da/Nz9Ho9tFqt3YM4iA4aQjwfHacu4+fIymVlZTCbzc1KXlFRUTh58mSL7+nVqxfWr1+P/v37o6qqCi+//DKGDRuGY8eOIT4+vsX3LFmyBIsWLXIkNOLlGLMOkm40AgYjYDRc+b/hr+Um01/rNn5wnPUBACLRX//6+QFSKSCRWB9N/y8W87OvvsRkNsFoNtoeBpMBBrMRpiv/N1lMsDALGGNgsJ7oG/7P4cp9+iv/AoCfyA8SPykkYj9IxVJIxVL4+UkgFUus/xf72db1CpTcXMah5NYZQ4cOxdChQ23Phw0bht69e+Ptt9/Gc8891+J7FixYgJycHNtzrVaLhIQEV4cqLB56gOv11gnCa2ut/9qSlwmAm49zkeivZCeXAwEB1odC8VeSJO0zW8zQGXSo1eugM9Si3lhvS2YW5tzqcT30gKG2jTU4SMR+kIglkPrJoJD6I0AWiACpAlI/qVNjcQr6obmMQ8ktPDwcYrEYarXabrlarUZ0dHSHtiGRSDBw4ECcPXu21XVkMhlkMpkjoZGmPCC5NU5kDY+G0pcnsFisMer1QE0NUFZ25QUOUPhbkxwlPHuNE1mtoQY6vQ51xnq4/cqkVcyWWHUGHTS6StsrErEEAbIAKKQKz0l4HnCcCpVDyU0qlSIzMxO5ubmYPHkyAMBisSA3NxcPPvhgh7ZhNptx9OhRXHfddQ4HSxzg5jMxY0B1NaDVemYicwizJmWdruWEFxQEKJXWqk2h0xv10NRpUKP3xETmGKPZCI1OA41OY1vWkPACpAEI9g9GoCzQvdWalNxcxuFqyZycHMyaNQuDBg1CVlYWVqxYgdraWsyePRsAMHPmTMTFxWHJkiUAgMWLF+Pqq69GamoqNBoNli9fjgsXLuCee+5x7p4Qe244aMxmQKP562E2u/wj+dNCwlMoAJUKCAmxlu6EokZfA02tNQnojDq+w3GpxgmvUFMIP5EfQhQhCPEPgVKhhFjk4huzVB3gMg4nt6lTp6K0tBTPPPMMiouLkZGRgR07dtgamVy8eBGiRl9YZWUl5syZg+LiYqhUKmRmZuLnn39Genq68/aCNOeig0avByo1gKbSWlLz5fvhDcmusNBaigsJsSa7oCDvOmdZLBZU1Wuhqa2Epk4Do9nId0i8MVlMKKspQ1lNGThwCPYPhkqhQogixDVVmFRycxmOMc8/PWm1WiiVSlRVVSE4OJjvcLzD998DVVVO2VRNzV+lM52wL+SdQiwGgoP/KtX5ubzZluOMZiM0tRpU1mmgratyesMPIVJIFVApVFApVFDIFM7ZaHg40KjBHWmbI7nAAw874hRyeZeSm9kMlJYBpSVAXZ0T4/IBZjNQWWl9cBwQGgpERQGBgXxHBmjrtCipLkFlbaWt6T3pGJ1BB51Bh0JNIRRSBaKCohAaGNq1qksaitBlKLkJVScPGp0OUKuBigqB30NzE8aA8nLrQ6GwJrnQUPf2sTObzSirLUOJtgR1RrpScQadQYe88jxcrLyI8MBwRAZFwl/q7/iGqFW4y1ByEyoHDhqLxVrKUKutVZDENXQ6IC8PuHgRCI8AoiJde+GuM9RBrS1GRU0FzIyuVFzBbDFDrVVDrVUjWB6MyKBIqAJUHW9xSSU3l6HkJlQdOGj0eqC01Pow+m4bArczmwF1sfURHAxERlrvzzmjbYHFYkGlrhJqrRo1erpScSdtvRbaei0kFRJEBkUiIiii/UYolNxchpKbULVx0NTVAZcuWVs90m0Xfmm11odEAsTEApERnWtpabFYoNaqUawt9unWjp7AaDaiUFOIQs1lhAWEIk4VB7mkleORqiVdhpKbULWQ3PR6a7P1snJQUvMwRiNw8YK1NBcXB4SFdawkxxhDWXUZLmkuUVLzOAzlteWoqK1ARFAE4kLiIPGT2K9CJTeXoeQmVI2uCI1G4HIRUKL27X5p3kCvB86fB4qLgfh4a1eC1lTUVOCS5hLqjfVui484joGhpLoEZTVliA6ORrQyGn7iK6deSm4uQ8lNqGQymC0c1MUMRUXU8tHb6HTA6dPWDuEJCfbdCLR1WhRUFKC2zQGEiaexMAsuV11GSXUJYpQxiAqNg4impnAZSm4CZLEAFy9yKD8uhVmn5zsc0gXV1cDx40CICgiNqEVZXQG09TS/oTczWUwoqCzAZZMWEeX9kBCa4F3T9HgJSm4CU1QEnDhhHbg4FHJI4DvJzWAy4p29H+J/R/egur4GqZFJuG/MDAxJGch3aF1itOhx4tIl1OSVIyjIOqiFJ4564goGkwnvfL0X/zt4FNV19UiNjcR9143BkLQUvkPrsjoxw++Xfsf5svPoHdMbUcFR7b+JdJgXjYBH2qLXAwcPAr/+ak1sAGCWdaJTqRdb/MUKfLT/c0zoOxqP/O0eiEQiPLJ5MY5cPM53aJ2m0ZfggvZPVBvKwZi1ZWV+vvVfX7D4wy/w0e79mDCoLx65+W8QcSI88vZmHDl3ke/Quswss3YTqK6vxoG8Azh84TCMJmoU5CyU3ASgsBDYvdvaCKExk0LJT0A8OFZ4Gt8e/wFzx87Ew9mzcdNVf8fKO55HjDICb+3awHd4DjNa9LhUfRIlunxYmnTAtlis33VhoRdPK9QBxy4U4tvDxzH3hrF4eFI2bhp2FVY+eAdiQpV464tdfIfXZaZA+/EpCzWF2H1qN4qrilt5B3EEJTcv1lBaO3y45U7YRh9KbrtO/gQxJ8LkqybYlsn8pLghYzyOXjoJdVUpj9E5pqG0pjO1XTyrrRV2KW7XkZMQizhMHnaVbZlM4ocbrs7A0fxLUFc6Z2BwvhiDms+TpDfpcTD/IJXinICSm5dqrbTWmCkwxG3x8O108XkkhMUhsMlo7X1ie1pfV+fxEZZD2iqttUbIpbjTl4qREBGGQLl9R+c+3WKtrxeq+QjLKRjHwRjY+swCVIrrOh+5LS0cej3wxx9tJ7UGFokMZokcYh/oB1VWU4nwQFWz5WFXlpVWV7g7JIdo9CUoqyvocFJrqqEUFxlpHdJLCMq0NQgPbj6VQpjSuqy0qtrdITmN2V/W7ujZDaW4uJA49Ivr17wDOGkTJTcvUlgIHD3q2DiQpgAlxBrhJze90QCJuPnBL7sytp/e5JmtRo0WPdS1ee1WQXZEQymuuto6+4C3t6jUG42Q+DVPALIrO6Y3em9RtaUqydYUagpRVlOG/vH9Ea2MdmFUwkLVkl7AYgGOHGn93lpbjAEhrgjJ48gk0haHn9KbDNbX/TxvDL9ao6ZD99Yc3m4tcOGC908sK5NIYDQ1L8nqr9S/yiTem71NgR1PbsBfpbijl47CC+aX9giU3DxcfT3w889AQUHn3u8rjUrCA1Uoq6lstrz8yrKIoFB3h9SmirrLuFxzptPVkO0xm60lfY3GJZt3i/DgQJRpm89sUF5lXRahDHJ3SE7jSMmtsfzyfPxy/hcYrly0kdZRcvNgGg3www/WudY6yxTgG8mtR1QKCsoLUaO3L64cKzwFAOgZlcxHWM1YmAVFtedQVn/J5TNhMwaUlFjn6bNYXPpRLtEjPgoFpeWoqbevUj52oRAA0DPOOzs9t9eYpD1lNWX44cwPqK733nuO7kDJzUMVFgI//WQtuXWFRSqHubXpNgTkmt7DYGYWfHZ4p22ZwWTEl7/nok9cT0QpI3iMzspkMeBSzQlUG8rd+rlVVVdaU3rZ+KLXDOgNs4Xhs58P25YZTCZ8uf939EmMQ5TKOy/cOtKYpD06gw4/nvmRWlO2wXsrrQWKMeDkSeDsWedt0xcalfSN64VxvYdj1e73UanTIF4Vg6//2IWiqhL8+/qH+A4PdaYaFNWcgYnx03eprg4ouAjExnrPFGJ9k+IwLqM3Vn25G5XVOsRHqPD1gT9QVFGFf0+/nu/wOq2zVZJNmSwmHMw/iLToNPSI6uGUbQoJx7zg7qRWq4VSqURVVRWChdLOuQUmk7XRiNrJ3XcCCk8j8Er1nJDpTQa8vedD7PhzD6rrapAalYT7Rs/A1d2vav/NLqQ1lEGty/OIhgAcB0RHW2cb8AZ6owlvf70HO379E9W6OqTGRuG+60bj6t7d+Q6t07Sp3VAX59xWj7EhschIyIBYJOxZBhzJBZTcPERtLXDgAFDT/P55l0lqKhF6/Efnb5i0iTGGsvoCVNZ7XtVRWJj1QdyvbHA/mBXOH/dV6a/E4KTB8JcKd0xZR3IB3XPzABUV1oYjrkhsAGAMVMEi8ZK6KIGwMDMu157xyMQGAOXl1hkkvLGhiTcz+ctcktgAoKquCj+c+QFVOu8elsxZKLnxrKwM+OUXx/uvOUof4p0ty7yRhZlxueYMao0avkNpU3U1JTh304c1H0XHqds36bHv/D5U1nahibVAUHLjUUkJsH+/e2bJpuTmHhaLGZdqTjm9Y7ar1NYClynBuY0+LMTln2E0G/HL+V9QUevZQ865GiU3nhQXW0f0d9dJRR8cASbwm818M1tMuFR7EvUmF9Uvu4iu1tpVgBKca1kkfjC6qeO5yWLCL+d/QWm198yG4WyU3HhQVGSdVNStJxOxGIbgcDd+oG8xW0y4VHMS9aZavkPplLo64NIlSnCupA9VWpuruonZYsaBvAMo0Za47TM9CSU3N1Orrc39+WijqlfRoKuu0FBi05u9ezDH+nrgEpXgXMbV99taYmEW/HrhV5RVl7n9s/lGyc2NSkt5KLE1QvfdnM9iMaOw9jT0Ju9ObA3q64DLlynBORvjOBh4GlHFbDHjQP4Bn7sHR8nNTcrL3XuPrSUWicxnZglwBwuzJjZvu8fWHp2OGpk4myEkCKyF6XvcxWwxY//5/dDoNLzF4G6U3NygstJ9rSLbQ1WTzmFhFlyuOYM6kzAHr9XVWhs9ef4QD96BjyrJphoamWjrvKMlb1dRcnOxujpric0TEhsA1IdQcnOGUt0Fr2nu31k1NdZ+mKTr3NEFoCOMZiMO5B2A3uiZk/c6EyU3FzKbrYlN70G/I7MiCMZA/q8ivVllvRpVBt9oYl1ZCWiFncNdTq8KhkXuOSME1Rnr8OuFX2EReL0zJTcXOnLEOt2Ip9FFJvEdgtfSGbUoq7vIdxhupVYDdcKeVMKl6mI9ryFXRW0FjhYe5TsMl6Lk5iJnzlhbnXmi+tBYWPykfIfhdYxmPS7XnnH5JKOehjGg6LJ11griGLNM6jFVkk1drLiIvNI8vsNwGUpuLlBcbJ2TzWOJRKgLT+A7Cq/S0OTfwjzk5qmbmUzURaAz6mIi3dpx21HHLh8TbB84Sm5OVl0N/PYb31G0ry4qCcyDDzpPU6w7D4O5ju8weFVfD6h9c7CLTmEch7oY/meAbwsDw68XfoVOL4x+mo1RcnMig8E6J5s3VN+YZQoYgj37wPMUZXWXUGOkUdYBoFprnaKJtE8fEQqLVMJ3GO0ymo04kH8AJrMXnLgcQMnNSRgDDh2ydoD1FtSwpH1aQzkq6j305ilPysutswmQtuliI/kOocOq66vx20UvqHJyACU3Jzl+3Pv6BBlCImGWKfgOw2PpTTqodcK94d5ZjFkH/zYY+I7Ec5kCFG6bAcBZirXFOFV8iu8wnIaSmxOUlQHnz/MdRSdwHHQRiXxH4ZEszIJi3XkwRi0oWmKxAMVqGsGkNd5UamvsjPqMYCY6peTWRSaTtT+bt6qL6AbG0c+gqYr6y14/yr+r1dcBGg3fUXgeJhajPjKM7zA6hYHhSMERQXTwprNaFx0/bh1iy1sxiRT14fF8h+FR6k21qKwv4jsMr1BWRtWTTeliIngdJLmravQ1OKX2/upJSm5dUFYGXLjAdxRdVxPXi0pvV1iYBWpdns911O4sxqh6sjGLnxi13WL5DqPLzpWc8/rqSTqjdZK3V0c2ZpHKoYtK5jsMj0DVkY6j6sm/6OJjwCR+fIfRZUKonqTk1kneXh3ZVG1sD1jEnt8nx5X0Jh0q9VQd2RlUPQlYJBLUxnveOJKd5e3Vk51KbitXrkRSUhLkcjmGDBmCAwcOdOh9mzdvBsdxmDx5cmc+1mMIpTqyMeYngS6mO99h8Oav1pFUv9YZVD0J1CTGAmLvvdfWEm+unnQ4uW3ZsgU5OTlYuHAhDh8+jAEDBmDChAkoKWl7XJ78/Hw89thjGDlyZKeD9QRCqo5sqjYqBWaJnO8weEHVkV3ny9WTJn8Z6ry0+X9bvLl60uHk9uqrr2LOnDmYPXs20tPTsWbNGigUCqxfv77V95jNZsyYMQOLFi1CSkpKlwLmm9CqI+2IxaiN68l3FG5H1ZHO46vVk7WJ8R49QHJXeGv1pEPJzWAw4NChQ8jOzv5rAyIRsrOzsW/fvlbft3jxYkRGRuIf//hH5yP1AJWVwquObKouohtMsgC+w3ArdV0eVUc6CWNAO5U4gmMMVKA+yjv7tXXUuZJzqK6v5jsMhziU3MrKymA2mxEVZX/TNCoqCsXFxS2+58cff8S6deuwdu3aDn+OXq+HVqu1e3iCEyf4jsANOA418Wl8R+E21YYK1JtooERn0ul8a+zJmmThTx/FwHCiyLtOgC5tLVldXY0777wTa9euRXh4eIfft2TJEiiVStsjIYH/H09JiXXAWF+gD4uFUaHkOwyXY4yhvP4S32EIkreNs9pZhpBgGEKFf6wAgFqrRkWt90wJ4VByCw8Ph1gshlqttluuVqsRHR3dbP1z584hPz8fN9xwA/z8/ODn54f3338fX3zxBfz8/HDu3LkWP2fBggWoqqqyPQoKChwJ0yV8otTWSHW3PnyH4HJaQxkM5nq+wxAkvR7QelctlsMYx6E6hf8Lb3fyptKbQ8lNKpUiMzMTubm5tmUWiwW5ubkYOnRos/XT0tJw9OhRHDlyxPa48cYbMXbsWBw5cqTVEplMJkNwcLDdg0+FhYCH1Iy6jTE4TNBT4liYGeV1VGpzpfJyYXcN0CXEwBTkW/enK2oroNaq21/RAzjclT4nJwezZs3CoEGDkJWVhRUrVqC2thazZ88GAMycORNxcXFYsmQJ5HI5+vbta/f+kJAQAGi23FNZLMDJk3xHwY+ahN6QVZVALMBZejX1apiYke8wBM1oAKqqgCuHvKCYFP7Wfm0+6GTRSUQGRYLz8NahDie3qVOnorS0FM888wyKi4uRkZGBHTt22BqZXLx4ESKRcAY+uXDBuyYgdSYm9oM2eQBUJ1tvCeuNzBYTKqjpv1tUVADBwYCATglgHIeqXsnC2ikHaOu1KKwsRHyoZw+4zjEvaAOt1WqhVCpRVVXl1ipKsxnIzbXeP/BlQflHoSjJ5zsMpymtK6BR/90oPBwIDeU7Cuep7RaLmmTPPrG7mkKqwNheY91ekHEkF/jmpUcHnTtHiQ2wVk8KZcZuk8UAjd477hkIRUWF9UJRCHy5OrIxnUGHC+We3emXklsrDAZrciN/VU8KQXldIc2u7WYWizXBeTtfr45s6kzJGZjMJr7DaBV9S604f946jiSxMgSHe33rSaNFD63BRzpgeRiNBjB5eelNFx8NU3Ag32F4DL1Jj/zyfL7DaBUltxZYLMIfZqszvL16skpfSpOQ8oQxQFvFdxSdZ1L4oyYpju8wPE5+Wb7HDl1Hya0Fly/75uCv7fHm6kkLs0CrL+U7DJ9WVeWd/d6oOrJ1dcY6lFR75mCi9G21ID+f7wg8lyE4HDWx3jdzQI2xkvq18cxo9M5uNTUpCVQd2Yb8sny+Q2gRJbcmtFrr6P+kdbXxvVCviuE7DIdQC0nPUKnhOwLH1EWFQxfffGhB8peS6hLoPHCgB0puTeTl8R2Bd6hKGQiTgt9h0TpKb9Kh3lTDdxgEQJ3OWoLzBsbgQGh7JvEdhlfwxIYllNwaMRqt40iSDhCLoekxGBY/Kd+RtItKbZ6DMUDjBQ1LzFIJNH160H22DrpYcdHjZuumb66RggLhdDZ1B7NMAU3qIDDOc39GFosZ1UYBdLISEG2VtUWyp2IiETR9esAilfAditcwmo0o1HhWycDhsSWFjJr/O84YHIbqxL4Izv+D71BaVGUog4UJ74rFaDbhk/178dOpo6jV1yMhLBK3Xj0G/RJS+A6tXWYzUFNjHXPSE2l7JlMDkk7IL8tHQqjnTAHkuZfcblZWZj3giOPqIhM9toN3lcEzmyl31Tu5X2DH7/sxrGdf3DHybxCJRHhl+2acunyR79A6RKPhO4KW1SbEoD4qjO8wvJKmTgONTsN3GDaU3K6g5v9dU53YF4Ygzzop6ExaGMx1fIfhdOfUhfjlzHHcdvVYTB+ejWv6XIUFk+5AWKASm/ft4ju8Dqmvtz48iT40xOcHRO4qT+oWQMkN1mG21NTmoGs4DprUQR41gkm1vpzvEFziwLmTEHEcrulzlW2Z1M8Po9MzcLb4EsqrvaDFBoBqD6opMSn8UdW7O+Dhc5R5uqKqIo9pWELJDUBpqWff4PYWTCJFZa+rYZbI+Q4FAFBr1PAdgktcKC1GdEgY/KUyu+XdI62j1V8o844rtVoPSW5muQyV/XuB+Yn5DsXrmSwmlNd6xkUlJTcAxcV8RyAcZnkAKtOGwiKRtb+yC9WZagQ7IolGV4MQRfMGDyEB1mWa2mp3h9QpBgP/w9yZZVJU9k+DReb5XVq8hVrrGRdXPp/cGANKhNnmgDdm/0BU9rqa1z5wQi21AYDRZISfuHkpQyK2Nn42ePA0JE3x2YjLLJWgckAazP78XogJDSU3D1FZyf/VoxCZFMHWBCfmp6+QkJObxE8CUwsdMo1XkppU7D09fGpr+flci8TvSmLzjCp0IdEZdNDWafkOg5IbVUm6jilAyUuCM1r00Js9b6w7ZwlRBEKja17k0Vy5iRUSEOTukDqtvt79AydYJBJU9k+DWeHv3g/2IZ5QevP55EatJF3LFBhivQfnxipKIZfaACAxPArFmnLUGfR2y8+pC22vewvG3Ft6M0slqMhIgynQc1r1ClFxFf+lBp9ObrW11HHbHUwBSlSkDXNbI5Mag7CndRjcvTcsjGHXscO2ZUazCd+f/B3do+IQFqTkMTrHuSu5mWVSVGb0phKbG2jqNNAb9e2v6ELeUznvAlQl6T5mRRAq0oZBdXIfxEbX9d61WMyoM3tHa8HOSo2OQ1b33vj4l93Q1ukQpVThx5N/oKy6Cvdccz3f4TmsttbaFceVYxSb/GXWVpFyajziLmqtGt3CuvH2+T5dcqPk5l5m/0BUpg936VQ5NSaNx05770z3ZU/ChP5Z+OnUUXzww06YLRbkTJyKtNhEvkNzmMUC1LlwIBljUAAqB/SmxOZmxVp+T7A+W3IzGmlSUj6YZQpU9B6O4PO/QV7p/B+/0O+3NZD6+WH68GxMH57NdyhOUVsLBAQ4f7t1UWHQ9kymqWt4UFZTBrPFDLGIn87xPvuNV1RYb2YT92NiP1T1GIya2J5O3zZNSuqdnF1yYxyH6pQEaNO6U2LjidliRlUdf0PB+ey3XuUdw+8JWm18L+t8cE66sjNbTDBa+L2JTTrHYHDeEHgWPzE0fXtAlxDjnA2STuNzlgCfTW6eOuWGr9GHxqAifQTM0q63YKs389QjmHQZY4DeCdclJn8ZKgamwxAa0vWNkS6jkhsPqOTmOUyKYJT3GQVDYGiXtqM3UXLzZl1NbvpQJSoG9qGm/h6ESm5u5olzSfk6JpGiMm0odBGdb+1Xb6Hk5s26ktxq46Kh6dsTTOKzbeQ8Uq2+Fiaexjr1yeRGpTYPJRKhOrk/tEn9O3UfTm8S7pBbvqAzF5xMLEZVWgpqUrvRXGweiIFBW8/POJM+eZlDyc2z1UUmwhAcjuC83yGt7tjcUNSYxPs1NCrpaONGvSoY2p7J1H/Nw2l0GoQGdO2WQ2f4ZMmNGpN4PrM8AJW9h0HbrW+HSnHUmMT7dbRRCROLoe2ZDA2NOOIV+GpUQiU34tHqopNhCIlstxRHjUmEQa8H/NtoD0KlNe/DV6MSnyu56fXUmMTbdKQUR41JhKG1khuV1rwXX41KfK7kRlWS3qutUhw1JhGGli48qbTm3Roalbj7vpvPldxoihvv1lIpzsLM1JhEIAyGv/5v8aPSmlBU17t/pg6fK7lRlaQw1EUnQx8Wi4DC0xAXn+Y7HOIkjAEmxkEfH4XaxBgwiXtncSeuUe/Caa5aQ8mNeC2LRIbqpH7QhYZAe6IAwZqOdRsgnolxHLSqMFweEAeZikpqQqI3ub9mhZIb8Xp1UjGKErujIiIaEcUFCKjmp9Mo6bwapQql0XEwyBVQigBKbcJCJTc3cMbgrMSzGM3WA0evCMCllDQoqrUIL74Efx3dYPV0usAglEYnoD4g0Las8X03IgyU3NzA10pudXU1+PTT5Th1aj/OnDmAmppK/POf72HcuLvs1jt9+gByczfg9On9yM//A2azCV980fKEd19/vRp//LELp0/vR1lZAa65Zhbmz9/QbixvvTUH33zzLgYNmohnntnuhL2zatqYRBcUjItB6QjUVCCi+BKkeh/70r2AXq5AaUw8aoNDmr1mNLo/Hk9gNBjx4aoPseerPajR1iCpRxJmzJuBgUMH8h1al/FRLelTrSVNJsBs5jsK99Jqy7B582JcunQCyckDWl3v11+/xrffvguO4xAdndLmNrdtW4ajR3ehW7c+EIs7dn105syvyM3dAKlU7lD8HWGwtJy8akJCkderHy536446RWCL6xD30gUG4XJid+T37NNiYgN8N7mteGYFPv/gc4y+djTuefweiEQiLH5oMY7/dpzv0LpMb9SDuXl2aJ8quflaqQ0AQkNjsHFjEVSqaJw58ysefXRwi+tde+0DuOWWJyGT+WPNmgdRWNh6C8QXX9yLiIhu4DgOU6a0nzQYY1i79mGMHTsTf/yR2+l9aY3R3MZVIcehWhWGalUYZLpaqMpLEKQph8hZM2OSdllEYmhVYagMj4RBrmh3fV+sljx99DR+2PEDZj8yGzfNugkAcM0N1+DBWx/Ehtc24KX3X+I5wq5hYDCYDJBJ3Hc31adKbr6Y3CQSGVSq6HbXU6miIJN1bB6syMhEcA6MwL579yZcuPAn7rzzhQ6/xxHGVkpuTekVAShOSMa53hkoie0Gg8z5pUjyF71cAXVcIs6lZ0Adn9ShxAYARn5mSOHVT9/9BJFYhAm3TLAtk8qkGD95PE7+cRKlxaU8Rucc7r7v5lMlN2pM4n46XTU2bnwSt932dIeSbGe0Vi3ZGoufHyojolEZEQ1FtRYh5WoEajXg3FxtIkSM41CtDIUmPBJ1AUGd2obRB0tu50+eR1xiHBSB9hcAPfv2BADkncpDRHQEH6E5Tb2pHkoo3fZ5PpXcfLHkxrctWxZDKvXHpEmPuGT7FmaB2dL5mzS6oGDogoLhZzQg5EqVpZSughyml/tDGxKGqtAImLvY8doXqyUryyqhClc1W96wrKK0wt0hOZ3e6N7jyqeSG52z3Kuw8DS+/PJ1PPbYfyFxUV17QzeArjJJpCiLjkdZdDyk9XUI0GoQpK2EXFdLJboWMI5DXUAQaoJDUBOsglHmvO+XMWvjLz8fOjsZ9AZIWrgokMqkAAB9vfefvKha0oV8raUk39au/SfS0oZh2LBbXPYZFji/YYhB7g+D3B+VkTEQm4wIqK5CYFUlAqq1EFl890dkFotRGxSCmuAQ1AYpYXFh9vG19j5SmRTGFpqJGvTWYqxMAGNrWph7v9RONShZuXIlkpKSIJfLMWTIEBw4cKDVdbdt24ZBgwYhJCQEAQEByMjIwKZNmzodcFfQBbj7/P77Lhw+vAM33PBPqNX5tofZbILBUAe1Oh86XddHEmEuPmDMfhJoVeG4nNQDZ/sMxKXkntCERcIokbr0cz2FUSpDZXgUClLScC59IIoSu6NaFebSxAb43rGqClehsqyy2fKGZaER7p/J2tncndwc/oVu2bIFOTk5WLNmDYYMGYIVK1ZgwoQJOHXqFCIjI5utHxoain/9619IS0uDVCrF9u3bMXv2bERGRmLChAktfILr+NrVIJ/Kyi4CAJYsubnZa+XlhZgzJxn/+MdrmDRpfpc+h8F9Z0EmEqE2OMTWP8vPoIdcVwt5nQ7yuhrI63QQm7y3qZ/JTwK9vwL1/gGoV1gfJh9J4nxL6ZWCo78eha5GZ9eo5NTRUwCA5F7JfIXmNB7fz+3VV1/FnDlzMHv2bADAmjVr8NVXX2H9+vV46qmnmq0/ZswYu+f//Oc/sXHjRvz4449uT26+djXIp/79r8HTT3/abPnKlfciIiIRU6b8C4mJ/br8Oe5Mbk2ZpDLUSGWoCfnrqtpbEp6nJzJfO1aHjR+GT9//FDs/2Wnr52Y0GJH7eS569uvp9S0lAfcfqw4lN4PBgEOHDmHBggW2ZSKRCNnZ2di3b1+772eMYdeuXTh16hSWLVvW6np6vR76Rq0/tFrnDITrawdMg+3b30JtrQYVFZcBAAcOfImysksAgOuvfwgBAUqUlFzA7t3W6uKzZ38FAGzZ8jwAa7+2sWPvtG3vwIEvkZf3OwDAZDIiP/8P27pZWTciObk/IiK6ISKiW7NY3n13PkJConD11ZOds3Me9qW2lvBk9XXwMxohNhkhMRrgZzJCbDTC78r/ndlohXEcTH4SmCRSmCQSmP0kMF75v0kihUHu71GJrCUe9rW6XK9+vTB8/HC8/+b70FRoEJMQg11f7kJJUQkeevYhvsNzCo+uliwrK4PZbEZUVJTd8qioKJw8ebLV91VVVSEuLg56vR5isRirVq3C+PHjW11/yZIlWLRokSOhdYivHTANPvvsZZSUXLA937dvG/bt2wYAGDPmDgQEKKFW5+HDD/9j976G5337jrZLbj///Al27dpoe37+/G84f/43AEB4eDySk/u7bF+a4rPk1lEmqQwmadsNAsQmoy35+RkN4BgDxyzgGADGwIEBDAAHMHAAx4FxAONEYCLRlWRmTWRmP++fA80Xj9VHnn8EH660H1vyP2/8B30z+/IdmlO4u1qSYw584uXLlxEXF4eff/4ZQ4cOtS1/4oknsHfvXuzfv7/F91ksFpw/fx41NTXIzc3Fc889h88++6xZlWWDlkpuCQkJqKqqQnBwcEfDbebwYaCwsNNvJx6oxlCJ4xU/8h0GcbJ+/QD/jg2YQ7xEYlgi+sd37cJXq9VCqVR2KBc4VHILDw+HWCyGWq22W65WqxEd3froEyKRCKmpqQCAjIwMnDhxAkuWLGk1uclkMsic2G+mgQMjRhFvQV+qINHXKjwizr2jPTr0aVKpFJmZmcjN/WvwW4vFgtzcXLuSXHssFotdycxdRD41kqZv4EBnQSGi5CY87j5WHW4tmZOTg1mzZmHQoEHIysrCihUrUFtba2s9OXPmTMTFxWHJkiUArPfPBg0ahO7du0Ov1+Prr7/Gpk2bsHr1aufuSQfQASM8lNwI8Q6ODLbuDA4nt6lTp6K0tBTPPPMMiouLkZGRgR07dtgamVy8eBGiRkWk2tpazJ07F5cuXYK/vz/S0tLwwQcfYOrUqc7biw6i5CY8nJurOoh70LEqPO6ulnSoQQlfHLmJ2Jbjx4Fz55wYGOGd3lyH30u/4zsM4mRXXeVbY0v6gl5RvdAzumeXtuFILvCpy14XtFEhPJOI6EsVGo6jxCZEcol750/0qeQmp7kpBUfEiSAWeX+/LvIXqWf3Lyed5M5ZuAFKbkQApCL6YoXEwwdPIZ0k96OSm8tQtaQwSSi5CYqEqiQFiaolXYhKbsIkEdNVi5BQtaTwcOAg9XPvF+tTyc3PDxCL+Y6COBtVSwpLCxNSEy8nk8jc3s/Np5IbQKU3IaIWk8JCyU14ZH7uP0YpuRGvJxHTlyokVC0pPO6+3wb4YHKjRiXCQ9WSwkLJTXgoubkBldyEh5KbsAhgOjrShLu7AQCdGFvS21FyEx4/H2wtaTIasfeLD3H0lz2o19UgMi4JYybPQEr6QL5D6xKOo64AQuTuDtyAD5bcgoL4joA4m5gTQyZW8B2GW32xYQX2f/s5+g4Zjb9NvQcikQib31iMi2eO8x1al9AEpcIUJHf/idfnkptSyXcExBUUEt/5YgvzTuP4wR8w9uaZyL51Nq4a9Xfc8ejzUIZFYNcnG/gOr0sCAviOgDgbBw5Kf/cfnz6X3GQyqpoUogC/EL5DcJuTh34CJxLhqpETbMv8JFJkjBiPS+dPoqqilMfoukbhWwVwnxAoD4RY5P4Oxj6X3AAgJITvCIizBfhQya244DzCouIg87fPBLFJ1ulE1AV5fITlFAGBfEdAnI2PUhvgo8mNqiaFJ0ASwncIblNTVYlAparZ8oZl1ZoKd4fkFBwHKOiem+BQcnMjSm7C4yeS+EyjEqPBAHEL7eX9rgynbzLq3R2SU/j7AyKfPCMJW4gihJfP9cmfElVLCpOvNCqRSKUwm4zNlpuMBgCAHw/Nrp2BGpMID1+NSQAfTW7UqESYfKVRSaBShZqqymbLG5YFhYS6OySnoMYkwsNXYxLAR5MbQKU3IfKVRiVRCSkoVxdCX6ezW16Yd+rK68l8hNVl1JhEePgqtQE+nNzovpvw+Eqjkt5XDQOzWHD4h522ZSajEb//nIu45J5QhkbwGF3nUGMSYeIzufnsQDeU3ISnoVGJ3qxrf2UvFpfSC70zh2P3tveh02qgiozBH/t2oaqsBNfPfIjv8DqFGpMIE1+NSQAfTm6hodarRcb4joQ4U5A0rFl1nRBNuvsR7PncOrZkna4GUfFJmPrQf5DYsy/foXUKDYsnPGKRmEpufJBIrAmuvJzvSIgzhciiUFZXwHcYLucnkSL71tnIvnU236E4Bd0DF56IwAjeGpMAPnzPDQCioviOgDibUhoBjvPpn7XXEYup5CZEUcH8nmB9+iwQHc13BMTZxCI/BEnC+A6DOECppPttQkTJjUcBAUAgNT8WHJWcrlq8iar5SGLEy6kUKl7mcGvMp5MbQKU3IVLJ6Ev1FhxHLZeFiO9SG0DJje67CZBULPeZobi8XWAg4OezzdqEK1rJ/wWmzyc3lQqQSvmOgjhbiIyuWrwBVUkKj0Kq4GXm7aZ8PrlxHJXehIiqJr0DdQEQHk+okgQouQGg5CZEARIlJCIaHduTyeU0gLkQRQd7xoUlJTcAkZHUFFmIqGrSs1GVpPBIxBKEBXpGVxw6pcPaiZRaTQpPuH883yGQNoR5xjmQOFGMMgYcx/EdBgBKbjaJiXxHQJwtSBoKfz/+b2yT5gKDaP42IUoKS+I7BBtKbleEh1OHbiGKVCTxHQJpQVQk3xEQZ1MpVFAqPKcLDiW3RpKS+I6AOFu4PB5ijjpSeRI/P7rfJkSeVGoDKLnZSUigDqVCIxb5IYzuvXmUiAhqwCU0UrEUsSGxfIdhh35ijfj5AXFxfEdBnI2qJj0Hx1lbJxNh6RbWDSIPu2LxrGg8AFVNCo/CLwiBklC+wyCwjiMp43c8XeJkHDgkhnpeizxKbk0EB1snMSXCEkWlN49ApTbhiQiKgELmeU1fKbm1gEpvwqOSx0AioiIDn2QyGm5LiJLCk/gOoUWU3FoQE0NVJ0Ij4kSI8O/Gdxg+jUptwqOQKhAZ5JlfLCW3FohEQDc6DwpOhCIRHEc/eT6IREB4BN9REGdLCkvymBFJmqIjvRXduwMSCd9REGeSif0RLk/gOwyfFBUFSKibjaDI/GQeWyUJUHJrlURiTXBEWOICe0LEifkOw6eIxdaqfiIsPaN6Qizy3GOJklsbUlJoSg6hkYrliFak8B2GT4mJpcERhCZAGoBuoZ5974aSWxvEYqBHD76jIM4WHdAdYhHVObuDRErzJQpRr+heHtdpu6lORbdy5UokJSVBLpdjyJAhOHDgQKvrrl27FiNHjoRKpYJKpUJ2dnab63uaxEQgIIDvKIgz+YkkiA2gqxZ3iI8DxJ59DiQOCpYHI07l+UM5Ofyz27JlC3JycrBw4UIcPnwYAwYMwIQJE1BSUtLi+nv27MH06dOxe/du7Nu3DwkJCfjb3/6GwsLCLgfvDhwH9OrFdxTE2aIUyZCK/fkOQ9DkcutsG0RYesf05juEDuEYY8yRNwwZMgSDBw/GW2+9BQCwWCxISEjAQw89hKeeeqrd95vNZqhUKrz11luYOXNmhz5Tq9VCqVSiqqoKwcHBjoTrNN9/D1RV8fLRxEVKdReRp/2d7zAEKzWVRvsRmrCAMAxLHcbb5zuSCxwquRkMBhw6dAjZ2dl/bUAkQnZ2Nvbt29ehbeh0OhiNRoS28avX6/XQarV2D7719o6LFeKAcP8EyP1oEj9XCAigxCZE3lJqAxxMbmVlZTCbzYhqcoc4KioKxcXFHdrGk08+idjYWLsE2dSSJUugVCptj4QE/vsmRURQFYvQcByHhEDvOVi9iQccssTJooOjoQrwnon43Hqrd+nSpdi8eTM+/fRTyNtoY79gwQJUVVXZHgUFBW6MsnVUehMelTwagRLvOWC9QbDSOgA5EQ4OHNJi0vgOwyEOJbfw8HCIxWKo1Wq75Wq1GtHR0W2+9+WXX8bSpUvxzTffoH///m2uK5PJEBwcbPfwBCEhNKiyECUF96dhuZxEJAKSPG/2E9JFqZGpCJIH8R2GQxw6oqVSKTIzM5Gbm2tbZrFYkJubi6FDh7b6vpdeegnPPfccduzYgUGDBnU+Wg+Qng4oPG92B9IFCkkwdQ1wkoQEGvhAaILkQegZ1ZPvMBzm8OVqTk4O1q5di40bN+LEiRN44IEHUFtbi9mzZwMAZs6ciQULFtjWX7ZsGf7zn/9g/fr1SEpKQnFxMYqLi1FTU+O8vXAjsRjIyOA7CuJssQE9oJAo+Q7DqwUFUYdtoeHAISMhw+M7bLfE4UFxpk6ditLSUjzzzDMoLi5GRkYGduzYYWtkcvHiRbs/xOrVq2EwGHDrrbfabWfhwoV49tlnuxY9T8LCrNWT+fl8R0KcheM4pARn4FjFD2DMwnc4XkckApKT+Y6COFtqZCpCFCF8h9EpDvdz44Mn9HNrymwG9uwBdDq+IyHOVFhzGoU1p/gOw+skJlKpTWiC5EEY1WOUR5XaXNbPjfyFqieFiaonHUfVkcLjzdWRDbw3cg/QUD1JhKOhepJaT3YMVUcKkzdXRzagI7iLqPWk8FDryY6j1pHC462tI5ui5NZFVD0pTFQ92T6qjhQeDhwGJgz06urIBt6/Bx4gLIxm7RYajuOQohxIs3a3Qiy2TuZLhKVnVE8oFcK4qKPk5iS9e1vHnyTCofALQopyIN9heByOs474L5PxHQlxphhlDHpGe391ZANKbk7CcUBmJk1sKjSh8hjEBgrngHeGhARAKYyLe3JFsDwYA7sJ60KOkpsTSSRAVhbg53DXeOLJ4gN7QSWP4TsMjxAeDrQzjCzxMlKxFFnJWRCLhFUFT8nNyQIDrSU4juM7EuJMKcqBUPh5xgACfAkMoq4vQiPiRBiUNAj+UuHNSk/JzQUiI4E075odgrRDzInRQ5UFP5GU71B4IZUCPVKt/dqIcPSN64uwwDC+w3AJ+qm6SGoqEBfHdxTEmWRif6SGDPK5Dt4iEdCjh7XanQhHUlgSEsOEOz+Rbx2lbpaRYZ0DjghHsDQMiUF9+Q7DrZKTqaGU0IQFhKFvnLB/x5TcXEgkAgYPphEchCZSkYhIRRLfYbhFbKy1HycRDoVUgUFJg8AJvGEAJTcXk8utCY5aUApLYlBfKGWRfIfhUqGhQHw831EQZ5KIJchKzoLUT/j3jim5uUFICDBkiHVUByIMHMehR8hgBEvD+Q7FJVQqGnVHaCRiCYamDEWQPIjvUNyCkpubhIZa+8BRazPhEHEi9FBlIVASyncoTqVUWhObwGutfIqfyA9DkocIZmitjqBTrRuFh1urKCnBCYeYE6OXaggCJSq+Q3GK4GAgtQf9RoVELBJjSMoQqAKE8RvtKPoJu1lkpLWTN508hEMs8kNP1RAESEL4DqVLgoKAHj0BMf02BUMsEiMrKQuhAcKqXegI+hnzIDqaSnBC4yeSoJfqaq8twQUHAz17UWITkoaqyPAgYd4Xbg/9lHkSGWm9B0eNTISjIcEFSb2r7bxSCfSkEpugSMQSXJ1ytWBHH+kI+jnzKCLC2oqSugkIR0MVZbDUO+Y/Uqmso49QLYJwNCQ2X7vH1hT9pHkWFgZcfbV17D4iDGJOjJ6qLKhknj18fliYdZg4SmzCIZfIMaz7MIQoQvgOhXf0s/YAKhUwcqT1vgcRBms3gcGIDejBdyjNcJy1czY19xeWEP8QjOwxEsH+dCIBKLl5DIUCGD6c5soSmvigNHRXXgUR5xk3V8ViazVkbCzfkRBnilfFY3jqcMglNNZfA0puHsTPz9qKsidN/CwoYf5x6B06HBIRvycemRxIT6fBvIWEA4feMb0xsNtAiKh+2Q79NTxQr17AoEHUklJIAiRK9A0bxVtXgeBgoE864C+8OSl9VsM4kamRqXyH4pEouXmomBhgxAg6GQmJRCxDWugwhPsnuPVzo6KsF0zUKlc4AqQBGJE6ApHBwh68uysouXmw4GBrQ5NQ3xtcQLBEnAgpygx0C+oDDq5tzcFx1rnYEhOp4YiQRARGYGSPkQiUB/Idikej5ObhZDJg6FDrCYoIR3RACnqqhkAscs301hIJkJZm7UtJhCMlPAVDUoZA4kfToreHkpsXEImA/v2t9+FkMr6jIc6ilEWgX9gYp88Lp1IBfftax4okwiCXyDEkeQj6xPUR/CSjzkK18F4kJsba8fboUeDyZb6jIc4gFcvRSzUEpXUFuFh9DGaLsdPb8vMDEpOAMKrGFpQEVQL6xPah0pqDKLl5GanUOqtAbCzwxx+AwcB3RMQZIvwToJRGIE/7O6r0JQ6/X6UCkpKs1ZFEGOQSOQbED6BGI51Eyc1LUSlOeDpTiqPSmjBRaa3rKLl5MSrFCVNHS3FUWhMeuUSO/vH9ERUcxXcoXo+SmwBQKU542irFUWlNmKi05lyU3ASioRQXFwecOAHU1PAdEXGGhlJcQc0JVNQXIjyCIT6OSmtCEiwPRnpsOiKCqN+GM1FyE5joaOuIFAUFwKlTQH093xGRrpKK5RjZeyDiklJxUXsCaq2a75CIEyikCvSK6oU4VRw173cBSm4CxHFAt27WaU3y8oAzZwBj51uYEx5FRFg7Y1sHOw5CdFgWKmorcKLoBCpqK3iOjnSGzE+GHpE9kBiWSIMduxAlNwETiaxzdnXrBpw7B5w/D5jNfEdFOkKpBHr3bnmEkdCAUAxPHQ61Vo2TRSehrde6P0DiMD+RH7pHdEdKRAr8xHTqdTX6C/uAhqGYkpKA06eBixcBxviOirQkIMD6XXVkvrWo4ChEBkWisLIQp9SnoDPoXB8gcZiIEyEpLAk9onpA6iflOxyfQcnNh8jl1mG8une33o+7fJmSnKdQKIDUVGsp25HbLxzHIT40HrEhsbhQfgFnS8+i3kg3Wj2BiBMhLiQOvaJ7wV9K03u4G8eY55/etFotlEolqqqqEBxMU6g7S329tRR34QI1POEDx1mrHZOSgMhI54zczxhDcVUx8svzUVZT1vUNEocppAokhiWiW2g3Kqk5mSO5gEpuPkwut8763aMHUFwM5OcDZXQ+dDmpFEhIsCY1hcK52+Y4DjEhMYgJiUFNfQ0ulF9AQWUBjGZqUeRqkUGRSApPQmRQJLV+9ACU3Ag4ztoRPCbG2j8uPx+4dIlaWDpbw4gisbHWxj6uFigPRJ+4PkiLSUNhZSHyy/NRVVfl+g/2IVKxFAmhCUgMS0SALIDvcEgjVC1JWmQ2A4WF1q4EWmqM12lisbVjfVKStQUk3yprK5Ffno/LmsuwMAvf4XitEP8QJIUnIS4kjprzuxFVS5IuE4utjRu6dbMmt+JiQK0GNBq+I/N8Eon1HlpUlPXh50FHmSpABVWACn1j+0KtVUNdrUaJtgQmi4nv0DwaBw4hihBEBUchWhmNIDlNlufpPOiwI54qONj66NnT2vBErbYmu7IywEIX/wCs984aRocJC3NO4xBXkvhJEB8aj/jQeFgsFpTXlqO4qhhqrRp1xjq+w/MIYpEYEYERiAqOQlRwFGQSminYm3Qqua1cuRLLly9HcXExBgwYgDfffBNZWVktrnvs2DE888wzOHToEC5cuIDXXnsN8+fP70rMhEdyOZCYaH2YzUBpqTXRlZQAej3f0bkPx1lHDWlIaN4867VIJEJEUAQigiLQD/2grdPaEp2mTsN3eG4ll8gRGRSJaGU0IgIjqMrRizmc3LZs2YKcnBysWbMGQ4YMwYoVKzBhwgScOnUKkZHNJ9XT6XRISUnBbbfdhkceecQpQRPPIBZbT+7R0db+cpWV1tJcVZW1+lJI3QtEImvpVam0NgyJjARkAr2QD/YPRrB/MHpG90S9sR4l2hJU6ipRVVeF6vpqQd2r85f4I0QRAqW/EhFBEQhRhPAdEnEShxuUDBkyBIMHD8Zbb70FALBYLEhISMBDDz2Ep556qs33JiUlYf78+Q6X3KhBiXfS6/9KdN6U8BonspAQ679BQe5p4ejpLBYLtPVaVNVVQaPTeFXCa5zIGv6lfmjexWUNSgwGAw4dOoQFCxbYlolEImRnZ2Pfvn2di5YIlkxmLeE0LtA3TnharTXZ1ddbl7v7/p1EYq1mlcut98wokbVPJBIhRBGCEEUIEsMSATRPeDqDDvXGeuhNerf3rxNxIsglcsj8ZPCX+iNYHkyJzEc5lNzKyspgNpsRFWU/S2xUVBROnjzptKD0ej30jW7gaKktumC0lPAaGAx/JbrGSa/hX6PRWv3JmDURNvyf4+wfIpH1X5nsr+TV8P/GyyiBOUdLCa+B2WKG3qhHvanemvCa/F9v0oOBwWKxgIGBMWb7l+M4iDgROHDgOOtDIpbYkpdcIofcTw6Z5K//00SfpIFHtpZcsmQJFi1axHcYxM2kUuuDCIdYJIZCpoBC5uShWAhph0PXruHh4RCLxVCr7SdLVKvViI6OdlpQCxYsQFVVle1RUFDgtG0TQggRPoeSm1QqRWZmJnJzc23LLBYLcnNzMXToUKcFJZPJEBwcbPcghBBCOsrhasmcnBzMmjULgwYNQlZWFlasWIHa2lrMnj0bADBz5kzExcVhyZIlAKyNUI4fP277f2FhIY4cOYLAwECkpqY6cVcIIYQQK4eT29SpU1FaWopnnnkGxcXFyMjIwI4dO2yNTC5evGjX8fHy5csYOHCg7fnLL7+Ml19+GaNHj8aePXu6vgeEEEJIEzRwMiGEEK/gSC6gxtCEEEIEh5IbIYQQwaHkRgghRHAouRFCCBEcSm6EEEIEh5IbIYQQwaHkRgghRHAouRFCCBEcSm6EEEIExyOnvGmqYRAVmteNEEJ8V0MO6MjAWl6R3KqrqwEACQkJPEdCCCGEb9XV1VAqlW2u4xVjS1osFly+fBlBQUHgOK5T29BqtUhISEBBQYHgx6ekfRUeX9lPgPZVqJyxr4wxVFdXIzY21m6A/pZ4RclNJBIhPj7eKdvypfnhaF+Fx1f2E6B9Faqu7mt7JbYG1KCEEEKI4FByI4QQIjg+k9xkMhkWLlwImUzGdyguR/sqPL6ynwDtq1C5e1+9okEJIYQQ4gifKbkRQgjxHZTcCCGECA4lN0IIIYJDyY0QQojgCCq5rVy5EklJSZDL5RgyZAgOHDjQ6rrHjh3DLbfcgqSkJHAchxUrVrgvUCdwZF/Xrl2LkSNHQqVSQaVSITs7u831PY0j+7pt2zYMGjQIISEhCAgIQEZGBjZt2uTGaDvPkf1sbPPmzeA4DpMnT3ZtgE7kyL5u2LABHMfZPeRyuRuj7RpHv1eNRoN58+YhJiYGMpkMPXv2xNdff+2maLvGkX0dM2ZMs++V4zhMnDjROcEwgdi8eTOTSqVs/fr17NixY2zOnDksJCSEqdXqFtc/cOAAe+yxx9h///tfFh0dzV577TX3BtwFju7r7bffzlauXMl+++03duLECXbXXXcxpVLJLl265ObIHefovu7evZtt27aNHT9+nJ09e5atWLGCicVitmPHDjdH7hhH97NBXl4ei4uLYyNHjmSTJk1yT7Bd5Oi+vvfeeyw4OJgVFRXZHsXFxW6OunMc3Ve9Xs8GDRrErrvuOvbjjz+yvLw8tmfPHnbkyBE3R+44R/e1vLzc7jv9888/mVgsZu+9955T4hFMcsvKymLz5s2zPTebzSw2NpYtWbKk3fcmJiZ6VXLryr4yxpjJZGJBQUFs48aNrgrRabq6r4wxNnDgQPbvf//bFeE5TWf202QysWHDhrF3332XzZo1y2uSm6P7+t577zGlUumm6JzL0X1dvXo1S0lJYQaDwV0hOk1Xj9XXXnuNBQUFsZqaGqfEI4hqSYPBgEOHDiE7O9u2TCQSITs7G/v27eMxMudzxr7qdDoYjUaEhoa6Kkyn6Oq+MsaQm5uLU6dOYdSoUa4MtUs6u5+LFy9GZGQk/vGPf7gjTKfo7L7W1NQgMTERCQkJmDRpEo4dO+aOcLukM/v6xRdfYOjQoZg3bx6ioqLQt29fvPjiizCbze4Ku1OccV5at24dpk2bhoCAAKfEJIjkVlZWBrPZjKioKLvlUVFRKC4u5ikq13DGvj755JOIjY21+yF6os7ua1VVFQIDAyGVSjFx4kS8+eabGD9+vKvD7bTO7OePP/6IdevWYe3ate4I0Wk6s6+9evXC+vXr8fnnn+ODDz6AxWLBsGHDcOnSJXeE3Gmd2dfz589j69atMJvN+Prrr/Gf//wHr7zyCp5//nl3hNxpXT0vHThwAH/++Sfuuecep8XkFbMCEOdZunQpNm/ejD179njVTXlHBAUF4ciRI6ipqUFubi5ycnKQkpKCMWPG8B2aU1RXV+POO+/E2rVrER4eznc4Ljd06FAMHTrU9nzYsGHo3bs33n77bTz33HM8RuZ8FosFkZGReOeddyAWi5GZmYnCwkIsX74cCxcu5Ds8l1m3bh369euHrKwsp21TEMktPDwcYrEYarXabrlarUZ0dDRPUblGV/b15ZdfxtKlS/Hdd9+hf//+rgzTKTq7ryKRCKmpqQCAjIwMnDhxAkuWLPHY5Obofp47dw75+fm44YYbbMssFgsAwM/PD6dOnUL37t1dG3QnOeNYlUgkGDhwIM6ePeuKEJ2mM/saExMDiUQCsVhsW9a7d28UFxfDYDBAKpW6NObO6sr3Wltbi82bN2Px4sVOjUkQ1ZJSqRSZmZnIzc21LbNYLMjNzbW74hOCzu7rSy+9hOeeew47duzAoEGD3BFqlznre7VYLNDr9a4I0Skc3c+0tDQcPXoUR44csT1uvPFGjB07FkeOHPHoGeud8Z2azWYcPXoUMTExrgrTKTqzr8OHD8fZs2dtFysAcPr0acTExHhsYgO69r1+/PHH0Ov1uOOOO5wblFOapXiAzZs3M5lMxjZs2MCOHz/O7r33XhYSEmJrMnznnXeyp556yra+Xq9nv/32G/vtt99YTEwMe+yxx9hvv/3Gzpw5w9cudJij+7p06VImlUrZ1q1b7ZreVldX87ULHebovr744ovsm2++YefOnWPHjx9nL7/8MvPz82Nr167laxc6xNH9bMqbWks6uq+LFi1iO3fuZOfOnWOHDh1i06ZNY3K5nB07doyvXegwR/f14sWLLCgoiD344IPs1KlTbPv27SwyMpI9//zzfO1Ch3X2NzxixAg2depUp8cjmOTGGGNvvvkm69atG5NKpSwrK4v98ssvttdGjx7NZs2aZXuel5fHADR7jB492v2Bd4Ij+5qYmNjivi5cuND9gXeCI/v6r3/9i6WmpjK5XM5UKhUbOnQo27x5Mw9RO86R/WzKm5IbY47t6/z5823rRkVFseuuu44dPnyYh6g7x9Hv9eeff2ZDhgxhMpmMpaSksBdeeIGZTCY3R905ju7ryZMnGQD2zTffOD0WmvKGEEKI4AjinhshhBDSGCU3QgghgkPJjRBCiOBQciOEECI4lNwIIYQIDiU3QgghgkPJjRBCiOBQciOEECI4lNwIIYQIDiU3QgghgkPJjRBCiOBQciOEECI4/w8wlqjj5Cg6rwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total unique missense variants in each group:\n", + "CHD: 1114\n", + "Control: 0\n", + "Control_DDD_ASD: 3573\n" + ] + } + ], + "source": [ + "def display_qc(all_results: pd.DataFrame):\n", + " \"\"\"\n", + " Perform and display final QC checks.\n", + " \"\"\"\n", + " print(\"\\n--- 4. Quality Control and Display ---\")\n", + "\n", + " # A. Initial Variant Type Counts\n", + " print(\"A. Initial variant type and class counts:\")\n", + " display(all_results[[\"classification\", \"class\"]].value_counts().to_frame(\"count\"))\n", + "\n", + " # B. Variant Overlaps\n", + " chd_variants = set(all_results[all_results[\"class\"] == \"chd\"][\"variant_id\"])\n", + " control_variants = set(all_results[all_results[\"class\"] == \"control\"][\"variant_id\"])\n", + "\n", + " chd_control_overlap = chd_variants & control_variants\n", + "\n", + " print(\"\\nB. Unique Variant Overlaps and Totals:\")\n", + " print(f\"CHD-Control overlap (same variant_id): {len(chd_control_overlap)}\")\n", + " print(f\"Total unique CHD variants: {len(chd_variants)}\")\n", + " print(f\"Total unique Control variants: {len(control_variants)}\")\n", + "\n", + " # Check for position overlaps (chrom_pos)\n", + " all_results[\"chrom_pos\"] = all_results[\"chrom\"].astype(str) + \":\" + all_results[\"pos\"].astype(str)\n", + " chd_positions = set(all_results[all_results[\"class\"] == \"chd\"][\"chrom_pos\"])\n", + " control_positions = set(all_results[all_results[\"class\"] == \"control\"][\"chrom_pos\"])\n", + " control_ddd_asd_positions = set(all_results[all_results[\"class\"] == \"control_ddd_asd\"][\"chrom_pos\"])\n", + " position_overlap = chd_positions & control_positions\n", + " position_overlap_ddd_asd = chd_positions & control_ddd_asd_positions\n", + " print(\"\\nPosition Overlaps (same chrom:pos, potentially different alleles):\")\n", + " print(f\"CHD-Control position overlap: {len(position_overlap)}\")\n", + " print(f\"CHD-Control_DDD_ASD position overlap: {len(position_overlap_ddd_asd)}\")\n", + " print(f\"Total unique CHD positions: {len(chd_positions)}\")\n", + " print(f\"Total unique Control positions: {len(control_positions)}\")\n", + " print(f\"Total unique Control_DDD_ASD positions: {len(control_ddd_asd_positions)}\")\n", + "\n", + " if len(position_overlap) > 0:\n", + " print(f\" Note: {len(position_overlap)} positions have variants in both CHD and Control groups\")\n", + "\n", + " # C. Unique and Total Rows by Class and Variant Type\n", + " print(\"\\nC. Unique Variants and Total Rows by Class/Type:\")\n", + " variant_counts = (\n", + " all_results.groupby([\"class\", \"classification\"])\n", + " .agg({\"variant_id\": [\"nunique\", \"count\"]})\n", + " .rename(columns={\"nunique\": \"unique_variants\", \"count\": \"total_rows\"})\n", + " )\n", + " display(variant_counts)\n", + "\n", + " # D. Final Missense-specific QC\n", + " print(\"\\nD. Final Missense-specific QC (matching original 'mis'/'misD' annotation):\")\n", + " final_qc_counts = (\n", + " all_results.groupby([\"class\", \"classification\"])\n", + " .agg({\"variant_id\": [\"nunique\", \"count\"]})\n", + " .rename(columns={\"nunique\": \"unique_variants\", \"count\": \"total_rows\"})\n", + " )\n", + " display(final_qc_counts)\n", + "\n", + " # E. Venn Diagram for Missense Variants Overlap\n", + " print(\"\\nE. Venn Diagram - Overlap of Missense Variants Between Classes:\")\n", + " # Get sets of variant IDs for each group (missense only)\n", + " chd_varids = set(all_results[(all_results[\"class\"] == \"chd\")][\"variant_id\"].unique())\n", + " control_varids = set(all_results[(all_results[\"class\"] == \"control\")][\"variant_id\"].unique())\n", + " control_ddd_asd_varids = set(all_results[(all_results[\"class\"] == \"control_ddd_asd\")][\"variant_id\"].unique())\n", + " # Create Venn diagram using matplotlib circles\n", + " plt.figure(figsize=(5, 5))\n", + " # Create three circles\n", + " circle1 = plt.Circle((0.3, 0.3), 0.2, alpha=0.3, fc=\"blue\", label=\"CHD\")\n", + " circle2 = plt.Circle((0.5, 0.3), 0.2, alpha=0.3, fc=\"darkgreen\", label=\"Control\")\n", + " circle3 = plt.Circle((0.4, 0.5), 0.2, alpha=0.3, fc=\"red\", label=\"Control_DDD_ASD\")\n", + " plt.gca().add_patch(circle1)\n", + " plt.gca().add_patch(circle2)\n", + " plt.gca().add_patch(circle3)\n", + " # Add counts in appropriate locations\n", + " # Unique to each set\n", + " plt.text(0.2, 0.3, str(len(chd_varids - control_varids - control_ddd_asd_varids)), fontsize=12)\n", + " plt.text(0.6, 0.3, str(len(control_varids - chd_varids - control_ddd_asd_varids)), fontsize=12)\n", + " plt.text(0.4, 0.6, str(len(control_ddd_asd_varids - chd_varids - control_varids)), fontsize=12)\n", + " # Pairwise overlaps\n", + " plt.text(0.4, 0.25, str(len(chd_varids & control_varids - control_ddd_asd_varids)), fontsize=12) # CHD & Control\n", + " plt.text(\n", + " 0.3, 0.45, str(len(chd_varids & control_ddd_asd_varids - control_varids)), fontsize=12\n", + " ) # CHD & Control_DDD_ASD\n", + " plt.text(\n", + " 0.5, 0.45, str(len(control_varids & control_ddd_asd_varids - chd_varids)), fontsize=12\n", + " ) # Control & Control_DDD_ASD\n", + " # Three-way overlap\n", + " plt.text(0.4, 0.35, str(len(chd_varids & control_varids & control_ddd_asd_varids)), fontsize=12)\n", + "\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, 1)\n", + " plt.title(\"Overlap of Missense Variants Between Groups\")\n", + " plt.legend()\n", + " plt.axis(\"equal\")\n", + " plt.show()\n", + " print(\"\\nTotal unique missense variants in each group:\")\n", + " print(\"CHD:\", len(chd_varids))\n", + " print(\"Control:\", len(control_varids))\n", + " print(\"Control_DDD_ASD:\", len(control_ddd_asd_varids))\n", + "\n", + " return all_results # Return final processed table\n", + "\n", + "\n", + "# 4. run the filter and display qc\n", + "_ = display_qc(variants)" + ] + }, + { + "cell_type": "markdown", + "id": "ebea927d", + "metadata": {}, + "source": [ + "- Save to disk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5da097da", + "metadata": {}, + "outputs": [], + "source": [ + "# Keep only the controls from the DDD/ASD dataset and CHD cases:\n", + "mask = variants[\"class\"].isin([\"control_ddd_asd\", \"chd\"])\n", + "variants = variants[mask].copy()\n", + "# rename control_ddd_asd to control\n", + "variants.loc[variants[\"class\"] == \"control_ddd_asd\", \"class\"] = \"control\"\n", + "variants.to_csv(f\"{OUTPUT_DIR}/chd_dnm_filtered_canonical_transcripts_ddd_asd_ctrls_am_scores.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "39195fb4", + "metadata": {}, + "source": [ + "\n", + "- Add features: pLI, phlop, cds length, cds_frac" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "404f5d52", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with pLI scores: 4680 / 4704\n", + "\n", + "Adding PhyloP scores from bigWig file...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4704/4704 [00:00<00:00, 27221.87it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with PhyloP scores: 4702 / 4704\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with pLI scores: 31606 / 31738\n", + "\n", + "Adding PhyloP scores from bigWig file...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 31738/31738 [00:00<00:00, 40671.22it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with PhyloP scores: 31738 / 31738\n" + ] + } + ], + "source": [ + "def process_variant_features(variants, pli_file_path, phylop_bw_path):\n", + " \"\"\"\n", + " Add pLI, PhyloP conservation scores, and CDS features to variant data.\n", + " \"\"\"\n", + "\n", + " # Read gnomAD pLI metrics by transcript (not by gene)\n", + " df_pli = pl.from_pandas(pd.read_csv(pli_file_path, sep=\"\\t\"))\n", + " # Select relevant columns and process\n", + " df_pli = df_pli.select([\"transcript\", \"pLI\"])\n", + " df_pli = df_pli.with_columns(pl.col(\"transcript\").str.split(\".\").list.first().alias(\"transcript\"))\n", + "\n", + " # Add pLI scores by transcript to variants\n", + " variants_pl = pl.from_pandas(variants)\n", + " variants_pl = variants_pl.with_columns(pl.col(\"tx_name\").str.split(\".\").list.first().alias(\"tx_name_clean\"))\n", + " variants_pl = variants_pl.join(df_pli, left_on=\"tx_name_clean\", right_on=\"transcript\", how=\"left\")\n", + "\n", + " # Create pLI bins (multiply by 10 and cast to int)\n", + " variants_pl = variants_pl.with_columns((pl.col(\"pLI\") * 10).cast(pl.Int32).alias(\"pLI_bin\"))\n", + " print(\n", + " f\"Variants with pLI scores: {variants_pl.filter(pl.col('pLI').is_not_null()).shape[0]} / {variants_pl.shape[0]}\"\n", + " )\n", + "\n", + " # Add PhyloP conservation scores\n", + " print(\"\\nAdding PhyloP scores from bigWig file...\")\n", + " bw = pyBigWig.open(phylop_bw_path)\n", + " phylop = []\n", + " for row in tqdm(variants_pl.rows(named=True), total=variants_pl.shape[0]):\n", + " try:\n", + " phylop_score = bw.values(row[\"chrom\"], row[\"pos\"] - 1, row[\"pos\"])[0]\n", + " phylop.append(phylop_score if phylop_score is not None else -1000)\n", + " except:\n", + " phylop.append(-1000)\n", + " bw.close()\n", + "\n", + " # Add phylop column and create bins\n", + " variants_pl = variants_pl.with_columns(pl.Series(values=phylop, name=\"phylop\").fill_nan(-1000))\n", + " variants_pl = variants_pl.with_columns(pl.col(\"phylop\").round().cast(pl.Int32).alias(\"phylop_bin\"))\n", + " print(\n", + " f\"Variants with PhyloP scores: {variants_pl.filter(pl.col('phylop') != -1000).shape[0]} / {variants_pl.shape[0]}\"\n", + " )\n", + "\n", + " # Add cds length and cds offset fraction\n", + " variants_pl = variants_pl.with_columns(pl.col(\"ref_seq\").str.len_chars().alias(\"cds_length\"))\n", + " variants_pl = variants_pl.with_columns(\n", + " (pl.col(\"var_rel_dist_in_cds\") / pl.col(\"cds_length\")).alias(\"cds_offset_frac\")\n", + " )\n", + " variants_pl = variants_pl.with_columns(\n", + " (pl.col(\"cds_offset_frac\") * 10).cast(pl.Int32).alias(\"cds_offset_frac_bin\")\n", + " )\n", + "\n", + " # Convert back to pandas for further analysis\n", + " return variants_pl.to_pandas()\n", + "\n", + "\n", + "phylop_bw_path = f\"{DATA_DIR}/reference/hg19.100way.phyloP100way.bw\"\n", + "pli_file_path = f\"{DATA_DIR}/reference/gnomad.v2.1.1.lof_metrics.by_transcript.txt\"\n", + "\n", + "chd_variants_with_ddd_controls = process_variant_features(variants, pli_file_path, phylop_bw_path)\n", + "ddd_asd_variants = process_variant_features(\n", + " missense_variants_ddd_asd[missense_variants_ddd_asd[\"classification\"] != \"control\"], pli_file_path, phylop_bw_path\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "322a72a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvariant_idchromposrefaltclassref_codonalt_codoncodon_position...am_classchrom_postx_name_cleanpLIpLI_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
059chr10_101163334_A_G_hg19chr10101163334AGcontrolGTCGCC283...ambiguouschr10:101163334ENST000003705080.0011570.05.229512420.6843806
173chr10_101371064_G_A_hg19chr10101371064GAcontrolCGGTGG212...benignchr10:101371064ENST000003704950.9295409.03.016310950.5808225
\n", + "

2 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " id variant_id chrom pos ref alt class ref_codon \\\n", + "0 59 chr10_101163334_A_G_hg19 chr10 101163334 A G control GTC \n", + "1 73 chr10_101371064_G_A_hg19 chr10 101371064 G A control CGG \n", + "\n", + " alt_codon codon_position ... am_class chrom_pos tx_name_clean \\\n", + "0 GCC 283 ... ambiguous chr10:101163334 ENST00000370508 \n", + "1 TGG 212 ... benign chr10:101371064 ENST00000370495 \n", + "\n", + " pLI pLI_bin phylop phylop_bin cds_length cds_offset_frac \\\n", + "0 0.001157 0.0 5.229 5 1242 0.684380 \n", + "1 0.929540 9.0 3.016 3 1095 0.580822 \n", + "\n", + " cds_offset_frac_bin \n", + "0 6 \n", + "1 5 \n", + "\n", + "[2 rows x 28 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chd_variants_with_ddd_controls.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "f88aa812", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvariant_idtx_namevariant_typechromposrefaltcdsStartcdsEnd...AlphaMissenseam_classtx_name_cleanpLIpLI_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
00chr10_100011447_C_T_hg19ENST00000260702missensechr10100011447CT100008677100022776...0.1004benignENST000002607029.191200e-200.04.771522710.8643778
11chr10_100017561_C_G_hg19ENST00000260702missensechr10100017561CG100008677100022776...0.1315benignENST000002607029.191200e-200.04.867522710.4865704
\n", + "

2 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " id variant_id tx_name variant_type chrom \\\n", + "0 0 chr10_100011447_C_T_hg19 ENST00000260702 missense chr10 \n", + "1 1 chr10_100017561_C_G_hg19 ENST00000260702 missense chr10 \n", + "\n", + " pos ref alt cdsStart cdsEnd ... AlphaMissense am_class \\\n", + "0 100011447 C T 100008677 100022776 ... 0.1004 benign \n", + "1 100017561 C G 100008677 100022776 ... 0.1315 benign \n", + "\n", + " tx_name_clean pLI pLI_bin phylop phylop_bin cds_length \\\n", + "0 ENST00000260702 9.191200e-20 0.0 4.771 5 2271 \n", + "1 ENST00000260702 9.191200e-20 0.0 4.867 5 2271 \n", + "\n", + " cds_offset_frac cds_offset_frac_bin \n", + "0 0.864377 8 \n", + "1 0.486570 4 \n", + "\n", + "[2 rows x 39 columns]" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddd_asd_variants.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "ba87e543", + "metadata": {}, + "outputs": [], + "source": [ + "# Save to disk\n", + "chd_variants_with_ddd_controls.to_csv(\n", + " f\"{OUTPUT_DIR}/chd_dnm_filtered_canonical_transcripts_ddd_asd_ctrls_am_scores_cds_features.csv\", index=False\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a6b57d57", + "metadata": {}, + "source": [ + "# 6. COSMIC synonymous analyses data\n", + "\n", + "This will include the processing of COSMIC variants and gnomAD common variants" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "7d257dc0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyfaidx\n", + "import seaborn as sns\n", "\n", - " codon_position = pos_cds0 // 3\n", - " ref_codon = seq[codon_position * 3 : (codon_position + 1) * 3]\n", - " remainder = pos_cds0 % 3\n", - " alt_nuc = list(ref_codon)\n", - " alt_nuc[remainder] = alt_cds\n", - " alt_codon = \"\".join(alt_nuc)\n", - " item = {\n", - " \"chrom\": row[\"chrom\"],\n", - " \"pos\": row[\"pos\"],\n", - " \"ref\": row[\"ref\"],\n", - " \"alt\": row[\"alt\"],\n", - " \"var_rel_dist_in_cds\": pos_cds0,\n", - " \"codon_position\": codon_position,\n", - " \"ref_codon\": ref_codon,\n", - " \"alt_codon\": alt_codon,\n", - " \"tx\": row[\"tx\"],\n", - " \"label\": row[\"ClinicalSignificance\"],\n", - " \"in_splice_junction\": row[\"in_splice_junction\"],\n", - " \"ref_seq\": seq,\n", - " \"alt_seq\": seq[:pos_cds0] + alt_cds + seq[pos_cds0 + 1 :],\n", - " }\n", - " result.append(item)\n", "\n", + "# Output file paths\n", + "COSMIC_OUTPUT_FILE = f\"{OUTPUT_DIR}/cosmic_mutantcensus_gencode_v47_canonical.csv\"\n", + "GNOMAD_OUTPUT_FILE = f\"{OUTPUT_DIR}/gnomad_af0.01_canonical_genes.csv\"\n", "\n", - "result_df = pl.from_dicts(result).with_row_index(\"id\")\n", - "frame = result_df.to_pandas()\n", - "(frame[\"ref_seq\"].apply(lambda x: len(x) == 0)).sum()" + "cosmic_dir = f\"{DATA_DIR}/cosmic/cosmic_raw\"\n", + "cosmic_files = {\n", + " \"cosmic_samples\": f\"{cosmic_dir}/Cosmic_Sample_v102_GRCh38.tsv.gz\",\n", + " \"cosmic_mutant_census\": f\"{cosmic_dir}/Cosmic_MutantCensus_v102_GRCh38.tsv.gz\",\n", + " \"hg38\": f\"{DATA_DIR}/reference/hg38/hg38.fa\",\n", + "}\n", + "\n", + "import os\n", + "\n", + "\n", + "gnomad_dir = f\"{DATA_DIR}/gnomad\"\n", + "gnomad_files = {\n", + " \"gnomad_exomes\": f\"{gnomad_dir}/gnomad.exomes.v4.1\",\n", + " \"gnomad_genomes\": f\"{gnomad_dir}/gnomad.genomes.v4.1\",\n", + "}\n", + "gencode_v47_file = f\"{DATA_DIR}/reference/gencode.v47.basic.annotation.processed.filtered.tsv\"\n", + "hg38 = {}\n", + "with pyfaidx.Fasta(cosmic_files[\"hg38\"]) as f:\n", + " for k in f.keys():\n", + " hg38[k] = f[k][:].seq\n", + "\n", + "\n", + "valid_chroms = [\"chr\" + str(x) for x in range(1, 23)]" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "f7ecb7ae", + "execution_count": 78, + "id": "6f9e7839", + "metadata": {}, + "outputs": [], + "source": [ + "def map_variants_to_genes_by_exons_efficient(\n", + " genes_df, variants_df, variant_columns=[\"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "):\n", + " \"\"\"\n", + " Efficient mapping using sorted variants and binary search for range queries.\n", + " \"\"\"\n", + " import bisect\n", + "\n", + " gene_variant_mapping = {}\n", + "\n", + " print(f\"Processing {len(genes_df)} genes and {len(variants_df)} variants...\")\n", + "\n", + " # Pre-sort variants by chromosome and position for efficient range queries\n", + " chrom_sorted_variants = {}\n", + " unique_chroms = variants_df.select(\"chrom\").unique().to_series().to_list()\n", + "\n", + " for chrom in unique_chroms:\n", + " chrom_variants = variants_df.filter(pl.col(\"chrom\") == chrom).sort(\"pos\")\n", + " if len(chrom_variants) > 0:\n", + " # Extract positions and variant data separately for efficient binary search\n", + " positions = chrom_variants.select(\"pos\").to_series().to_numpy() - 1\n", + " variant_data = chrom_variants.select(variant_columns).to_dicts()\n", + " chrom_sorted_variants[chrom] = (positions, variant_data)\n", + "\n", + " # Process genes\n", + " for gene_row in genes_df.iter_rows(named=True):\n", + " id = gene_row[\"id\"]\n", + " chrom = gene_row[\"chrom\"]\n", + " strand = gene_row[\"strand\"]\n", + " cds_seq = gene_row[\"cds_sequence\"]\n", + "\n", + " gene_variant_mapping[id] = {\"variants\": []}\n", + "\n", + " if chrom not in chrom_sorted_variants:\n", + " continue\n", + "\n", + " positions, variant_data = chrom_sorted_variants[chrom]\n", + "\n", + " # Parse exon coordinates\n", + " exon_starts = gene_row[\"exonStarts\"]\n", + " exon_ends = gene_row[\"exonEnds\"]\n", + " cds_start = gene_row[\"cdsStart\"]\n", + " cds_end = gene_row[\"cdsEnd\"]\n", + "\n", + " if exon_starts and exon_ends:\n", + " if isinstance(exon_starts, str):\n", + " starts = [int(x.strip()) for x in exon_starts.split(\",\") if x.strip()]\n", + " else:\n", + " starts = [int(exon_starts)]\n", + "\n", + " if strand == \"-\":\n", + " assert starts[0] == cds_start, f\"{cds_start}, {starts[0]}, {gene_row['id']}, {gene_row['gene_id']}\"\n", + " starts[0] = cds_start\n", + "\n", + " if isinstance(exon_ends, str):\n", + " ends = [int(x.strip()) for x in exon_ends.split(\",\") if x.strip()]\n", + " else:\n", + " ends = [int(exon_ends)]\n", + " if strand == \"+\":\n", + " assert ends[-1] == cds_end, f\"{cds_end}, {ends[-1]}, {gene_row['id']}, {gene_row['gene_id']}\"\n", + " ends[-1] = cds_end\n", + " else:\n", + " raise ValueError(f\"No exon coordinates found for gene {id}\")\n", + " assert starts[0] == cds_start and ends[-1] == cds_end\n", + " # Use binary search to find variants in each exon range\n", + " cum_left = 0\n", + " for start, end in zip(starts, ends):\n", + " # Find range of variants within [start, end] using binary search\n", + " left_idx = bisect.bisect_left(positions, start)\n", + " right_idx = bisect.bisect_left(positions, end)\n", + "\n", + " # Extract variants in this range\n", + " for i in range(left_idx, right_idx):\n", + " variant = variant_data[i].copy()\n", + " variant[\"chrom\"] = chrom\n", + " variant[\"exon_start\"] = start\n", + " variant[\"exon_end\"] = end\n", + " dist_left = cum_left + variant[\"pos\"] - 1 - start\n", + " dist_in_cds = dist_left if strand == \"+\" else len(cds_seq) - dist_left - 1\n", + " variant[\"dist_left\"] = dist_left\n", + " variant[\"dist_in_cds\"] = dist_in_cds\n", + " codon_start = dist_in_cds // 3 * 3\n", + " variant[\"ref_codon\"] = cds_seq[codon_start : codon_start + 3]\n", + " alt_codon = []\n", + " for j in range(3):\n", + " if j + codon_start != dist_in_cds:\n", + " alt_codon.append(variant[\"ref_codon\"][j])\n", + " else:\n", + " if strand == \"+\":\n", + " assert variant[\"ref\"].upper() == cds_seq[j + codon_start].upper()\n", + " alt_codon.append(variant[\"alt\"].upper())\n", + " else:\n", + " assert variant[\"ref\"].upper() == reverse_complement_dna(cds_seq[j + codon_start]).upper()\n", + " alt_codon.append(reverse_complement_dna(variant[\"alt\"].upper()))\n", + " variant[\"alt_codon\"] = \"\".join(alt_codon)\n", + " gene_variant_mapping[id][\"variants\"].append(variant)\n", + " cum_left += end - start\n", + "\n", + " return gene_variant_mapping\n", + "\n", + "\n", + "def get_alt_seq(row):\n", + " ref_seq, ref_codon, alt_codon, codon_pos = (\n", + " row[\"ref_seq\"],\n", + " row[\"ref_codon\"],\n", + " row[\"alt_codon\"],\n", + " row[\"codon_position\"],\n", + " )\n", + " assert codon_pos >= 0 and codon_pos < len(ref_seq) / 3\n", + " assert ref_seq[codon_pos * 3 : (codon_pos + 1) * 3] == ref_codon\n", + " alt_seq = ref_seq[: codon_pos * 3] + alt_codon + ref_seq[(codon_pos + 1) * 3 :]\n", + " return alt_seq\n", + "\n", + "\n", + "def convert_gene_variant_mapping_to_df(gene_variant_mapping, genes, extra_cols=[]):\n", + " # Flatten gene_variant_mapping into a list of variant dicts, each with gene id\n", + " variant_rows = []\n", + " for row_id, info in gene_variant_mapping.items():\n", + " for variant in info[\"variants\"]:\n", + " row = variant.copy()\n", + " row[\"row_id\"] = row_id\n", + " variant_rows.append(row)\n", + "\n", + " gene_variant_df = pd.DataFrame(variant_rows)\n", + " gene_variant_df[\"codon_pos\"] = gene_variant_df[\"dist_in_cds\"] // 3\n", + "\n", + " # Compute ref_aa and alt_aa columns\n", + " gene_variant_df[\"ref_aa\"] = gene_variant_df[\"ref_codon\"].apply(lambda c: codon_to_aa(c) if pd.notnull(c) else None)\n", + " gene_variant_df[\"alt_aa\"] = gene_variant_df[\"alt_codon\"].apply(lambda c: codon_to_aa(c) if pd.notnull(c) else None)\n", + "\n", + " # Compute is_synonymous column\n", + " gene_variant_df[\"is_synonymous\"] = gene_variant_df.apply(\n", + " lambda row: (\n", + " row[\"ref_aa\"] == row[\"alt_aa\"]\n", + " if pd.notnull(row[\"ref_aa\"]) and pd.notnull(row[\"alt_aa\"]) and (row[\"ref_aa\"] != \"*\")\n", + " else False\n", + " ),\n", + " axis=1,\n", + " )\n", + "\n", + " gene_variant_df = pl.from_pandas(gene_variant_df)\n", + "\n", + " temp = gene_variant_df.with_columns(pl.col(\"row_id\").cast(pl.Int64)).join(\n", + " genes.select([\"id\", \"gene_name\", \"name\", \"gene_id\", \"cds_sequence\", \"strand\"]), left_on=\"row_id\", right_on=\"id\"\n", + " )\n", + " cols_to_select = [\n", + " \"chrom\",\n", + " \"pos\",\n", + " \"ref\",\n", + " \"alt\",\n", + " \"ref_codon\",\n", + " \"alt_codon\",\n", + " \"gene_name\",\n", + " \"gene_id\",\n", + " \"cds_sequence\",\n", + " \"strand\",\n", + " \"codon_pos\",\n", + " \"dist_in_cds\",\n", + " ]\n", + " if extra_cols:\n", + " cols_to_select += [x for x in extra_cols if x not in [\"chrom\", \"pos\", \"ref\", \"alt\"]]\n", + " temp = temp.select(cols_to_select)\n", + " result = (\n", + " temp.sort(\"chrom\", \"pos\")\n", + " .with_row_index(\"id\")\n", + " .rename({\"cds_sequence\": \"ref_seq\", \"codon_pos\": \"codon_position\", \"dist_in_cds\": \"var_rel_dist_in_cds\"})\n", + " )\n", + "\n", + " result = result.with_columns(\n", + " pl.struct(pl.col(\"ref_seq\"), pl.col(\"ref_codon\"), pl.col(\"alt_codon\"), pl.col(\"codon_position\"))\n", + " .map_elements(get_alt_seq, return_dtype=pl.Utf8)\n", + " .alias(\"alt_seq\")\n", + " )\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "1426ad03", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Adding additional features (pLI, PhyloP, codon frequencies)...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|███████████████████████████████████████████████████████████| 129384/129384 [00:12<00:00, 10168.75it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset with additional features: 129384 variants\n" - ] - }, + "data": { + "text/plain": [ + "19310" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genes = pl.read_csv(gencode_v47_file, separator=\"\\t\")\n", + "genes.head()\n", + "genes = (\n", + " genes.filter(pl.col(\"is_canonical\"))\n", + " .filter(pl.col(\"length_divisible_by_3\"))\n", + " .filter(pl.col(\"has_start_codon\"))\n", + " .filter(pl.col(\"has_stop_codon\"))\n", + ")\n", + "genes = genes.with_row_index(\"id\")\n", + "genes.head()\n", + "genes[\"is_canonical\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "f9aefcc3", + "metadata": {}, + "outputs": [ { "data": { - "text/html": [ - "
\n", - "shape: (2, 27)
idchromposrefaltvar_rel_dist_in_cdscodon_positionref_codonalt_codontxlabelin_splice_junctionref_seqalt_seqref_aaalt_aaref_codon_freqalt_codon_freqcodon_freq_ratiogene_nameplipli_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
u32stri64strstri64i64strstrstrstrboolstrstrstrstrf64f64f64strf64i32f64i32u32f64i32
0"chr1"45015006"G""A"941313"GAG""GAA""NM_000374.5""Likely pathogenic"true"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"E""E"4.6414453e73.7827281e70.20458"UROD"0.007.998811040.8523558
1"chr10"124400865"G""A"1133377"AAC""AAT""NM_000274.4""Benign"false"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"N""N"2.0900468e72.0353876e70.0265"OAT"0.00-2.351-213200.8583338
" - ], "text/plain": [ - "shape: (2, 27)\n", - "┌─────┬───────┬───────────┬─────┬───┬────────────┬────────────┬─────────────────┬──────────────────┐\n", - "│ id ┆ chrom ┆ pos ┆ ref ┆ … ┆ phylop_bin ┆ cds_length ┆ cds_offset_frac ┆ cds_offset_frac_ │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ bin │\n", - "│ u32 ┆ str ┆ i64 ┆ str ┆ ┆ i32 ┆ u32 ┆ f64 ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ i32 │\n", - "╞═════╪═══════╪═══════════╪═════╪═══╪════════════╪════════════╪═════════════════╪══════════════════╡\n", - "│ 0 ┆ chr1 ┆ 45015006 ┆ G ┆ … ┆ 8 ┆ 1104 ┆ 0.852355 ┆ 8 │\n", - "│ 1 ┆ chr10 ┆ 124400865 ┆ G ┆ … ┆ -2 ┆ 1320 ┆ 0.858333 ┆ 8 │\n", - "└─────┴───────┴───────────┴─────┴───┴────────────┴────────────┴─────────────────┴──────────────────┘" + "shape: (19_310, 22)\n", + "┌───────┬─────────────┬────────────┬───────┬───┬────────────┬────────────┬────────────┬────────────┐\n", + "│ id ┆ gene_id ┆ name ┆ chrom ┆ … ┆ has_stop_c ┆ length_div ┆ has_intern ┆ cds_length │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ odon ┆ isible_by_ ┆ al_stop_co ┆ --- │\n", + "│ u64 ┆ str ┆ str ┆ str ┆ ┆ --- ┆ 3 ┆ dons ┆ i64 │\n", + "│ ┆ ┆ ┆ ┆ ┆ bool ┆ --- ┆ --- ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ bool ┆ bool ┆ │\n", + "╞═══════╪═════════════╪════════════╪═══════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", + "│ 0 ┆ ENSG0000018 ┆ ENST000006 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 981 │\n", + "│ ┆ 6092.7 ┆ 41515.2 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1 ┆ ENSG0000028 ┆ ENST000003 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 939 │\n", + "│ ┆ 4662.2 ┆ 32831.5 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2 ┆ ENSG0000018 ┆ ENST000006 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 2535 │\n", + "│ ┆ 7634.13 ┆ 16016.5 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 3 ┆ ENSG0000018 ┆ ENST000003 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 2250 │\n", + "│ ┆ 8976.11 ┆ 27044.7 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 4 ┆ ENSG0000018 ┆ ENST000003 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 1929 │\n", + "│ ┆ 7961.15 ┆ 38591.8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 19305 ┆ ENSG0000018 ┆ ENST000003 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 1266 │\n", + "│ ┆ 5973.12 ┆ 34398.8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19306 ┆ ENSG0000016 ┆ ENST000006 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 867 │\n", + "│ ┆ 8939.13 ┆ 95325.1 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19307 ┆ ENSG0000012 ┆ ENST000002 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 663 │\n", + "│ ┆ 4333.16 ┆ 86448.12 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19308 ┆ ENSG0000012 ┆ ENST000002 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 1566 │\n", + "│ ┆ 4334.19 ┆ 44174.11 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19309 ┆ ENSG0000018 ┆ ENST000003 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 1437 │\n", + "│ ┆ 2484.15 ┆ 59512.8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────┴─────────────┴────────────┴───────┴───┴────────────┴────────────┴────────────┴────────────┘" ] }, - "execution_count": 18, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "print(\"Adding additional features (pLI, PhyloP, codon frequencies)...\")\n", - "dset = process_dset(result_df, refseq, remove_non_pli=False)\n", - "print(f\"Dataset with additional features: {dset.shape[0]} variants\")\n", - "dset.head(2)" + "genes" + ] + }, + { + "cell_type": "markdown", + "id": "69b84700", + "metadata": {}, + "source": [ + "### COSMIC" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "a6fe62c7", + "execution_count": 86, + "id": "bd6453ba", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAHDCAYAAACpu1eiAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAU7FJREFUeJzt3XlcFWX///E3IAdwQUAFxFBRy31J3DBTSwSVNMtcysrMNAsrs6xsUUS7NS233LJSW/ROzbRSM0ktS8ktzTUzb72tu8A7FXAFhOv3h78zt0fABVBwvq/n48FDz8w1M9fnzJwz7zMzZ46bMcYIAAAANzz3ou4AAAAACgfBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBzqZWrlypRo0aydvbW25ubkpJSSnqLiEXVatW1SOPPFLU3cA1dKWvxbi4OLm5uenvv/++vh0sJIW9LS9cuFABAQE6efJkoc3Tbtzc3BQXF1fU3bistm3bql69ekXdjRtKr1691KNHj3xNWyjBbsaMGerevbsqV64sNze3PF/c69atU5cuXRQaGipvb28FBwerQ4cOWr9+fa7tN2zYoFatWqlkyZIKDg7W008/neNFvnnzZg0aNEh169ZVqVKlVLlyZfXo0UO//vprjvm5ubnl+de+ffsCPw9Xa8OGDYqLiyv00HX06FH16NFDPj4+mjZtmj766COVKlUq17YnT57UiBEj1KFDBwUEBMjNzU1z587N0S47O1tz58611l+pUqVUr149jR49WmfPns3RPjk5WX379lVgYKB8fHzUuHFjLVq06LJ9b9++vdzc3DRo0KAc4/Jad2PHjnVpt2TJEkVHRyskJEReXl666aabdN9992nXrl2XXb4dzZ8/X5MmTSrqbhRrxeG1iP/JysrSiBEj9NRTT6l06dJF2pc///xTcXFx2r59+zVf1vTp03N9/70RXM/nKb+udj8mSe+//75q164tb29v3XzzzXr77bcvu5xL7cek8/vHxx9/XJUqVZK3t7eqVq2qfv36ubR58cUXtXjxYv38889XXWeJq54iF2+88YZOnDihZs2a6a+//sqz3a+//ip3d3cNHDhQwcHBOn78uD7++GO1bt1ay5cvV4cOHay227dvV7t27VS7dm1NmDBBf/zxh958803t379fX331lcuy169fr+7du6tBgwZKSkrS1KlT1bhxY/34448unxI++uijHH3asmWLJk+erKioqMJ4Kq7Khg0bNHLkSD3yyCPy8/MrtPlu3rxZJ06c0KhRoxQZGXnJtn///bfi4+NVuXJlNWzYUN9++22u7U6fPq2+ffuqRYsWGjhwoAIDA5WYmKgRI0Zo9erVWrNmjdzc3CRJaWlpatWqlZKTk/XMM88oODhYCxcuVI8ePTRv3jw98MADuS7js88+U2Ji4iX72759ez388MMuw2699VaXxzt37pS/v7+eeeYZlS9fXklJSZo9e7aaNWumxMRENWzY8JLLuJ727dsnd/dre+B8/vz52rVrlwYPHnxNl3MjKw6vRfzPl19+qX379mnAgAFF3RX9+eefGjlypKpWrapGjRpd02VNnz5d5cuXvyGP4l/P5ym/rmY/JknvvPOOBg4cqG7dumnIkCH6/vvv9fTTT+v06dN68cUXc13G5fZjv//+u2677TZJ0sCBA1WpUiX9+eef2rRpk0u7W2+9VU2aNNFbb72lDz/88OoKNYXg0KFDJjs72xhjTKlSpUyfPn2ueNpTp06ZoKAgEx0d7TK8Y8eOpmLFiiY1NdUa9u677xpJ5uuvv7aGrV+/3qSnp7tM++uvvxovLy/Tu3fvyy6/X79+xs3Nzfz+++9X3OfCMn78eCPJHDx4sFDn+8EHHxhJZvPmzZdte/bsWfPXX38ZY4zZvHmzkWTmzJmTo116erpZv359juEjR440kkxCQoI1bNy4cUaSWb16tTUsKyvLNG3a1AQHB+dYX8YYc+bMGVO1alUTHx9vJJnY2NgcbfIafiWSkpJMiRIlzOOPP56v6QtTdna2OX369HVbXkxMjKlSpcp1W96lZGZm5rr+i1pxeC2OGDHCSDL//e9/C7UP10uVKlWu6r3/Urp06WJatWpVKPMqqEu9Lxa2unXrmjZt2lxxe0lmxIgR16w/V+NSz1ObNm1M3bp1r3+nLnI1+7HTp0+bcuXKmZiYGJe2vXv3NqVKlTLHjh3LMZ8r2Y917NjRhIWFmb///vuy/X3zzTdNqVKlzIkTJ66kPEuhBLsLXW2wM8aYevXqmebNm1uPU1NTTYkSJczQoUNd2qWnp5vSpUubfv36XXaejRs3No0bN75km7Nnzxo/Pz/Ttm3bK+rnyZMnzZAhQ8xNN91kHA6HueWWW8z48eOtUGuMMQcPHsxz477wReh8E7/473I7loULF5rGjRsbb29vU65cOdO7d2/zxx9/WOPbtGmTY55Xuj7y8wa2Y8cOI8lMmTLFGta5c2dToUKFHG2dO89Vq1blGDdy5EhTuXJlc/r06csGu9OnT5szZ85ccR+NOR+mfH19Tc+ePS/ZLiYmxoSFheU6rkWLFiY8PNx6PHv2bHPHHXeYChUqGIfDYWrXrm2mT5+eY7oqVaqYmJgYs3LlShMeHm68vLzMxIkTrXEXrp+jR4+a5557ztSrV8+UKlXKlClTxnTo0MFs377dZZ5r1641ksyCBQvM6NGjTaVKlYyXl5e58847zf79+612uW0PF4a8KVOmmDp16hgfHx/j5+dnwsPDzbx58y75HKWnp5vXXnvNNG7c2Pj6+pqSJUuaVq1amTVr1ri0c74Wxo8fbyZOnGiqVatm3N3dzbZt24wxxuzdu9d069bN+Pv7Gy8vLxMeHm4+//zzSy7byY6vRWc/9u7da7p3727KlCljAgICzNNPP51je8/MzDTx8fGmWrVqxuFwmCpVqphhw4aZs2fP5lnnhS7e7ubMmWMkmR9++ME8++yzpnz58qZkyZKma9eu5siRIy7TZmdnm1GjRplKlSoZHx8f07ZtW7Nr164c88zIyDBxcXGmRo0axsvLywQEBJjbbrst19f/hc6cOWMcDoeJi4vLdfxHH31kmjZtam2zt99+u8uHfWOMmTZtmqlTp45xOBymYsWK5sknnzTHjx93aeMMG7t37zZt27Y1Pj4+JiQkxLzxxhtWG+fr7OK/C7epH3/80URHRxtfX1/j4+NjWrdubX744Qdr/J49e4y3t7d56KGHXJb//fffG3d3d/PCCy8YY86vk4uXc7mQl9v6/eOPP0zfvn1NYGCgcTgcpk6dOub99993aXOl7x9OU6dONWFhYcbb29s0bdrUrFu3zrRp08bq3+Wepyt5rotSbvux5cuXG0lm+fLlLm03bNhgJJmPPvoox3wutx/bu3evkWTtJ86cOWMyMjLy7NfPP/9sJJnPPvvsquoplFOxVystLU0ZGRn6+++/9eGHH2rXrl16+eWXrfE7d+7UuXPn1KRJE5fpHA6HGjVqpG3btl1y/sYYJScnq27dupdst2LFCqWkpKh3796X7bMxRl26dNHatWvVr18/NWrUSF9//bWGDh2q//znP5o4ceJl53Ghe++9V7/++qv++c9/auLEiSpfvrwkqUKFCnlOM3fuXPXt21dNmzbVmDFjlJycrMmTJ2v9+vXatm2b/Pz89Morr6hmzZqaNWuW4uPjFRYWpurVq19V365GUlKSJFn9l6T09HT5+PjkaFuyZElJ0tatW12uaTx8+LDGjh2r2bNn5zrdhebOnavp06fLGKPatWvr1VdfzfPUbkpKijIzM5WUlKRJkyYpLS1N7dq1u+T8e/bsqYcfflibN29W06ZNreH//ve/9eOPP2r8+PHWsBkzZqhu3brq0qWLSpQooS+//FJPPvmksrOzFRsb6zLfffv26f7779fjjz+u/v37q2bNmrku/1//+peWLl2q7t27KywsTMnJyXrnnXfUpk0b7dmzRyEhIS7tx44dK3d3dz3//PNKTU3VuHHj1Lt3b23cuFGS9Morryg1NVV//PGHtY06r1l699139fTTT+u+++7TM888o7Nnz2rHjh3auHFjns+pdP71+9577+n+++9X//79deLECb3//vuKjo7Wpk2bcpyGmTNnjs6ePasBAwbIy8tLAQEB2r17t2677TZVqlRJL730kkqVKqWFCxeqa9euWrx4se655548l2/312KPHj1UtWpVjRkzRj/++KOmTJmi48ePu5yOeeyxx/TBBx/ovvvu03PPPaeNGzdqzJgx2rt3r5YsWXJV9V/oqaeekr+/v0aMGKFDhw5p0qRJGjRokBYsWGC1GT58uEaPHq1OnTqpU6dO+umnnxQVFaWMjAyXecXFxWnMmDF67LHH1KxZM6WlpWnLli366aefLnlN89atW5WRkaHGjRvnGDdy5EjFxcWpZcuWio+Pl8Ph0MaNG7VmzRrrcpq4uDiNHDlSkZGReuKJJ7Rv3z7NmDFDmzdv1vr16+Xp6WnN7/jx4+rQoYPuvfde9ejRQ59++qlefPFF1a9fXx07dlTt2rUVHx+v4cOHa8CAAbr99tslSS1btpQkrVmzRh07dlR4eLhGjBghd3d3zZkzR3feeae+//57NWvWTLVr19aoUaM0dOhQ3XffferSpYtOnTqlRx55RLVq1VJ8fLwkadKkSdY1ha+88ookKSgo6KrWX3Jyslq0aGFd31WhQgV99dVX6tevn9LS0nJcjnG59w/p/PvcoEGDdPvtt+vZZ5/VoUOH1LVrV/n7++umm26SpMs+T1fyXF9KamqqMjMzL1u/t7d3vq7JzG0/5swZF+eQ8PBwubu7a9u2bXrwwQet4VeyH/vmm28knV+v7dq105o1a+Th4aH27dtrxowZqlq1qkv7OnXqyMfHR+vXr7/ke2IOVxUDr8CVHLGLjo62Er3D4TCPP/64yyfSRYsWGUlm3bp1Oabt3r27CQ4OvuT8P/roIyMpx6eUi3Xr1s14eXnl+CSXm6VLlxpJZvTo0S7D77vvPuPm5mZ+++03Y8yVHyUw5upO/2RkZJjAwEBTr149l+dq2bJlRpIZPny4Ncz56ftKTv9cKD9H7CIjI42vr6/Lc/jUU08Zd3d3c+jQIZe2vXr1MpLMoEGDXIbfd999pmXLltZj5XHErmXLlmbSpEnm888/NzNmzDD16tVz+fRzsZo1a1rbWenSpc2rr75qsrKyLllPamqq8fLyMs8995zL8HHjxhk3Nzfz73//2xqW2+nU6OhoU61aNZdhzk/iK1euzNH+4qMcZ8+ezdHHgwcPGi8vLxMfH28Nc35Crl27tsupzcmTJxtJZufOndawvE7F3n333fk6PXLu3Lkcp1OPHz9ugoKCzKOPPurSb0nG19c3x1Gfdu3amfr167scYcrOzjYtW7Y0N9988yWXb9fXovOIXZcuXVyGP/nkk0aS+fnnn40xxmzfvt1IMo899phLu+eff95IcjlyenGdTnkdsYuMjHQ56vnss88aDw8Pk5KSYowx5siRI8bhcJiYmBiXdi+//HKOI5INGzbMcRrrSrz33ns5tmFjjNm/f79xd3c399xzT47XiLMvzv5FRUW5tJk6daqRZGbPnm0Ncx5R/fDDD61h6enpJjg42HTr1s0altf7YnZ2trn55ptNdHS0y3Nx+vRpExYWZtq3b28Ny8rKMq1atTJBQUHm77//NrGxsaZEiRI5touCnort16+fqVixYo7TfL169TJly5a13rOu9P0jPT3dlCtXzjRt2tRkZmZa7ebOnZvjiOLlTsVeyXOdl9yOfuf2l99LAXLbj8XGxhoPD49c21eoUMH06tXLZdiV7MeefvppI8mUK1fOdOjQwSxYsMCMHz/elC5d2lSvXt2cOnUqx7JuueUW07Fjx6uqp0hudzJ27FitWrVK77//vlq0aKGMjAydO3fOGn/mzBlJkpeXV45pvb29rfG5+eWXXxQbG6uIiAj16dMnz3ZpaWlavny5OnXqdEUXS69YsUIeHh56+umnXYY/99xzMsa4fKHjWtiyZYuOHDmiJ598Ut7e3tbwmJgY1apVS8uXL7+my8/NP/7xD33zzTcaO3asy3P42GOPycPDQz169NCGDRt04MABjRkzxjqScOH6W7t2rRYvXnxF39pcv369nnnmGXXp0kUDBw7U1q1bVa9ePb388su5bhNz5szRypUrNX36dNWuXVtnzpxRVlbWJZfh6+urjh07auHChTr/2jxvwYIFatGihSpXrmwNu/BTWWpqqv7++2+1adNG//rXv5Samuoy37CwMEVHR1+2Ri8vL+vLFFlZWTp69KhKly6tmjVr6qeffsrRvm/fvnI4HNZj5yflf/3rX5ddlp+fn/744w9t3rz5sm0v5OHhYS0zOztbx44ds46w59bHbt26uRz9OnbsmNasWaMePXroxIkT+vvvv/X333/r6NGjio6O1v79+/Wf//wnz+Xb/bV48dHep556StL5ui/8d8iQIS7tnnvuOUkq0PIHDBjgcvH47bffrqysLP373/+WdP6IQ0ZGhp566imXdrl9McfPz0+7d+/W/v37r6oPR48elST5+/u7DF+6dKmys7M1fPjwHF84cvbF2b/Bgwe7tOnfv798fX1zPDelS5d2OericDjUrFmzK3r9bN++Xfv379cDDzygo0ePWtvxqVOn1K5dO61bt07Z2dmSJHd3d82dO1cnT55Ux44dNX36dA0bNizH0aCCMMZo8eLF6ty5s4wxVn/+/vtvRUdHKzU1Ncfr83LvH1u2bNHRo0fVv39/lSjxvxN8vXv3zrF+Lqcgz/Vbb72lhISEy/698MILV9UnKe/92JkzZ1yemwtdnEOudD/mvKtHcHCwli9frh49euj555/Xu+++qwMHDmj+/Pk5pvH397/qWyAVyanYC0/VPPjgg2rcuLEeeeQRffrpp5L+t8NMT0/PMe3Zs2fzPMyZlJSkmJgYlS1bVp9++qk8PDzy7MPixYt19uzZKzoNK50/FRcSEqIyZcq4DK9du7Y1/lpyzj+3U3i1atXSDz/8cE2Xf7EFCxbo1VdfVb9+/fTEE0+4jGvQoIHmz5+vgQMHWt/+CQ4O1qRJk/TEE09Yh8rPnTunp59+Wg899JDLac8r5XA4NGjQICvktWrVymV8RESE9f9evXpZ6+rNN9+85Hx79uyppUuXKjExUS1bttSBAwe0devWHC/a9evXa8SIEUpMTNTp06ddxqWmpqps2bLW47CwsCuqKTs7W5MnT9b06dN18OBBlyBarly5HO0vDJrS/3aGx48fv+yyXnzxRX3zzTdq1qyZatSooaioKD3wwAPWOruUDz74QG+99ZZ++eUXl1MkudV58bDffvtNxhi99tpreu2113Kd/5EjR1SpUqVcx9n9tXjzzTe7PK5evbrc3d116NAha/nu7u6qUaOGS7vg4GD5+fkVqP7LbU/OeV/cxwoVKuTY0cfHx+vuu+/WLbfconr16qlDhw566KGH1KBBgyvqy4UfrCTpwIEDcnd3V506dfKcJq9143A4VK1atRzPzU033eQSUKXzNe/YseOy/XMG1ksdQEhNTbWel+rVqysuLk5Dhw5VvXr18tz28+u///2vUlJSNGvWLM2aNSvXNkeOHHF5fKXr++JtrUSJEjlOG15OQZ7r8PDwq1rWlbrUfszHxyfH5QVOF+aQq9mPOafp0aOHyweP7t2766GHHtKGDRv02GOPuUxjjMnxvF1OkQS7CzkcDnXp0kVjx47VmTNn5OPjo4oVK0pSrrdO+euvv3JcZySdfwF17NhRKSkp+v7773Ntc6F58+apbNmyuuuuuwqnkP8vrxVwuSNFN5KEhAQ9/PDDiomJ0cyZM3Nt47yW5Oeff1ZWVpYaN25s3UrllltukSR9+OGH2rdvn9555x1rp+V04sQJHTp0SIGBgda1ebkJDQ2VdP4o0KX4+/vrzjvv1Lx58y4b7Dp37qySJUtq4cKFatmypRYuXCh3d3d1797danPgwAG1a9dOtWrV0oQJExQaGiqHw6EVK1Zo4sSJ1id1p8tdO+j0j3/8Q6+99poeffRRjRo1SgEBAXJ3d9fgwYNzzFNSnh9eLt4p5qZ27drat2+fli1bppUrV2rx4sWaPn26hg8frpEjR+Y53ccff6xHHnlEXbt21dChQxUYGCgPDw+NGTNGBw4cyNH+4tqddTz//PN5HsW8eEeSH3Z5LeZVx9W+2V8or+egINvTxVq3bq0DBw7o888/16pVq/Tee+9p4sSJmjlzZo6d14WcH2COHz9uXcN1rRSkXud2PH78+Dxv73Hx9V6rVq2SdP7WIEePHlVwcPBV9PbK+vPggw/mGTYvDtWFub4vpyDLOnbsWJ4h60I+Pj4uH6gv5XL7sYoVKyorK0tHjhxRYGCgNTwjI0NHjx61MsbV7Mec01x87aSHh4fKlSuX6wfy48eP5/ggdTlFHuyk84c8jTE6ceKEfHx8VK9ePZUoUUJbtmxxufNyRkaGtm/fnuNuzGfPnlXnzp3166+/6ptvvrnkJzrpfDhcu3atHnnkkVxP9+amSpUq+uabb3TixAmXIwW//PKLNV763yeei290mtun6Kt5Y3bOf9++fbrzzjtdxu3bt88af61t3LhR99xzj5o0aaKFCxe6HJ6/mMPhcPkE47xw1Hk/r8OHDyszMzPXI0QffvihPvzwQy1ZskRdu3bNcxnOw/iXutDd6cyZMzlOkeamVKlSuuuuu7Ro0SJNmDBBCxYs0O233+7yYeHLL79Uenq6vvjiC5dPvWvXrr3s/C/l008/1R133KH333/fZXhKSorLhb1X41LbWalSpdSzZ0/17NlTGRkZuvfee/X6669r2LBhLqcZL+5jtWrV9Nlnn7nMe8SIEVfUn2rVqkmSPD0983VvN7u/Fvfv3+9ylPO3335Tdna2dYSkSpUqys7O1v79+62jlNL5C+dTUlJclu/v75+j/oyMjEveb/RSnPPev3+/tR6l80eLctspBQQEqG/fvurbt69Onjyp1q1bKy4u7pLBrlatWpKkgwcPqn79+tbw6tWrKzs7W3v27MkzSF24bi7sX0ZGhg4ePJiv7S2vbcP5RRhfX98rmu/MmTOVkJCg119/XWPGjNHjjz+uzz///IqWdSUqVKigMmXKKCsrq9Dumeh8Pn/77Tfdcccd1vBz587p0KFDLkGxIH2/nHvvvVfffffdZdv16dPnim7wfCX7Mec2tmXLFnXq1MkavmXLFmVnZ1vjr2Y/5jzyePGlJs4vk168Hzt37px+//13denS5bI1Xei6XmN38WFg6fyb7uLFixUaGmql4rJlyyoyMlIff/yxTpw4YbX96KOPdPLkSZcjJ1lZWerZs6cSExO1aNEil9Nvefnkk0+UnZ19xadhJalTp07KysrS1KlTXYZPnDhRbm5u1rd6fH19Vb58ea1bt86l3fTp03PM03kH+iu5232TJk0UGBiomTNnupyi/uqrr7R3717FxMRccS355VxO1apVtWzZsis+CiWd3xHMnDlTd911l3XErlevXlqyZEmOP+n8871kyRI1b95c0vkdx8VOnDihSZMmqXz58i6H6nPbzg4dOqTVq1df8TUtPXv21J9//qn33ntPP//8s3r27Oky3vnp88JPm6mpqZozZ84VzT8vHh4eOT7BLlq06JLXnF1OqVKlcg20zmuZnBwOh+rUqSNjzCW/gZZb7Rs3brzszaWdAgMD1bZtW73zzju5Bozc1vWF7P5anDZtmstj553unXU5dzIXXxowYcIESXJZfvXq1XPUP2vWrHwftYyMjJSnp6fefvttl/Wf27VFF29fpUuXVo0aNXK9xOZC4eHhcjgc2rJli8vwrl27yt3dXfHx8TmOXjv7EhkZKYfDoSlTprj07/3331dqamq+1k1e20Z4eLiqV6+uN998M9efPbtwOz548KCGDh2qbt266eWXX9abb76pL774IseNZ0uVKpXvXz/x8PBQt27dtHjx4lx/Zedyr6vcNGnSROXKldO7777rch38vHnzcgT5q3kNXa3CvMbuSvdjd955pwICAjRjxgyX4TNmzFDJkiWtbelq9mNt27ZVYGCg5s2b5/JLF3PnzlVWVlaOb4vv2bNHZ8+edfl28ZUolCN2X375pfWzF5mZmdqxY4dGjx4tSerSpYuV6jt27KibbrpJzZs3V2BgoA4fPqw5c+bozz//dPk6vSS9/vrratmypdq0aaMBAwbojz/+0FtvvaWoqCiXX6h47rnn9MUXX6hz5846duyYPv74Y5f5XHixptO8efMUEhKitm3bXnGNnTt31h133KFXXnlFhw4dUsOGDbVq1Sp9/vnnGjx4sMttDB577DGNHTtWjz32mJo0aaJ169bl+hNnzjDyyiuvqFevXvL09FTnzp1z/ckhT09PvfHGG+rbt6/atGmj+++/37rFQtWqVfXss89ecS0Xmzp1qlJSUvTnn39KOr8+//jjD0nnL9wuW7asTpw4oejoaB0/flxDhw7NcRFy9erVXUJ1nTp1rJ+ZO3jwoGbMmKGAgACXQ961atWyPp1fLCwszOVI3bRp07R06VJ17txZlStX1l9//aXZs2fr8OHD+uijj1wucq1fv77atWunRo0ayd/fX/v379f777+vzMzMHD8/lpdOnTqpTJkyev755603zAtFRUXJ4XCoc+fOevzxx3Xy5Em9++67CgwMzPfREEm66667FB8fr759+6ply5bauXOn5s2b53L04WqFh4drwYIFGjJkiJo2barSpUurc+fOioqKUnBwsG677TYFBQVp7969mjp1qmJiYnJcv3ZxHz/77DPdc889iomJ0cGDBzVz5kzVqVPnin/Xc9q0aWrVqpXq16+v/v37q1q1akpOTlZiYqL++OOPS/6Mjp1fi9L5ENClSxd16NBBiYmJ+vjjj/XAAw9Yv5jSsGFD9enTR7NmzVJKSoratGmjTZs26YMPPlDXrl1djqw89thj1p3z27dvr59//llff/11vo/+VqhQQc8//7zGjBmju+66S506ddK2bdv01Vdf5ZhnnTp11LZtW4WHhysgIEBbtmzRp59+mufPLDl5e3srKipK33zzjXUrEOn86flXXnlFo0aN0u233657771XXl5e2rx5s0JCQjRmzBhVqFBBw4YN08iRI9WhQwd16dJF+/bt0/Tp09W0adNc9weXU716dfn5+WnmzJkqU6aMSpUqpebNmyssLEzvvfeeOnbsqLp166pv376qVKmS/vOf/2jt2rXy9fXVl19+KWOMHn30Ufn4+Fgh4fHHH9fixYv1zDPPKDIy0jobEB4erhkzZmj06NGqUaOGAgMDcxwVvpSxY8dq7dq1at68ufr37686dero2LFj+umnn/TNN99c9pKVizkcDsXFxempp57SnXfeqR49eujQoUOaO3euqlev7nKU7lLPU0EV1jV2V7Mf8/Hx0ahRoxQbG6vu3bsrOjpa33//vT7++GO9/vrrCggIkHR1+zEvLy+NHz9effr0UevWrfXQQw/p8OHDmjx5srVNXyghIUElS5a8+p88varv0OahT58+eX79+MKvPk+dOtW0atXKlC9f3pQoUcJUqFDBdO7cOdfbmhhz/gaOLVu2NN7e3qZChQomNjbWpKWlubS53NegL/bLL78YSWbIkCFXXeeJEyfMs88+a0JCQoynp6e5+eabc9wU1ZjzX3fv16+fKVu2rClTpozp0aOHOXLkSK63HnDe6NPd3f2KbrewYMECc+utt1o3/Lz4pqjGXP3tTnK7Mabzz9kf560j8vq7+GvmvXr1MqGhocbhcJiQkBAzcOBAk5ycfEX9US5fE1+1apVp3769CQ4ONp6ensbPz89ERUW5/LqF04gRI0yTJk2Mv7+/KVGihAkJCTG9evUyO3bsuKLlO/Xu3du6BURuvvjiC9OgQQPj7e1tqlatat544w0ze/bsHOvReYPi3OR2u5PnnnvOVKxY0fj4+JjbbrvNJCYmutwM1Jj/3a5g0aJFLvPL7RYfJ0+eNA888IDx8/Mz0v9uUPzOO++Y1q1bm3LlyhkvLy9TvXp1M3ToUJdfe8lNdna2+cc//mGqVKlivLy8zK233mqWLVtm+vTp43JblQtvUJybAwcOmIcffthap5UqVTJ33XWX+fTTTy+5fGPs+Vp03u5kz5495r777jNlypQx/v7+ZtCgQbneoHjkyJEmLCzMeHp6mtDQ0FxvUJyVlWVefPFF64bD0dHR5rfffsvzdicX99O5na1du9ZlniNHjrS20bxuUDx69GjTrFkz4+fnZ3x8fEytWrXM66+/fskbsjp99tlnxs3NzRw+fDjHuNmzZ1vPu7+/v2nTpo3LLwYYc35fU6tWLePp6WmCgoLME088kecNii928XZsjDGff/65qVOnjilRokSO19e2bdvMvffea72OqlSpYnr06GG9NzlvIbJ48WKXeR4+fNj4+vqaTp06WcOSkpJMTEyMKVOmTI7bieQmt+04OTnZxMbGmtDQUOPp6WmCg4NNu3btzKxZs6w2V/P+Ycz5G5k7X+/NmjUz69evN+Hh4aZDhw5X9DxdzXN9LV3tfswYY2bNmmVq1qxpHA6HqV69upk4cWKO95nc5LYfc/rnP/9pGjZsaLy8vExQUJAZNGhQjmxjjDHNmzc3Dz744FXX6fb/OwAAQLGQlZWlOnXqqEePHho1alRRdwcXyc7OVoUKFXTvvffq3XffLeru2NL27dvVuHFj/fTTT1f927tFch87AADy4uHhofj4eE2bNu2KT+/j2jh79myO634//PBDHTt27KouZ8LVGTt2rO67776rDnWSxBE7AACQq2+//VbPPvusunfvrnLlyumnn37S+++/r9q1a2vr1q153sQXRadY3O4EAAAUP1WrVlVoaKimTJmiY8eOKSAgQA8//LDGjh1LqCumOGIHAABgE1xjBwAAYBMEOwAAAJvgGjuby87O1p9//qkyZcpc0598AQAUb+b//3RnSEiIy4/Qw14Idjb3559/KjQ0tKi7AQAoJn7//XfddNNNRd0NXCMEO5tz/jTU77//Ll9f3yLuTf5kZmZq1apVioqKkqenZ1F3p8DsVI+dapGop7ijnoJJS0tTaGjoJX8yEDc+gp3NOU+/+vr63tDBrmTJkvL19bXNm7ld6rFTLRL1FHfUUzi4LMfeOMkOAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsokRRdwA3jqovLS+S5Xp5GI1rJtWL+1rpWW6SpENjY4qkLwAAFGccsQMAALAJgl0u1q1bp86dOyskJERubm5aunSpy3hjjIYPH66KFSvKx8dHkZGR2r9/v0ubY8eOqXfv3vL19ZWfn5/69eunkydPurTZsWOHbr/9dnl7eys0NFTjxo3L0ZdFixapVq1a8vb2Vv369bVixYpCrxcAANgDwS4Xp06dUsOGDTVt2rRcx48bN05TpkzRzJkztXHjRpUqVUrR0dE6e/as1aZ3797avXu3EhIStGzZMq1bt04DBgywxqelpSkqKkpVqlTR1q1bNX78eMXFxWnWrFlWmw0bNuj+++9Xv379tG3bNnXt2lVdu3bVrl27rl3xAADghsU1drno2LGjOnbsmOs4Y4wmTZqkV199VXfffbck6cMPP1RQUJCWLl2qXr16ae/evVq5cqU2b96sJk2aSJLefvttderUSW+++aZCQkI0b948ZWRkaPbs2XI4HKpbt662b9+uCRMmWAFw8uTJ6tChg4YOHSpJGjVqlBISEjR16lTNnDnzOjwTAADgRkKwu0oHDx5UUlKSIiMjrWFly5ZV8+bNlZiYqF69eikxMVF+fn5WqJOkyMhIubu7a+PGjbrnnnuUmJio1q1by+FwWG2io6P1xhtv6Pjx4/L391diYqKGDBnisvzo6Ogcp4YvlJ6ervT0dOtxWlqaJCkzM1OZmZkFqt3LwxRo+nwv1924/CupwLUUJWffb+QanOxUi0Q9xR31FM7yYG8Eu6uUlJQkSQoKCnIZHhQUZI1LSkpSYGCgy/gSJUooICDApU1YWFiOeTjH+fv7Kykp6ZLLyc2YMWM0cuTIHMNXrVqlkiVLXkmJeRrXrECTF9ioJtnW/+1wrWFCQkJRd6HQ2KkWiXqKO+rJn9OnT1+X5aBoEexsZtiwYS5H+dLS0hQaGqqoqCj5+voWaN714r4uaPfyxcvdaFSTbL22xV3p2edvd7IrLrpI+lIYMjMzlZCQoPbt28vT07Oou1MgdqpFop7ijnoKxnkGB/ZGsLtKwcHBkqTk5GRVrFjRGp6cnKxGjRpZbY4cOeIy3blz53Ts2DFr+uDgYCUnJ7u0cT6+XBvn+Nx4eXnJy8srx3BPT88Cv3E47yFXVNKz3aw+2OFNvTDWSXFhp1ok6inuqCf/y4H98a3YqxQWFqbg4GCtXr3aGpaWlqaNGzcqIiJCkhQREaGUlBRt3brVarNmzRplZ2erefPmVpt169a5XPOQkJCgmjVryt/f32pz4XKcbZzLAQAAuBDBLhcnT57U9u3btX37dknnvzCxfft2HT58WG5ubho8eLBGjx6tL774Qjt37tTDDz+skJAQde3aVZJUu3ZtdejQQf3799emTZu0fv16DRo0SL169VJISIgk6YEHHpDD4VC/fv20e/duLViwQJMnT3Y5jfrMM89o5cqVeuutt/TLL78oLi5OW7Zs0aBBg673UwIAAG4AnIrNxZYtW3THHXdYj51hq0+fPpo7d65eeOEFnTp1SgMGDFBKSopatWqllStXytvb25pm3rx5GjRokNq1ayd3d3d169ZNU6ZMscaXLVtWq1atUmxsrMLDw1W+fHkNHz7c5V53LVu21Pz58/Xqq6/q5Zdf1s0336ylS5eqXr161+FZAAAANxqCXS7atm0rY/K+tYebm5vi4+MVHx+fZ5uAgADNnz//kstp0KCBvv/++0u26d69u7p3737pDgMAAIhTsQAAALZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwS7fMrKytJrr72msLAw+fj4qHr16ho1apSMMVYbY4yGDx+uihUrysfHR5GRkdq/f7/LfI4dO6bevXvL19dXfn5+6tevn06ePOnSZseOHbr99tvl7e2t0NBQjRs37rrUCAAAbiwEu3x64403NGPGDE2dOlV79+7VG2+8oXHjxuntt9+22owbN05TpkzRzJkztXHjRpUqVUrR0dE6e/as1aZ3797avXu3EhIStGzZMq1bt04DBgywxqelpSkqKkpVqlTR1q1bNX78eMXFxWnWrFnXtV4AAFD8lSjqDtyoNmzYoLvvvlsxMTGSpKpVq+qf//ynNm3aJOn80bpJkybp1Vdf1d133y1J+vDDDxUUFKSlS5eqV69e2rt3r1auXKnNmzerSZMmkqS3335bnTp10ptvvqmQkBDNmzdPGRkZmj17thwOh+rWravt27drwoQJLgEQAACAYJdPLVu21KxZs/Trr7/qlltu0c8//6wffvhBEyZMkCQdPHhQSUlJioyMtKYpW7asmjdvrsTERPXq1UuJiYny8/OzQp0kRUZGyt3dXRs3btQ999yjxMREtW7dWg6Hw2oTHR2tN954Q8ePH5e/v79Lv9LT05Wenm49TktLkyRlZmYqMzOzQDV7eZjLN7oGvNyNy7+SClxLUXL2/UauwclOtUjUU9xRT+EsD/ZGsMunl156SWlpaapVq5Y8PDyUlZWl119/Xb1795YkJSUlSZKCgoJcpgsKCrLGJSUlKTAw0GV8iRIlFBAQ4NImLCwsxzyc4y4OdmPGjNHIkSNz9HfVqlUqWbJkfsuVJI1rVqDJC2xUk2zr/ytWrCjCnhSOhISEou5CobFTLRL1FHfUkz+nT5++LstB0SLY5dPChQs1b948zZ8/3zo9OnjwYIWEhKhPnz5F1q9hw4ZpyJAh1uO0tDSFhoYqKipKvr6+BZp3vbivC9q9fPFyNxrVJFuvbXFXerabJGlXXHSR9KUwZGZmKiEhQe3bt5enp2dRd6dA7FSLRD3FHfUUjPMMDuyNYJdPQ4cO1UsvvaRevXpJkurXr69///vfGjNmjPr06aPg4GBJUnJysipWrGhNl5ycrEaNGkmSgoODdeTIEZf5njt3TseOHbOmDw4OVnJysksb52Nnmwt5eXnJy8srx3BPT88Cv3GkZ7kVaPqCSs92s/pghzf1wlgnxYWdapGop7ijnvwvB/bHt2Lz6fTp03J3d336PDw8lJ19/nRhWFiYgoODtXr1amt8WlqaNm7cqIiICElSRESEUlJStHXrVqvNmjVrlJ2drebNm1tt1q1b53JtREJCgmrWrJnjNCwAAPi/jWCXT507d9brr7+u5cuX69ChQ1qyZIkmTJige+65R5Lk5uamwYMHa/To0friiy+0c+dOPfzwwwoJCVHXrl0lSbVr11aHDh3Uv39/bdq0SevXr9egQYPUq1cvhYSESJIeeOABORwO9evXT7t379aCBQs0efJkl9OtAAAAEqdi8+3tt9/Wa6+9pieffFJHjhxRSEiIHn/8cQ0fPtxq88ILL+jUqVMaMGCAUlJS1KpVK61cuVLe3t5Wm3nz5mnQoEFq166d3N3d1a1bN02ZMsUaX7ZsWa1atUqxsbEKDw9X+fLlNXz4cG51AgAAciDY5VOZMmU0adIkTZo0Kc82bm5uio+PV3x8fJ5tAgICNH/+/Esuq0GDBvr+++/z21UAAPB/BKdiAQAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYF8J///EcPPvigypUrJx8fH9WvX19btmyxxhtjNHz4cFWsWFE+Pj6KjIzU/v37XeZx7Ngx9e7dW76+vvLz81O/fv108uRJlzY7duzQ7bffLm9vb4WGhmrcuHHXpT4AAHBjIdjl0/Hjx3XbbbfJ09NTX331lfbs2aO33npL/v7+Vptx48ZpypQpmjlzpjZu3KhSpUopOjpaZ8+etdr07t1bu3fvVkJCgpYtW6Z169ZpwIAB1vi0tDRFRUWpSpUq2rp1q8aPH6+4uDjNmjXrutYLAACKvxJF3YEb1RtvvKHQ0FDNmTPHGhYWFmb93xijSZMm6dVXX9Xdd98tSfrwww8VFBSkpUuXqlevXtq7d69WrlypzZs3q0mTJpKkt99+W506ddKbb76pkJAQzZs3TxkZGZo9e7YcDofq1q2r7du3a8KECS4BEAAAgGCXT1988YWio6PVvXt3fffdd6pUqZKefPJJ9e/fX5J08OBBJSUlKTIy0pqmbNmyat68uRITE9WrVy8lJibKz8/PCnWSFBkZKXd3d23cuFH33HOPEhMT1bp1azkcDqtNdHS03njjDR0/ftzlCKEkpaenKz093XqclpYmScrMzFRmZmaBavbyMAWaPt/LdTcu/0oqcC1Fydn3G7kGJzvVIlFPcUc9hbM82BvBLp/+9a9/acaMGRoyZIhefvllbd68WU8//bQcDof69OmjpKQkSVJQUJDLdEFBQda4pKQkBQYGuowvUaKEAgICXNpceCTwwnkmJSXlCHZjxozRyJEjc/R31apVKlmyZAEqlsY1K9DkBTaqSbb1/xUrVhRhTwpHQkJCUXeh0NipFol6ijvqyZ/Tp09fl+WgaBHs8ik7O1tNmjTRP/7xD0nSrbfeql27dmnmzJnq06dPkfVr2LBhGjJkiPU4LS1NoaGhioqKkq+vb4HmXS/u64J2L1+83I1GNcnWa1vclZ7tJknaFRddJH0pDJmZmUpISFD79u3l6elZ1N0pEDvVIlFPcUc9BeM8gwN7I9jlU8WKFVWnTh2XYbVr19bixYslScHBwZKk5ORkVaxY0WqTnJysRo0aWW2OHDniMo9z587p2LFj1vTBwcFKTk52aeN87GxzIS8vL3l5eeUY7unpWeA3jvQstwJNX1Dp2W5WH+zwpl4Y66S4sFMtEvUUd9ST/+XA/vhWbD7ddttt2rdvn8uwX3/9VVWqVJF0/osUwcHBWr16tTU+LS1NGzduVEREhCQpIiJCKSkp2rp1q9VmzZo1ys7OVvPmza0269atc7k2IiEhQTVr1sxxGhYAAPzfRrDLp2effVY//vij/vGPf+i3337T/PnzNWvWLMXGxkqS3NzcNHjwYI0ePVpffPGFdu7cqYcfflghISHq2rWrpPNH+Dp06KD+/ftr06ZNWr9+vQYNGqRevXopJCREkvTAAw/I4XCoX79+2r17txYsWKDJkye7nG4FAACQOBWbb02bNtWSJUs0bNgwxcfHKywsTJMmTVLv3r2tNi+88IJOnTqlAQMGKCUlRa1atdLKlSvl7e1ttZk3b54GDRqkdu3ayd3dXd26ddOUKVOs8WXLltWqVasUGxur8PBwlS9fXsOHD+dWJwAAIAeCXQHcdddduuuuu/Ic7+bmpvj4eMXHx+fZJiAgQPPnz7/kcho0aKDvv/8+3/0EAAD/N3AqFgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmBXCMaOHSs3NzcNHjzYGnb27FnFxsaqXLlyKl26tLp166bk5GSX6Q4fPqyYmBiVLFlSgYGBGjp0qM6dO+fS5ttvv1Xjxo3l5eWlGjVqaO7cudehIgAAcCMi2BXQ5s2b9c4776hBgwYuw5999ll9+eWXWrRokb777jv9+eefuvfee63xWVlZiomJUUZGhjZs2KAPPvhAc+fO1fDhw602Bw8eVExMjO644w5t375dgwcP1mOPPaavv/76utUHAABuHAS7Ajh58qR69+6td999V/7+/tbw1NRUvf/++5owYYLuvPNOhYeHa86cOdqwYYN+/PFHSdKqVau0Z88effzxx2rUqJE6duyoUaNGadq0acrIyJAkzZw5U2FhYXrrrbdUu3ZtDRo0SPfdd58mTpxYJPUCAIDirURRd+BGFhsbq5iYGEVGRmr06NHW8K1btyozM1ORkZHWsFq1aqly5cpKTExUixYtlJiYqPr16ysoKMhqEx0drSeeeEK7d+/WrbfeqsTERJd5ONtceMr3Yunp6UpPT7cep6WlSZIyMzOVmZlZoHq9PEyBps/3ct2Ny7+SClxLUXL2/UauwclOtUjUU9xRT+EsD/ZGsMunTz75RD/99JM2b96cY1xSUpIcDof8/PxchgcFBSkpKclqc2Goc453jrtUm7S0NJ05c0Y+Pj45lj1mzBiNHDkyx/BVq1apZMmSV15gLsY1K9DkBTaqSbb1/xUrVhRhTwpHQkJCUXeh0NipFol6ijvqyZ/Tp09fl+WgaBHs8uH333/XM888o4SEBHl7exd1d1wMGzZMQ4YMsR6npaUpNDRUUVFR8vX1LdC868UVzbV9Xu5Go5pk67Ut7krPdpMk7YqLLpK+FIbMzEwlJCSoffv28vT0LOruFIidapGop7ijnoJxnsGBvRHs8mHr1q06cuSIGjdubA3LysrSunXrNHXqVH399dfKyMhQSkqKy1G75ORkBQcHS5KCg4O1adMml/k6vzV7YZuLv0mbnJwsX1/fXI/WSZKXl5e8vLxyDPf09CzwG0d6lluBpi+o9Gw3qw92eFMvjHVSXNipFol6ijvqyf9yYH98eSIf2rVrp507d2r79u3WX5MmTdS7d2/r/56enlq9erU1zb59+3T48GFFRERIkiIiIrRz504dOXLEapOQkCBfX1/VqVPHanPhPJxtnPMAAAC4EEfs8qFMmTKqV6+ey7BSpUqpXLly1vB+/fppyJAhCggIkK+vr5566ilFRESoRYsWkqSoqCjVqVNHDz30kMaNG6ekpCS9+uqrio2NtY64DRw4UFOnTtULL7ygRx99VGvWrNHChQu1fPny61swAAC4IRDsrpGJEyfK3d1d3bp1U3p6uqKjozV9+nRrvIeHh5YtW6YnnnhCERERKlWqlPr06aP4+HirTVhYmJYvX65nn31WkydP1k033aT33ntP0dE37vVlAADg2iHYFZJvv/3W5bG3t7emTZumadOm5TlNlSpVLvvtzrZt22rbtm2F0UUAAGBzXGMHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsMunMWPGqGnTpipTpowCAwPVtWtX7du3z6XN2bNnFRsbq3Llyql06dLq1q2bkpOTXdocPnxYMTExKlmypAIDAzV06FCdO3fOpc23336rxo0by8vLSzVq1NDcuXOvdXkAAOAGRLDLp++++06xsbH68ccflZCQoMzMTEVFRenUqVNWm2effVZffvmlFi1apO+++05//vmn7r33Xmt8VlaWYmJilJGRoQ0bNuiDDz7Q3LlzNXz4cKvNwYMHFRMTozvuuEPbt2/X4MGD9dhjj+nrr7++rvUCAIDir0RRd+BGtXLlSpfHc+fOVWBgoLZu3arWrVsrNTVV77//vubPn68777xTkjRnzhzVrl1bP/74o1q0aKFVq1Zpz549+uabbxQUFKRGjRpp1KhRevHFFxUXFyeHw6GZM2cqLCxMb731liSpdu3a+uGHHzRx4kRFR0df97oBAEDxRbArJKmpqZKkgIAASdLWrVuVmZmpyMhIq02tWrVUuXJlJSYmqkWLFkpMTFT9+vUVFBRktYmOjtYTTzyh3bt369Zbb1ViYqLLPJxtBg8enGs/0tPTlZ6ebj1OS0uTJGVmZiozM7NANXp5mAJNn+/luhuXfyUVuJai5Oz7jVyDk51qkainuKOewlke7I1gVwiys7M1ePBg3XbbbapXr54kKSkpSQ6HQ35+fi5tg4KClJSUZLW5MNQ5xzvHXapNWlqazpw5Ix8fH5dxY8aM0ciRI3P0cdWqVSpZsmT+i5Q0rlmBJi+wUU2yrf+vWLGiCHtSOBISEoq6C4XGTrVI1FPcUU/+nD59+rosB0WLYFcIYmNjtWvXLv3www9F3RUNGzZMQ4YMsR6npaUpNDRUUVFR8vX1LdC868UVzXV9Xu5Go5pk67Ut7krPdpMk7Yq7cU9DZ2ZmKiEhQe3bt5enp2dRd6dA7FSLRD3FHfUUjPMMDuyNYFdAgwYN0rJly7Ru3TrddNNN1vDg4GBlZGQoJSXF5ahdcnKygoODrTabNm1ymZ/zW7MXtrn4m7TJycny9fXNcbROkry8vOTl5ZVjuKenZ4HfONKz3Ao0fUGlZ7tZfbDDm3phrJPiwk61SNRT3FFP/pcD++NbsflkjNGgQYO0ZMkSrVmzRmFhYS7jw8PD5enpqdWrV1vD9u3bp8OHDysiIkKSFBERoZ07d+rIkSNWm4SEBPn6+qpOnTpWmwvn4WzjnAcAAIATR+zyKTY2VvPnz9fnn3+uMmXKWNfElS1bVj4+Pipbtqz69eunIUOGKCAgQL6+vnrqqacUERGhFi1aSJKioqJUp04dPfTQQxo3bpySkpL06quvKjY21jrqNnDgQE2dOlUvvPCCHn30Ua1Zs0YLFy7U8uXLi6x2AABQPHHELp9mzJih1NRUtW3bVhUrVrT+FixYYLWZOHGi7rrrLnXr1k2tW7dWcHCwPvvsM2u8h4eHli1bJg8PD0VEROjBBx/Uww8/rPj4eKtNWFiYli9froSEBDVs2FBvvfWW3nvvPW51AgAAcuCIXT4Zc/lbf3h7e2vatGmaNm1anm2qVKly2W94tm3bVtu2bbvqPgIAgP9bOGIHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyhR1B0A8qPqS8uLuguWQ2NjiroLAABI4ogdAACAbXDEDgBQbFzqaLyXh9G4ZlK9uK+VnuV2zfvC0XjciAh2AP5PuJLT99c7OFxrdqsHwOUR7ABcM8XpWkjgal3r7fdqgjdHD3GluMYOAADAJjhiB9jM9TxKxqk+ACheOGIHAABgEwQ7AAAAmyDYAQAA2ATX2AEFdLXXtHFdGgDgWuGI3Q1i2rRpqlq1qry9vdW8eXNt2rSpqLsEAACKGYLdDWDBggUaMmSIRowYoZ9++kkNGzZUdHS0jhw5UtRdAwAAxQjB7gYwYcIE9e/fX3379lWdOnU0c+ZMlSxZUrNnzy7qrgEAgGKEa+yKuYyMDG3dulXDhg2zhrm7uysyMlKJiYk52qenpys9Pd16nJqaKkk6duyYMjMzC9SXEudOFWj6fC832+j06WyVyHRXVvaNf02aneqxUy0S9RR3/5frOXr0aIGXd+LECUmSMabA80LxRbAr5v7++29lZWUpKCjIZXhQUJB++eWXHO3HjBmjkSNH5hgeFhZ2zfp4PTxQ1B0oZHaqx061SNRT3P1fraf8W4W3zBMnTqhs2bKFN0MUKwQ7mxk2bJiGDBliPc7OztaxY8dUrlw5ubndmJ9w09LSFBoaqt9//12+vr5F3Z0Cs1M9dqpFop7ijnoKxhijEydOKCQk5JovC0WHYFfMlS9fXh4eHkpOTnYZnpycrODg4Bztvby85OXl5TLMz8/vWnbxuvH19bXFm7mTneqxUy0S9RR31JN/HKmzP748Ucw5HA6Fh4dr9erV1rDs7GytXr1aERERRdgzAABQ3HDE7gYwZMgQ9enTR02aNFGzZs00adIknTp1Sn379i3qrgEAgGKEYHcD6Nmzp/773/9q+PDhSkpKUqNGjbRy5cocX6iwKy8vL40YMSLHKeYblZ3qsVMtEvUUd9QDXJ6b4XvPAAAAtsA1dgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmCHay4uLk5ubm4uf7Vq1bLGnz17VrGxsSpXrpxKly6tbt265bgh8+HDhxUTE6OSJUsqMDBQQ4cO1blz51zafPvtt2rcuLG8vLxUo0YNzZ07t1D6v27dOnXu3FkhISFyc3PT0qVLXcYbYzR8+HBVrFhRPj4+ioyM1P79+13aHDt2TL1795avr6/8/PzUr18/nTx50qXNjh07dPvtt8vb21uhoaEaN25cjr4sWrRItWrVkre3t+rXr68VK1YUej2PPPJIjvXVoUOHYlnPmDFj1LRpU5UpU0aBgYHq2rWr9u3b59Lmem5f06ZNU9WqVeXt7a3mzZtr06ZNhV5P27Ztc6yfgQMHFst6ZsyYoQYNGlg34I2IiNBXX31ljb+R1s2V1HMjrRvYmAGusREjRpi6deuav/76y/r773//a40fOHCgCQ0NNatXrzZbtmwxLVq0MC1btrTGnzt3ztSrV89ERkaabdu2mRUrVpjy5cubYcOGWW3+9a9/mZIlS5ohQ4aYPXv2mLffftt4eHiYlStXFrj/K1asMK+88or57LPPjCSzZMkSl/Fjx441ZcuWNUuXLjU///yz6dKliwkLCzNnzpyx2nTo0ME0bNjQ/Pjjj+b77783NWrUMPfff781PjU11QQFBZnevXubXbt2mX/+85/Gx8fHvPPOO1ab9evXGw8PDzNu3DizZ88e8+qrrxpPT0+zc+fOQq2nT58+pkOHDi7r69ixYy5tiks90dHRZs6cOWbXrl1m+/btplOnTqZy5crm5MmTVpvrtX198sknxuFwmNmzZ5vdu3eb/v37Gz8/P5OcnFyo9bRp08b079/fZf2kpqYWy3q++OILs3z5cvPrr7+affv2mZdfftl4enqaXbt2GWNurHVzJfXcSOsG9kWwwzU3YsQI07Bhw1zHpaSkGE9PT7No0SJr2N69e40kk5iYaIw5H0Tc3d1NUlKS1WbGjBnG19fXpKenG2OMeeGFF0zdunVd5t2zZ08THR1dqLVcHISys7NNcHCwGT9+vEtNXl5e5p///Kcxxpg9e/YYSWbz5s1Wm6+++sq4ubmZ//znP8YYY6ZPn278/f2teowx5sUXXzQ1a9a0Hvfo0cPExMS49Kd58+bm8ccfL7R6jDkf7O6+++48pynO9Rw5csRIMt99950x5vpuX82aNTOxsbHW46ysLBMSEmLGjBlTaPUYcz48PPPMM3lOU5zrMcYYf39/8957793w6+bieoy58dcN7IFTsbgu9u/fr5CQEFWrVk29e/fW4cOHJUlbt25VZmamIiMjrba1atVS5cqVlZiYKElKTExU/fr1XW7IHB0drbS0NO3evdtqc+E8nG2c87hWDh48qKSkJJdlly1bVs2bN3fpv5+fn5o0aWK1iYyMlLu7uzZu3Gi1ad26tRwOh0v/9+3bp+PHj1ttrleN3377rQIDA1WzZk098cQTOnr0qDWuONeTmpoqSQoICJB0/bavjIwMbd261aWNu7u7IiMjC7Uep3nz5ql8+fKqV6+ehg0bptOnT1vjims9WVlZ+uSTT3Tq1ClFRETc8Ovm4nqcbsR1A3vhlydwzTVv3lxz585VzZo19ddff2nkyJG6/fbbtWvXLiUlJcnhcMjPz89lmqCgICUlJUmSkpKScvzKhvPx5dqkpaXpzJkz8vHxuSa1OZef27Iv7FtgYKDL+BIlSiggIMClTVhYWI55OMf5+/vnWaNzHoWlQ4cOuvfeexUWFqYDBw7o5ZdfVseOHZWYmCgPD49iW092drYGDx6s2267TfXq1bOWdT22r+PHjysrKyvXNr/88kuh1SNJDzzwgKpUqaKQkBDt2LFDL774ovbt26fPPvusWNazc+dORURE6OzZsypdurSWLFmiOnXqaPv27TfkusmrHunGWzewJ4IdrrmOHTta/2/QoIGaN2+uKlWqaOHChdcscCH/evXqZf2/fv36atCggapXr65vv/1W7dq1K8KeXVpsbKx27dqlH374oai7UijyqmfAgAHW/+vXr6+KFSuqXbt2OnDggKpXr369u3lZNWvW1Pbt25WamqpPP/1Uffr00XfffVfU3cq3vOqpU6fODbduYE+cisV15+fnp1tuuUW//fabgoODlZGRoZSUFJc2ycnJCg4OliQFBwfn+Kac8/Hl2vj6+l7T8Ohcfm7LvrBvR44ccRl/7tw5HTt2rFBqdI6/VqpVq6by5cvrt99+s/pR3OoZNGiQli1bprVr1+qmm26yhl+v7at8+fLy8PC45vXkpnnz5pLksn6KUz0Oh0M1atRQeHi4xowZo4YNG2ry5Mk37LrJq57cFPd1A3si2OG6O3nypA4cOKCKFSsqPDxcnp6eWr16tTV+3759Onz4sHXdSkREhHbu3OkSJhISEuTr62udAomIiHCZh7PNhde+XAthYWEKDg52WXZaWpo2btzo0v+UlBRt3brVarNmzRplZ2dbb/wRERFat26dMjMzXfpfs2ZN+fv7W22KosY//vhDR48eVcWKFYtdPcYYDRo0SEuWLNGaNWtynP69XtuXw+FQeHi4S5vs7GytXr26UOvJzfbt2yXJZf0Ul3pyk52drfT09Btu3VyuntzcaOsGNlHU396A/T333HPm22+/NQcPHjTr1683kZGRpnz58ubIkSPGmPO3PKhcubJZs2aN2bJli4mIiDARERHW9M5bBERFRZnt27eblStXmgoVKuR6i4ChQ4eavXv3mmnTphXa7U5OnDhhtm3bZrZt22YkmQkTJpht27aZf//738aY87c78fPzM59//rnZsWOHufvuu3O93cmtt95qNm7caH744Qdz8803u9weJCUlxQQFBZmHHnrI7Nq1y3zyySemZMmSOW4PUqJECfPmm2+avXv3mhEjRuTrdieXqufEiRPm+eefN4mJiebgwYPmm2++MY0bNzY333yzOXv2bLGr54knnjBly5Y13377rcstJk6fPm21uV7b1yeffGK8vLzM3LlzzZ49e8yAAQOMn5+fyzcgC1rPb7/9ZuLj482WLVvMwYMHzeeff26qVatmWrduXSzreemll8x3331nDh48aHbs2GFeeukl4+bmZlatWnXDrZvL1XOjrRvYF8EO11zPnj1NxYoVjcPhMJUqVTI9e/Y0v/32mzX+zJkz5sknnzT+/v6mZMmS5p577jF//fWXyzwOHTpkOnbsaHx8fEz58uXNc889ZzIzM13arF271jRq1Mg4HA5TrVo1M2fOnELp/9q1a42kHH99+vQxxpy/5clrr71mgoKCjJeXl2nXrp3Zt2+fyzyOHj1q7r//flO6dGnj6+tr+vbta06cOOHS5ueffzatWrUyXl5eplKlSmbs2LE5+rJw4UJzyy23GIfDYerWrWuWL19eqPWcPn3aREVFmQoVKhhPT09TpUoV079//xw7jOJST251SHJZ99dz+3r77bdN5cqVjcPhMM2aNTM//vhjodZz+PBh07p1axMQEGC8vLxMjRo1zNChQ13ulVac6nn00UdNlSpVjMPhMBUqVDDt2rWzQp0xN9a6uVw9N9q6gX25GWPM9Ts+CAAAgGuFa+wAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2MT/AxfuxmKOSwz/AAAAAElFTkSuQmCC", "text/plain": [ - "
" + "49970" ] }, + "execution_count": 86, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "context_to_check = 2046\n", - "checks = check_mutation_positions(result_df.to_pandas(), context_to_check)\n", - "checks[checks[\"out_of_bounds\"]].codon_position.hist(figsize=(5, 5))\n", - "plt.title(\n", - " f\" {checks['out_of_bounds'].sum()} out of {len(checks)} variants are out of bounds (context length = {context_to_check})\"\n", + "# Keep only WGS and WXS samples\n", + "cosmic_samples = pl.read_csv(cosmic_files[\"cosmic_samples\"], separator=\"\\t\")\n", + "cosmic_wxs_samples = (\n", + " cosmic_samples.filter((pl.col(\"WHOLE_GENOME_SCREEN\") == \"y\") | (pl.col(\"WHOLE_EXOME_SCREEN\") == \"y\"))[\n", + " \"COSMIC_SAMPLE_ID\"\n", + " ]\n", + " .unique()\n", + " .to_list()\n", ")\n", - "plt.show()" + "len(cosmic_wxs_samples)" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "6b85ad06", + "execution_count": 87, + "id": "625222f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2004598\n", + "2 953793\n", + "3 923499\n", + "4 923499\n" + ] + } + ], + "source": [ + "columns = [\n", + " \"MUTATION_ID\",\n", + " \"GENE_SYMBOL\",\n", + " \"TRANSCRIPT_ACCESSION\",\n", + " \"MUTATION_CDS\",\n", + " \"MUTATION_AA\",\n", + " \"MUTATION_DESCRIPTION\",\n", + " \"CHROMOSOME\",\n", + " \"GENOME_START\",\n", + " \"GENOME_STOP\",\n", + " \"STRAND\",\n", + " \"HGVSP\",\n", + " \"HGVSC\",\n", + " \"HGVSG\",\n", + " \"GENOMIC_WT_ALLELE\",\n", + " \"GENOMIC_MUT_ALLELE\",\n", + "]\n", + "\n", + "\n", + "# %%\n", + "data = pl.read_csv(cosmic_files[\"cosmic_mutant_census\"], infer_schema_length=100000, separator=\"\\t\")\n", + "data = data.with_columns(\n", + " pl.col(\"CHROMOSOME\").map_elements(\n", + " lambda x: str(int(float(x))) if x not in [\"X\", \"Y\"] else x, return_dtype=pl.String\n", + " )\n", + ")\n", + "data = data.with_columns(pl.col(\"GENOME_START\").cast(pl.Int64))\n", + "data = data.with_columns(pl.col(\"GENOME_STOP\").cast(pl.Int64))\n", + "print(0, data.height)\n", + "data = data.filter(pl.col(\"COSMIC_SAMPLE_ID\").is_in(cosmic_wxs_samples))\n", + "\n", + "## Each row is a variant in a sample, so need to group by variant across samples.\n", + "num_samples = data.group_by(\"HGVSC\").len(\"num_samples\")\n", + "somatic = data.filter(pl.col(\"MUTATION_SOMATIC_STATUS\") == \"Confirmed somatic variant\")[\"HGVSC\"].unique().to_list()\n", + "data_grouped = data.group_by(columns).first().select(columns)\n", + "data_grouped = data_grouped.with_columns(pl.col(\"HGVSC\").is_in(somatic).alias(\"somatic\"))\n", + "data_grouped = data_grouped.join(num_samples, on=\"HGVSC\", how=\"left\")\n", + "data_grouped = data_grouped.with_columns(\n", + " pl.col(\"GENOMIC_WT_ALLELE\").alias(\"ref\"), pl.col(\"GENOMIC_MUT_ALLELE\").alias(\"alt\")\n", + ")\n", + "data_grouped = data_grouped.filter((pl.col(\"ref\").str.len_chars() == 1) & (pl.col(\"alt\").str.len_chars() == 1))\n", + "data_grouped = data_grouped.filter(pl.col(\"ref\") != pl.col(\"alt\"))\n", + "print(2, data_grouped.height)\n", + "data_grouped = data_grouped.with_columns(\n", + " (\"chr\" + pl.col(\"CHROMOSOME\")).alias(\"chrom\"), (pl.col(\"GENOME_START\")).cast(pl.Int64).alias(\"pos\")\n", + ")\n", + "data_grouped = data_grouped.filter(pl.col(\"chrom\").is_in(valid_chroms))\n", + "print(3, data_grouped.height)\n", + "data_grouped = data_grouped.unique()\n", + "print(4, data_grouped.height)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ac6c0ad1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 19310 genes and 923499 variants...\n" + ] + } + ], + "source": [ + "cols = [x for x in data_grouped.columns if x != \"chrom\"]\n", + "gene_variant_mapping = map_variants_to_genes_by_exons_efficient(genes, data_grouped, variant_columns=cols)\n", + "result = convert_gene_variant_mapping_to_df(gene_variant_mapping, genes, cols)\n", + "\n", + "result = result.filter(pl.col(\"GENE_SYMBOL\") == pl.col(\"gene_name\"))\n", + "result = result.filter(pl.col(\"ref_seq\").str.len_chars() % 3 == 0)\n", + "result.write_csv(COSMIC_OUTPUT_FILE)" + ] + }, + { + "cell_type": "markdown", + "id": "7c30abbc", + "metadata": {}, + "source": [ + "### gnomAD common variants" + ] + }, + { + "cell_type": "markdown", + "id": "378a80d1", + "metadata": {}, + "source": [ + "#### The gnomad data need to be downloaded from https://gnomad.broadinstitute.org/\n", + "To convert the vcf files to tsv files, run the following command with `bcftools`.\n", + "```\n", + "bcftools query -f '\\''%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AC\\t%AN\\n'\\'' \\\n", + " -i '\\''TYPE=\"snp\" & FILTER=\"PASS\"'\\'' \\\n", + " \"gnomad..v4.1.sites..vcf.bgz\" | \\\n", + " gzip > \".tsv.gz\"\n", + "```\n", + "\n", + "The `` and `` need to be replaced by the actual names." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "67283569", "metadata": {}, "outputs": [], "source": [ - "# Save processed results, dset, and refseq tables\n", - "dset.write_csv(f\"{OUTPUT_DIR}/clinvar_synom.csv\")" + "# get all variants from gnomAD, including exome and genome\n", + "exome_variants = []\n", + "for chrom in [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]:\n", + " variants = pl.read_csv(\n", + " os.path.join(gnomad_files[\"gnomad_exomes\"], f\"{chrom}.tsv.gz\"), separator=\"\\t\", has_header=False\n", + " )\n", + " exome_variants.append(variants)\n", + "exome_variants = pl.concat(exome_variants)\n", + "exome_variants.columns = [\"chrom\", \"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "exome_variants = exome_variants.filter(pl.col(\"an\") > 100000)\n", + "\n", + "genome_variants = []\n", + "for chrom in [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]:\n", + " variants = pl.read_csv(\n", + " os.path.join(gnomad_files[\"gnomad_genomes\"], f\"{chrom}.tsv.gz\"), separator=\"\\t\", has_header=False\n", + " )\n", + " genome_variants.append(variants)\n", + "genome_variants = pl.concat(genome_variants)\n", + "genome_variants.columns = [\"chrom\", \"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "genome_variants = genome_variants.filter(pl.col(\"an\") > 25000)\n", + "\n", + "all_variants = (\n", + " pl.concat([exome_variants, genome_variants])\n", + " .sort(\"af\", descending=True)\n", + " .unique(subset=[\"chrom\", \"pos\", \"ref\", \"alt\"], keep=\"first\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "b47f7a09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 19310 genes and 12910000 variants...\n" + ] + } + ], + "source": [ + "common_variants = all_variants.filter(pl.col(\"af\") > 0.01).sort([\"chrom\", \"pos\"])\n", + "cols = [\"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "gene_variant_mapping = map_variants_to_genes_by_exons_efficient(genes, common_variants, variant_columns=cols)\n", + "\n", + "result = convert_gene_variant_mapping_to_df(gene_variant_mapping, genes, cols)\n", + "result = result.filter(pl.col(\"ref_seq\").str.len_chars() % 3 == 0)\n", + "result.write_csv(GNOMAD_OUTPUT_FILE)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8e555940-85c7-430c-8d41-e8fae2de5df6", + "id": "5a1d4872", "metadata": {}, "outputs": [], "source": [] diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb new file mode 100644 index 0000000000..70216b8057 --- /dev/null +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 38, + "id": "92fb3d66", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GENCODE version: v47\n", + "Reference directory: /data/for_paper/data/reference\n" + ] + } + ], + "source": [ + "# Annotation File Processing\n", + "# This notebook processes GENCODE GTF annotation files to extract protein-coding\n", + "# transcript information and exports it in a tabular format suitable for downstream analysis.\n", + "\n", + "import polars as pl\n", + "\n", + "\n", + "# =============================================================================\n", + "# Configuration\n", + "# =============================================================================\n", + "REFERENCE_DIR = \"/data/for_paper/data/reference\"\n", + "GENCODE_VERSION = \"v47\"\n", + "\n", + "# Input: GENCODE GTF files (downloaded from https://www.gencodegenes.org/)\n", + "GTF_FILES = {\n", + " \"hg38\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.gtf.gz\",\n", + " \"hg19\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}lift37.basic.annotation.gtf.gz\",\n", + "}\n", + "\n", + "# Output: Processed annotation TSV files\n", + "OUTPUT_FILES = {\n", + " \"hg38\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.processed.tsv\",\n", + " \"hg19\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}lift37.basic.annotation.processed.tsv\",\n", + "}\n", + "\n", + "print(f\"GENCODE version: {GENCODE_VERSION}\")\n", + "print(f\"Reference directory: {REFERENCE_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "562e1104", + "metadata": {}, + "outputs": [], + "source": [ + "# Helper functions to validate and adjust CDS (coding sequence) boundaries\n", + "# These functions ensure that the exon coordinates properly include the stop codon (3 bp)\n", + "\n", + "\n", + "def check_start_alignment(row):\n", + " \"\"\"\n", + " Validates that CDS start aligns with exon start and adjusts for stop codon on minus strand.\n", + " For minus strand genes, the stop codon is at the 3' end (lowest genomic position), so we subtract 3 bp.\n", + " \"\"\"\n", + " cds_start = row[\"cds_start\"]\n", + " exon_starts = list(map(int, row[\"exon_starts\"].strip(\",\").split(\",\")))\n", + " if row[\"strand\"] == \"-\":\n", + " # Extend first exon by 3 bp to include stop codon (on minus strand)\n", + " exon_starts[0] -= 3\n", + " assert cds_start == exon_starts[0], f\"{cds_start} != {exon_starts[0]} {row['transcript_id']}\"\n", + "\n", + " exon_starts = \",\".join(map(str, exon_starts)) + \",\"\n", + " return exon_starts\n", + "\n", + "\n", + "def check_end_alignment(row):\n", + " \"\"\"\n", + " Validates that CDS end aligns with exon end and adjusts for stop codon on plus strand.\n", + " For plus strand genes, the stop codon is at the 3' end (highest genomic position), so we add 3 bp.\n", + " \"\"\"\n", + " cds_end = row[\"cds_end\"]\n", + " exon_ends = list(map(int, row[\"exon_ends\"].strip(\",\").split(\",\")))\n", + " if row[\"strand\"] == \"+\":\n", + " # Extend last exon by 3 bp to include stop codon (on plus strand)\n", + " exon_ends[-1] += 3\n", + " assert cds_end == exon_ends[-1], f\"{cds_end} != {exon_ends[-1]} {row['transcript_id']}\"\n", + "\n", + " exon_ends = \",\".join(map(str, exon_ends)) + \",\"\n", + " return exon_ends" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "af9b630f", + "metadata": {}, + "outputs": [], + "source": [ + "def process_gtf_file(gtf_file, output_file):\n", + " \"\"\"\n", + " Process a GENCODE GTF file and extract protein-coding transcript annotations.\n", + "\n", + " This function:\n", + " 1. Parses the GTF file and extracts relevant attributes\n", + " 2. Filters for protein-coding genes only\n", + " 3. Aggregates exon/CDS coordinates per transcript\n", + " 4. Adjusts coordinates to include stop codons\n", + " 5. Outputs a tab-separated file with transcript annotations\n", + "\n", + " Args:\n", + " gtf_file: Path to input GENCODE GTF file (can be gzipped)\n", + " output_file: Path for output TSV file\n", + " \"\"\"\n", + " # Read GTF file (standard 9-column format)\n", + " gtf = pl.read_csv(gtf_file, comment_prefix=\"#\", separator=\"\\t\", has_header=False)\n", + " gtf.columns = [\"chrom\", \"source\", \"feature\", \"start\", \"end\", \"score\", \"strand\", \"frame\", \"attribute\"]\n", + "\n", + " # Parse the attribute column (column 9) to extract key-value pairs\n", + " # GTF attributes are semicolon-separated with format: key \"value\"\n", + " gtf = gtf.with_columns(\n", + " [\n", + " pl.col(\"attribute\").str.extract('gene_id \"(.*?)\"', 1).alias(\"gene_id\"),\n", + " pl.col(\"attribute\").str.extract('transcript_id \"(.*?)\"', 1).alias(\"transcript_id\"),\n", + " pl.col(\"attribute\").str.extract('gene_name \"(.*?)\"', 1).alias(\"gene_name\"),\n", + " pl.col(\"attribute\").str.extract('gene_type \"(.*?)\"', 1).alias(\"gene_type\"),\n", + " pl.col(\"attribute\").str.extract('transcript_type \"(.*?)\"', 1).alias(\"transcript_type\"),\n", + " pl.col(\"attribute\").str.extract(\"exon_number (.*?);\", 1).alias(\"exon_number\"),\n", + " ]\n", + " )\n", + "\n", + " # Flag canonical transcripts (Ensembl canonical and MANE Select)\n", + " gtf = gtf.with_columns(pl.col(\"attribute\").str.contains(\"Ensembl_canonical\").alias(\"is_canonical\"))\n", + " gtf = gtf.with_columns(pl.col(\"attribute\").str.contains(\"MANE_Select\").alias(\"is_mane_select\"))\n", + "\n", + " # Filter to protein-coding genes only, exclude gene-level features\n", + " protein_coding_gtf = gtf.filter((pl.col(\"gene_type\") == \"protein_coding\") & (pl.col(\"feature\") != \"gene\"))\n", + " # Filter to protein-coding transcripts only\n", + " protein_coding_gtf = protein_coding_gtf.filter(pl.col(\"transcript_type\") == \"protein_coding\")\n", + "\n", + " # Convert from 1-based (GTF) to 0-based coordinates (BED-like format)\n", + " protein_coding_gtf = protein_coding_gtf.with_columns(pl.col(\"start\") - 1)\n", + "\n", + " # Aggregate CDS exon coordinates per transcript\n", + " # Creates comma-separated lists of exon start/end positions (sorted by genomic position)\n", + " exon_starts = (\n", + " protein_coding_gtf.filter(pl.col(\"feature\") == \"CDS\")\n", + " .group_by(\"transcript_id\")\n", + " .agg(\n", + " (pl.col(\"start\").sort().cast(str).str.join(\",\") + \",\").alias(\"exon_starts\"),\n", + " (pl.col(\"end\").sort().cast(str).str.join(\",\") + \",\").alias(\"exon_ends\"),\n", + " pl.col(\"exon_number\").max().alias(\"exon_count\"),\n", + " )\n", + " )\n", + "\n", + " # Calculate CDS boundaries with stop codon adjustment\n", + " # GENCODE GTF excludes stop codon from CDS, but we want to include it\n", + " # For + strand: stop codon is after the last CDS position (add 3 to max end)\n", + " # For - strand: stop codon is before the first CDS position (subtract 3 from min start)\n", + " # Note: Using min()/max() instead of first()/last() to avoid dependency on row order\n", + " cds_starts = (\n", + " protein_coding_gtf.filter(pl.col(\"feature\") == \"CDS\")\n", + " .group_by(\"transcript_id\")\n", + " .agg(\n", + " pl.when(pl.col(\"strand\").first() == \"-\")\n", + " .then(pl.col(\"start\").min() - 3) # Include stop codon at 3' end (lowest genomic position)\n", + " .otherwise(pl.col(\"start\").min()) # 5' end, no adjustment needed\n", + " .alias(\"cds_start\"),\n", + " pl.when(pl.col(\"strand\").first() == \"-\")\n", + " .then(pl.col(\"end\").max()) # 5' end (highest genomic position), no adjustment\n", + " .otherwise(pl.col(\"end\").max() + 3) # Include stop codon at 3' end\n", + " .alias(\"cds_end\"),\n", + " )\n", + " )\n", + "\n", + " # Get transcript-level metadata (gene info, coordinates, canonical status)\n", + " tx_starts = (\n", + " protein_coding_gtf.filter(pl.col(\"feature\") == \"transcript\")\n", + " .group_by(\"transcript_id\")\n", + " .agg(\n", + " pl.col(\"gene_id\").first().alias(\"gene_id\"),\n", + " pl.col(\"gene_name\").first().alias(\"gene_name\"),\n", + " pl.col(\"chrom\").first().alias(\"chrom\"),\n", + " pl.col(\"strand\").first().alias(\"strand\"),\n", + " pl.col(\"start\").min().alias(\"tx_start\"),\n", + " pl.col(\"end\").max().alias(\"tx_end\"),\n", + " pl.col(\"transcript_type\").first().alias(\"transcript_type\"),\n", + " pl.col(\"is_canonical\").first().alias(\"is_canonical\"),\n", + " pl.col(\"is_mane_select\").first().alias(\"is_mane_select\"),\n", + " )\n", + " )\n", + "\n", + " # Join all transcript information together\n", + " joined_df = tx_starts.join(cds_starts, on=\"transcript_id\", how=\"inner\").join(\n", + " exon_starts, on=\"transcript_id\", how=\"inner\"\n", + " )\n", + "\n", + " # Validate and adjust exon coordinates to include stop codon\n", + " joined_df = joined_df.with_columns(\n", + " pl.struct([\"cds_start\", \"exon_starts\", \"strand\", \"transcript_id\"])\n", + " .map_elements(check_start_alignment, return_dtype=pl.Utf8)\n", + " .alias(\"exon_starts\"),\n", + " pl.struct([\"cds_end\", \"exon_ends\", \"strand\", \"transcript_id\"])\n", + " .map_elements(check_end_alignment, return_dtype=pl.Utf8)\n", + " .alias(\"exon_ends\"),\n", + " )\n", + "\n", + " # Sort by chromosome and position, then select and rename columns for output\n", + " joined_df = joined_df.sort([\"chrom\", \"tx_start\"])\n", + " joined_df = joined_df.select(\n", + " [\n", + " \"gene_id\",\n", + " \"transcript_id\",\n", + " \"chrom\",\n", + " \"strand\",\n", + " \"tx_start\",\n", + " \"tx_end\",\n", + " \"cds_start\",\n", + " \"cds_end\",\n", + " \"exon_count\",\n", + " \"exon_starts\",\n", + " \"exon_ends\",\n", + " \"gene_name\",\n", + " \"transcript_type\",\n", + " \"is_canonical\",\n", + " \"is_mane_select\",\n", + " ]\n", + " ).rename(\n", + " {\n", + " \"transcript_id\": \"name\", # Transcript ID becomes the 'name' field\n", + " \"tx_start\": \"txStart\", # Transcript start position\n", + " \"tx_end\": \"txEnd\", # Transcript end position\n", + " \"cds_start\": \"cdsStart\", # CDS start (including stop codon adjustment)\n", + " \"cds_end\": \"cdsEnd\", # CDS end (including stop codon adjustment)\n", + " \"exon_starts\": \"exonStarts\", # Comma-separated exon start positions\n", + " \"exon_ends\": \"exonEnds\", # Comma-separated exon end positions\n", + " }\n", + " )\n", + "\n", + " # Write output as tab-separated file\n", + " joined_df.write_csv(output_file, separator=\"\\t\")\n", + "\n", + " return joined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "62147898", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing hg38: /data/for_paper/data/reference/gencode.v47.basic.annotation.gtf.gz\n", + "Output saved to: /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.tsv\n", + "\n", + "Processing hg19: /data/for_paper/data/reference/gencode.v47lift37.basic.annotation.gtf.gz\n", + "Output saved to: /data/for_paper/data/reference/gencode.v47lift37.basic.annotation.processed.tsv\n", + "\n" + ] + } + ], + "source": [ + "# Process GENCODE annotation files for both GRCh38 (hg38) and GRCh37 (hg19) assemblies\n", + "\n", + "for assembly in [\"hg38\", \"hg19\"]:\n", + " gtf_file = GTF_FILES[assembly]\n", + " output_file = OUTPUT_FILES[assembly]\n", + "\n", + " print(f\"Processing {assembly}: {gtf_file}\")\n", + " _ = process_gtf_file(gtf_file, output_file)\n", + " print(f\"Output saved to: {output_file}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "10eb865d", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 1: Quality Control of CDS and Extract CDS Sequence\n", + "\n", + "This section extracts CDS sequences from the reference genome and validates them for downstream variant analysis.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1c02b365", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reference genome: /data/for_paper/data/reference/hg38/hg38.fa\n", + "Annotation file: /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.tsv\n", + "Output file: /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.filtered.tsv\n" + ] + } + ], + "source": [ + "# =============================================================================\n", + "# Additional Configuration for CDS Extraction\n", + "# =============================================================================\n", + "\n", + "# Reference genome (GRCh38/hg38)\n", + "REFERENCE_GENOME = f\"{REFERENCE_DIR}/hg38/hg38.fa\"\n", + "\n", + "# Input: Processed annotation from above\n", + "ANNOTATION_FILE = OUTPUT_FILES[\"hg38\"]\n", + "\n", + "# Output: Filtered transcripts with CDS sequences\n", + "FILTERED_TRANSCRIPTS_FILE = f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.processed.filtered.tsv\"\n", + "\n", + "# Valid chromosomes for analysis\n", + "VALID_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\"]\n", + "\n", + "# DNA complement mapping for reverse complement operations\n", + "COMPLEMENT = {\"A\": \"T\", \"T\": \"A\", \"G\": \"C\", \"C\": \"G\", \"N\": \"N\"}\n", + "\n", + "print(f\"Reference genome: {REFERENCE_GENOME}\")\n", + "print(f\"Annotation file: {ANNOTATION_FILE}\")\n", + "print(f\"Output file: {FILTERED_TRANSCRIPTS_FILE}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "486819ee", + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================================\n", + "# CDS Extraction and Quality Control Functions\n", + "# =============================================================================\n", + "\n", + "\n", + "def extract_cds_sequence(row, fasta):\n", + " \"\"\"\n", + " Extract the coding sequence (CDS) for a transcript from the reference genome.\n", + "\n", + " This function:\n", + " 1. Iterates through exons and extracts only the CDS-overlapping portions\n", + " 2. Concatenates exon sequences in genomic order\n", + " 3. Reverse complements for minus strand genes\n", + "\n", + " Args:\n", + " row: DataFrame row containing transcript annotation (chrom, strand, cdsStart, cdsEnd, exonStarts, exonEnds)\n", + " fasta: Dictionary mapping chromosome names to their sequences\n", + "\n", + " Returns:\n", + " str: The complete CDS sequence in 5' to 3' orientation (transcript strand)\n", + " \"\"\"\n", + " chrom = row[\"chrom\"]\n", + " strand = row[\"strand\"]\n", + " cds_start = row[\"cdsStart\"]\n", + " cds_end = row[\"cdsEnd\"]\n", + "\n", + " # Parse comma-separated exon coordinates from annotation file\n", + " exon_starts = [int(x) for x in row[\"exonStarts\"].rstrip(\",\").split(\",\")]\n", + " exon_ends = [int(x) for x in row[\"exonEnds\"].rstrip(\",\").split(\",\")]\n", + "\n", + " # Ensure exon boundaries encompass the full CDS (handles edge cases)\n", + " if exon_starts[0] > cds_start:\n", + " exon_starts[0] = cds_start\n", + " if exon_ends[-1] < cds_end:\n", + " exon_ends[-1] = cds_end\n", + "\n", + " # Extract CDS sequence by iterating through exons\n", + " cds_sequence = \"\"\n", + "\n", + " for start, end in zip(exon_starts, exon_ends):\n", + " # Find overlap between this exon and the CDS region\n", + " overlap_start = max(start, cds_start)\n", + " overlap_end = min(end, cds_end)\n", + "\n", + " if overlap_start < overlap_end:\n", + " # Extract sequence from this exon segment (0-based coordinates)\n", + " seq = str(fasta[chrom][overlap_start:overlap_end]).upper()\n", + " cds_sequence += seq\n", + "\n", + " # For minus strand genes, reverse complement to get 5' to 3' orientation\n", + " if strand == \"-\":\n", + " cds_sequence = \"\".join(COMPLEMENT[base] for base in cds_sequence[::-1])\n", + "\n", + " return cds_sequence\n", + "\n", + "\n", + "def check_cds_quality(sequence):\n", + " \"\"\"\n", + " Validate CDS sequence quality for downstream variant analysis.\n", + "\n", + " Quality criteria checked:\n", + " 1. Starts with ATG (methionine start codon)\n", + " 2. Ends with a stop codon (TAA, TAG, or TGA)\n", + " 3. Length is divisible by 3 (complete codons)\n", + " 4. No premature stop codons within the coding region\n", + "\n", + " Args:\n", + " sequence: CDS nucleotide sequence string\n", + "\n", + " Returns:\n", + " dict: Quality metrics including boolean flags and sequence length\n", + " \"\"\"\n", + " if not sequence or len(sequence) < 3:\n", + " return {\n", + " \"has_start_codon\": False,\n", + " \"has_stop_codon\": False,\n", + " \"length_divisible_by_3\": False,\n", + " \"has_internal_stop_codons\": False,\n", + " \"length\": len(sequence) if sequence else 0,\n", + " }\n", + "\n", + " # Check for canonical start codon (ATG = Methionine)\n", + " has_start_codon = sequence[:3] == \"ATG\"\n", + "\n", + " # Check for stop codon at the end\n", + " has_stop_codon = sequence[-3:] in [\"TAA\", \"TAG\", \"TGA\"]\n", + "\n", + " # CDS should be in-frame (length divisible by 3)\n", + " length_divisible_by_3 = len(sequence) % 3 == 0\n", + "\n", + " # Check for internal stop codons (premature termination)\n", + " # These indicate potential annotation errors or pseudogenes\n", + " has_internal_stop_codons = False\n", + " if len(sequence) >= 6: # Need at least 2 codons to check for internal stops\n", + " # Check all codons except the last one (which should be a stop)\n", + " for i in range(0, len(sequence) - 3, 3):\n", + " codon = sequence[i : i + 3]\n", + " if codon in [\"TAA\", \"TAG\", \"TGA\"]:\n", + " has_internal_stop_codons = True\n", + " break\n", + "\n", + " return {\n", + " \"has_start_codon\": has_start_codon,\n", + " \"has_stop_codon\": has_stop_codon,\n", + " \"length_divisible_by_3\": length_divisible_by_3,\n", + " \"has_internal_stop_codons\": has_internal_stop_codons,\n", + " \"length\": len(sequence),\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "7a083e5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 23 chromosomes from /data/for_paper/data/reference/hg38/hg38.fa\n" + ] + } + ], + "source": [ + "# =============================================================================\n", + "# Load Reference Genome (GRCh38/hg38)\n", + "# =============================================================================\n", + "# Pre-load chromosome sequences into memory for faster access during variant generation.\n", + "# Only loading standard chromosomes (1-22, X, Y) - excluding patches and alternate contigs.\n", + "\n", + "import pyfaidx\n", + "\n", + "\n", + "fasta = {}\n", + "\n", + "with pyfaidx.Fasta(REFERENCE_GENOME) as f:\n", + " for chrom in VALID_CHROMS:\n", + " fasta[chrom] = f[chrom][:].seq\n", + "print(f\"Loaded {len(fasta)} chromosomes from {REFERENCE_GENOME}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "9f350338", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 64,488 transcripts from /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.tsv\n", + "After deduplicating by genomic structure: 51,650 transcripts\n", + "Extracting CDS sequences...\n", + "Running quality checks...\n", + "\n", + "============================================================\n", + "CDS Quality Summary (before filtering):\n", + "============================================================\n", + "Total transcripts: 51,650\n", + "Has start codon (ATG): 51,419 (99.6%)\n", + "Has stop codon (TAA/TAG/TGA): 51,341 (99.4%)\n", + "Length divisible by 3: 51,573 (99.9%)\n", + "Has internal stop codons: 113 (0.2%)\n", + "All quality criteria met: 51,061\n", + "\n", + "After quality filtering: 51,061 transcripts\n", + "After filtering to canonical transcripts: 19,407 transcripts\n", + "After canonical filter + CDS deduplication: 19,310 unique transcripts\n", + " (Removed 31,751 transcripts)\n", + "Saved 19,310 unique transcripts to /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.filtered.tsv\n" + ] + } + ], + "source": [ + "# =============================================================================\n", + "# Load Annotations, Extract CDS Sequences, and Apply Quality Filters\n", + "# =============================================================================\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 1: Load processed GENCODE annotation\n", + "# -----------------------------------------------------------------------------\n", + "# Input: TSV file from the processing above containing\n", + "# transcript coordinates, exon boundaries, and canonical status flags\n", + "ann = pl.read_csv(ANNOTATION_FILE, separator=\"\\t\")\n", + "ann = ann.filter(pl.col(\"chrom\").is_in(VALID_CHROMS))\n", + "print(f\"Loaded {len(ann):,} transcripts from {ANNOTATION_FILE}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 2: Deduplicate transcripts with identical genomic structure\n", + "# -----------------------------------------------------------------------------\n", + "# Multiple transcript IDs can map to the same CDS coordinates (e.g., RefSeq vs Ensembl)\n", + "# Keep MANE Select > Ensembl Canonical when duplicates exist\n", + "ann = ann.sort([\"is_mane_select\", \"is_canonical\"], descending=True)\n", + "ann = ann.unique(subset=[\"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"exonStarts\", \"exonEnds\"])\n", + "print(f\"After deduplicating by genomic structure: {len(ann):,} transcripts\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 3: Extract CDS sequences from reference genome\n", + "# -----------------------------------------------------------------------------\n", + "print(\"Extracting CDS sequences...\")\n", + "sequences = [extract_cds_sequence(row, fasta) for row in ann.iter_rows(named=True)]\n", + "ann = ann.with_columns(pl.Series(\"cds_sequence\", sequences))\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 4: Quality control - validate CDS sequences\n", + "# -----------------------------------------------------------------------------\n", + "print(\"Running quality checks...\")\n", + "quality_checks = [check_cds_quality(row[\"cds_sequence\"]) for row in ann.iter_rows(named=True)]\n", + "\n", + "ann = ann.with_columns(\n", + " [\n", + " pl.Series(\"has_start_codon\", [q[\"has_start_codon\"] for q in quality_checks]),\n", + " pl.Series(\"has_stop_codon\", [q[\"has_stop_codon\"] for q in quality_checks]),\n", + " pl.Series(\"length_divisible_by_3\", [q[\"length_divisible_by_3\"] for q in quality_checks]),\n", + " pl.Series(\"has_internal_stop_codons\", [q[\"has_internal_stop_codons\"] for q in quality_checks]),\n", + " pl.Series(\"cds_length\", [q[\"length\"] for q in quality_checks]),\n", + " ]\n", + ")\n", + "\n", + "# Print quality summary\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"CDS Quality Summary (before filtering):\")\n", + "print(\"=\" * 60)\n", + "print(f\"Total transcripts: {len(ann):,}\")\n", + "print(f\"Has start codon (ATG): {ann['has_start_codon'].sum():,} ({ann['has_start_codon'].mean() * 100:.1f}%)\")\n", + "print(f\"Has stop codon (TAA/TAG/TGA): {ann['has_stop_codon'].sum():,} ({ann['has_stop_codon'].mean() * 100:.1f}%)\")\n", + "print(\n", + " f\"Length divisible by 3: {ann['length_divisible_by_3'].sum():,} ({ann['length_divisible_by_3'].mean() * 100:.1f}%)\"\n", + ")\n", + "print(\n", + " f\"Has internal stop codons: {ann['has_internal_stop_codons'].sum():,} ({ann['has_internal_stop_codons'].mean() * 100:.1f}%)\"\n", + ")\n", + "print(\n", + " f\"All quality criteria met: {(ann['has_start_codon'] & ann['has_stop_codon'] & ann['length_divisible_by_3'] & ~ann['has_internal_stop_codons']).sum():,}\"\n", + ")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 5: Apply quality filters\n", + "# -----------------------------------------------------------------------------\n", + "# Keep only transcripts that pass all quality checks\n", + "ann = ann.filter(\n", + " pl.col(\"has_start_codon\")\n", + " & pl.col(\"has_stop_codon\")\n", + " & pl.col(\"length_divisible_by_3\")\n", + " & ~pl.col(\"has_internal_stop_codons\")\n", + ")\n", + "print(f\"\\nAfter quality filtering: {len(ann):,} transcripts\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 6: Filter to canonical transcripts and deduplicate by CDS sequence\n", + "# -----------------------------------------------------------------------------\n", + "# Keep only MANE Select or Ensembl Canonical transcripts\n", + "# Then deduplicate by CDS sequence (different transcripts can encode identical proteins)\n", + "initial_count = len(ann)\n", + "ann = ann.filter(pl.col(\"is_mane_select\") | pl.col(\"is_canonical\"))\n", + "print(f\"After filtering to canonical transcripts: {len(ann):,} transcripts\")\n", + "\n", + "ann = ann.sort([\"is_mane_select\", \"is_canonical\"], descending=True)\n", + "ann = ann.unique(subset=[\"cds_sequence\"], keep=\"first\")\n", + "ann = ann.sort([\"chrom\", \"txStart\"])\n", + "\n", + "print(f\"After canonical filter + CDS deduplication: {len(ann):,} unique transcripts\")\n", + "print(f\" (Removed {initial_count - len(ann):,} transcripts)\")\n", + "ann.write_csv(FILTERED_TRANSCRIPTS_FILE, separator=\"\\t\")\n", + "print(f\"Saved {len(ann):,} unique transcripts to {FILTERED_TRANSCRIPTS_FILE}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb index 4f163050e9..fc993a2b59 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb @@ -108,6 +108,12 @@ ")\n", "download_checkpoint(\n", " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-1B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\"\n", ")" ] }, @@ -123,6 +129,8 @@ " \"/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\",\n", "]\n", "\n", "checkpoint_path = checkpoint_paths[0]\n", diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb index fccb64161e..aaeafff7a0 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb @@ -158,6 +158,12 @@ ")\n", "download_checkpoint(\n", " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-1B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\"\n", ")" ] }, @@ -173,6 +179,8 @@ " \"/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\",\n", "]\n", "\n", "checkpoint_path = checkpoint_paths[0]\n", diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb index 58aa842fe4..0741b246e6 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb @@ -113,6 +113,12 @@ ")\n", "download_checkpoint(\n", " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-1B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\"\n", ")" ] }, @@ -128,6 +134,8 @@ " \"/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\",\n", "]\n", "\n", "checkpoint_path = checkpoint_paths[0]\n", diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt b/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt index 9e28afa182..2bdfb2ed09 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt +++ b/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt @@ -158,6 +158,7 @@ numpy==1.26.4 nvidia-dali-cuda130==1.51.2 nvidia-resiliency-ext==0.4.1 omegaconf==2.3.0 +openpyxl==3.1.5 opt_einsum==3.4.0 optree==0.17.0 optuna==2.10.1 diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py index 63f6464e51..21354fd815 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py @@ -133,8 +133,8 @@ def get_callbacks_config(args: Any) -> Dict[str, fdl.Config]: ), "model_summary": fdl.Config(ModelSummary, max_depth=-1), "lr_monitor": fdl.Config(LearningRateMonitor, logging_interval="step", log_weight_decay=True), - "grad_norm_callback": fdl.Config(GradientNormLogger, log_every_n_steps=100), - "timer_callback": fdl.Config(StepTimingCallback, log_every_n_steps=100, mode="train"), + "grad_norm_callback": fdl.Config(GradientNormLogger, log_every_n_steps=args.log_every_n_steps), + "timer_callback": fdl.Config(StepTimingCallback, log_every_n_steps=args.log_every_n_steps, mode="train"), } if args.mode == "eval": callbacks["pred_writer"] = fdl.Config( @@ -251,6 +251,12 @@ def get_logger_config(args: Any) -> fdl.Config: "num_attention_heads": 16, "num_hidden_layers": 18, }, + "encodon_5b": { + "hidden_size": 4096, + "intermediate_size": 16384, + "num_attention_heads": 32, + "num_hidden_layers": 24, + }, "encodon_10b": { "hidden_size": 5120, "intermediate_size": 20480, diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py index 6cb842e561..bef7727b7e 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py @@ -113,6 +113,7 @@ def get_parser(): # noqa: D103 "encodon_80m", "encodon_600m", "encodon_1b", + "encodon_5b", "encodon_10b", ], ) @@ -150,6 +151,7 @@ def get_parser(): # noqa: D103 default=None, help="For evaluation, the directory to write predictions to.", ) + parser.add_argument("--task_type", type=str, default=None, help="For evaluation, the task type to run.") # Finetune specific parser.add_argument(