From db33a6a76f451786418c8ed313b93df6b5696f13 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Fri, 13 Feb 2026 15:43:37 +0000 Subject: [PATCH 01/13] add codonfm 5b arch params Signed-off-by: Bruno Alvisio --- .../codonfm_ckpt_te_conversion.py | 43 ++++++++++++++++++- .../data_scripts/check_codon_frequency.py | 2 +- .../4-EnCodon-Downstream-Task-riboNN.ipynb | 8 ++++ ...odon-Downstream-Task-mRFP-expression.ipynb | 8 ++++ ...Codon-Downstream-Task-mRNA-stability.ipynb | 8 ++++ .../recipes/codonfm_ptl_te/src/config.py | 6 +++ .../recipes/codonfm_ptl_te/src/runner.py | 1 + 7 files changed, 73 insertions(+), 3 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py b/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py index dc811a87fc..6bf5c501ba 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/codonfm_ckpt_te_conversion.py @@ -26,14 +26,39 @@ import argparse import logging +import os import torch +from safetensors.torch import save_file as safetensors_save_file from src.utils.load_checkpoint import load_checkpoint logger = logging.getLogger(__name__) +ALLOWED_HYPERPARAMETER_KEYS = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout_prob", + "attention_probs_dropout_prob", + "initializer_range", + "layer_norm_eps", + "pad_token_id", + "position_embedding_type", + "classifier_dropout", + "rotary_theta", + "ignore_index", + "loss_type", + "lora", + "lora_alpha", + "lora_r", + "lora_dropout", +) + # PYTorch -> TE keymap PYTORCH_TO_TE_KEYMAP = { "model.layers.*.pre_attn_layer_norm.weight": "model.layers.*.self_attention.layernorm_qkv.layer_norm_weight", @@ -300,6 +325,11 @@ def convert_state_dict(src: dict, keymap: dict): return dst_state_dict +def filter_hyper_parameters(hyper_parameters: dict) -> dict: + """Keep only conversion-compatible hyperparameter keys.""" + return {key: value for key, value in hyper_parameters.items() if key in ALLOWED_HYPERPARAMETER_KEYS} + + def main(): """Main function.""" logging.basicConfig(level=logging.INFO) @@ -325,6 +355,7 @@ def main(): # Load source checkpoint (automatically detects format) logger.info(f"Loading checkpoint from {args.src}") src_checkpoint = load_checkpoint(args.src, map_location="cpu") + src_checkpoint["hyper_parameters"] = filter_hyper_parameters(src_checkpoint["hyper_parameters"]) # Perform conversion based on direction if args.direction == "pytorch2te": @@ -341,11 +372,19 @@ def main(): dst_state_dict = split_qkv(converted_state_dict, src_checkpoint["hyper_parameters"]) # Prepare final checkpoint - dst_checkpoint = {"state_dict": dst_state_dict, "hyper_parameters": src_checkpoint["hyper_parameters"]} + dst_checkpoint = { + "state_dict": dst_state_dict, + "hyper_parameters": src_checkpoint["hyper_parameters"], + } # Save the converted checkpoint in pickled format torch.save(dst_checkpoint, args.dst) - logger.info(f"Successfully converted checkpoint from {args.src} to {args.dst}") + logger.info(f"Successfully converted checkpoint saved to {args.dst}") + + # Save the state_dict in safetensors format alongside the .ckpt file + safetensors_path = os.path.splitext(args.dst)[0] + ".safetensors" + safetensors_save_file(dst_state_dict, safetensors_path) + logger.info(f"Successfully saved safetensors checkpoint to {safetensors_path}") if __name__ == "__main__": diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py index 525a426c31..5c130a4f96 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py @@ -23,7 +23,7 @@ from tqdm import tqdm -sys.path.append("/workspace/codon_fm") +sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb index 4f163050e9..fc993a2b59 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/4-EnCodon-Downstream-Task-riboNN.ipynb @@ -108,6 +108,12 @@ ")\n", "download_checkpoint(\n", " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-1B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\"\n", ")" ] }, @@ -123,6 +129,8 @@ " \"/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\",\n", "]\n", "\n", "checkpoint_path = checkpoint_paths[0]\n", diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb index fccb64161e..aaeafff7a0 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/5-EnCodon-Downstream-Task-mRFP-expression.ipynb @@ -158,6 +158,12 @@ ")\n", "download_checkpoint(\n", " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-1B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\"\n", ")" ] }, @@ -173,6 +179,8 @@ " \"/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\",\n", "]\n", "\n", "checkpoint_path = checkpoint_paths[0]\n", diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb index 58aa842fe4..0741b246e6 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/6-EnCodon-Downstream-Task-mRNA-stability.ipynb @@ -113,6 +113,12 @@ ")\n", "download_checkpoint(\n", " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-1B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\"\n", + ")\n", + "download_checkpoint(\n", + " repo_id=\"nvidia/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\", local_dir=\"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\"\n", ")" ] }, @@ -128,6 +134,8 @@ " \"/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1\",\n", " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-5B-v1\",\n", + " \"/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-5B-v1\",\n", "]\n", "\n", "checkpoint_path = checkpoint_paths[0]\n", diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py index 63f6464e51..effa1a455c 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py @@ -251,6 +251,12 @@ def get_logger_config(args: Any) -> fdl.Config: "num_attention_heads": 16, "num_hidden_layers": 18, }, + "encodon_5b": { + "hidden_size": 4096, + "intermediate_size": 16384, + "num_attention_heads": 32, + "num_hidden_layers": 24, + }, "encodon_10b": { "hidden_size": 5120, "intermediate_size": 20480, diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py index 6cb842e561..16a5eecba0 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py @@ -113,6 +113,7 @@ def get_parser(): # noqa: D103 "encodon_80m", "encodon_600m", "encodon_1b", + "encodon_5b", "encodon_10b", ], ) From 73d5b555eac9dbdffc6c71d3b093d903828e83a5 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 24 Feb 2026 02:23:25 +0000 Subject: [PATCH 02/13] add script to download data --- .../notebooks/download_ucsc_tables.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py new file mode 100644 index 0000000000..9e191b9035 --- /dev/null +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Download TSV files from the UCSC Table Browser via POST requests. + +Usage: + python download_ucsc_tables.py # download all four tables + python download_ucsc_tables.py --table ncbiRefSeq # download a single table + python download_ucsc_tables.py --output-dir /data/ncbi # custom output directory +""" + +import argparse +import os + +import requests + + +UCSC_URL = "https://genome.ucsc.edu/cgi-bin/hgTables" + +TABLES = { + "wgEncodeGencodeCompV32": { + "hgsid": "3727160771_KywqrMbVutzoVUyr47py53TcxZMg", # pragma: allowlist secret + "clade": "mammal", + "org": "Human", + "db": "hg38", + "hgta_group": "allTables", + "hgta_track": "hg38", + "hgta_table": "wgEncodeGencodeCompV32", + "hgta_regionType": "genome", + "position": "chr7:155,799,529-155,812,871", + "hgta_outSep": "tab", + "hgta_doTopSubmit": "Get output", + "filename": "ucsc_gencodev32_hg38.tsv", + }, + "ncbiRefSeq": { + "hgsid": "3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0", # pragma: allowlist secret + "clade": "mammal", + "org": "Human", + "db": "hg38", + "hgta_group": "allTables", + "hgta_track": "hg38", + "hgta_table": "ncbiRefSeq", + "hgta_regionType": "genome", + "position": "chr7:155,799,529-155,812,871", + "hgta_outSep": "tab", + "hgta_doTopSubmit": "Get output", + "subdir": "clinvar_syn", + "filename": "ucsc_refseq_hg38.tsv", + }, + "ncbiRefSeqHistorical": { + "hgsid": "3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0", # pragma: allowlist secret + "clade": "mammal", + "org": "Human", + "db": "hg38", + "hgta_group": "allTables", + "hgta_track": "hg38", + "hgta_table": "ncbiRefSeqHistorical", + "hgta_regionType": "genome", + "position": "chr7:155,799,529-155,812,871", + "hgta_outSep": "tab", + "hgta_doTopSubmit": "Get output", + "subdir": "clinvar_syn", + "filename": "ucsc_refseq_hist_hg38.tsv", + }, + "pliByGene": { + "hgsid": "3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3", # pragma: allowlist secret + "clade": "mammal", + "org": "Human", + "db": "hg38", + "hgta_group": "varRep", + "hgta_track": "gnomadPLI", + "hgta_table": "pliByGene", + "hgta_regionType": "genome", + "position": "chr7:155,799,529-155,812,871", + "hgta_outSep": "tab", + "hgta_doTopSubmit": "Get output", + "filename": "ucsc_pliByGene_hg38.tsv", + }, +} + + +def download_table(table_name: str, output_dir: str, api_key: str) -> str: + """POST to the UCSC Table Browser and save the result as a TSV.""" + cfg = TABLES[table_name] + cfg["apiKey"] = api_key + dest_dir = os.path.join(output_dir, cfg.get("subdir", "")) if cfg.get("subdir") else output_dir + os.makedirs(dest_dir, exist_ok=True) + dest = os.path.join(dest_dir, cfg["filename"]) + + if os.path.exists(dest): + print(f" [skip] {dest}") + return dest + + print(f" Downloading {table_name} → {dest} ...") + + resp = requests.post(UCSC_URL, timeout=300, data=cfg) + resp.raise_for_status() + + if "" in resp.text: + error_start = resp.text.index("") + error_end = ( + resp.text.index("") if "" in resp.text else error_start + 500 + ) + raise RuntimeError(f"UCSC returned an error:\n{resp.text[error_start:error_end]}") + + lines = resp.text.splitlines(keepends=True) + while lines: + tail = lines[-1].strip() + if not tail or tail.startswith("---") or "cookie" in tail.lower(): + lines.pop() + else: + break + + with open(dest, "w") as f: + f.writelines(lines) + + print(f" [done] {dest} ({len(lines):,} lines)") + return dest + + +def main(): + """Download UCSC Table Browser tables as TSV.""" + parser = argparse.ArgumentParser(description="Download UCSC Table Browser tables as TSV") + parser.add_argument("--table", choices=list(TABLES.keys()), help="Single table to download (default: all)") + parser.add_argument("--output-dir", default=".", help="Base output directory (default: cwd)") + parser.add_argument("--api-key", required=True, help="API key for UCSC Table Browser") + args = parser.parse_args() + + tables = [args.table] if args.table else list(TABLES.keys()) + + for t in tables: + print(f"=== {t} ===") + download_table(t, args.output_dir, args.api_key) + + +if __name__ == "__main__": + main() From ac2895d00c8b0e1028e3c3a2481529da3b4b0c00 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 24 Feb 2026 02:55:53 +0000 Subject: [PATCH 03/13] updated data download --- .../00-Mutation-Datasets-Preprocessing.ipynb | 4018 ++++++++++++++--- .../recipes/codonfm_ptl_te/requirements.txt | 1 + 2 files changed, 3504 insertions(+), 515 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb index f01734c922..a22bede07c 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb @@ -7,16 +7,41 @@ "source": [ "# Mutation Datasets Preprocessing\n", "\n", - "This notebook preprocesses four mutation variant datasets for downstream analysis. For each dataset, we extract coding sequence (CDS) context from reference genome annotations, annotate variants with transcript information, codon changes, and amino acid translations, and save the processed data in a standardized format.\n", + "This notebook preprocesses five mutation variant datasets for downstream analysis. For each dataset, we extract coding sequence (CDS) context from reference genome annotations, annotate variants with transcript information, codon changes, and amino acid translations, and save the processed data in a standardized format.\n", + "\n", + "---\n", + "## 📋 Table of Contents\n", + "\n", + "1. **[DDD / ASD Dataset](#1-ddd-asd-dataset)** - Developmental disorder and autism spectrum disorder variants (hg19)\n", + "2. **[ClinVar AlphaMissense](#2-clinvar-alphamissense-dataset)** - ClinVar missense variants with AlphaMissense scores (hg38)\n", + "3. **[Cancer Hotspot](#3-cancer-hotspot)** - Cancer hotspot mutations with AlphaMissense annotations (hg38)\n", + "4. **[ClinVar Synonymous](#4-clinvar-synonymous)** - ClinVar synonymous variants with conservation features (hg38)\n", + "5. **[CHD Missense Dataset](#5-chd-missense-dataset)** - Congenital heart disease rare mutations with DDD/ASD controls (hg19)\n", + "6. **[COSMIC Synonymous](#6-cosmic-synonymous-analyses-data)** - COSMIC synonymous analyses data (hg38)\n", + " - **[COSMIC](#cosmic)** - COSMIC mutant census variants\n", + " - **[gnomAD Common Variants](#gnomad-common-variants)** - gnomAD common variants for comparison\n", + "\n", + "---\n", "\n", "## Required Pre-processing Steps\n", "\n", - "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed and saved at `/data/ncbi`\n", + "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed.\n", + "\n", + "### 1. Open-source Data Download\n", "\n", - "#### 1. Open-source Data Download\n", + "There are two ways to obtain the data used by this notebook:\n", "\n", + "a. **Manual:**\n", + " - Use the links provided above to download each file individually.\n", + " - Use the [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) to export the required tables as TSV.\n", + " - Save them into the corresponding subdirectories under `DATA_DIR` (matching the filenames in the directory structure section above).\n", "\n", - "##### Reference Files\n", + "b. **Automatic (recommended):**\n", + " - Create a UCSC account: [hgLogin](https://genome.ucsc.edu/cgi-bin/hgLogin)\n", + " - Generate an API key: [hgHubConnect](https://genome.ucsc.edu/cgi-bin/hgHubConnect) → click **\"generate key\"**\n", + " - Paste the key into `UCSC_API_KEY` in the download cell below, then run the cell.\n", + "\n", + "#### 1.a. Manual Download - Reference Files\n", "| File | Origin |\n", "|----------------|-------- |\n", "| `hg19.fa` | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz) |\n", @@ -40,89 +65,334 @@ "| `ddd_other` | [Zhou et al. 2022](https://www.nature.com/articles/s41588-022-01148-2) | [Download](https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM7_ESM.xlsx) | Supplementary Table 7 |\n", "| `AlphaMissense ClinVar` | [Cheng et al. 2023](https://www.science.org/doi/10.1126/science.adg7492) | [Download](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) | Data S5 |\n", "| `AlphaMissense CancerHotspot` | [Cheng et al. 2023](https://www.science.org/doi/10.1126/science.adg7492) | [Download](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) | Data S6 |\n", + "| `chd_rare_mutation.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S9 |\n", + "| `chd_mutation_ctrl.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S10 |\n", + "| `Cosmic_Sample_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", + "| `Cosmic_MutantCensus_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", + "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n", + "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n", "\n", "##### ClinVar Synonymous Matching Features\n", "\n", "| File | Source | URL |\n", "|------|--------|-----|\n", "| `hg38.phyloP447way.bw` | UCSC Genome Browser | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw) |\n", + "| `hg19.100way.phyloP100way.bw` | UCSC Genome Browser | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw) |\n", "| `ucsc_pliByGene_hg38.tsv` | UCSC Genome Browser → Table Browser | [Download](https://genome.ucsc.edu/cgi-bin/hgTables) (table: `pliByGene`) |\n", - "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n", + "| `gnomad.v2.1.1.lof_metrics.by_transcript.txt` | gnomAD | [Download](https://gnomad.broadinstitute.org/downloads) |\n", + "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n" + ] + }, + { + "cell_type": "markdown", + "id": "0ffb5238", + "metadata": {}, + "source": [ + "### 1.b. Automatic Download\n", + "\n", + "If you choose **Automatic**:\n", + " 1. Set the `DATA_DIR` where the files should be saved.\n", + " 2. Set the `UCSC_API_KEY` to download the tables form the UCSC table browser.\n", + " 3. Run the next cell to download the required datasets into `DATA_DIR`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6d796a4", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import os\n", + "import shutil\n", + "import urllib.request\n", "\n", - "#### 2. Data Scripts\n", + "import pandas as pd\n", + "import requests\n", "\n", - "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", "\n", - "| File | Purpose | How to Generate |\n", - "|------|---------|-----------------| \n", - "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", - "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `python data_scripts/process_gtf.py` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", + "# ── Set data directory ───────────────────────────────────────\n", + "DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n", + "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", + "UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n", + "# ─────────────────────────────────────────────────────────────\n", + "\n", + "# Create output directory\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", + "\n", + "for subdir in [\n", + " \"reference/hg19\",\n", + " \"reference/hg38\",\n", + " \"alphamissense_data\",\n", + " \"ddd_asd_zhouetal\",\n", + " \"clinvar_syn\",\n", + "]:\n", + " os.makedirs(os.path.join(DATA_DIR, subdir), exist_ok=True)\n", + "\n", + "\n", + "def download_file(url, dest, decompress_gz=False):\n", + " \"\"\"Download *url* → *dest*, optionally gunzipping in place. Skips if target already exists.\"\"\"\n", + " final = dest[:-3] if decompress_gz and dest.endswith(\".gz\") else dest\n", + " if os.path.exists(final):\n", + " print(f\" [skip] {os.path.relpath(final, DATA_DIR)}\")\n", + " return\n", + " print(f\" Downloading → {os.path.relpath(final, DATA_DIR)} ...\")\n", + " urllib.request.urlretrieve(url, dest)\n", + " if decompress_gz and dest.endswith(\".gz\"):\n", + " with gzip.open(dest, \"rb\") as f_in, open(final, \"wb\") as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + " os.remove(dest)\n", + "\n", + "\n", + "# ── 1. Reference genomes ────────────────────────────────────\n", + "print(\"Reference genomes\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz\",\n", + " os.path.join(DATA_DIR, \"reference/hg19/hg19.fa.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\",\n", + " os.path.join(DATA_DIR, \"reference/hg38/hg38.fa.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "# ── 2. GENCODE annotation (GTF) ─────────────────────────────\n", + "print(\"GENCODE annotation\")\n", + "download_file(\n", + " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n", + " os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n", + "print(\"DDD / ASD variant files\")\n", + "xlsx_sources = {\n", + " \"asd_discov\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM5_ESM.xlsx\",\n", + " \"asd_rep\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM6_ESM.xlsx\",\n", + " \"ddd_other\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM7_ESM.xlsx\",\n", + "}\n", + "\n", + "for name, url in xlsx_sources.items():\n", + " csv_path = os.path.join(DATA_DIR, \"ddd_asd_zhouetal\", f\"{name}.csv\")\n", + " if os.path.exists(csv_path):\n", + " print(f\" [skip] ddd_asd_zhouetal/{name}.csv\")\n", + " continue\n", + " xlsx_path = csv_path.replace(\".csv\", \".xlsx\")\n", + " download_file(url, xlsx_path)\n", + " print(f\" Converting {name}.xlsx → csv ...\")\n", + " pd.read_excel(xlsx_path).to_csv(csv_path, index=False)\n", + "\n", + "# ── 4. ClinVar variant summary ──────────────────────────────\n", + "print(\"ClinVar variant summary\")\n", + "download_file(\n", + " \"https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz\",\n", + " os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n", + ")\n", + "\n", + "# ── 5. phyloP conservation scores ───────────────────────────\n", + "print(\"phyloP447way conservation scores\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n", + " os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n", + ")\n", + "\n", + "# ── 6. UCSC Table Browser downloads ─────────────────────────\n", + "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n", + "UCSC_TABLES = {\n", + " \"wgEncodeGencodeCompV32\": {\n", + " \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n", + " \"subdir\": \"\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"wgEncodeGencodeCompV32\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"ncbiRefSeq\": {\n", + " \"filename\": \"ucsc_refseq_hg38.tsv\",\n", + " \"subdir\": \"clinvar_syn\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"ncbiRefSeq\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"ncbiRefSeqHistorical\": {\n", + " \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n", + " \"subdir\": \"clinvar_syn\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"ncbiRefSeqHistorical\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"pliByGene\": {\n", + " \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n", + " \"subdir\": \"\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"varRep\",\n", + " \"hgta_track\": \"gnomadPLI\",\n", + " \"hgta_table\": \"pliByGene\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + "}\n", + "\n", + "print(\"UCSC Table Browser downloads\")\n", + "if not UCSC_API_KEY:\n", + " print(\" UCSC_API_KEY is not set — skipping automatic download.\")\n", + " print(\" Download these tables manually from https://genome.ucsc.edu/cgi-bin/hgTables:\")\n", + " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", + " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", + " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", + " status = \"found\" if os.path.exists(dest) else \"MISSING\"\n", + " rel = os.path.join(tbl_cfg[\"subdir\"], tbl_cfg[\"filename\"]) if tbl_cfg[\"subdir\"] else tbl_cfg[\"filename\"]\n", + " print(f\" [{status}] {rel} (table: {tbl_name})\")\n", + "else:\n", + " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", + " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", + " os.makedirs(dest_dir, exist_ok=True)\n", + " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", + "\n", + " if os.path.exists(dest):\n", + " print(f\" [skip] {os.path.relpath(dest, DATA_DIR)}\")\n", + " continue\n", "\n", + " print(f\" Downloading {tbl_name} → {os.path.relpath(dest, DATA_DIR)} ...\")\n", + " form = {**tbl_cfg[\"form\"], \"apiKey\": UCSC_API_KEY}\n", + " resp = requests.post(UCSC_URL, data=form, timeout=300)\n", + " resp.raise_for_status()\n", "\n", - "## Table of Contents\n", + " if \"\" in resp.text:\n", + " raise RuntimeError(f\"UCSC returned an error for {tbl_name}. Re-run the cell to retry.\")\n", "\n", - "| Section | Dataset | Description | Required Data Files |\n", - "|---------|---------|-------------|---------------------|\n", - "| **1** | [DDD / ASD Dataset](#1-ddd-asd-dataset) | Developmental disorder and autism spectrum disorder variants | `ddd_asd_zhouetal/asd_discov.csv`
`ddd_asd_zhouetal/asd_rep.csv`
`ddd_asd_zhouetal/ddd_other.csv`
`gencode.v47lift37.basic.annotation.processed.tsv`
`alphamissense_data/AlphaMissense_hg19.tsv.gz`
`reference/hg19/hg19.fa` |\n", - "| **2** | [ClinVar AlphaMissense](#2-clinvar-alphamissense-dataset) | ClinVar missense variants with AlphaMissense pathogenicity predictions | `alphamissense_data/alphamissense_clinvar.csv`
`ucsc_gencodev32_hg38.tsv`
`hg38/hg38.fa` |\n", - "| **3** | [Cancer Hotspot](#3-cancer-hotspot) | Cancer hotspot mutations with AlphaMissense scores | `alphamissense_data/alphamissense_cancer_hotspot.csv`
`ucsc_gencodev32_hg38.tsv`
`reference/hg38/hg38.fa` |\n", - "| **4** | [ClinVar Synonymous](#4-clinvar-synonymous) | Extract synonymous variants from ClinVar (benign and pathogenic labels) with optional additional features | `clinvar_syn/variant_summary.txt.gz`
`clinvar_syn/ucsc_refseq_hg38.tsv`
`clinvar_syn/ucsc_refseq_hist_hg38.tsv`
`reference/hg38/hg38.fa` |\n", + " lines = resp.text.splitlines(keepends=True)\n", + " while lines:\n", + " tail = lines[-1].strip()\n", + " if not tail or tail.startswith(\"---\") or \"cookie\" in tail.lower():\n", + " lines.pop()\n", + " else:\n", + " break\n", "\n", - "---\n" + " with open(dest, \"w\") as f:\n", + " f.writelines(lines)\n", + " print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n", + "\n", + "print(\"\\nDone.\")" ] }, { "cell_type": "markdown", - "id": "8d094b99", + "id": "59e18758", "metadata": {}, "source": [ - "# Imports and Paths setup" + "### 2. Download AlphaMissense Data\n", + "\n", + "The **AlphaMissense** data can only be downloaded manually due to the webiste's bot protection. [Download the zip file](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) in the `DATA_DIR/alphamissense_data` and run the next cell:" ] }, { "cell_type": "code", "execution_count": null, - "id": "8dfcfbe1", + "id": "40b6713c", "metadata": {}, "outputs": [], "source": [ - "# Uncomment to install PyBigWig\n", - "!pip install pyBigWig" + "import zipfile\n", + "\n", + "\n", + "print(\"AlphaMissense data\")\n", + "\n", + "am_data_dir = os.path.join(DATA_DIR, \"alphamissense_data\")\n", + "am_zip_path = os.path.join(am_data_dir, \"science.adg7492_data_s1_to_s9.zip\")\n", + "am_clinvar_path = os.path.join(am_data_dir, \"alphamissense_clinvar.csv\")\n", + "am_hotspot_path = os.path.join(am_data_dir, \"alphamissense_cancer_hotspot.csv\")\n", + "\n", + "if not os.path.exists(am_zip_path):\n", + " raise FileNotFoundError(\n", + " f\"Required file not found: {am_zip_path}\\n\"\n", + " \"Please manually download science.adg7492_data_s1_to_s9.zip into DATA_DIR/alphamissense_data/.\"\n", + " )\n", + "\n", + "with zipfile.ZipFile(am_zip_path, \"r\") as zf:\n", + " print(f\" Extracting zip → {zf.namelist()}\")\n", + " zf.extractall(am_data_dir)\n", + "\n", + "rename_map = {\n", + " \"science.adg7492_data_s5.csv\": am_clinvar_path,\n", + " \"science.adg7492_data_s6.csv\": am_hotspot_path,\n", + "}\n", + "\n", + "for src_name, dst_path in rename_map.items():\n", + " src_path = os.path.join(am_data_dir, src_name)\n", + " if os.path.exists(src_path):\n", + " os.replace(src_path, dst_path)\n", + " print(f\" Renamed {src_name} -> {os.path.basename(dst_path)}\")\n", + " elif os.path.exists(dst_path):\n", + " print(f\" [skip] {os.path.basename(dst_path)} already present\")\n", + " else:\n", + " raise FileNotFoundError(f\"Expected file not found after extraction: {src_path}\")" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "5dae6998", + "cell_type": "markdown", + "id": "5dd4178e", "metadata": {}, - "outputs": [], "source": [ - "import ast\n", - "import json\n", - "import os\n", - "import warnings\n", + "### 3. Data Scripts\n", "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import polars as pl\n", - "import pyBigWig\n", - "import pyfaidx\n", - "import seaborn as sns\n", - "from Bio.Seq import Seq\n", - "from matplotlib.ticker import LogLocator\n", - "from tqdm import tqdm\n", + "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", "\n", + "| File | Purpose | How to Generate |\n", + "|------|---------|-----------------| \n", + "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", + "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `000-Annotation-File-Processing.ipynb` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", + "| `gencode.v47.basic.annotation.processed.filtered.tsv` | Filtered transcripts with CDS sequences (hg38) | Run `000-Annotation-File-Processing.ipynb` Part 1 on the GENCODE v47 GTF file. |\n", "\n", - "warnings.filterwarnings(\"ignore\")" + "---" ] }, { "cell_type": "markdown", - "id": "01a546f6", + "id": "21c15660", "metadata": {}, "source": [ - "Before setting the `DATA_DIR` path, ensure the following directory structure (containing the files from the required pre-processing steo) is in place:\n", + "### 4. Downloaded Data Integrity Check\n", + "\n", + "Run the following cell to ensure that the `DATA_DIR` path structure (containing the files from the required pre-processing step) is in place:\n", "\n", "```\n", "📁 DATA_DIR/\n", @@ -135,10 +405,15 @@ "│ ├── asd_rep.csv\n", "│ └── ddd_other.csv\n", "├── 📁 clinvar_syn/\n", - "│ ├── variant_summary.txt.gz\n", - "│ ├── ucsc_refseq_hg38.tsv\n", - "│ └── ucsc_refseq_hist_hg38.tsv\n", + "│ └── variant_summary.txt.gz\n", "├── 📁 reference/\n", + "│ ├── 📄 gencode.v47lift37.basic.annotation.processed.tsv\n", + "│ ├── 📄 gencode.v47.basic.annotation.processed.filtered.tsv\n", + "│ ├── 📄 ucsc_gencodev32_hg38.tsv\n", + "│ ├── 📄 ucsc_pliByGene_hg38.tsv\n", + "│ ├── 📄 hg38.phyloP447way.bw\n", + "│ ├── ucsc_refseq_hg38.tsv\n", + "│ ├── ucsc_refseq_hist_hg38.tsv\n", "│ ├── hg19/\n", "│ │ ├── hg19.fa\n", "│ │ └── hg19.fa.fai\n", @@ -146,24 +421,101 @@ "│ ├── hg38.fa\n", "│ └── hg38.fa.fai\n", "├── 📄 codon_counts_nopathogen.json\n", - "├── 📄 gencode.v47lift37.basic.annotation.processed.tsv\n", - "├── 📄 ucsc_gencodev32_hg38.tsv\n", - "├── 📄 ucsc_pliByGene_hg38.tsv\n", - "└── 📄 hg38.phyloP447way.bw\n", - "```\n" + "├── 📁 cosmic/\n", + "│ └── 📁 cosmic_raw/\n", + "│ ├── Cosmic_Sample_v102_GRCh38.tsv.gz\n", + "│ └── Cosmic_MutantCensus_v102_GRCh38.tsv.gz\n", + "└── 📁 gnomad/\n", + " ├── 📁 gnomad.exomes.v4.1/\n", + " │ └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", + " └── 📁 gnomad.genomes.v4.1/\n", + " └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be4365d5", + "metadata": {}, + "outputs": [], + "source": [ + "expected_files = [\n", + " \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n", + " \"alphamissense_data/alphamissense_cancer_hotspot.csv\",\n", + " \"alphamissense_data/alphamissense_clinvar.csv\",\n", + " \"ddd_asd_zhouetal/asd_discov.csv\",\n", + " \"ddd_asd_zhouetal/asd_rep.csv\",\n", + " \"ddd_asd_zhouetal/ddd_other.csv\",\n", + " \"clinvar_syn/variant_summary.txt.gz\",\n", + " \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n", + " \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n", + " \"reference/hg19/hg19.fa\",\n", + " \"reference/hg19/hg19.fa.fai\",\n", + " \"reference/hg38/hg38.fa\",\n", + " \"reference/hg38/hg38.fa.fai\",\n", + " \"codon_counts_nopathogen.json\",\n", + " \"gencode.v47lift37.basic.annotation.processed.tsv\",\n", + " \"ucsc_gencodev32_hg38.tsv\",\n", + " \"ucsc_pliByGene_hg38.tsv\",\n", + " \"hg38.phyloP447way.bw\",\n", + "]\n", + "\n", + "missing = [f for f in expected_files if not os.path.exists(os.path.join(DATA_DIR, f))]\n", + "if missing:\n", + " print(f\"{len(missing)} file(s) missing from {DATA_DIR}:\")\n", + " for f in missing:\n", + " print(f\" ✗ {f}\")\n", + " raise FileNotFoundError(f\"{len(missing)} required file(s) missing — see list above.\")\n", + "else:\n", + " print(f\"All {len(expected_files)} required files found in {DATA_DIR}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d094b99", + "metadata": {}, + "source": [ + "# Imports and Paths setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8dfcfbe1", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to install PyBigWig\n", + "# !pip install pyBigWig" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "48d31ff4", + "execution_count": 3, + "id": "5dae6998", "metadata": {}, "outputs": [], "source": [ - "DATA_DIR = \"/data/ncbi/\" # set this to the path of your data directory\n", + "import ast\n", + "import json\n", + "import os\n", + "import warnings\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyBigWig\n", + "import pyfaidx\n", + "import seaborn as sns\n", + "from Bio.Data import CodonTable\n", + "from Bio.Seq import Seq\n", + "from matplotlib.ticker import LogLocator\n", + "from tqdm import tqdm\n", "\n", - "OUTPUT_DIR = \"/data/processed/mutation_datasets_latest\" # output directory where all processed datasets will be saved\n", - "os.makedirs(OUTPUT_DIR, exist_ok=True)" + "\n", + "warnings.filterwarnings(\"ignore\")" ] }, { @@ -176,79 +528,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 53, "id": "42a905da", "metadata": {}, "outputs": [], "source": [ - "dna_code = {\n", - " \"ATA\": \"I\",\n", - " \"ATC\": \"I\",\n", - " \"ATT\": \"I\",\n", - " \"ATG\": \"M\",\n", - " \"ACA\": \"T\",\n", - " \"ACC\": \"T\",\n", - " \"ACG\": \"T\",\n", - " \"ACT\": \"T\",\n", - " \"AAC\": \"N\",\n", - " \"AAT\": \"N\",\n", - " \"AAA\": \"K\",\n", - " \"AAG\": \"K\",\n", - " \"AGC\": \"S\",\n", - " \"AGT\": \"S\",\n", - " \"AGA\": \"R\",\n", - " \"AGG\": \"R\",\n", - " \"CTA\": \"L\",\n", - " \"CTC\": \"L\",\n", - " \"CTG\": \"L\",\n", - " \"CTT\": \"L\",\n", - " \"CCA\": \"P\",\n", - " \"CCC\": \"P\",\n", - " \"CCG\": \"P\",\n", - " \"CCT\": \"P\",\n", - " \"CAC\": \"H\",\n", - " \"CAT\": \"H\",\n", - " \"CAA\": \"Q\",\n", - " \"CAG\": \"Q\",\n", - " \"CGA\": \"R\",\n", - " \"CGC\": \"R\",\n", - " \"CGG\": \"R\",\n", - " \"CGT\": \"R\",\n", - " \"GTA\": \"V\",\n", - " \"GTC\": \"V\",\n", - " \"GTG\": \"V\",\n", - " \"GTT\": \"V\",\n", - " \"GCA\": \"A\",\n", - " \"GCC\": \"A\",\n", - " \"GCG\": \"A\",\n", - " \"GCT\": \"A\",\n", - " \"GAC\": \"D\",\n", - " \"GAT\": \"D\",\n", - " \"GAA\": \"E\",\n", - " \"GAG\": \"E\",\n", - " \"GGA\": \"G\",\n", - " \"GGC\": \"G\",\n", - " \"GGG\": \"G\",\n", - " \"GGT\": \"G\",\n", - " \"TCA\": \"S\",\n", - " \"TCC\": \"S\",\n", - " \"TCG\": \"S\",\n", - " \"TCT\": \"S\",\n", - " \"TTC\": \"F\",\n", - " \"TTT\": \"F\",\n", - " \"TTA\": \"L\",\n", - " \"TTG\": \"L\",\n", - " \"TAC\": \"Y\",\n", - " \"TAT\": \"Y\",\n", - " \"TAA\": \"*\",\n", - " \"TAG\": \"*\",\n", - " \"TGC\": \"C\",\n", - " \"TGT\": \"C\",\n", - " \"TGA\": \"*\",\n", - " \"TGG\": \"W\",\n", - "}\n", - "\n", - "\n", "def translate(seq):\n", " \"\"\"\n", " Translate an RNA sequence into a protein sequence.\n", @@ -258,11 +542,30 @@ " for i in range(0, len(seq) - 2, 3):\n", " codon = seq[i : i + 3]\n", " # Look up the codon in the genetic code dictionary.\n", - " amino_acid = dna_code.get(codon, \"?\")\n", - " protein += amino_acid\n", + " amino_acid = codon_to_aa(codon)\n", + " protein += amino_acid if amino_acid is not None else \"?\"\n", " return protein\n", "\n", "\n", + "def codon_to_aa(codon):\n", + " \"\"\"\n", + " Translate a single codon to its corresponding amino acid using BioPython's CodonTable.\n", + "\n", + " Parameters:\n", + " codon (str): A 3-nucleotide DNA codon.\n", + "\n", + " Returns:\n", + " str or None: The single-letter amino acid code, '*' for stop codons, or None if invalid.\n", + " \"\"\"\n", + " standard_table = CodonTable.unambiguous_dna_by_name[\"Standard\"]\n", + " codon = codon.upper().replace(\"U\", \"T\")\n", + " if len(codon) != 3 or any(base not in \"ATGC\" for base in codon):\n", + " return None\n", + " if codon in standard_table.stop_codons:\n", + " return \"*\"\n", + " return standard_table.forward_table.get(codon, None)\n", + "\n", + "\n", "def reverse_complement_dna(seq):\n", " \"\"\"\n", " Return the reverse complement of a DNA sequence.\n", @@ -277,7 +580,7 @@ " KeyError: If the sequence contains lowercase letters or invalid characters.\n", " \"\"\"\n", " complement = {\"A\": \"T\", \"T\": \"A\", \"G\": \"C\", \"C\": \"G\", \"N\": \"N\"}\n", - " return \"\".join(complement[base] for base in seq[::-1])\n", + " return \"\".join(complement[base] for base in seq[::-1].upper())\n", "\n", "\n", "def process_gtf(gtf_path, fasta_path):\n", @@ -349,10 +652,18 @@ " gtf[\"cds_length\"] = lengths\n", " gtf[\"cds\"] = seqs # sequence is strand-aware (always gene 5'->3')\n", "\n", - " gtf_s = gtf[\n", - " [\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\", \"cds\"]\n", - " ].copy()\n", - " gtf_s[\"name\"] = gtf_s[\"name\"].str.split(\".\").str[0]\n", + " # remove version numbers from identifiers\n", + " gtf[\"name\"] = gtf[\"name\"].str.split(\".\").str[0]\n", + "\n", + " # Build output columns - gene_id and gene_name are optional\n", + " output_cols = [\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\", \"cds\"]\n", + " if \"gene_name\" in gtf.columns:\n", + " output_cols.append(\"gene_name\")\n", + " if \"gene_id\" in gtf.columns:\n", + " gtf[\"gene_id\"] = gtf[\"gene_id\"].str.split(\".\").str[0]\n", + " output_cols.append(\"gene_id\")\n", + "\n", + " gtf_s = gtf[output_cols].copy()\n", " # Sort transcripts by chromosome, start, and end coordinates, they're in the forward direction and 0-based.\n", " gtf_s = gtf_s.sort_values(by=[\"chrom\", \"cdsStart\", \"cdsEnd\"]).reset_index(drop=True).copy()\n", "\n", @@ -586,12 +897,6 @@ " return df.apply(_check, axis=1)\n", "\n", "\n", - "def get_reverse_complement(seq):\n", - " \"\"\"Get reverse complement of a sequence\"\"\"\n", - " complement = {\"A\": \"T\", \"T\": \"A\", \"G\": \"C\", \"C\": \"G\", \"N\": \"N\"}\n", - " return \"\".join(complement[base] for base in seq[::-1].upper())\n", - "\n", - "\n", "def extract_cds_sequence(row, fasta):\n", " \"\"\"Extract CDS sequence for a transcript based on exon coordinates and CDS boundaries.\"\"\"\n", " chrom = row[\"chrom\"]\n", @@ -618,7 +923,7 @@ "\n", " # Reverse complement if on negative strand\n", " if strand == \"-\":\n", - " cds_sequence = get_reverse_complement(cds_sequence)\n", + " cds_sequence = reverse_complement_dna(cds_sequence)\n", "\n", " return cds_sequence\n", "\n", @@ -663,7 +968,7 @@ " dset = dset.with_columns(\n", " pl.col(\"tx\").map_elements(lambda x: tx_to_name[x], return_dtype=pl.String).alias(\"gene_name\")\n", " )\n", - " pli = pl.read_csv(f\"{DATA_DIR}/ucsc_pliByGene_hg38.tsv\", separator=\"\\t\")\n", + " pli = pl.read_csv(f\"{DATA_DIR}/reference/ucsc_pliByGene_hg38.tsv\", separator=\"\\t\")\n", " gene_to_pli = {row[\"geneName\"]: row[\"_pli\"] for row in pli.rows(named=True)}\n", " dset = dset.with_columns(\n", " pl.col(\"gene_name\").map_elements(lambda x: gene_to_pli.get(x, -1000), return_dtype=pl.Float64).alias(\"pli\")\n", @@ -673,7 +978,7 @@ " dset = dset.filter(pl.col(\"pli\") != -1000)\n", " dset = dset.with_columns((pl.col(\"pli\") * 10).cast(pl.Int32).alias(\"pli_bin\"))\n", "\n", - " bw = pyBigWig.open(f\"{DATA_DIR}/hg38.phyloP447way.bw\")\n", + " bw = pyBigWig.open(f\"{DATA_DIR}/reference/hg38.phyloP447way.bw\")\n", " phylop = []\n", " for row in tqdm(dset.rows(named=True)):\n", " phylop.append(bw.values(row[\"chrom\"], row[\"pos\"] - 1, row[\"pos\"])[0])\n", @@ -698,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "f4bd8e89", "metadata": {}, "outputs": [ @@ -967,7 +1272,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "b7a9dd69", "metadata": {}, "outputs": [ @@ -977,7 +1282,7 @@ "2933" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -989,7 +1294,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "9a2bd343", "metadata": {}, "outputs": [ @@ -1027,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "185bfa35", "metadata": {}, "outputs": [ @@ -1092,7 +1397,7 @@ "1 1:874817:C:T chr1 874817 C T Affected asd" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1118,7 +1423,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "510f1010", "metadata": {}, "outputs": [ @@ -1126,14 +1431,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing transcripts: 100%|██████████| 65158/65158 [00:14<00:00, 4392.38it/s]\n" + "Processing transcripts: 100%|██████████| 64779/64779 [00:09<00:00, 6511.43it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Processed 65158 GTF CDS sequences\n" + "Processed 64779 GTF CDS sequences\n" ] }, { @@ -1213,7 +1518,8 @@ "assembly = \"hg19\"\n", "all_results = []\n", "gtf_s, fasta = process_gtf(\n", - " f\"{DATA_DIR}/gencode.v47lift37.basic.annotation.processed.tsv\", f\"{DATA_DIR}/reference/{assembly}/hg19.fa\"\n", + " f\"{DATA_DIR}/reference/gencode.v47lift37.basic.annotation.processed.tsv\",\n", + " f\"{DATA_DIR}/reference/{assembly}/hg19.fa\",\n", ")\n", "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" @@ -1319,7 +1625,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing chromosomes: 100%|██████████| 23/23 [00:03<00:00, 7.11it/s]\n" + "Processing chromosomes: 0%| | 0/23 [00:00" ] @@ -1993,7 +2306,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8YAAAGGCAYAAAAJj+sGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABn6ElEQVR4nO3deVhU5f//8dcAgsrmDu6YpokpmJK55JIWmlluaaYFWGY1brmUVm64W5lmU35aXDIt01yz3HArM8UFLVFzTcstNxBMEDi/P/o5XydQGZlxEJ6P65rr4tznnvu85pwZpnxz38dkGIYhAAAAAAAAAAAAAADyKDdXBwAAAAAAAAAAAAAAwJkojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAwF1mxIgRMplMd+RYTZs2VdOmTa3b69evl8lk0oIFC+7I8SMjIxUUFHRHjnW7kpKS9OKLLyowMFAmk0n9+vVzdSSXO3r0qEwmk2bOnOnqKPnSfz+3AAAAAAAK4wAAAADgUjNnzpTJZLI+ChYsqDJlyig8PFwffPCBLl265JDjnDhxQiNGjFBcXJxDxnOk3JwtO8aOHauZM2fqlVde0ezZs/Xcc8/dtO/ixYvvXLi70OXLlzVixAitX7/e1VFwE99//71GjBjh6hgAAAAAkG0mwzAMV4cAAAAAgPxq5syZioqKUnR0tCpVqqSrV6/q1KlTWr9+vVavXq0KFSpo6dKlqlWrlvU5aWlpSktLU8GCBbN9nG3btiksLEwzZsxQZGRktp+XmpoqSfL09JT074zxZs2aaf78+erYsWO2x7ndbFevXlVGRoa8vLwccixneOihh+Th4aGffvrpln19fHzUsWPHPD+T2jAMpaSkqECBAnJ3d7fruWfPnlXJkiU1fPhwCq+36b+fW2fo1auXLBaL+GclAAAAAHcLD1cHAAAAAABIrVq1Ut26da3bQ4YM0dq1a/XEE0/oySef1N69e1WoUCFJkoeHhzw8nPu/c5cvX1bhwoWdWljLjgIFCrj0+Nlx5swZBQcHO3zc5ORkeXt7O3xcZ0pLS1NGRoY8PT3t+sONvMAwDF25csX6OXWF3PK5BQAAAIDciKXUAQAAACCXeuSRRzR06FD98ccf+vLLL63tWd1jfPXq1WrUqJGKFCkiHx8fVatWTW+++aakf2d5h4WFSZKioqKsy7Zfm7XctGlT3X///dq+fbsaN26swoULW597o3sVp6en680331RgYKC8vb315JNP6vjx4zZ9goKCspydfv2Yt8qW1T3Gk5OTNWDAAJUvX15eXl6qVq2a3n333UwzV00mk3r16qXFixfr/vvvl5eXl2rUqKEVK1ZkfcL/48yZM3rhhRcUEBCgggULKiQkRLNmzbLuv3a/9SNHjmj58uXW7EePHs1yPJPJpOTkZM2aNcva99r5uXZN4+Pj9eyzz6po0aJq1KiRJGn37t2KjIzUPffco4IFCyowMFDdu3fXuXPnbMa/NsbBgwcVGRmpIkWKyN/fX1FRUbp8+bJN35u9X665cuWKRowYoapVq6pgwYIqXbq02rdvr0OHDkn6v/uIv/vuu5o8ebIqV64sLy8vxcfHZ3mP8cjISPn4+Ojw4cMKDw+Xt7e3ypQpo+joaOu1O3r0qEqWLClJGjlypPU8XZs5furUKUVFRalcuXLy8vJS6dKl9dRTT93wnNtz7GsyMjI0efJk1ahRQwULFlRAQIB69uypCxcu2PQLCgrSE088oZUrV6pu3boqVKiQ/ve//2V5/F69esnHxyfTdZCkLl26KDAwUOnp6ZKkJUuWqHXr1ipTpoy8vLxUuXJljRo1yrr/Gns+t6mpqRo2bJjq1Kkjf39/eXt76+GHH9a6detsxrz+mn7yySfWaxoWFqbY2Fib82mxWCTJ5lYQAAAAAJCbMWMcAAAAAHKx5557Tm+++aZWrVqlHj16ZNlnz549euKJJ1SrVi1FR0fLy8tLBw8e1KZNmyRJ1atXV3R0tIYNG6aXXnpJDz/8sCSpQYMG1jHOnTunVq1a6ZlnnlG3bt0UEBBw01xjxoyRyWTSG2+8oTNnzmjy5Mlq0aKF4uLi7Joxm51s1zMMQ08++aTWrVunF154QaGhoVq5cqUGDRqkv/76S++//75N/59++kkLFy7Uq6++Kl9fX33wwQfq0KGDjh07puLFi98w1z///KOmTZvq4MGD6tWrlypVqqT58+crMjJSFy9eVN++fVW9enXNnj1br732msqVK6cBAwZIkrWw+1+zZ8/Wiy++qAcffFAvvfSSJKly5co2fZ5++mnde++9Gjt2rLVgu3r1ah0+fFhRUVEKDAzUnj179Mknn2jPnj365ZdfMhUkO3XqpEqVKmncuHHasWOHPvvsM5UqVUoTJkyQdOv3i/TvHz488cQTiomJ0TPPPKO+ffvq0qVLWr16tX777Teb3DNmzNCVK1f00ksvycvLS8WKFVNGRkaW5yA9PV0tW7bUQw89pIkTJ2rFihUaPny40tLSFB0drZIlS+rjjz/WK6+8onbt2ql9+/aSZL2VQIcOHbRnzx717t1bQUFBOnPmjFavXq1jx45l+gMKe499Tc+ePa23OOjTp4+OHDmiDz/8UDt37tSmTZtsVjHYv3+/unTpop49e6pHjx6qVq1alsfu3LmzLBaLli9frqefftrafvnyZS1btkyRkZHWJednzpwpHx8f9e/fXz4+Plq7dq2GDRumxMREvfPOOzbjZvdzm5iYqM8++0xdunRRjx49dOnSJX3++ecKDw/X1q1bFRoaatN/7ty5unTpknr27CmTyaSJEyeqffv2Onz4sAoUKKCePXvqxIkTWr16tWbPnn3T8w4AAAAAuYYBAAAAAHCZGTNmGJKM2NjYG/bx9/c3ateubd0ePny4cf3/zr3//vuGJOPvv/++4RixsbGGJGPGjBmZ9jVp0sSQZEybNi3LfU2aNLFur1u3zpBklC1b1khMTLS2f/PNN4YkY8qUKda2ihUrGhEREbcc82bZIiIijIoVK1q3Fy9ebEgyRo8ebdOvY8eOhslkMg4ePGhtk2R4enratO3atcuQZEydOjXTsa43efJkQ5Lx5ZdfWttSU1ON+vXrGz4+PjavvWLFikbr1q1vOt413t7eWZ6Ta9e0S5cumfZdvnw5U9tXX31lSDI2btyYaYzu3bvb9G3Xrp1RvHhx63Z23i/Tp083JBmTJk3KtC8jI8MwDMM4cuSIIcnw8/Mzzpw5Y9Pn2r7rr2lERIQhyejdu7fNWK1btzY8PT2tef7++29DkjF8+HCbMS9cuGBIMt55550b5r6R7B77xx9/NCQZc+bMsXn+ihUrMrVXrFjRkGSsWLHilsfPyMgwypYta3To0MGm/drn5vrrmNX17tmzp1G4cGHjypUr1jZ7PrdpaWlGSkqKTZ8LFy4YAQEBNu+Xa9etePHixvnz563tS5YsMSQZy5Yts7aZzWaDf1YCAAAAcDdhKXUAAAAAyOV8fHx06dKlG+4vUqSIpH+XYL7RTN1b8fLyUlRUVLb7P//88/L19bVud+zYUaVLl9b3339/W8fPru+//17u7u7q06ePTfuAAQNkGIZ++OEHm/YWLVrYzG6uVauW/Pz8dPjw4VseJzAwUF26dLG2FShQQH369FFSUpI2bNjggFeT2csvv5yp7foZ+FeuXNHZs2f10EMPSZJ27NhxyzEefvhhnTt3TomJiZKy93759ttvVaJECfXu3TvTvv/OUO/QocMNZ8lnpVevXjZj9erVS6mpqVqzZs1Nn1eoUCF5enpq/fr1mZY1d9Sx58+fL39/fz366KM6e/as9VGnTh35+PhkWnq8UqVKCg8Pv+VxTSaTnn76aX3//fdKSkqyts+bN09ly5a1Lpt/7XVec+nSJZ09e1YPP/ywLl++rH379tmMm93Prbu7u/W+4xkZGTp//rzS0tJUt27dLN9DnTt3VtGiRa3b11ZyuNXnBgAAAAByMwrjAAAAAJDLJSUl2RSh/6tz585q2LChXnzxRQUEBOiZZ57RN998Y1eRvGzZstbCWXbce++9Ntsmk0lVqlS55b2ec+qPP/5QmTJlMp2P6tWrW/dfr0KFCpnGKFq06C0Lq3/88YfuvfdeubnZ/m/zjY7jKJUqVcrUdv78efXt21cBAQEqVKiQSpYsae2XkJCQqf9/X/O1Aue115yd98uhQ4dUrVo1eXjc+g5sWWW+ETc3N91zzz02bVWrVpWkW753vLy8NGHCBP3www8KCAhQ48aNNXHiRJ06dcphxz5w4IASEhJUqlQplSxZ0uaRlJSkM2fO2DzfntfeuXNn/fPPP1q6dKmkfz/X33//vZ5++mmbPzbYs2eP2rVrJ39/f/n5+alkyZLq1q2bpMzX257P7axZs1SrVi0VLFhQxYsXV8mSJbV8+fLbeg8BAAAAwN2Ie4wDAAAAQC72559/KiEhQVWqVLlhn0KFCmnjxo1at26dli9frhUrVmjevHl65JFHtGrVKuu9i2/GnvuCZ9d/ZxZfk56enq1MjnCj4xj///7duU1W16FTp076+eefNWjQIIWGhsrHx0cZGRlq2bJlln/8cKvX7Ij3y60yO0u/fv3Upk0bLV68WCtXrtTQoUM1btw4rV27VrVr187x+BkZGSpVqpTmzJmT5f7/zoy357U/9NBDCgoK0jfffKNnn31Wy5Yt0z///KPOnTtb+1y8eFFNmjSRn5+foqOjVblyZRUsWFA7duzQG2+8kel6Z/f4X375pSIjI9W2bVsNGjRIpUqVkru7u8aNG6dDhw5l6n+3fW4AAAAAIDsojAMAAABALjZ79mxJuuVyzW5ubmrevLmaN2+uSZMmaezYsXrrrbe0bt06tWjR4oZF6tt14MABm23DMHTw4EHVqlXL2la0aFFdvHgx03P/+OMPm5m79mSrWLGi1qxZo0uXLtnMGr+2xHTFihWzPdatjrN7925lZGTYzBrP6XHsvQ4XLlxQTEyMRo4cqWHDhlnb/3v+7XWr90vlypW1ZcsWXb16VQUKFMjRsa6XkZGhw4cPW2dqS9Lvv/8uSQoKCpJ063NUuXJlDRgwQAMGDNCBAwcUGhqq9957T19++WWOj125cmWtWbNGDRs2dErBv1OnTpoyZYoSExM1b948BQUFWZfFl6T169fr3LlzWrhwoRo3bmxtP3LkSI6Ou2DBAt1zzz1auHChzfkdPnz4bY/p6N8pAAAAAOBsLKUOAAAAALnU2rVrNWrUKFWqVEldu3a9Yb/z589nagsNDZUkpaSkSJK8vb0lKctC9e344osvbO57vmDBAp08eVKtWrWytlWuXFm//PKLUlNTrW3fffedjh8/bjOWPdkef/xxpaen68MPP7Rpf//992UymWyOnxOPP/64Tp06pXnz5lnb0tLSNHXqVPn4+KhJkya3Na63t7dd1+DazN3/ztSdPHnybR1fyt77pUOHDjp79mym85xVFntdP6ZhGPrwww9VoEABNW/eXJJUuHBhSZnfD5cvX9aVK1ds2ipXrixfX19r7pweu1OnTkpPT9eoUaMyPTctLS3Hn5/OnTsrJSVFs2bN0ooVK9SpUyeb/Vld79TUVH300Uc5Om5W427ZskWbN2++7TEd/TsFAAAAAJyNGeMAAAAAkAv88MMP2rdvn9LS0nT69GmtXbtWq1evVsWKFbV06VIVLFjwhs+Njo7Wxo0b1bp1a1WsWFFnzpzRRx99pHLlyqlRo0aS/i0gFilSRNOmTZOvr6+8vb1Vr149u+6RfL1ixYqpUaNGioqK0unTpzV58mRVqVJFPXr0sPZ58cUXtWDBArVs2VKdOnXSoUOH9OWXX6py5co2Y9mTrU2bNmrWrJneeustHT16VCEhIVq1apWWLFmifv36ZRr7dr300kv63//+p8jISG3fvl1BQUFasGCBNm3apMmTJ9/0nu83U6dOHa1Zs0aTJk1SmTJlVKlSJdWrV++G/f38/Kz30r569arKli2rVatW5WgGcXbeL88//7y++OIL9e/fX1u3btXDDz+s5ORkrVmzRq+++qqeeuqp2zp2wYIFtWLFCkVERKhevXr64YcftHz5cr355pvWZcoLFSqk4OBgzZs3T1WrVlWxYsV0//33Ky0tTc2bN1enTp0UHBwsDw8PLVq0SKdPn9YzzzzjkGM3adJEPXv21Lhx4xQXF6fHHntMBQoU0IEDBzR//nxNmTJFHTt2vK3XLkkPPPCAqlSporfeekspKSk2y6hLUoMGDVS0aFFFRESoT58+MplMmj17do7/GOGJJ57QwoUL1a5dO7Vu3VpHjhzRtGnTFBwcrKSkpNsas06dOpKkPn36KDw8XO7u7tm6DgAAAADgKhTGAQAAACAXuLZMtqenp4oVK6aaNWtq8uTJioqKumUR9sknn9TRo0c1ffp0nT17ViVKlFCTJk00cuRI+fv7S5IKFCigWbNmaciQIXr55ZeVlpamGTNm3HZh/M0339Tu3bs1btw4Xbp0Sc2bN9dHH31kne0r/bv8+3vvvadJkyapX79+qlu3rr777jsNGDDAZix7srm5uWnp0qUaNmyY5s2bpxkzZigoKEjvvPNOpnFzolChQlq/fr0GDx6sWbNmKTExUdWqVdOMGTMUGRl52+NOmjRJL730kt5++239888/1iLtzcydO1e9e/eWxWKRYRh67LHH9MMPP6hMmTK3lSE77xd3d3d9//33GjNmjObOnatvv/1WxYsXV6NGjVSzZs3bOu61cVesWKFXXnlFgwYNkq+vr4YPH26zTLwkffbZZ+rdu7dee+01paamavjw4erdu7e6dOmimJgYzZ49Wx4eHrrvvvv0zTffqEOHDg479rRp01SnTh3973//05tvvikPDw8FBQWpW7duatiw4W2/9ms6d+6sMWPGqEqVKnrggQds9hUvXtz6GXn77bdVtGhRdevWTc2bN7/l7RRuJjIyUqdOndL//vc/rVy5UsHBwfryyy81f/58rV+//rbGbN++vXr37q2vv/5aX375pQzDoDAOAAAAIFczGTn9s2MAAAAAAIBbiIyM1IIFC257hvLdemwAAAAAQO7APcYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdxj3EAAAAAAAAAAAAAQJ7GjHEAAAAAAAAAAAAAQJ5GYRzAHdO7d2+VL19efn5+Klu2rPr166fU1FRJUtOmTeXl5SUfHx/r48SJEzccKzExUc8++6z8/PwUEBCgUaNG2ezfvn27GjVqJD8/P91zzz364osvrPtSUlLUtGlTlSpVSn5+frrvvvv0ySefOOdFAwAAAAAAAAAAwOXyfWHcMAwlJiaKFeUB53v11Ve1b98+JSYmateuXdq1a5cmTpxo3T9hwgQlJSVZH2XKlLnhWL1799b58+d17Ngx/fjjj/r000+txe+LFy/q8ccfV7du3XThwgV99dVX6t27t3766SdJkoeHh6ZOnaoTJ04oMTFRCxcu1NChQ/Xjjz869wQAAAAAAAAAAADAJfJ9YfzSpUvy9/fXpUuXXB0FyPOqV68ub29vSf/+UYqbm5sOHDhg9ziXL1/W119/rdGjR6tIkSKqWrWqevfurc8//1yS9PPPP8vLy0svv/yy3N3dVa9ePbVv316fffaZJMnd3V01a9aUh4eHJMlkMslkMungwYMOeqUAAAAAAAAAAADITfJ9YRzAnTV+/Hj5+PioVKlS2rVrl3r37m3dN3r0aBUrVky1a9e2Wfr8v/bv36/U1FSFhoZa20JDQ7V7925JUkZGRqZVIDIyMqz7r3niiSdUsGBBBQcHKyAgQO3atXPAKwQAAAAAAAAAAEBuQ2EcwB01ePBgJSUlKT4+Xi+//LICAwMlSePGjdOhQ4d0+vRpjR8/Xr1799aiRYuyHCMpKUne3t7WGd+SVKRIEevKD/Xr11dycrI+/PBDXb16VZs2bdKiRYuUmJhoM853332n5ORkrV+/Xh06dFChQoWc9KoBAAAAAAAAAADgShTGAbhE9erVFRISosjISEn/FrP9/f1VoEABhYeHq2fPnpo3b16Wz/Xx8dHly5eVlpZmbUtISJCvr68kqXjx4lq2bJnmzp2rwMBADR48WFFRUSpevHimsdzd3dWkSROdPn1a77zzjuNfKAAAAAAAAAAAAFyOwjgAl7l69eoN7zHu5nbjX0/VqlVTgQIFtGvXLmtbXFycatasad1u2LChfv75Z507d04//vijTp06pSZNmtxWFgAAAAAAAAAAANzd8m1h3GKxKDg4WGFhYa6OAuQLSUlJmjFjhi5evCjDMPTrr79q9OjRCg8P18WLF/X999/r8uXLSk9PV0xMjKZNm6YOHTpYnx8ZGWmdXV64cGF17txZQ4cOVUJCgg4cOKCpU6fqxRdftPbfuXOnUlJS9M8//+jTTz/V+vXr1a9fP0n/FtFXr16tf/75R2lpaVq+fLnmzJmj8PDwO3lKAAAAAAAAAAAAcIeYDMMwXB3ClRITE+Xv76+EhAT5+fm5Og6QZyUnJ6tt27basWOHUlJSVKpUKXXo0EEjR45UcnKynnjiCe3du1eSFBQUpH79+ql79+7W5z/yyCPq0qWLevToIenfz27Pnj313XffqVChQurVq5eGDRtm7R8VFaVFixYpLS1NDRo00Pvvv68aNWpIkrZt26ZXXnlF+/fvl8lkUlBQkF599VX17NnzDp4RAAAAAAAAAAAA3CkUximMA7leSkqKatWqpd9++00FChRwdRwAAAAAAAAAAADcZTxcHQAAbsXLy0v79+93dQwAAAAAAAAAAADcpfLtPcYBAAAAAAAAAAAAAPkDhXEAAAAAAAAAAAAAQJ6WbwvjFotFwcHBCgsLc3UUAAAAAAAAAAAAAIATmQzDMFwdwpUSExPl7++vhIQE+fn5uToOAAAAAAAAAAAAAMDB8u2McQAAAAAAAAAAAABA/uDh6gDIuY4R0a6OAAC4wxbMGubqCAAAAAAAAAAA3DWYMQ4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPC3fFsYtFouCg4MVFhbm6igAAAAAAAAAAAAAACfKt4Vxs9ms+Ph4xcbGujoKAAAAAAAAAAAAAMCJ8m1hHAAAAAAAAAAAAACQP1AYBwAAAAAAAAAAAADkaRTGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKfl28K4xWJRcHCwwsLCXB0FAAAAAAAAAAAAAOBE+bYwbjabFR8fr9jYWFdHAQAAAAAAAAAAAAA4Ub4tjAMAAAAAAAAAAAAA8gcK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDwtzxTGL1++rIoVK2rgwIGujgIAAAAAAAAAAAAAyEXyTGF8zJgxeuihh1wdAwAAAAAAAAAAAACQy+SJwviBAwe0b98+tWrVytVRAAAAAAAAAAAAAAC5jMsL4xs3blSbNm1UpkwZmUwmLV68OFMfi8WioKAgFSxYUPXq1dPWrVtt9g8cOFDjxo27Q4kBAAAAAAAAAAAAAHcTlxfGk5OTFRISIovFkuX+efPmqX///ho+fLh27NihkJAQhYeH68yZM5KkJUuWqGrVqqpateqdjA0AAAAAAAAAAAAAuEt4uDpAq1atbroE+qRJk9SjRw9FRUVJkqZNm6bly5dr+vTpGjx4sH755Rd9/fXXmj9/vpKSknT16lX5+flp2LBhWY6XkpKilJQU63ZiYqJjXxAAAAAAAAAAAAAAIFdx+Yzxm0lNTdX27dvVokULa5ubm5tatGihzZs3S5LGjRun48eP6+jRo3r33XfVo0ePGxbFr/X39/e3PsqXL+/01wEAAAAAAAAAAAAAcJ1cXRg/e/as0tPTFRAQYNMeEBCgU6dO3daYQ4YMUUJCgvVx/PhxR0QFAAAAAAAAAAAAAORSLl9K3ZEiIyNv2cfLy0teXl7ODwMAAAAAAAAAAAAAyBVy9YzxEiVKyN3dXadPn7ZpP336tAIDA3M0tsViUXBwsMLCwnI0DgAAAAAAAAAAAAAgd8vVhXFPT0/VqVNHMTEx1raMjAzFxMSofv36ORrbbDYrPj5esbGxOY0JAAAAAAAAAAAAAMjFXL6UelJSkg4ePGjdPnLkiOLi4lSsWDFVqFBB/fv3V0REhOrWrasHH3xQkydPVnJysqKiolyYGgAAAAAAAAAAAABwt3B5YXzbtm1q1qyZdbt///6SpIiICM2cOVOdO3fW33//rWHDhunUqVMKDQ3VihUrFBAQkKPjWiwWWSwWpaen52gcAAAAAAAAAAAAAEDuZjIMw3B1CFdKTEyUv7+/EhIS5Ofn5+o4t6VjRLSrIwAA7rAFs4a5OgIAAAAAAAAAAHeNXH2PcQAAAAAAAAAAAAAAcorCOAAAAAAAAAAAAAAgT8u3hXGLxaLg4GCFhYW5OgoAAAAAAAAAAAAAwInybWHcbDYrPj5esbGxro4CAAAAAAAAAAAAAHCifFsYBwAAAAAAAAAAAADkDxTGAQAAAAAAAAAAAAB5mt2F8R07dujXX3+1bi9ZskRt27bVm2++qdTUVIeGcybuMQ4AAAAAAAAAAAAA+YPdhfGePXvq999/lyQdPnxYzzzzjAoXLqz58+fr9ddfd3hAZ+Ee4wAAAAAAAAAAAACQP9hdGP/9998VGhoqSZo/f74aN26suXPnaubMmfr2228dnQ8AAAAAAAAAAAAAgByxuzBuGIYyMjIkSWvWrNHjjz8uSSpfvrzOnj3r2HQAAAAAAAAAAAAAAOSQ3YXxunXravTo0Zo9e7Y2bNig1q1bS5KOHDmigIAAhwcEAAAAAAAAAAAAACAn7C6Mv//++9qxY4d69eqlt956S1WqVJEkLViwQA0aNHB4QGexWCwKDg5WWFiYq6MAAAAAAAAAAAAAAJzIZBiG4YiBrly5Ig8PD3l4eDhiuDsmMTFR/v7+SkhIkJ+fn6vj3JaOEdGujgAAuMMWzBrm6ggAAAAAAAAAANw17J4xfs899+jcuXOZ2q9cuaKqVas6JBQAAAAAAAAAAAAAAI5id2H86NGjSk9Pz9SekpKiP//80yGhAAAAAAAAAAAAAABwlGyve7506VLrzytXrpS/v791Oz09XTExMapUqZJj0wEAAAAAAAAAAAAAkEPZLoy3bdtWkmQymRQREWGzr0CBAgoKCtJ7773n0HAAAAAAAAAAAAAAAORUtgvjGRkZkqRKlSopNjZWJUqUcFqoO8FischisWS5LDwAAAAAAAAAAAAAIO+w+x7jR44cueuL4pJkNpsVHx+v2NhYV0cBAAAAAAAAAAAAADhRtmeMXy8mJkYxMTE6c+aMdSb5NdOnT3dIMAAAAAAAAAAAAAAAHMHuwvjIkSMVHR2tunXrqnTp0jKZTM7IBQAAAAAAAAAAAACAQ9hdGJ82bZpmzpyp5557zhl5AAAAAAAAAAAAAABwKLvvMZ6amqoGDRo4IwsAAAAAAAAAAAAAAA5nd2H8xRdf1Ny5c52RBQAAAAAAAAAAAAAAh7N7KfUrV67ok08+0Zo1a1SrVi0VKFDAZv+kSZMcFs6ZLBaLLBaL0tPTXR0FAAAAAAAAAAAAAOBEdhfGd+/erdDQUEnSb7/9ZrPPZDI5JNSdYDabZTablZiYKH9/f1fHAQAAAAAAAAAAAAA4id2F8XXr1jkjBwAAAAAAAAAAAAAATmH3PcYBAAAAAAAAAAAAALib2D1jXJK2bdumb775RseOHVNqaqrNvoULFzokGAAAAAAAAAAAAAAAjmD3jPGvv/5aDRo00N69e7Vo0SJdvXpVe/bs0dq1a7lXNwAAAAAAAAAAAAAg17G7MD527Fi9//77WrZsmTw9PTVlyhTt27dPnTp1UoUKFZyREQAAAAAAAAAAAACA22Z3YfzQoUNq3bq1JMnT01PJyckymUx67bXX9Mknnzg8IAAAAAAAAAAAAAAAOWF3Ybxo0aK6dOmSJKls2bL67bffJEkXL17U5cuXHZsOAAAAAAAAAAAAAIAc8rD3CY0bN9bq1atVs2ZNPf300+rbt6/Wrl2r1atXq3nz5s7ICAAAAAAAAAAAAADAbbO7MP7hhx/qypUrkqS33npLBQoU0M8//6wOHTro7bffdnhAAAAAAAAAAAAAAABywu7CeLFixaw/u7m5afDgwQ4NdKdYLBZZLBalp6e7OgoAAAAAAAAAAAAAwImyVRhPTEyUn5+f9eebudYvtzObzTKbzUpMTJS/v7+r4wAAAAAAAAAAAAAAnCRbhfGiRYvq5MmTKlWqlIoUKSKTyZSpj2EYMplMzMAGAAAAAAAAAAAAAOQq2SqMr1271rqE+rp165waCAAAAAAAAAAAAAAAR8pWYbxJkyaSpLS0NG3YsEHdu3dXuXLlnBoMAAAAAAAAAAAAAABHcLOns4eHh9555x2lpaU5Kw8AAAAAAAAAAAAAAA5lV2Fckh555BFt2LDBGVkAAAAAAAAAAAAAAHC4bC2lfr1WrVpp8ODB+vXXX1WnTh15e3vb7H/yyScdFg4AAAAAAAAAAAAAgJyyuzD+6quvSpImTZqUaZ/JZFJ6enrOUwEAAAAAAAAAAAAA4CB2F8YzMjKckQMAAAAAAAAAAAAAAKew+x7jAAAAAAAAAAAAAADcTeyeMS5JycnJ2rBhg44dO6bU1FSbfX369HFIMAAAAAAAAAAAAAAAHMHuwvjOnTv1+OOP6/Lly0pOTlaxYsV09uxZFS5cWKVKlbrjhfGLFy+qRYsWSktLU1pamvr27asePXrc0QwAAAAAAAAAAAAAgNzL7qXUX3vtNbVp00YXLlxQoUKF9Msvv+iPP/5QnTp19O677zoj4035+vpq48aNiouL05YtWzR27FidO3fujucAAAAAAAAAAAAAAOROdhfG4+LiNGDAALm5ucnd3V0pKSkqX768Jk6cqDfffNMZGW/K3d1dhQsXliSlpKTIMAwZhnHHcwAAAAAAAAAAAAAAcie7C+MFChSQm9u/TytVqpSOHTsmSfL399fx48ftDrBx40a1adNGZcqUkclk0uLFizP1sVgsCgoKUsGCBVWvXj1t3brVZv/FixcVEhKicuXKadCgQSpRooTdOQAAAAAAAAAAAAAAeZPdhfHatWsrNjZWktSkSRMNGzZMc+bMUb9+/XT//ffbHSA5OVkhISGyWCxZ7p83b5769++v4cOHa8eOHQoJCVF4eLjOnDlj7VOkSBHt2rVLR44c0dy5c3X69Gm7cwAAAAAAAAAAAAAA8ia7C+Njx45V6dKlJUljxoxR0aJF9corr+jvv//WJ598YneAVq1aafTo0WrXrl2W+ydNmqQePXooKipKwcHBmjZtmgoXLqzp06dn6hsQEKCQkBD9+OOPNzxeSkqKEhMTbR4AAAAAAAAAAAAAgLzL7sJ43bp11axZM0n/LqW+YsUKJSYmavv27QoJCXFouNTUVG3fvl0tWrSwtrm5ualFixbavHmzJOn06dO6dOmSJCkhIUEbN25UtWrVbjjmuHHj5O/vb32UL1/eoZkBAAAAAAAAAAAAALmL3YXx0aNH68iRI87IksnZs2eVnp6ugIAAm/aAgACdOnVKkvTHH3/o4YcfVkhIiB5++GH17t1bNWvWvOGYQ4YMUUJCgvVxO/dFBwAAAAAAAAAAAADcPTzsfcL8+fM1fPhw1atXT926dVOnTp1UokQJZ2TLlgcffFBxcXHZ7u/l5SUvLy/nBQIAAAAAAAAAAAAA5Cp2zxjftWuXdu/eraZNm+rdd99VmTJl1Lp1a82dO1eXL192aLgSJUrI3d1dp0+ftmk/ffq0AgMDczS2xWJRcHCwwsLCcjQOAAAAAAAAAAAAACB3s7swLkk1atTQ2LFjdfjwYa1bt05BQUHq169fjovV/+Xp6ak6deooJibG2paRkaGYmBjVr18/R2ObzWbFx8crNjY2pzEBAAAAAAAAAAAAALmY3Uup/5e3t7cKFSokT09PXbp0ye7nJyUl6eDBg9btI0eOKC4uTsWKFVOFChXUv39/RUREqG7dunrwwQc1efJkJScnKyoqKqfRAQAAAAAAAAAAAAD5wG0Vxo8cOaK5c+dq7ty52r9/v5o0aaKRI0eqY8eOdo+1bds2NWvWzLrdv39/SVJERIRmzpypzp076++//9awYcN06tQphYaGasWKFQoICLid6FYWi0UWi0Xp6ek5GgcAAAAAAAAAAAAAkLuZDMMw7HnCQw89pNjYWNWqVUtdu3ZVly5dVLZsWWflc7rExET5+/srISFBfn5+ro5zWzpGRLs6AgDgDlswa5irIwAAAAAAAAAAcNewe8Z48+bNNX36dAUHBzsjDwAAAAAAAAAAAAAADmV3YXzMmDHOyAEAAAAAAAAAAAAAgFO4uTqAq1gsFgUHByssLMzVUQAAAAAAAAAAAAAATpRvC+Nms1nx8fGKjY11dRQAAAAAAAAAAAAAgBPl28I4AAAAAAAAAAAAACB/oDAOAAAAAAAAAAAAAMjTbqsw/uOPP6pbt26qX7++/vrrL0nS7Nmz9dNPPzk0nDNxj3EAAAAAAAAAAAAAyB/sLox/++23Cg8PV6FChbRz506lpKRIkhISEjR27FiHB3QW7jEOAAAAAAAAAAAAAPmD3YXx0aNHa9q0afr0009VoEABa3vDhg21Y8cOh4YDAAAAAAAAAAAAACCn7C6M79+/X40bN87U7u/vr4sXLzoiEwAAAAAAAAAAAAAADmN3YTwwMFAHDx7M1P7TTz/pnnvucUgoAAAAAAAAAAAAAAAcxe7CeI8ePdS3b19t2bJFJpNJJ06c0Jw5czRw4EC98sorzsjoFBaLRcHBwQoLC3N1FAAAAAAAAAAAAACAE3nY+4TBgwcrIyNDzZs31+XLl9W4cWN5eXlp4MCB6t27tzMyOoXZbJbZbFZiYqL8/f1dHQcAAAAAAAAAAAAA4CR2F8ZNJpPeeustDRo0SAcPHlRSUpKCg4Pl4+PjjHwAAAAAAAAAAAAAAOSI3YXxazw9PRUcHOzILAAAAAAAAAAAAAAAOJzdhfFmzZrJZDLdcP/atWtzFAgAAAAAAAAAAAAAAEeyuzAeGhpqs3316lXFxcXpt99+U0REhKNyOZ3FYpHFYlF6erqrowAAAAAAAAAAAAAAnMjuwvj777+fZfuIESOUlJSU40B3itlsltlsVmJiovz9/V0dBwAAAAAAAAAAAADgJG6OGqhbt26aPn26o4YDAAAAAAAAAAAAAMAhHFYY37x5swoWLOio4QAAAAAAAAAAAAAAcAi7l1Jv3769zbZhGDp58qS2bdumoUOHOiwYAAAAAAAAAAAAAACOYHdh/L/343Zzc1O1atUUHR2txx57zGHBAAAAAAAAAAAAAABwBLsL4zNmzHBGDgAAAAAAAAAAAAAAnMJh9xgHAAAAAAAAAAAAACA3snvGeNGiRWUymbLV9/z583YHulMsFossFovS09NdHQUAAAAAAAAAAAAA4ER2F8aHDh2q0aNHKzw8XPXr15ckbd68WStXrtTQoUNVrFgxh4d0BrPZLLPZrMTExEz3TQcAAAAAAAAAAAAA5B12F8Y3bdqk6Oho9erVy9rWp08fffjhh1qzZo0WL17syHwAAAAAAAAAAAAAAOSI3fcYX7lypVq2bJmpvWXLllqzZo1DQgEAAAAAAAAAAAAA4Ch2F8aLFy+uJUuWZGpfsmSJihcv7pBQAAAAAAAAAAAAAAA4it1LqY8cOVIvvvii1q9fr3r16kmStmzZohUrVujTTz91eEAAAAAAAAAAAAAAAHLC7sJ4ZGSkqlevrg8++EALFy6UJFWvXl0//fSTtVAOAAAAAAAAAAAAAEBuYXdhXJLq1aunOXPmODoLAAAAAAAAAAAAAAAOl63CeGJiovz8/Kw/38y1fgAAAAAAAAAAAAAA5AbZKowXLVpUJ0+eVKlSpVSkSBGZTKZMfQzDkMlkUnp6usNDAgAAAAAAAAAAAABwu7JVGF+7dq2KFSsmSVq3bp1TAwEAAAAAAAAAAAAA4EjZKow3adIky5/vZhaLRRaLhRnuAAAAAAAAAAAAAJDHud3Oky5evKhVq1bpyy+/1BdffGHzuFuYzWbFx8crNjbW1VEAAAAAl/nwww9Vt25deXl5qW3btjb7EhMT9eyzz8rPz08BAQEaNWqUdd+ZM2fUtWtXlStXTn5+fqpdu7aWLl16h9MDAAAAAAAA2ZOtGePXW7Zsmbp27aqkpCT5+fnZ3G/cZDLp+eefd2hAAAAAAM5TpkwZvf3221qzZo3+/PNPm329e/fW+fPndezYMZ05c0YtWrRQxYoV9fzzzyspKUm1a9fWhAkTVKZMGS1fvlzPPPOMYmNjFRwc7KJXAwAAAAAAAGTN7hnjAwYMUPfu3ZWUlKSLFy/qwoUL1sf58+edkREAAACAk7Rv315t27ZViRIlbNovX76sr7/+WqNHj1aRIkVUtWpV9e7dW59//rkk6Z577tHAgQNVrlw5ubm5qU2bNqpWrZp++eUXV7wMAAAAAAAA4KbsLoz/9ddf6tOnjwoXLuyMPAAAAABygf379ys1NVWhoaHWttDQUO3evTvL/mfOnNHevXtVq1atO5QQAAAAAAAAyD67C+Ph4eHatm2bM7IAAAAAyCWSkpLk7e0tD4//u/tSkSJFdOnSpUx9U1NT9cwzz6hTp06qW7funYwJAAAAAAAAZIvd9xhv3bq1Bg0apPj4eNWsWVMFChSw2f/kk086LBwAAAAA1/Dx8dHly5eVlpZmLY4nJCTI19fXpl9qaqo6duyowoUL69NPP3VFVAAAAAAAAOCW7C6M9+jRQ5IUHR2daZ/JZFJ6enrOUwEAAABwqWrVqqlAgQLatWuX6tSpI0mKi4tTzZo1rX1SU1P19NNPKzU1VUuWLJGnp6er4gIAAAAAAAA3ZfdS6hkZGTd8UBQHAAAA7i5paWm6cuWK0tLSlJGRoStXrig1NVWFCxdW586dNXToUCUkJOjAgQOaOnWqXnzxRUnS1atX1alTJyUnJ2vx4sXy8vJy8SsBAAAAAAAAbszuwjgAAACAvGP06NEqVKiQxowZo2XLlqlQoUJ67LHHJEkffvih/P39Va5cOTVs2FAvvPCCnn/+eUnSzz//rCVLlmjTpk0qUaKEfHx85OPjo7Fjx7ry5QAAAAAAAABZMhmGYdjzhKyWUL/esGHDchToTktMTJS/v78SEhLk5+fn6ji3pWPEza8JACDvWTDr7vq+BQAAAAAAAADAley+x/iiRYtstq9evaojR47Iw8NDlStXvusK4wAAAAAAAAAAAACAvM3uwvjOnTsztSUmJioyMlLt2rVzSCh7HD9+XM8995zOnDkjDw8PDR06VE8//fQdzwEAAAAAAAAAAAAAyJ0cco9xPz8/jRw5UkOHDnXEcHbx8PDQ5MmTFR8fr1WrVqlfv35KTk6+4zkAAAAAAAAAAAAAALmT3TPGbyQhIUEJCQmOGi7bSpcurdKlS0uSAgMDVaJECZ0/f17e3t53PAsAAAAAAAAAAAAAIPexuzD+wQcf2GwbhqGTJ09q9uzZatWqld0BNm7cqHfeeUfbt2/XyZMntWjRIrVt29amj8Vi0TvvvKNTp04pJCREU6dO1YMPPphprO3btys9PV3ly5e3OwcAAAAAAAAAAAAAIG+yuzD+/vvv22y7ubmpZMmSioiI0JAhQ+wOkJycrJCQEHXv3l3t27fPtH/evHnq37+/pk2bpnr16mny5MkKDw/X/v37VapUKWu/8+fP6/nnn9enn35qdwYAAAAAAAAAAAAAQN5ld2H8yJEjDg3QqlWrm840nzRpknr06KGoqChJ0rRp07R8+XJNnz5dgwcPliSlpKSobdu2Gjx4sBo0aHDT46WkpCglJcW6nZiY6IBXAQAAAAAAAAAAAADIrRx2j3FnSE1N1fbt221moru5ualFixbavHmzpH+Xco+MjNQjjzyi55577pZjjhs3TiNHjnRaZgAAkLd1jIh2dQQAwB22YNYwV0cAAAAAAAA55ObqADdz9uxZpaenKyAgwKY9ICBAp06dkiRt2rRJ8+bN0+LFixUaGqrQ0FD9+uuvNxxzyJAhSkhIsD6OHz/u1NcAAAAAAAAAAAAAAHCtXD1jPDsaNWqkjIyMbPf38vKSl5eXExMBAAAAAAAAAAAAAHKTXD1jvESJEnJ3d9fp06dt2k+fPq3AwMAcjW2xWBQcHKywsLAcjQMAAAAAAAAAAAAAyN2yVRh/4IEHdOHCBUlSdHS0Ll++7NRQ13h6eqpOnTqKiYmxtmVkZCgmJkb169fP0dhms1nx8fGKjY3NaUwAAAAAAAAAAAAAQC6WrcL43r17lZycLEkaOXKkkpKSHBYgKSlJcXFxiouLkyQdOXJEcXFxOnbsmCSpf//++vTTTzVr1izt3btXr7zyipKTkxUVFeWwDAAAAAAAAAAAAACAvCtb9xgPDQ1VVFSUGjVqJMMw9O6778rHxyfLvsOGDbMrwLZt29SsWTPrdv/+/SVJERERmjlzpjp37qy///5bw4YN06lTpxQaGqoVK1YoICDAruP8l8VikcViUXp6eo7GAQAAAAAAAAAAAADkbibDMIxbddq/f7+GDx+uQ4cOaceOHQoODpaHR+aauslk0o4dO5wS1FkSExPl7++vhIQE+fn5uTrObekYEe3qCACAO2zBLPv+EA2Ow/cuAOQ/fO8CAAAAAHD3y9aM8WrVqunrr7+WJLm5uSkmJkalSpVyajAAAAAAAAAAAAAAABwhW4Xx62VkZDgjBwAAAAAAAAAAAAAATmF3YVySDh06pMmTJ2vv3r2SpODgYPXt21eVK1d2aDhn4h7jAAAAAAAAAAAAAJA/uNn7hJUrVyo4OFhbt25VrVq1VKtWLW3ZskU1atTQ6tWrnZHRKcxms+Lj4xUbG+vqKAAAAAAAAAAAAAAAJ7J7xvjgwYP12muvafz48Zna33jjDT366KMOCwcAAAAAAAAAAAAAQE7ZPWN87969euGFFzK1d+/eXfHx8Q4JBQAAAAAAAAAAAACAo9hdGC9ZsqTi4uIytcfFxalUqVKOyHRHWCwWBQcHKywszNVRAAAAAAAAAAAAAABOZPdS6j169NBLL72kw4cPq0GDBpKkTZs2acKECerfv7/DAzqL2WyW2WxWYmKi/P39XR0HAAAAAAAAAAAAAOAkdhfGhw4dKl9fX7333nsaMmSIJKlMmTIaMWKE+vTp4/CAAAAAAAAAAAAAAADkhN2FcZPJpNdee02vvfaaLl26JEny9fV1eDAAAAAAAAAAAAAAABzB7nuMX8/X1/euLYpzj3EAAAAAAAAAAAAAyB9yVBi/m5nNZsXHxys2NtbVUQAAAAAAAAAAAAAATpRvC+MAAAAAAAAAAAAAgPyBwjgAAAAAAAAAAAAAIE+zqzB+9epVNW/eXAcOHHBWHgAAAAAAAAAAAAAAHMquwniBAgW0e/duZ2UBAAAAAAAAAAAAAMDh7F5KvVu3bvr888+dkeWOslgsCg4OVlhYmKujAAAAAAAAAAAAAACcyMPeJ6SlpWn69Olas2aN6tSpI29vb5v9kyZNclg4ZzKbzTKbzUpMTJS/v7+r4wAAAAAAAAAAAAAAnMTuwvhvv/2mBx54QJL0+++/2+wzmUyOSQUAAAAAAAAAAAAAgIPYXRhft26dM3IAAAAAAAAAAAAAAOAUdt9j/JqDBw9q5cqV+ueffyRJhmE4LBQAAAAAAAAAAAAAAI5id2H83Llzat68uapWrarHH39cJ0+elCS98MILGjBggMMDAgAAAAAAAAAAAACQE3YXxl977TUVKFBAx44dU+HCha3tnTt31ooVKxwaDgAAAAAAAAAAAACAnLL7HuOrVq3SypUrVa5cOZv2e++9V3/88YfDggEAAAAAAAAAAAAA4Ah2zxhPTk62mSl+zfnz5+Xl5eWQUHeCxWJRcHCwwsLCXB0FAAAAAAAAAAAAAOBEdhfGH374YX3xxRfWbZPJpIyMDE2cOFHNmjVzaDhnMpvNio+PV2xsrKujAAAAAAAAAAAAAACcyO6l1CdOnKjmzZtr27ZtSk1N1euvv649e/bo/Pnz2rRpkzMyAgAAAAAAAAAAAABw2+yeMX7//ffr999/V6NGjfTUU08pOTlZ7du3186dO1W5cmVnZAQAAAAAAAAAAAAA4LbZPWNckvz9/fXWW285OgsAAAAAAAAAAAAAAA53W4XxCxcu6PPPP9fevXslScHBwYqKilKxYsUcGg4AAAAAAAAAAAAAgJyyeyn1jRs3KigoSB988IEuXLigCxcu6IMPPlClSpW0ceNGZ2QEAAAAAAAAAAAAAOC22T1j3Gw2q3Pnzvr444/l7u4uSUpPT9err74qs9msX3/91eEhAQAAAAAAAAAAAAC4XXbPGD948KAGDBhgLYpLkru7u/r376+DBw86NBwAAAAAAAAAAAAAADlld2H8gQcesN5b/Hp79+5VSEiIQ0IBAAAAAAAAAAAAAOAo2VpKfffu3daf+/Tpo759++rgwYN66KGHJEm//PKLLBaLxo8f75yUAAAAAAAAAAAAAADcpmwVxkNDQ2UymWQYhrXt9ddfz9Tv2WefVefOnR2XzoksFossFovS09NdHQUAAAAAAAAAAAAA4ETZKowfOXLE2TnuOLPZLLPZrMTERPn7+7s6DgAAAAAAAAAAAADASbJVGK9YsaKzcwAAAAAAAAAAAAAA4BTZKoz/14kTJ/TTTz/pzJkzysjIsNnXp08fhwQDAAAAAAAAAAAAAMAR7C6Mz5w5Uz179pSnp6eKFy8uk8lk3WcymSiMAwAAAAAAAAAAAAByFbsL40OHDtWwYcM0ZMgQubm5OSMTAAAAAAAAAAAAAAAOY3dl+/Lly3rmmWcoigMAAAAAAAAAAAAA7gp2V7dfeOEFzZ8/3xlZAAAAAAAAAAAAAABwOLuXUh83bpyeeOIJrVixQjVr1lSBAgVs9k+aNMlh4QAAAAAAAAAAAAAAyKnbKoyvXLlS1apVkySZTCbrvut/BgAAAAAAAAAAAAAgN7C7MP7ee+9p+vTpioyMdEIcAAAAAAAAAAAAAAAcy+57jHt5ealhw4bOyAIAAAAAAAAAAAAAgMPZXRjv27evpk6d6owst61du3YqWrSoOnbs6OooAAAAAAAAAAAAAIBcxu6l1Ldu3aq1a9fqu+++U40aNVSgQAGb/QsXLnRYuOzq27evunfvrlmzZt3xYwMAAAAAAAAAAAAAcje7C+NFihRR+/btnZHltjVt2lTr1693dQwAAAAAAAAAAAAAQC5kd2F8xowZDg2wceNGvfPOO9q+fbtOnjypRYsWqW3btjZ9LBaL3nnnHZ06dUohISGaOnWqHnzwQYfmAAAAAAAAAAAAAADkTXbfY9zRkpOTFRISIovFkuX+efPmqX///ho+fLh27NihkJAQhYeH68yZM3c4KQAAAAAAAAAAAADgbmT3jPFKlSrJZDLdcP/hw4ftGq9Vq1Zq1arVDfdPmjRJPXr0UFRUlCRp2rRpWr58uaZPn67BgwfbdSxJSklJUUpKinU7MTHR7jEAAAAAAAAAAAAAAHcPuwvj/fr1s9m+evWqdu7cqRUrVmjQoEGOyiVJSk1N1fbt2zVkyBBrm5ubm1q0aKHNmzff1pjjxo3TyJEjHRURAAAAAAAAAAAAAJDL2V0Y79u3b5btFotF27Zty3Gg6509e1bp6ekKCAiwaQ8ICNC+ffus2y1atNCuXbuUnJyscuXKaf78+apfv36WYw4ZMkT9+/e3bicmJqp8+fIOzQ0AAAAAAAAAAAAAyD3sLozfSKtWrTRkyBDNmDHDUUNm25o1a7Ld18vLS15eXk5MAwAAAAAAAAAAAADITdwcNdCCBQtUrFgxRw0nSSpRooTc3d11+vRpm/bTp08rMDAwR2NbLBYFBwcrLCwsR+MAAAAAAAAAAAAAAHI3u2eM165dWyaTybptGIZOnTqlv//+Wx999JFDw3l6eqpOnTqKiYlR27ZtJUkZGRmKiYlRr169cjS22WyW2WxWYmKi/P39HZAWAAAAAAAAAAAAAJAb2V0Yv1agvsbNzU0lS5ZU06ZNdd9999kdICkpSQcPHrRuHzlyRHFxcSpWrJgqVKig/v37KyIiQnXr1tWDDz6oyZMnKzk5WVFRUXYfCwAAAAAAAAAAAACQ/9hdGB8+fLhDA2zbtk3NmjWzbvfv31+SFBERoZkzZ6pz5876+++/NWzYMJ06dUqhoaFasWKFAgICcnRci8Uii8Wi9PT0HI0DAAAAAAAAAAAAAMjdTIZhGK4O4UrXllJPSEiQn5+fq+Pclo4R0a6OAAC4wxbMGubqCPkW37sAkP/wvQsAAAAAwN0v2zPG3dzcbO4tnhWTyaS0tLQchwIAAAAAAAAAAAAAwFGyXRhftGjRDfdt3rxZH3zwgTIyMhwS6k5gKXUAAAAAAAAAAAAAyB+yXRh/6qmnMrXt379fgwcP1rJly9S1a1dFR989S4uazWaZzWbrUuoAAAAAAAAAAAAAgLzJ7XaedOLECfXo0UM1a9ZUWlqa4uLiNGvWLFWsWNHR+QAAAAAAAAAAAAAAyBG7CuMJCQl64403VKVKFe3Zs0cxMTFatmyZ7r//fmflAwAAAAAAAAAAAAAgR7K9lPrEiRM1YcIEBQYG6quvvspyaXUAAAAAAAAAAAAAAHKbbBfGBw8erEKFCqlKlSqaNWuWZs2alWW/hQsXOiycM1ksFlksFqWnp7s6CgAAAAAAAAAAAADAibJdGH/++edlMpmcmeWOMpvNMpvNSkxMlL+/v6vjAAAAAAAAAAAAAACcJNuF8ZkzZzoxBgAAAAAAAABJ6t27txYvXqyEhAT5+vrq6aef1sSJE+Xp6enqaAAAAMBdy83VAQAAAAAAAAD8n1dffVX79u1TYmKidu3apV27dmnixImujgUAAADc1fJtYdxisSg4OFhhYWGujgIAAAAAAABYVa9eXd7e3pIkwzDk5uamAwcOuDgVAAAAcHfLt4Vxs9ms+Ph4xcbGujoKAAAAAAAAYGP8+PHy8fFRqVKltGvXLvXu3dvVkQAAAIC7Wr4tjAMAAAAAAAC51eDBg5WUlKT4+Hi9/PLLCgwMdHUkAAAA4K5GYRwAAAAAAADIpapXr66QkBBFRka6OgoAAABwV6MwDgAAAAAAAORiV69e5R7jAAAAQA5RGAcAAAAAAAByiaSkJM2YMUMXL16UYRj69ddfNXr0aIWHh7s6GgAAAHBXy7eFcYvFouDgYIWFhbk6CgAAAAAAACBJMplMmjt3ripXrixfX1899dRTat26tSZPnuzqaAAAAMBdzcPVAVzFbDbLbDYrMTFR/v7+ro4DAAAAAAAAyNvbW6tXr3Z1DAAAACDPybczxgEAAAAAAAAAAAAA+QOFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp1EYBwAAAAAAAAAAAADkaRTGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECe5uHqAK5isVhksViUnp7u6igAAAAAAOAGOkZEuzoCAOAOWzBrmKsjAACAPCjfzhg3m82Kj49XbGysq6MAAAAAAAAAAAAAAJwo3xbGAQAAAAAAAAAAAAD5A4VxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdRGAcAAAAAAAAAAAAA5GkUxgEAAAAAAAAAAAAAeVq+LYxbLBYFBwcrLCzM1VEAAAAAAAAAAAAAAE6UbwvjZrNZ8fHxio2NdXUUAAAAAAAAAAAAAIAT5dvCOAAAAAAAAAAAAPK3Dz/8UHXr1pWXl5fatm3r6jgAnMjD1QEAAAAAAAAAAAAAVyhTpozefvttrVmzRn/++aer4wBwIgrjAAAAAAAAAAAAyJfat28vSYqLi6MwDuRxLKUOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDyNwjgAAAAAAAAAAAAAIE/zcHUAAAAAAAAAAAAAwBXS0tKsj4yMDF25ckVubm7y9PR0dTQADkZhHAAAAAAAAAAAAPnS6NGjNXLkSOt2oUKF1KRJE61fv951oQA4BUupAwAAAAAAAAAAIF8aMWKEDMOweVAUB/KmPFEY/+6771StWjXde++9+uyzz1wdBwAAAAAAAAAAAACQi9z1S6mnpaWpf//+Wrdunfz9/VWnTh21a9dOxYsXd3U0AAAAAAAAAAAAAEAucNfPGN+6datq1KihsmXLysfHR61atdKqVatcHQsAAAAAAAAAAAAAkEu4vDC+ceNGtWnTRmXKlJHJZNLixYsz9bFYLAoKClLBggVVr149bd261brvxIkTKlu2rHW7bNmy+uuvv+5EdAAAAAAAAAAAAADAXcDlhfHk5GSFhITIYrFkuX/evHnq37+/hg8frh07digkJETh4eE6c+bMHU4KAAAAAAAAAAAAALgbubww3qpVK40ePVrt2rXLcv+kSZPUo0cPRUVFKTg4WNOmTVPhwoU1ffp0SVKZMmVsZoj/9ddfKlOmzA2Pl5KSosTERJsHAAAAAAAAAAAAACDv8nB1gJtJTU3V9u3bNWTIEGubm5ubWrRooc2bN0uSHnzwQf3222/666+/5O/vrx9++EFDhw694Zjjxo3TyJEjnZ4dAAAAAAAAAIC7TceIaFdHAADcYQtmDXN1hDvC5TPGb+bs2bNKT09XQECATXtAQIBOnTolSfLw8NB7772nZs2aKTQ0VAMGDFDx4sVvOOaQIUOUkJBgfRw/ftyprwEAAAAAAAAAAAAA4Fq5esZ4dj355JN68skns9XXy8tLXl5eTk4EAAAAAAAAAAAAAMgtcvWM8RIlSsjd3V2nT5+2aT99+rQCAwNzNLbFYlFwcLDCwsJyNA4AAAAAAAAAAAAAIHfL1YVxT09P1alTRzExMda2jIwMxcTEqH79+jka22w2Kz4+XrGxsTmNCQAAAAAAAAAAAADIxVy+lHpSUpIOHjxo3T5y5Iji4uJUrFgxVahQQf3791dERITq1q2rBx98UJMnT1ZycrKioqJcmBoAAAAAAAAAAAAAcLdweWF827ZtatasmXW7f//+kqSIiAjNnDlTnTt31t9//61hw4bp1KlTCg0N1YoVKxQQEJCj41osFlksFqWnp+doHAAAAAAAAAAAAABA7ubywnjTpk1lGMZN+/Tq1Uu9evVy6HHNZrPMZrMSExPl7+/v0LEBAAAAAAAAAAAAALlHrr7HOAAAAAAAAAAAAAAAOZVvC+MWi0XBwcEKCwtzdRQAAAAAAAAAAAAAgBPl28K42WxWfHy8YmNjXR0FAAAAAAAAAAAAAOBE+bYwDgAAAAAAAAAAAADIHyiMAwAAAAAAAAAAAADyNA9XB3AVi8Uii8WitLQ0SVJiYqKLE92+q6lXXB0BAHCH3c3fW3c7vncBIP/he9e1+O4FgPyH717X4rsXAPKfvPDd6+vrK5PJdNM+JsMwjDuUJ1f6888/Vb58eVfHAAAAAAAAAAAAAADchoSEBPn5+d20T74vjGdkZOjEiRPZ+isCALlHYmKiypcvr+PHj9/yFx0AAMg5vnsBALiz+O4FAODO4rsXuLtlp9abb5dSv8bNzU3lypVzdQwAt8nPz4//SAEA4A7iuxcAgDuL714AAO4svnuBvMvN1QEAAAAAAAAAAAAAAHAmCuMAAAAAAAAAAAAAgDyNwjiAu5KXl5eGDx8uLy8vV0cBACBf4LsXAIA7i+9eAADuLL57gbzPZBiG4eoQAAAAAAAAAAAAAAA4CzPGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHcFfZuHGj2rRpozJlyshkMmnx4sWujgQAQJ42btw4hYWFydfXV6VKlVLbtm21f/9+V8cCACDPunTpkvr166eKFSuqUKFCatCggWJjY10dCwCAPOFW/74cGRkpk8lk82jZsqVrwgJwOArjAO4qycnJCgkJkcVicXUUAADyhQ0bNshsNuuXX37R6tWrdfXqVT322GNKTk52dTQAAPKkF198UatXr9bs2bP166+/6rHHHlOLFi30119/uToaAAB3vez8+3LLli118uRJ6+Orr766gwkBOJPJMAzD1SEA4HaYTCYtWrRIbdu2dXUUAADyjb///lulSpXShg0b1LhxY1fHAQAgT/nnn3/k6+urJUuWqHXr1tb2OnXqqFWrVho9erQL0wEAkLdk9e/LkZGRunjxIiuVAnkUM8YBAAAAZFtCQoIkqVixYi5OAgBA3pOWlqb09HQVLFjQpr1QoUL66aefXJQKAID8Zf369SpVqpSqVaumV155RefOnXN1JAAOQmEcAAAAQLZkZGSoX79+atiwoe6//35XxwEAIM/x9fVV/fr1NWrUKJ04cULp6en68ssvtXnzZp08edLV8QAAyPNatmypL774QjExMZowYYI2bNigVq1aKT093dXRADiAh6sDAAAAALg7mM1m/fbbb8xYAwDAiWbPnq3u3burbNmycnd31wMPPKAuXbpo+/btro4GAECe98wzz1h/rlmzpmrVqqXKlStr/fr1at68uQuTAXAEZowDAAAAuKVevXrpu+++07p161SuXDlXxwEAIM+qXLmyNmzYoKSkJB0/flxbt27V1atXdc8997g6GgAA+c4999yjEiVK6ODBg66OAsABKIwDAAAAuCHDMNSrVy8tWrRIa9euVaVKlVwdCQCAfMHb21ulS5fWhQsXtHLlSj311FOujgQAQL7z559/6ty5cypdurSrowBwAJZSB3BXSUpKsvnrvCNHjiguLk7FihVThQoVXJgMAIC8yWw2a+7cuVqyZIl8fX116tQpSZK/v78KFSrk4nQAAOQ9K1eulGEYqlatmg4ePKhBgwbpvvvuU1RUlKujAQBw17vZvy8XK1ZMI0eOVIcOHRQYGKhDhw7p9ddfV5UqVRQeHu7C1AAcxWQYhuHqEACQXevXr1ezZs0ytUdERGjmzJl3PhAAAHmcyWTKsn3GjBmKjIy8s2EAAMgHvvnmGw0ZMkR//vmnihUrpg4dOmjMmDHy9/d3dTQAAO56N/v35Y8//lht27bVzp07dfHiRZUpU0aPPfaYRo0apYCAABekBeBoFMYBAAAAAAAAAAAAAHka9xgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdRGAcAAAAAAAAAAAAA5GkUxgEAAAAAAAAAAAAAeRqFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp1EYBwAAAAAXOHr0qEwmk+Li4lwdxWrfvn166KGHVLBgQYWGhro6jtNERkaqbdu2ro6RqwQFBWny5MkuOfbMmTNVpEgRh43XtGlT9evX744e83Y463dAbn5/r1+/XiaTSRcvXnR1FAAAAAD5EIVxAAAAAPlSZGSkTCaTxo8fb9O+ePFimUwmF6VyreHDh8vb21v79+9XTExMln2yU3TM7aZMmaKZM2fa9RxXFo7vhNjYWL300kvZ7p8bCss3snDhQo0aNcq6nVuvXfny5XXy5Endf//9ro5yxzRo0EAnT56Uv7+/Q8c1mUxavHixQ8cEAAAAkPdQGAcAAACQbxUsWFATJkzQhQsXXB3FYVJTU2/7uYcOHVKjRo1UsWJFFS9e/LbHMQxDaWlpt/18Z0lPT1dGRob8/f1zbVHXVUqWLKnChQu7OoZDFCtWTL6+vq6OcUvu7u4KDAyUh4eHq6PcEVevXpWnp6cCAwPz7R8fAQAAAHAtCuMAAAAA8q0WLVooMDBQ48aNu2GfESNGZFpWfPLkyQoKCrJuX1u6eOzYsQoICFCRIkUUHR2ttLQ0DRo0SMWKFVO5cuU0Y8aMTOPv27dPDRo0UMGCBXX//fdrw4YNNvt/++03tWrVSj4+PgoICNBzzz2ns2fPWvc3bdpUvXr1Ur9+/VSiRAmFh4dn+ToyMjIUHR2tcuXKycvLS6GhoVqxYoV1v8lk0vbt2xUdHS2TyaQRI0ZkGiMyMlIbNmzQlClTZDKZZDKZdPToUevyyD/88IPq1KkjLy8v/fTTTzp06JCeeuopBQQEyMfHR2FhYVqzZo3NmEFBQRo7dqy6d+8uX19fVahQQZ988ol1f2pqqnr16qXSpUurYMGCqlixos31unjxonr27KmAgADrOfzuu+8k/d+s5qVLlyo4OFheXl46duxYpqWmr53DXr16yd/fXyVKlNDQoUNlGIZ1/x9//KHXXnvN+rol6Y8//lCbNm1UtGhReXt7q0aNGvr++++zPP+SNHv2bNWtW1e+vr4KDAzUs88+qzNnzlj3X7hwQV27dlXJkiVVqFAh3Xvvvdb3zK3Ow7Fjx/TUU0/Jx8dHfn5+6tSpk06fPm1z/GXLliksLEwFCxZUiRIl1K5dO5vrcP2s6kmTJqlmzZry9vZW+fLl9eqrryopKUnSv8thR0VFKSEhwXo+rr1fUlJSNHDgQJUtW1be3t6qV6+e1q9fb5Nj5syZqlChggoXLqx27drp3LlzNzxnktSxY0f16tXLut2vXz+ZTCbt27fPem68vb2t763rVzW40bW7ZuXKlapevbp8fHzUsmVLnTx58oY50tPT9cILL6hSpUoqVKiQqlWrpilTptw0+82u6X+XUr/2OYqJiVHdunVVuHBhNWjQQPv377cZc/To0SpVqpR8fX314osvavDgwTe99UFGRobGjRtnzR0SEqIFCxbcsP+bb76pevXqZWoPCQlRdHS0pH9XGHj00UdVokQJ+fv7q0mTJtqxY4dNf5PJpI8//lhPPvmkvL29NWbMmExLqZ87d05dunRR2bJlVbhwYdWsWVNfffWVzThNmzZVnz599Prrr6tYsWIKDAy0+f107Xdxu3btZDKZbH43AwAAAMD1KIwDAAAAyLfc3d01duxYTZ06VX/++WeOxlq7dq1OnDihjRs3atKkSRo+fLieeOIJFS1aVFu2bNHLL7+snj17ZjrOoEGDNGDAAO3cuVP169dXmzZtrIXCixcv6pFHHlHt2rW1bds2rVixQqdPn1anTp1sxpg1a5Y8PT21adMmTZs2Lct8U6ZM0Xvvvad3331Xu3fvVnh4uJ588kkdOHBAknTy5EnVqFFDAwYM0MmTJzVw4MAsx6hfv7569OihkydP6uTJkypfvrx1/+DBgzV+/Hjt3btXtWrVUlJSkh5//HHFxMRo586datmypdq0aaNjx47ZjPvee++pbt262rlzp1599VW98sor1mLgBx98oKVLl+qbb77R/v37NWfOHGvhKyMjQ61atdKmTZv05ZdfKj4+XuPHj5e7u7t17MuXL2vChAn67LPPtGfPHpUqVSrL8zNr1ix5eHho69atmjJliiZNmqTPPvtM0r9Lc5crV07R0dHW1y1JZrNZKSkp2rhxo3799VdNmDBBPj4+WY4v/TtjdtSoUdq1a5cWL16so0ePKjIy0rp/6NChio+P1w8//KC9e/fq448/VokSJbJ1Hp566imdP39eGzZs0OrVq3X48GF17tzZOvby5cvVrl07Pf7449q5c6diYmL04IMP3jCrm5ubPvjgA+3Zs0ezZs3S2rVr9frrr0v6dznsyZMny8/Pz3o+rr1fevXqpc2bN+vrr7/W7t279fTTT6tly5bW99mWLVv0wgsvqFevXoqLi1OzZs00evToG+aQpCZNmtgU1zds2KASJUpY22JjY3X16lU1aNAg03NvdO2kf98b7777rmbPnq2NGzfq2LFjWb7vr8nIyFC5cuU0f/58xcfHa9iwYXrzzTf1zTff3PA5N7umN/LWW2/pvffe07Zt2+Th4aHu3btb982ZM0djxozRhAkTtH37dlWoUEEff/zxTccbN26cvvjiC02bNk179uzRa6+9pm7dumX6I5xrunbtqq1bt+rQoUPWtj179mj37t169tlnJUmXLl1SRESEfvrpJ/3yyy+699579fjjj+vSpUs2Y40YMULt2rXTr7/+avM6rrly5Yrq1Kmj5cuX67ffftNLL72k5557Tlu3brXpN2vWLHl7e2vLli2aOHGioqOjtXr1akn/Xn9JmjFjhk6ePGndBgAAAIBMDAAAAADIhyIiIoynnnrKMAzDeOihh4zu3bsbhmEYixYtMq7/X6Xhw4cbISEhNs99//33jYoVK9qMVbFiRSM9Pd3aVq1aNePhhx+2bqelpRne3t7GV199ZRiGYRw5csSQZIwfP97a5+rVq0a5cuWMCRMmGIZhGKNGjTIee+wxm2MfP37ckGTs37/fMAzDaNKkiVG7du1bvt4yZcoYY8aMsWkLCwszXn31Vet2SEiIMXz48JuO06RJE6Nv3742bevWrTMkGYsXL75ljho1ahhTp061blesWNHo1q2bdTsjI8MoVaqU8fHHHxuGYRi9e/c2HnnkESMjIyPTWCtXrjTc3Nys5+K/ZsyYYUgy4uLibNqvv/bXXlP16tVtjvHGG28Y1atXt8n5/vvv24xTs2ZNY8SIEbd8zTcSGxtrSDIuXbpkGIZhtGnTxoiKisqy783Ow6pVqwx3d3fj2LFj1rY9e/YYkoytW7cahmEY9evXN7p27XrDLFm9vuvNnz/fKF68uHV7xowZhr+/v02fP/74w3B3dzf++usvm/bmzZsbQ4YMMQzDMLp06WI8/vjjNvs7d+6caazr7d692zCZTMaZM2eM8+fPG56ensaoUaOMzp07G4ZhGKNHjzYaNGhg7f/f92hWr+3ae+PgwYPWNovFYgQEBNwwR1bMZrPRoUOHG+6/2TW99jtg586dhmH83+dozZo11j7Lly83JBn//POPYRiGUa9ePcNsNtuM07BhQ5vfUde/v69cuWIULlzY+Pnnn22e88ILLxhdunS5Ye6QkBAjOjrauj1kyBCjXr16N+yfnp5u+Pr6GsuWLbO2STL69etn0+/aa7xw4cINx2rdurUxYMAA63aTJk2MRo0a2fQJCwsz3njjDZtjLVq06IZjAgAAAIBhGAYzxgEAAADkexMmTNCsWbO0d+/e2x6jRo0acnP7v//FCggIUM2aNa3b7u7uKl68uM3S2ZJUv359688eHh6qW7euNceuXbu0bt06+fj4WB/33XefJNnM5qxTp85NsyUmJurEiRNq2LChTXvDhg1z9Jr/q27dujbbSUlJGjhwoKpXr64iRYrIx8dHe/fuzTRjvFatWtafTSaTAgMDrecpMjJScXFxqlatmvr06aNVq1ZZ+8bFxalcuXKqWrXqDTN5enrajH8jDz30kM0y2/Xr19eBAweUnp5+w+f06dNHo0ePVsOGDTV8+HDt3r37psfYvn272rRpowoVKsjX11dNmjSRJOv5eOWVV/T1118rNDRUr7/+un7++Wfrc292Hvbu3avy5cvbzN4PDg5WkSJFrNc3Li5OzZs3v+V5uGbNmjVq3ry5ypYtK19fXz333HM6d+6cLl++fMPn/Prrr0pPT1fVqlVt3rMbNmywvl/37t2baZnu6z8DWbn//vtVrFgxbdiwQT/++KNq166tJ554wjrjecOGDWratGm2X9s1hQsXVuXKla3bpUuXzvT5/C+LxaI6deqoZMmS8vHx0SeffJLp/Xy9m13TG7n+/Vq6dGlJsubav39/ppn+N5v5f/DgQV2+fFmPPvqozTX54osvbH6H/FfXrl01d+5cSZJhGPrqq6/UtWtX6/7Tp0+rR48euvfee+Xv7y8/Pz8lJSVlOhf//Z3wX+np6Ro1apRq1qypYsWKycfHRytXrrzp7wgpe9cKAAAAAP6LwjgAAACAfK9x48YKDw/XkCFDMu1zc3Oz3mv6mqtXr2bqV6BAAZttk8mUZVtGRka2cyUlJalNmzaKi4uzeRw4cECNGze29vP29s72mM703xwDBw7UokWLNHbsWP3444+Ki4tTzZo1lZqaatPvZufpgQce0JEjRzRq1Cj9888/6tSpkzp27ChJKlSo0C0zFSpUKNN9pR3lxRdf1OHDh/Xcc8/p119/Vd26dTV16tQs+yYnJys8PFx+fn6aM2eOYmNjtWjRIkmyno9WrVpZ74d94sQJNW/e3Lq0983OQ3Zk51xdc/ToUT3xxBOqVauWvv32W23fvl0Wi8Uma1aSkpLk7u6u7du327xf9+7de8t7cd+MyWRS48aNtX79emsRvFatWkpJSdFvv/2mn3/+2fpHBvbI6n3338/69b7++msNHDhQL7zwglatWqW4uDhFRUXd9Jzc7JpmJ9e19649vzeud+2+8MuXL7e5JvHx8Te9z3iXLl20f/9+7dixQz///LOOHz9uszR/RESE4uLiNGXKFP3888+Ki4tT8eLFM52LW/1ueueddzRlyhS98cYbWrduneLi4hQeHm7X7wgAAAAAyC4K4wAAAAAgafz48Vq2bJk2b95s016yZEmdOnXKpmAWFxfnsOP+8ssv1p/T0tK0fft2Va9eXdK/xdA9e/YoKChIVapUsXnYUwz38/NTmTJltGnTJpv2TZs2KTg42K68np6eN51F/d/xIyMj1a5dO9WsWVOBgYE6evSoXceT/s3fuXNnffrpp5o3b56+/fZbnT9/XrVq1dKff/6p33//3e4x/2vLli0229fum3ztfuU3et3ly5fXyy+/rIULF2rAgAH69NNPsxx/3759OnfunMaPH6+HH35Y9913X5YzXkuWLKmIiAh9+eWXmjx5sj755BPrvhudh+rVq+v48eM6fvy4tW98fLwuXrxovb61atVSTExMts7F9u3blZGRoffee08PPfSQqlatqhMnTtj0yep81K5dW+np6Tpz5kym92tgYKAkqXr16lme61u5dp/x9evXq2nTpnJzc1Pjxo31zjvvKCUlJdNqCLfKejs2bdqkBg0a6NVXX1Xt2rVVpUqVm866vuZm19Re1apVy3QP7ZvdUzs4OFheXl46duxYpmty/QoD/1WuXDk1adJEc+bM0Zw5c/Too4+qVKlS1v2bNm1Snz599Pjjj6tGjRry8vLS2bNn7X49mzZt0lNPPaVu3bopJCRE99xzz219ngsUKOCQawwAAAAgb6MwDgAAAACSatasqa5du+qDDz6waW/atKn+/vtvTZw4UYcOHZLFYtEPP/zgsONaLBYtWrRI+/btk9ls1oULF9S9e3dJktls1vnz59WlSxfFxsbq0KFDWrlypaKiouwuAg0aNEgTJkzQvHnztH//fg0ePFhxcXHq27evXeMEBQVpy5YtOnr0qM6ePXvTWZv33nuvFi5cqLi4OO3atUvPPvus3bM8J02apK+++kr79u3T77//rvnz5yswMFBFihRRkyZN1LhxY3Xo0EGrV6/WkSNH9MMPP2jFihV2HUP6dznz/v37a//+/frqq680depUm3MTFBSkjRs36q+//rIWAPv166eVK1fqyJEj2rFjh9atW2f9o4b/qlChgjw9PTV16lQdPnxYS5cu1ahRo2z6DBs2TEuWLNHBgwe1Z88efffdd9bxbnYeWrRoYX3/7tixQ1u3btXzzz+vJk2aWJeyHj58uL766isNHz5ce/fu1a+//qoJEyZkmbVKlSq6evWqNevs2bM1bdo0mz5BQUFKSkpSTEyMzp49q8uXL6tq1arq2rWrnn/+eS1cuFBHjhzR1q1bNW7cOC1fvlzSv8vPr1ixQu+++64OHDigDz/8MFvXq2nTpoqPj9eePXvUqFEja9ucOXNUt27dm/6hSFbX7nbce++92rZtm1auXKnff/9dQ4cOvWlRWrr5Nb0dvXv31ueff65Zs2bpwIEDGj16tHbv3n3DVRF8fX01cOBAvfbaa5o1a5YOHTqkHTt2aOrUqZo1a9ZNj9W1a1d9/fXXmj9/vs0y6tK/52L27Nnau3evtmzZoq5du9q1KsH146xevVo///yz9u7dq549e+r06dN2jxMUFKSYmBidOnVKFy5csPv5AAAAAPIHCuMAAAAA8P9FR0dnKtxWr15dH330kSwWi0JCQrR169ZbLoVsj/Hjx2v8+PEKCQnRTz/9pKVLl6pEiRKSZJ3lnZ6erscee0w1a9ZUv379VKRIEZv7mWdHnz591L9/fw0YMEA1a9bUihUrtHTpUt177712jTNw4EC5u7srODhYJUuWvOn9lSdNmqSiRYuqQYMGatOmjcLDw/XAAw/YdTxfX19NnDhRdevWVVhYmI4eParvv//e+vq//fZbhYWFqUuXLgoODtbrr79+WzNHn3/+ef3zzz968MEHZTab1bdvX7300kvW/dHR0Tp69KgqV66skiVLSvr3/shms1nVq1dXy5YtVbVqVX300UdZjl+yZEnNnDlT8+fPV3BwsMaPH693333Xpo+np6eGDBmiWrVqqXHjxnJ3d9fXX399y/NgMpm0ZMkSFS1aVI0bN1aLFi10zz33aN68edaxmzZtqvnz52vp0qUKDQ3VI488oq1bt2aZNSQkRJMmTdKECRN0//33a86cORo3bpxNnwYNGujll19W586dVbJkSU2cOFGSNGPGDD3//PMaMGCAqlWrprZt2yo2NlYVKlSQ9O+93D/99FNNmTJFISEhWrVqld5+++1bXp+aNWuqSJEiCg0NlY+Pj/U1paen3/L+4lldu9vRs2dPtW/fXp07d1a9evV07tw5vfrqqzd9zs2u6e3o2rWrhgwZooEDB1qX14+MjFTBggVv+JxRo0Zp6NChGjdunPW9unz5clWqVOmmx+rYsaP1vvJt27a12ff555/rwoULeuCBB/Tcc8+pT58+NjPKs+vtt9/WAw88oPDwcDVt2lSBgYGZjpUd7733nlavXq3y5curdu3adj8fAAAAQP5gMm52Ay0AAAAAAPK4pk2bKjQ0VJMnT3Z1FMBujz76qAIDAzV79mxXRwEAAACAXM3D1QEAAAAAAABwa5cvX9a0adMUHh4ud3d3ffXVV1qzZo1Wr17t6mgAAAAAkOtRGAcAAAAAALgLmEwmff/99xozZoyuXLmiatWq6dtvv1WLFi1cHQ0AAAAAcj2WUgcAAAAAAAAAAAAA5Glurg4AAAAAAAAAAAAAAIAzURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ5GYRwAAAAAAAAAAAAAkKdRGAcAAAAAAAAAAAAA5GkUxgEAAAAAAAAAAAAAeRqFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp/0/l+Z9bE2aRhsAAAAASUVORK5CYII=", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8YAAAGGCAYAAAAJj+sGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABoCElEQVR4nO3deVhU5f//8dcAgsrmDu6YpkkpmJKp5ZIWmpmappkmYJnVuGullRvuVqbZlJ8Wl0zLNNcsN9zKTHFBTdRc03LLDQRSBM7vj37O1wlURmYchOfjuua6OPe55z6vOWeGKd/c9zEZhmEIAAAAAAAAAAAAAIA8ys3VAQAAAAAAAAAAAAAAcCYK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAcJcZPny4TCbTHTlW48aN1bhxY+v2unXrZDKZNH/+/Dty/MjISAUFBd2RY92upKQkvfTSSwoMDJTJZFLfvn1dHcnljh49KpPJpBkzZrg6Sr70388tAAAAAIDCOAAAAAC41IwZM2QymayPggULqkyZMgoPD9eHH36oS5cuOeQ4J06c0PDhwxUXF+eQ8RwpN2fLjjFjxmjGjBl69dVXNWvWLL3wwgs37bto0aI7F+4ulJKSouHDh2vdunWujoKb+OGHHzR8+HBXxwAAAACAbDMZhmG4OgQAAAAA5FczZsxQVFSUoqOjValSJV29elWnTp3SunXrtGrVKlWoUEFLlixRzZo1rc9JS0tTWlqaChYsmO3jbN26VWFhYZo+fboiIyOz/bzU1FRJkqenp6R/Z4w3adJE8+bNU/v27bM9zu1mu3r1qjIyMuTl5eWQYznDww8/LA8PD/3888+37Ovj46P27dvn+ZnUhmHoypUrKlCggNzd3e167tmzZ1WyZEkNGzaMwutt+u/n1hl69uwpi8Ui/lkJAAAAwN3Cw9UBAAAAAABSixYtVKdOHev24MGDtWbNGj311FN6+umntXfvXhUqVEiS5OHhIQ8P5/7vXEpKigoXLuzUwlp2FChQwKXHz44zZ84oODjY4eMmJyfL29vb4eM6U1pamjIyMuTp6WnXH27kBYZh6PLly9bPqSvkls8tAAAAAORGLKUOAAAAALnUY489piFDhuiPP/7QV199ZW3P6h7jq1at0iOPPKIiRYrIx8dH1apV01tvvSXp31neYWFhkqSoqCjrsu3XZi03btxYDzzwgLZt26aGDRuqcOHC1ufe6F7F6enpeuuttxQYGChvb289/fTTOn78uE2foKCgLGenXz/mrbJldY/x5ORkDRgwQOXLl5eXl5eqVaum9957L9PMVZPJpJ49e2rRokV64IEH5OXlpfvvv1/Lly/P+oT/x5kzZ/Tiiy8qICBABQsWVEhIiGbOnGndf+1+60eOHNGyZcus2Y8ePZrleCaTScnJyZo5c6a177Xzc+2axsfH6/nnn1fRokX1yCOPSJJ27dqlyMhI3XPPPSpYsKACAwPVrVs3nTt3zmb8a2McPHhQkZGRKlKkiPz9/RUVFaWUlBSbvjd7v1xz+fJlDR8+XFWrVlXBggVVunRpPfPMMzp06JCk/7uP+HvvvadJkyapcuXK8vLyUnx8fJb3GI+MjJSPj48OHz6s8PBweXt7q0yZMoqOjrZeu6NHj6pkyZKSpBEjRljP07WZ46dOnVJUVJTKlSsnLy8vlS5dWq1bt77hObfn2NdkZGRo0qRJuv/++1WwYEEFBASoR48eunDhgk2/oKAgPfXUU1qxYoXq1KmjQoUK6X//+1+Wx+/Zs6d8fHwyXQdJ6tSpkwIDA5Weni5JWrx4sVq2bKkyZcrIy8tLlStX1siRI637r7Hnc5uamqqhQ4eqdu3a8vf3l7e3tx599FGtXbvWZszrr+mnn35qvaZhYWGKjY21OZ8Wi0WSbG4FAQAAAAC5GTPGAQAAACAXe+GFF/TWW29p5cqV6t69e5Z99uzZo6eeeko1a9ZUdHS0vLy8dPDgQW3cuFGSVL16dUVHR2vo0KF6+eWX9eijj0qS6tevbx3j3LlzatGihZ577jl16dJFAQEBN801evRomUwmvfnmmzpz5owmTZqkZs2aKS4uzq4Zs9nJdj3DMPT0009r7dq1evHFFxUaGqoVK1bo9ddf119//aUPPvjApv/PP/+sBQsW6LXXXpOvr68+/PBDtWvXTseOHVPx4sVvmOuff/5R48aNdfDgQfXs2VOVKlXSvHnzFBkZqYsXL6pPnz6qXr26Zs2apX79+qlcuXIaMGCAJFkLu/81a9YsvfTSS3rooYf08ssvS5IqV65s0+fZZ5/VvffeqzFjxlgLtqtWrdLhw4cVFRWlwMBA7dmzR59++qn27NmjX3/9NVNBskOHDqpUqZLGjh2r7du36/PPP1epUqU0fvx4Sbd+v0j//uHDU089pZiYGD333HPq06ePLl26pFWrVum3336zyT19+nRdvnxZL7/8sry8vFSsWDFlZGRkeQ7S09PVvHlzPfzww5owYYKWL1+uYcOGKS0tTdHR0SpZsqQ++eQTvfrqq2rbtq2eeeYZSbLeSqBdu3bas2ePevXqpaCgIJ05c0arVq3SsWPHMv0Bhb3HvqZHjx7WWxz07t1bR44c0UcffaQdO3Zo48aNNqsY7N+/X506dVKPHj3UvXt3VatWLctjd+zYURaLRcuWLdOzzz5rbU9JSdHSpUsVGRlpXXJ+xowZ8vHxUf/+/eXj46M1a9Zo6NChSkxM1LvvvmszbnY/t4mJifr888/VqVMnde/eXZcuXdIXX3yh8PBwbdmyRaGhoTb958yZo0uXLqlHjx4ymUyaMGGCnnnmGR0+fFgFChRQjx49dOLECa1atUqzZs266XkHAAAAgFzDAAAAAAC4zPTp0w1JRmxs7A37+Pv7G7Vq1bJuDxs2zLj+f+c++OADQ5Lx999/33CM2NhYQ5Ixffr0TPsaNWpkSDKmTp2a5b5GjRpZt9euXWtIMsqWLWskJiZa27/99ltDkjF58mRrW8WKFY2IiIhbjnmzbBEREUbFihWt24sWLTIkGaNGjbLp1759e8NkMhkHDx60tkkyPD09bdp27txpSDKmTJmS6VjXmzRpkiHJ+Oqrr6xtqampRr169QwfHx+b116xYkWjZcuWNx3vGm9v7yzPybVr2qlTp0z7UlJSMrV9/fXXhiRjw4YNmcbo1q2bTd+2bdsaxYsXt25n5/0ybdo0Q5IxceLETPsyMjIMwzCMI0eOGJIMPz8/48yZMzZ9ru27/ppGREQYkoxevXrZjNWyZUvD09PTmufvv/82JBnDhg2zGfPChQuGJOPdd9+9Ye4bye6xf/rpJ0OSMXv2bJvnL1++PFN7xYoVDUnG8uXLb3n8jIwMo2zZska7du1s2q99bq6/jlld7x49ehiFCxc2Ll++bG2z53OblpZmXLlyxabPhQsXjICAAJv3y7XrVrx4ceP8+fPW9sWLFxuSjKVLl1rbzGazwT8rAQAAALibsJQ6AAAAAORyPj4+unTp0g33FylSRNK/SzDfaKburXh5eSkqKirb/bt27SpfX1/rdvv27VW6dGn98MMPt3X87Prhhx/k7u6u3r1727QPGDBAhmHoxx9/tGlv1qyZzezmmjVrys/PT4cPH77lcQIDA9WpUydrW4ECBdS7d28lJSVp/fr1Dng1mb3yyiuZ2q6fgX/58mWdPXtWDz/8sCRp+/bttxzj0Ucf1blz55SYmCgpe++X7777TiVKlFCvXr0y7fvvDPV27drdcJZ8Vnr27GkzVs+ePZWamqrVq1ff9HmFChWSp6en1q1bl2lZc0cde968efL399fjjz+us2fPWh+1a9eWj49PpqXHK1WqpPDw8Fse12Qy6dlnn9UPP/ygpKQka/vcuXNVtmxZ67L5117nNZcuXdLZs2f16KOPKiUlRfv27bMZN7ufW3d3d+t9xzMyMnT+/HmlpaWpTp06Wb6HOnbsqKJFi1q3r63kcKvPDQAAAADkZhTGAQAAACCXS0pKsilC/1fHjh3VoEEDvfTSSwoICNBzzz2nb7/91q4iedmyZa2Fs+y49957bbZNJpOqVKlyy3s959Qff/yhMmXKZDof1atXt+6/XoUKFTKNUbRo0VsWVv/44w/de++9cnOz/d/mGx3HUSpVqpSp7fz58+rTp48CAgJUqFAhlSxZ0tovISEhU///vuZrBc5rrzk775dDhw6pWrVq8vC49R3Yssp8I25ubrrnnnts2qpWrSpJt3zveHl5afz48frxxx8VEBCghg0basKECTp16pTDjn3gwAElJCSoVKlSKlmypM0jKSlJZ86csXm+Pa+9Y8eO+ueff7RkyRJJ/36uf/jhBz377LM2f2ywZ88etW3bVv7+/vLz81PJkiXVpUsXSZmvtz2f25kzZ6pmzZoqWLCgihcvrpIlS2rZsmW39R4CAAAAgLsR9xgHAAAAgFzszz//VEJCgqpUqXLDPoUKFdKGDRu0du1aLVu2TMuXL9fcuXP12GOPaeXKldZ7F9+MPfcFz67/ziy+Jj09PVuZHOFGxzH+//27c5usrkOHDh30yy+/6PXXX1doaKh8fHyUkZGh5s2bZ/nHD7d6zY54v9wqs7P07dtXrVq10qJFi7RixQoNGTJEY8eO1Zo1a1SrVq0cj5+RkaFSpUpp9uzZWe7/78x4e177ww8/rKCgIH377bd6/vnntXTpUv3zzz/q2LGjtc/FixfVqFEj+fn5KTo6WpUrV1bBggW1fft2vfnmm5mud3aP/9VXXykyMlJt2rTR66+/rlKlSsnd3V1jx47VoUOHMvW/2z43AAAAAJAdFMYBAAAAIBebNWuWJN1yuWY3Nzc1bdpUTZs21cSJEzVmzBi9/fbbWrt2rZo1a3bDIvXtOnDggM22YRg6ePCgatasaW0rWrSoLl68mOm5f/zxh83MXXuyVaxYUatXr9alS5dsZo1fW2K6YsWK2R7rVsfZtWuXMjIybGaN5/Q49l6HCxcuKCYmRiNGjNDQoUOt7f89//a61fulcuXK2rx5s65evaoCBQrk6FjXy8jI0OHDh60ztSXp999/lyQFBQVJuvU5qly5sgYMGKABAwbowIEDCg0N1fvvv6+vvvoqx8euXLmyVq9erQYNGjil4N+hQwdNnjxZiYmJmjt3roKCgqzL4kvSunXrdO7cOS1YsEANGza0th85ciRHx50/f77uueceLViwwOb8Dhs27LbHdPTvFAAAAABwNpZSBwAAAIBcas2aNRo5cqQqVaqkzp0737Df+fPnM7WFhoZKkq5cuSJJ8vb2lqQsC9W348svv7S57/n8+fN18uRJtWjRwtpWuXJl/frrr0pNTbW2ff/99zp+/LjNWPZke/LJJ5Wenq6PPvrIpv2DDz6QyWSyOX5OPPnkkzp16pTmzp1rbUtLS9OUKVPk4+OjRo0a3da43t7edl2DazN3/ztTd9KkSbd1fCl775d27drp7Nmzmc5zVlnsdf2YhmHoo48+UoECBdS0aVNJUuHChSVlfj+kpKTo8uXLNm2VK1eWr6+vNXdOj92hQwelp6dr5MiRmZ6blpaW489Px44ddeXKFc2cOVPLly9Xhw4dbPZndb1TU1P18ccf5+i4WY27efNmbdq06bbHdPTvFAAAAABwNmaMAwAAAEAu8OOPP2rfvn1KS0vT6dOntWbNGq1atUoVK1bUkiVLVLBgwRs+Nzo6Whs2bFDLli1VsWJFnTlzRh9//LHKlSunRx55RNK/BcQiRYpo6tSp8vX1lbe3t+rWrWvXPZKvV6xYMT3yyCOKiorS6dOnNWnSJFWpUkXdu3e39nnppZc0f/58NW/eXB06dNChQ4f01VdfqXLlyjZj2ZOtVatWatKkid5++20dPXpUISEhWrlypRYvXqy+fftmGvt2vfzyy/rf//6nyMhIbdu2TUFBQZo/f742btyoSZMm3fSe7zdTu3ZtrV69WhMnTlSZMmVUqVIl1a1b94b9/fz8rPfSvnr1qsqWLauVK1fmaAZxdt4vXbt21Zdffqn+/ftry5YtevTRR5WcnKzVq1frtddeU+vWrW/r2AULFtTy5csVERGhunXr6scff9SyZcv01ltvWZcpL1SokIKDgzV37lxVrVpVxYoV0wMPPKC0tDQ1bdpUHTp0UHBwsDw8PLRw4UKdPn1azz33nEOO3ahRI/Xo0UNjx45VXFycnnjiCRUoUEAHDhzQvHnzNHnyZLVv3/62XrskPfjgg6pSpYrefvttXblyxWYZdUmqX7++ihYtqoiICPXu3Vsmk0mzZs3K8R8jPPXUU1qwYIHatm2rli1b6siRI5o6daqCg4OVlJR0W2PWrl1bktS7d2+Fh4fL3d09W9cBAAAAAFyFwjgAAAAA5ALXlsn29PRUsWLFVKNGDU2aNElRUVG3LMI+/fTTOnr0qKZNm6azZ8+qRIkSatSokUaMGCF/f39JUoECBTRz5kwNHjxYr7zyitLS0jR9+vTbLoy/9dZb2rVrl8aOHatLly6padOm+vjjj62zfaV/l39///33NXHiRPXt21d16tTR999/rwEDBtiMZU82Nzc3LVmyREOHDtXcuXM1ffp0BQUF6d133800bk4UKlRI69at06BBgzRz5kwlJiaqWrVqmj59uiIjI2973IkTJ+rll1/WO++8o3/++cdapL2ZOXPmqFevXrJYLDIMQ0888YR+/PFHlSlT5rYyZOf94u7urh9++EGjR4/WnDlz9N1336l48eJ65JFHVKNGjds67rVxly9frldffVWvv/66fH19NWzYMJtl4iXp888/V69evdSvXz+lpqZq2LBh6tWrlzp16qSYmBjNmjVLHh4euu+++/Ttt9+qXbt2Djv21KlTVbt2bf3vf//TW2+9JQ8PDwUFBalLly5q0KDBbb/2azp27KjRo0erSpUqevDBB232FS9e3PoZeeedd1S0aFF16dJFTZs2veXtFG4mMjJSp06d0v/+9z+tWLFCwcHB+uqrrzRv3jytW7futsZ85pln1KtXL33zzTf66quvZBgGhXEAAAAAuZrJyOmfHQMAAAAAANxCZGSk5s+ff9szlO/WYwMAAAAAcgfuMQ4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPI17jAMAAAAAAAAAAAAA8jRmjAMAAAAAAAAAAAAA8jQK4wDumF69eql8+fLy8/NT2bJl1bdvX6WmpkqSGjduLC8vL/n4+FgfJ06cuOFYiYmJev755+Xn56eAgACNHDnSZn98fLyaNm2qokWLKjAwUC+//LJSUlIkSWfOnFHnzp1Vrlw5+fn5qVatWlqyZInzXjgAAAAAAAAAAABcKt8Xxg3DUGJiolhRHnC+1157Tfv27VNiYqJ27typnTt3asKECdb948ePV1JSkvVRpkyZG47Vq1cvnT9/XseOHdNPP/2kzz77TF9++aV1//PPP69q1arp9OnT2r17t3bu3GktniclJalWrVr69ddfdfHiRUVHR6tTp06Kj4933osHAAAAAAAAAACAy+T7wvilS5fk7++vS5cuuToKkOdVr15d3t7ekv79oxQ3NzcdOHDA7nFSUlL0zTffaNSoUSpSpIiqVq2qXr166YsvvrD2OXz4sLp06SJPT0+VLFlSTz/9tHbv3i1JuueeezRw4ECVK1dObm5uatWqlapVq6Zff/3VMS8UAAAAAAAAAAAAuUq+L4wDuLPGjRsnHx8flSpVSjt37lSvXr2s+0aNGqVixYqpVq1aNrO//2v//v1KTU1VaGiotS00NFS7du2ybg8cOFBffvml/vnnH506dUoLFy5Uq1atshzvzJkz2rt3r2rWrJnzFwgAAAAAAAAAAIBch8I4gDtq0KBBSkpKUnx8vF555RUFBgZKksaOHatDhw7p9OnTGjdunHr16qWFCxdmOUZSUpK8vb3l4eFhbStSpIjNyg8tWrTQzz//LF9fX5UuXVrly5dXt27dMo2Vmpqq5557Th06dFCdOnUc/GoBAAAAAAAAAACQG1AYB+AS1atXV0hIiCIjIyVJ9erVk7+/vwoUKKDw8HD16NFDc+fOzfK5Pj4+SklJUVpamrUtISFBvr6+kqQLFy6oWbNm6t69u1JSUnT+/Hl5e3urS5cuNuOkpqaqffv2Kly4sD777DPnvFAAAAAAAAAAAAC4HIVxAC5z9erVG95j3M3txr+eqlWrpgIFCmjnzp3Wtri4ONWoUUOSdOjQIf3zzz/q3bu3PD09VbRoUfXo0UPLli2z9k9NTdWzzz6r1NRUfffdd/L09HTQqwIAAAAAAAAAAEBuk28L4xaLRcHBwQoLC3N1FCBfSEpK0vTp03Xx4kUZhqHdu3dr1KhRCg8P18WLF/XDDz8oJSVF6enpiomJ0dSpU9WuXTvr8yMjI62zywsXLqyOHTtqyJAhSkhI0IEDBzRlyhS99NJLkqT77rtPPj4++vjjj5WWlqZLly7ps88+U61atST9W5Dv0KGDkpOTtWjRInl5ed3x8wEAAAAAAAAAAIA7J98Wxs1ms+Lj4xUbG+vqKEC+YDKZNGfOHFWuXFm+vr5q3bq1WrZsqUmTJunq1asaMWKEAgMDVbRoUfXr108TJ07Us88+a33+sWPH1KBBA+v2Rx99JH9/f5UrV04NGjTQiy++qK5du0r6d6n1pUuX6uuvv1aJEiUUFBSkixcvaubMmZKkX375RYsXL9bGjRtVokQJ+fj4yMfHR2PGjLmzJwUAAAAAAAAAAAB3hMkwDMPVIVwpMTFR/v7+SkhIkJ+fn6vjAMjClStXVLNmTf32228qUKCAq+MAAAAAAAAAAADgLuPh6gAAcCteXl7av3+/q2MAAAAAAAAAAADgLpVvl1IHAAAAAAAAAAAAAOQPFMYBAAAAAAAAAAAAAHlavi2MWywWBQcHKywszNVRAAAAAAAAAAAAAABOZDIMw3B1CFdKTEyUv7+/EhIS5Ofn5+o4AAAAAAAAAAAAAAAHy7czxgEAAAAAAAAAAAAA+YOHqwMg59pHRLs6AgDgDps/c6irIwAAAAAAAAAAcNdgxjgAAAAAAAAAAAAAIE+jMA4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8rR8Wxi3WCwKDg5WWFiYq6MAAAAAAAAAAAAAAJwo3xbGzWaz4uPjFRsb6+ooAAAAAAAAAAAAAAAnyreFcQAAAAAAAAAAAABA/kBhHAAAAAAAAAAAAACQp1EYBwAAAAAAAAAAAADkaRTGAQAAAAAAAAAAAAB5GoVxAAAAAAAAAAAAAECeRmEcAAAAAAAAAAAAAJCnURgHAAAAAAAAAAAAAORpFMYBAAAAAAAAAAAAAHkahXEAAAAAAAAAAAAAQJ6WbwvjFotFwcHBCgsLc3UUAAAAAAAAAAAAAIAT5dvCuNlsVnx8vGJjY10dBQAAAAAAAAAAAADgRPm2MA4AAAAAAAAAAAAAyB8ojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPI3COAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPK0PFMYT0lJUcWKFTVw4EBXRwEAAAAAAAAAAAAA5CJ5pjA+evRoPfzww66OAQAAAAAAAAAAAADIZfJEYfzAgQPat2+fWrRo4eooAAAAAAAAAAAAAIBcxuWF8Q0bNqhVq1YqU6aMTCaTFi1alKmPxWJRUFCQChYsqLp162rLli02+wcOHKixY8feocQAAAAAAAAAAAAAgLuJywvjycnJCgkJkcViyXL/3Llz1b9/fw0bNkzbt29XSEiIwsPDdebMGUnS4sWLVbVqVVWtWvVOxgYAAAAAAAAAAAAA3CU8XB2gRYsWN10CfeLEierevbuioqIkSVOnTtWyZcs0bdo0DRo0SL/++qu++eYbzZs3T0lJSbp69ar8/Pw0dOjQLMe7cuWKrly5Yt1OTEx07AsCAAAAAAAAAAAAAOQqLp8xfjOpqanatm2bmjVrZm1zc3NTs2bNtGnTJknS2LFjdfz4cR09elTvvfeeunfvfsOi+LX+/v7+1kf58uWd/joAAAAAAAAAAAAAAK6TqwvjZ8+eVXp6ugICAmzaAwICdOrUqdsac/DgwUpISLA+jh8/7oioAAAAAAAAAAAAAIBcyuVLqTtSZGTkLft4eXnJy8vL+WEAAAAAAAAAAAAAALlCrp4xXqJECbm7u+v06dM27adPn1ZgYGCOxrZYLAoODlZYWFiOxgEAAAAAAAAAAAAA5G65ujDu6emp2rVrKyYmxtqWkZGhmJgY1atXL0djm81mxcfHKzY2NqcxAQAAAAAAAAAAAAC5mMuXUk9KStLBgwet20eOHFFcXJyKFSumChUqqH///oqIiFCdOnX00EMPadKkSUpOTlZUVJQLUwMAAAAAAAAAAAAA7hYuL4xv3bpVTZo0sW73799fkhQREaEZM2aoY8eO+vvvvzV06FCdOnVKoaGhWr58uQICAnJ0XIvFIovFovT09ByNAwAAAAAAAAAAAADI3UyGYRiuDuFKiYmJ8vf3V0JCgvz8/Fwd57a0j4h2dQQAwB02f+ZQV0cAAAAAAAAAAOCukavvMQ4AAAAAAAAAAAAAQE5RGAcAAAAAAAAAAAAA5Gn5tjBusVgUHByssLAwV0cBAAAAAAAAAAAAADhRvi2Mm81mxcfHKzY21tVRAAAAAAAAAAAAAABOlG8L4wAAAAAAAAAAAACA/IHCOAAAAAAAAAAAAAAgT7O7ML59+3bt3r3bur148WK1adNGb731llJTUx0azpm4xzgAAAAAAAAAAAAA5A92F8Z79Oih33//XZJ0+PBhPffccypcuLDmzZunN954w+EBnYV7jAMAAAAAAAAAAABA/mB3Yfz3339XaGioJGnevHlq2LCh5syZoxkzZui7775zdD4AAAAAAAAAAAAAAHLE7sK4YRjKyMiQJK1evVpPPvmkJKl8+fI6e/asY9MBAAAAAAAAAAAAAJBDdhfG69Spo1GjRmnWrFlav369WrZsKUk6cuSIAgICHB4QAAAAAAAAAAAAAICcsLsw/sEHH2j79u3q2bOn3n77bVWpUkWSNH/+fNWvX9/hAZ3FYrEoODhYYWFhro4CAAAAAAAAAAAAAHAik2EYhiMGunz5sjw8POTh4eGI4e6YxMRE+fv7KyEhQX5+fq6Oc1vaR0S7OgIA4A6bP3OoqyMAAAAAAAAAAHDXsHvG+D333KNz585lar98+bKqVq3qkFAAAAAAAAAAAAAAADiK3YXxo0ePKj09PVP7lStX9OeffzokFAAAAAAAAAAAAAAAjpLtdc+XLFli/XnFihXy9/e3bqenpysmJkaVKlVybDoAAAAAAAAAAAAAAHIo24XxNm3aSJJMJpMiIiJs9hUoUEBBQUF6//33HRoOAAAAAAAAAAAAAICcynZhPCMjQ5JUqVIlxcbGqkSJEk4LdSdYLBZZLJYsl4UHAAAAAAAAAAAAAOQddt9j/MiRI3d9UVySzGaz4uPjFRsb6+ooAAAAAAAAAAAAAAAnyvaM8evFxMQoJiZGZ86csc4kv2batGkOCQYAAAAAAAAAAAAAgCPYXRgfMWKEoqOjVadOHZUuXVomk8kZuQAAAAAAAAAAAAAAcAi7C+NTp07VjBkz9MILLzgjDwAAAAAAAAAAAAAADmX3PcZTU1NVv359Z2QBAAAAAAAAAAAAAMDh7C6Mv/TSS5ozZ44zsgAAAAAAAAAAAAAA4HB2L6V++fJlffrpp1q9erVq1qypAgUK2OyfOHGiw8I5k8VikcViUXp6uqujAAAAAAAAAAAAAACcyO7C+K5duxQaGipJ+u2332z2mUwmh4S6E8xms8xmsxITE+Xv7+/qOAAAAAAAAAAAAAAAJ7G7ML527Vpn5AAAAAAAAAAAAAAAwCnsvsc4AAAAAAAAAAAAAAB3E7tnjEvS1q1b9e233+rYsWNKTU212bdgwQKHBAMAAAAAAAAAAAAAwBHsnjH+zTffqH79+tq7d68WLlyoq1evas+ePVqzZg336gYAAAAAAAAAAAAA5Dp2F8bHjBmjDz74QEuXLpWnp6cmT56sffv2qUOHDqpQoYIzMgIAAAAAAAAAAAAAcNvsLowfOnRILVu2lCR5enoqOTlZJpNJ/fr106effurwgAAAAAAAAAAAAAAA5ITdhfGiRYvq0qVLkqSyZcvqt99+kyRdvHhRKSkpjk0HAAAAAAAAAAAAAEAOedj7hIYNG2rVqlWqUaOGnn32WfXp00dr1qzRqlWr1LRpU2dkBAAAAAAAAAAAAADgttldGP/oo490+fJlSdLbb7+tAgUK6JdfflG7du30zjvvODwgAAAAAAAAAAAAAAA5YXdhvFixYtaf3dzcNGjQIIcGulMsFossFovS09NdHQUAAAAAAAAAAAAA4ETZKownJibKz8/P+vPNXOuX25nNZpnNZiUmJsrf39/VcQAAAAAAAAAAAAAATpKtwnjRokV18uRJlSpVSkWKFJHJZMrUxzAMmUwmZmADAAAAAAAAAAAAAHKVbBXG16xZY11Cfe3atU4NBAAAAAAAAAAAAACAI2WrMN6oUSNJUlpamtavX69u3bqpXLlyTg0GAAAAAAAAAAAAAIAjuNnT2cPDQ++++67S0tKclQcAAAAAAAAAAAAAAIeyqzAuSY899pjWr1/vjCwAAAAAAAAAAAAAADhctpZSv16LFi00aNAg7d69W7Vr15a3t7fN/qefftph4QAAAAAAAAAAAAAAyCm7C+OvvfaaJGnixImZ9plMJqWnp+c8FQAAAAAAAAAAAAAADmJ3YTwjI8MZOQAAAAAAAAAAAAAAcAq77zEOAAAAAAAAAAAAAMDdxO4Z45KUnJys9evX69ixY0pNTbXZ17t3b4cEAwAAAAAAAAAAAADAEewujO/YsUNPPvmkUlJSlJycrGLFiuns2bMqXLiwSpUqdccL4xcvXlSzZs2UlpamtLQ09enTR927d7+jGQAAAAAAAAAAAAAAuZfdS6n369dPrVq10oULF1SoUCH9+uuv+uOPP1S7dm299957zsh4U76+vtqwYYPi4uK0efNmjRkzRufOnbvjOQAAAAAAAAAAAAAAuZPdhfG4uDgNGDBAbm5ucnd315UrV1S+fHlNmDBBb731ljMy3pS7u7sKFy4sSbpy5YoMw5BhGHc8BwAAAAAAAAAAAAAgd7K7MF6gQAG5uf37tFKlSunYsWOSJH9/fx0/ftzuABs2bFCrVq1UpkwZmUwmLVq0KFMfi8WioKAgFSxYUHXr1tWWLVts9l+8eFEhISEqV66cXn/9dZUoUcLuHAAAAAAAAAAAAACAvMnuwnitWrUUGxsrSWrUqJGGDh2q2bNnq2/fvnrggQfsDpCcnKyQkBBZLJYs98+dO1f9+/fXsGHDtH37doWEhCg8PFxnzpyx9ilSpIh27typI0eOaM6cOTp9+rTdOQAAAAAAAAAAAAAAeZPdhfExY8aodOnSkqTRo0eraNGievXVV/X333/r008/tTtAixYtNGrUKLVt2zbL/RMnTlT37t0VFRWl4OBgTZ06VYULF9a0adMy9Q0ICFBISIh++umnGx7vypUrSkxMtHkAAAAAAAAAAAAAAPIuuwvjderUUZMmTST9u5T68uXLlZiYqG3btikkJMSh4VJTU7Vt2zY1a9bM2ubm5qZmzZpp06ZNkqTTp0/r0qVLkqSEhARt2LBB1apVu+GYY8eOlb+/v/VRvnx5h2YGAAAAAAAAAAAAAOQudhfGR40apSNHjjgjSyZnz55Venq6AgICbNoDAgJ06tQpSdIff/yhRx99VCEhIXr00UfVq1cv1ahR44ZjDh48WAkJCdbH7dwXHQAAAAAAAAAAAABw9/Cw9wnz5s3TsGHDVLduXXXp0kUdOnRQiRIlnJEtWx566CHFxcVlu7+Xl5e8vLycFwgAAAAAAAAAAAAAkKvYPWN8586d2rVrlxo3bqz33ntPZcqUUcuWLTVnzhylpKQ4NFyJEiXk7u6u06dP27SfPn1agYGBORrbYrEoODhYYWFhORoHAAAAAAAAAAAAAJC72V0Yl6T7779fY8aM0eHDh7V27VoFBQWpb9++OS5W/5enp6dq166tmJgYa1tGRoZiYmJUr169HI1tNpsVHx+v2NjYnMYEAAAAAAAAAAAAAORidi+l/l/e3t4qVKiQPD09denSJbufn5SUpIMHD1q3jxw5ori4OBUrVkwVKlRQ//79FRERoTp16uihhx7SpEmTlJycrKioqJxGBwAAAAAAAAAAAADkA7dVGD9y5IjmzJmjOXPmaP/+/WrUqJFGjBih9u3b2z3W1q1b1aRJE+t2//79JUkRERGaMWOGOnbsqL///ltDhw7VqVOnFBoaquXLlysgIOB2oltZLBZZLBalp6fnaBwAAAAAAAAAAAAAQO5mMgzDsOcJDz/8sGJjY1WzZk117txZnTp1UtmyZZ2Vz+kSExPl7++vhIQE+fn5uTrObWkfEe3qCACAO2z+zKGujgAAAAAAAAAAwF3D7hnjTZs21bRp0xQcHOyMPAAAAAAAAAAAAAAAOJTdhfHRo0c7IwcAAAAAAAAAAAAAAE7h5uoArmKxWBQcHKywsDBXRwEAAAAAAAAAAAAAOFG+LYybzWbFx8crNjbW1VEAAAAAAAAAAAAAAE6UbwvjAAAAAAAAAAAAAID8gcI4AAAAAAAAAAAAACBPu63C+E8//aQuXbqoXr16+uuvvyRJs2bN0s8//+zQcM7EPcYBAAAAAAAAAAAAIH+wuzD+3XffKTw8XIUKFdKOHTt05coVSVJCQoLGjBnj8IDOwj3GAQAAAAAAAAAAACB/sLswPmrUKE2dOlWfffaZChQoYG1v0KCBtm/f7tBwAAAAAAAAAAAAAADklN2F8f3796thw4aZ2v39/XXx4kVHZAIAAAAAAAAAAAAAwGHsLowHBgbq4MGDmdp//vln3XPPPQ4JBQAAAAAAAAAAAACAo9hdGO/evbv69OmjzZs3y2Qy6cSJE5o9e7YGDhyoV1991RkZncJisSg4OFhhYWGujgIAAAAAAAAAAAAAcCIPe58waNAgZWRkqGnTpkpJSVHDhg3l5eWlgQMHqlevXs7I6BRms1lms1mJiYny9/d3dRwAAAAAAAAAAAAAgJPYXRg3mUx6++239frrr+vgwYNKSkpScHCwfHx8nJEPAAAAAAAAAAAAAIAcsbswfo2np6eCg4MdmQUAAAAAAAAAAAAAAIezuzDepEkTmUymG+5fs2ZNjgIBAAAAAAAAAAAAAOBIdhfGQ0NDbbavXr2quLg4/fbbb4qIiHBULqezWCyyWCxKT093dRQAAAAAAAAAAAAAgBPZXRj/4IMPsmwfPny4kpKSchzoTjGbzTKbzUpMTJS/v7+r4wAAAAAAAAAAAAAAnMTNUQN16dJF06ZNc9RwAAAAAAAAAAAAAAA4hMMK45s2bVLBggUdNRwAAAAAAAAAAAAAAA5h91LqzzzzjM22YRg6efKktm7dqiFDhjgsGAAAAAAAAAAAAAAAjmB3Yfy/9+N2c3NTtWrVFB0drSeeeMJhwQAAAAAAAAAAAAAAcAS7C+PTp093Rg4AAAAAAAAAAAAAAJzCYfcYBwAAAAAAAAAAAAAgN7J7xnjRokVlMpmy1ff8+fN2B7pTLBaLLBaL0tPTXR0FAAAAAAAAAAAAAOBEdhfGhwwZolGjRik8PFz16tWTJG3atEkrVqzQkCFDVKxYMYeHdAaz2Syz2azExMRM900HAAAAAAAAAAAAAOQddhfGN27cqOjoaPXs2dPa1rt3b3300UdavXq1Fi1a5Mh8AAAAAAAAAAAAAADkiN33GF+xYoWaN2+eqb158+ZavXq1Q0IBAAAAAAAAAAAAAOAodhfGixcvrsWLF2dqX7x4sYoXL+6QUAAAAAAAAAAAAAAAOIrdS6mPGDFCL730ktatW6e6detKkjZv3qzly5frs88+c3hAAAAAAAAAAAAAAABywu7CeGRkpKpXr64PP/xQCxYskCRVr15dP//8s7VQDgAAAAAAAAAAAABAbmF3YVyS6tatq9mzZzs6CwAAAAAAAAAAAAAADpetwnhiYqL8/PysP9/MtX4AAAAAAAAAAAAAAOQG2SqMFy1aVCdPnlSpUqVUpEgRmUymTH0Mw5DJZFJ6errDQwIAAAAAAAAAAAAAcLuyVRhfs2aNihUrJklau3atUwMBAAAAAAAAAAAAAOBI2SqMN2rUKMuf72YWi0UWi4UZ7gAAAAAAAAAAAACQx7ndzpMuXryolStX6quvvtKXX35p87hbmM1mxcfHKzY21tVRAAAAAJf56KOPVKdOHXl5ealNmzY2+xITE/X888/Lz89PAQEBGjlypHXfmTNn1LlzZ5UrV05+fn6qVauWlixZcofTAwAAAAAAANmTrRnj11u6dKk6d+6spKQk+fn52dxv3GQyqWvXrg4NCAAAAMB5ypQpo3feeUerV6/Wn3/+abOvV69eOn/+vI4dO6YzZ86oWbNmqlixorp27aqkpCTVqlVL48ePV5kyZbRs2TI999xzio2NVXBwsIteDQAAAAAAAJA1u2eMDxgwQN26dVNSUpIuXryoCxcuWB/nz593RkYAAAAATvLMM8+oTZs2KlGihE17SkqKvvnmG40aNUpFihRR1apV1atXL33xxReSpHvuuUcDBw5UuXLl5ObmplatWqlatWr69ddfXfEyAAAAAAAAgJuyuzD+119/qXfv3ipcuLAz8gAAAADIBfbv36/U1FSFhoZa20JDQ7Vr164s+585c0Z79+5VzZo171BCAAAAAAAAIPvsLoyHh4dr69atzsgCAAAAIJdISkqSt7e3PDz+7+5LRYoU0aVLlzL1TU1N1XPPPacOHTqoTp06dzImAAAAAAAAkC1232O8ZcuWev311xUfH68aNWqoQIECNvuffvpph4UDAAAA4Bo+Pj5KSUlRWlqatTiekJAgX19fm36pqalq3769ChcurM8++8wVUQEAAAAAAIBbsrsw3r17d0lSdHR0pn0mk0np6ek5TwUAAADApapVq6YCBQpo586dql27tiQpLi5ONWrUsPZJTU3Vs88+q9TUVC1evFienp6uigsAAAAAAADclN1LqWdkZNzwQVEcAAAAuLukpaXp8uXLSktLU0ZGhi5fvqzU1FQVLlxYHTt21JAhQ5SQkKADBw5oypQpeumllyRJV69eVYcOHZScnKxFixbJy8vLxa8EAAAAAAAAuDG7C+MAAAAA8o5Ro0apUKFCGj16tJYuXapChQrpiSeekCR99NFH8vf3V7ly5dSgQQO9+OKL6tq1qyTpl19+0eLFi7Vx40aVKFFCPj4+8vHx0ZgxY1z5cgAAAAAAAIAsmQzDMOx5QlZLqF9v6NChOQp0pyUmJsrf318JCQny8/NzdZzb0j7i5tcEAJD3zJ95d33fAgAAAAAAAADgSnbfY3zhwoU221evXtWRI0fk4eGhypUr33WFcQAAAAAAAAAAAABA3mZ3YXzHjh2Z2hITExUZGam2bds6JJQ9jh8/rhdeeEFnzpyRh4eHhgwZomefffaO5wAAAAAAAAAAAAAA5E4Ouce4n5+fRowYoSFDhjhiOLt4eHho0qRJio+P18qVK9W3b18lJyff8RwAAAAAAAAAAAAAgNzJ7hnjN5KQkKCEhARHDZdtpUuXVunSpSVJgYGBKlGihM6fPy9vb+87ngUAAAAAAAAAAAAAkPvYXRj/8MMPbbYNw9DJkyc1a9YstWjRwu4AGzZs0Lvvvqtt27bp5MmTWrhwodq0aWPTx2Kx6N1339WpU6cUEhKiKVOm6KGHHso01rZt25Senq7y5cvbnQMAAAAAAAAAAAAAkDfZXRj/4IMPbLbd3NxUsmRJRUREaPDgwXYHSE5OVkhIiLp166Znnnkm0/65c+eqf//+mjp1qurWratJkyYpPDxc+/fvV6lSpaz9zp8/r65du+qzzz6zOwMAAAAAAAAAAAAAIO+yuzB+5MgRhwZo0aLFTWeaT5w4Ud27d1dUVJQkaerUqVq2bJmmTZumQYMGSZKuXLmiNm3aaNCgQapfv/5Nj3flyhVduXLFup2YmOiAVwEAAAAAAAAAAAAAyK0cdo9xZ0hNTdW2bdtsZqK7ubmpWbNm2rRpk6R/l3KPjIzUY489phdeeOGWY44dO1YjRoxwWmYAAJC3tY+IdnUEAMAdNn/mUFdHAAAAAAAAOeTm6gA3c/bsWaWnpysgIMCmPSAgQKdOnZIkbdy4UXPnztWiRYsUGhqq0NBQ7d69+4ZjDh48WAkJCdbH8ePHnfoaAAAAAAAAAAAAAACulatnjGfHI488ooyMjGz39/LykpeXlxMTAQAAAAAAAAAAAAByk1w9Y7xEiRJyd3fX6dOnbdpPnz6twMDAHI1tsVgUHByssLCwHI0DAAAAAAAAAAAAAMjdslUYf/DBB3XhwgVJUnR0tFJSUpwa6hpPT0/Vrl1bMTEx1raMjAzFxMSoXr16ORrbbDYrPj5esbGxOY0JAAAAAAAAAAAAAMjFslUY37t3r5KTkyVJI0aMUFJSksMCJCUlKS4uTnFxcZKkI0eOKC4uTseOHZMk9e/fX5999plmzpypvXv36tVXX1VycrKioqIclgEAAAAAAAAAAAAAkHdl6x7joaGhioqK0iOPPCLDMPTee+/Jx8cny75Dhw61K8DWrVvVpEkT63b//v0lSREREZoxY4Y6duyov//+W0OHDtWpU6cUGhqq5cuXKyAgwK7j/JfFYpHFYlF6enqOxgEAAAAAAAAAAAAA5G4mwzCMW3Xav3+/hg0bpkOHDmn79u0KDg6Wh0fmmrrJZNL27dudEtRZEhMT5e/vr4SEBPn5+bk6zm1pHxHt6ggAgDts/kz7/hANjsP3LgDkP3zvAgAAAABw98vWjPFq1arpm2++kSS5ubkpJiZGpUqVcmowAAAAAAAAAAAAAAAcIVuF8etlZGQ4IwcAAAAAAAAAAAAAAE5hd2Fckg4dOqRJkyZp7969kqTg4GD16dNHlStXdmg4Z+Ie4wAAAAAAAAAAAACQP7jZ+4QVK1YoODhYW7ZsUc2aNVWzZk1t3rxZ999/v1atWuWMjE5hNpsVHx+v2NhYV0cBAAAAAAAAAAAAADiR3TPGBw0apH79+mncuHGZ2t988009/vjjDgsHAAAAAAAAAAAAAEBO2T1jfO/evXrxxRcztXfr1k3x8fEOCQUAAAAAAAAAAAAAgKPYXRgvWbKk4uLiMrXHxcWpVKlSjsh0R1gsFgUHByssLMzVUQAAAAAAAAAAAAAATmT3Uurdu3fXyy+/rMOHD6t+/fqSpI0bN2r8+PHq37+/wwM6i9lsltlsVmJiovz9/V0dBwAAAAAAAAAAAADgJHYXxocMGSJfX1+9//77Gjx4sCSpTJkyGj58uHr37u3wgAAAAAAAAAAAAAAA5ITdhXGTyaR+/fqpX79+unTpkiTJ19fX4cEAAAAAAAAAAAAAAHAEu+8xfj1fX9+7tijOPcYBAAAAAAAAAAAAIH/IUWH8bmY2mxUfH6/Y2FhXRwEAAAAAAAAAAAAAOFG+LYwDAAAAAAAAAAAAAPIHCuMAAAAAAAAAAAAAgDzNrsL41atX1bRpUx04cMBZeQAAAAAAAAAAAAAAcCi7CuMFChTQrl27nJUFAAAAAAAAAAAAAACHs3sp9S5duuiLL75wRpY7ymKxKDg4WGFhYa6OAgAAAAAAAAAAAABwIg97n5CWlqZp06Zp9erVql27try9vW32T5w40WHhnMlsNstsNisxMVH+/v6ujgMAAAAAAAAAAAAAcBK7C+O//fabHnzwQUnS77//brPPZDI5JhUAAAAAAAAAAAAAAA5id2F87dq1zsgBAAAAAAAAAAAAAIBT2H2P8WsOHjyoFStW6J9//pEkGYbhsFAAAAAAAAAAAAAAADiK3YXxc+fOqWnTpqpataqefPJJnTx5UpL04osvasCAAQ4PCAAAAAAAAAAAAABATthdGO/Xr58KFCigY8eOqXDhwtb2jh07avny5Q4NBwAAAAAAAAAAAABATtl9j/GVK1dqxYoVKleunE37vffeqz/++MNhwQAAAAAAAAAAAAAAcAS7Z4wnJyfbzBS/5vz58/Ly8nJIqDvBYrEoODhYYWFhro4CAAAAAAAAAAAAAHAiuwvjjz76qL788kvrtslkUkZGhiZMmKAmTZo4NJwzmc1mxcfHKzY21tVRAAAAAAAAAAAAAABOZPdS6hMmTFDTpk21detWpaam6o033tCePXt0/vx5bdy40RkZAQAAAAAAAAAAAAC4bXbPGH/ggQf0+++/65FHHlHr1q2VnJysZ555Rjt27FDlypWdkREAAAAAAAAAAAAAgNtm94xxSfL399fbb7/t6CwAAAAAAAAAAAAAADjcbRXGL1y4oC+++EJ79+6VJAUHBysqKkrFihVzaDgAAAAAAAAAAAAAAHLK7qXUN2zYoKCgIH344Ye6cOGCLly4oA8//FCVKlXShg0bnJERAAAAAAAAAAAAAIDbZveMcbPZrI4dO+qTTz6Ru7u7JCk9PV2vvfaazGazdu/e7fCQAAAAAAAAAAAAAADcLrtnjB88eFADBgywFsUlyd3dXf3799fBgwcdGg4AAAAAAAAAAAAAgJyyuzD+4IMPWu8tfr29e/cqJCTEIaEAAAAAAAAAAAAAAHCUbC2lvmvXLuvPvXv3Vp8+fXTw4EE9/PDDkqRff/1VFotF48aNc05KAAAAAAAAAAAAAABuU7YK46GhoTKZTDIMw9r2xhtvZOr3/PPPq2PHjo5L50QWi0UWi0Xp6emujgIAAAAAAAAAAAAAcKJsFcaPHDni7Bx3nNlsltlsVmJiovz9/V0dBwAAAAAAAAAAAADgJNkqjFesWNHZOQAAAAAAAAAAAAAAcIpsFcb/68SJE/r555915swZZWRk2Ozr3bu3Q4IBAAAAAAAAAAAAAOAIdhfGZ8yYoR49esjT01PFixeXyWSy7jOZTBTGAQAAAAAAAAAAAAC5it2F8SFDhmjo0KEaPHiw3NzcnJEJAAAAAAAAAAAAAACHsbuynZKSoueee46iOAAAAAAAAAAAAADgrmB3dfvFF1/UvHnznJEFAAAAAAAAAAAAAACHs3sp9bFjx+qpp57S8uXLVaNGDRUoUMBm/8SJEx0WDgAAAAAAAAAAAACAnLqtwviKFStUrVo1SZLJZLLuu/5nAAAAAAAAAAAAAAByA7sL4++//76mTZumyMhIJ8QBAAAAAAAAAAAAAMCx7L7HuJeXlxo0aOCMLAAAAAAAAAAAAAAAOJzdhfE+ffpoypQpzshy29q2bauiRYuqffv2ro4CAAAAAAAAAAAAAMhl7F5KfcuWLVqzZo2+//573X///SpQoIDN/gULFjgsXHb16dNH3bp108yZM+/4sQEAAAAAAAAAAAAAuZvdhfEiRYromWeecUaW29a4cWOtW7fO1TEAAAAAAAAAAAAAALmQ3YXx6dOnOzTAhg0b9O6772rbtm06efKkFi5cqDZt2tj0sVgsevfdd3Xq1CmFhIRoypQpeuihhxyaAwAAAAAAAAAAAACQN9l9j3FHS05OVkhIiCwWS5b7586dq/79+2vYsGHavn27QkJCFB4erjNnztzhpAAAAAAAAAAAAACAu5HdM8YrVaokk8l0w/2HDx+2a7wWLVqoRYsWN9w/ceJEde/eXVFRUZKkqVOnatmyZZo2bZoGDRpk17Ek6cqVK7py5Yp1OzEx0e4xAAAAAAAAAAAAAAB3D7sL43379rXZvnr1qnbs2KHly5fr9ddfd1QuSVJqaqq2bdumwYMHW9vc3NzUrFkzbdq06bbGHDt2rEaMGOGoiAAAAAAAAAAAAACAXM7uwnifPn2ybLdYLNq6dWuOA13v7NmzSk9PV0BAgE17QECA9u3bZ91u1qyZdu7cqeTkZJUrV07z5s1TvXr1shxz8ODB6t+/v3U7MTFR5cuXd2huAAAAAAAAAAAAAEDuYXdh/EZatGihwYMHa/r06Y4aMttWr16d7b5eXl7y8vJyYhoAAAAAAAAAAAAAQG7i5qiB5s+fr2LFijlqOElSiRIl5O7urtOnT9u0nz59WoGBgTka22KxKDg4WGFhYTkaBwAAAAAAAAAAAACQu9k9Y7xWrVoymUzWbcMwdOrUKf3999/6+OOPHRrO09NTtWvXVkxMjNq0aSNJysjIUExMjHr27Jmjsc1ms8xmsxITE+Xv7++AtAAAAAAAAAAAAACA3Mjuwvi1AvU1bm5uKlmypBo3bqz77rvP7gBJSUk6ePCgdfvIkSOKi4tTsWLFVKFCBfXv318RERGqU6eOHnroIU2aNEnJycmKioqy+1gAAAAAAAAAAAAAgPzH7sL4sGHDHBpg69atatKkiXW7f//+kqSIiAjNmDFDHTt21N9//62hQ4fq1KlTCg0N1fLlyxUQEJCj41osFlksFqWnp+doHAAAAAAAAAAAAABA7mYyDMNwdQhXuraUekJCgvz8/Fwd57a0j4h2dQQAwB02f+ZQV0fIt/jeBYD8h+9dAAAAAADuftmeMe7m5mZzb/GsmEwmpaWl5TgUAAAAAAAAAAAAAACOku3C+MKFC2+4b9OmTfrwww+VkZHhkFB3AkupAwAAAAAAAAAAAED+kO3CeOvWrTO17d+/X4MGDdLSpUvVuXNnRUffPUuLms1mmc1m61LqAAAAAAAAAAAAAIC8ye12nnTixAl1795dNWrUUFpamuLi4jRz5kxVrFjR0fkAAAAAAAAAAAAAAMgRuwrjCQkJevPNN1WlShXt2bNHMTExWrp0qR544AFn5QMAAAAAAAAAAAAAIEeyvZT6hAkTNH78eAUGBurrr7/Ocml1AAAAAAAAAAAAAABym2wXxgcNGqRChQqpSpUqmjlzpmbOnJllvwULFjgsnDNZLBZZLBalp6e7OgoAAAAAAAAAAAAAwImyXRjv2rWrTCaTM7PcUWazWWazWYmJifL393d1HAAAAAAAAAAAAACAk2S7MD5jxgwnxgAAAAAAAAAgSb169dKiRYuUkJAgX19fPfvss5owYYI8PT1dHQ0AAAC4a7m5OgAAAAAAAACA//Paa69p3759SkxM1M6dO7Vz505NmDDB1bEAAACAu1q+LYxbLBYFBwcrLCzM1VEAAAAAAAAAq+rVq8vb21uSZBiG3NzcdODAARenAgAAAO5u+bYwbjabFR8fr9jYWFdHAQAAAAAAAGyMGzdOPj4+KlWqlHbu3KlevXq5OhIAAABwV8u3hXEAAAAAAAAgtxo0aJCSkpIUHx+vV155RYGBga6OBAAAANzVKIwDAAAAAAAAuVT16tUVEhKiyMhIV0cBAAAA7moUxgEAAAAAAIBc7OrVq9xjHAAAAMghCuMAAAAAAABALpGUlKTp06fr4sWLMgxDu3fv1qhRoxQeHu7qaAAAAMBdLd8Wxi0Wi4KDgxUWFubqKAAAAAAAAIAkyWQyac6cOapcubJ8fX3VunVrtWzZUpMmTXJ1NAAAAOCu5uHqAK5iNptlNpuVmJgof39/V8cBAAAAAAAA5O3trVWrVrk6BgAAAJDn5NsZ4wAAAAAAAAAAAACA/IHCOAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBP83B1AFexWCyyWCxKT093dRQAAAAAAHAD7SOiXR0BAHCHzZ851NURAABAHpRvZ4ybzWbFx8crNjbW1VEAAAAAAAAAAAAAAE6UbwvjAAAAAAAAAAAAAID8gcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDyNwjgAAAAAAAAAAAAAIE+jMA4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAAAAAAAAAACAPC3fFsYtFouCg4MVFhbm6igAAAAAAAAAAAAAACfKt4Vxs9ms+Ph4xcbGujoKAAAAAAAAAAAAAMCJ8m1hHAAAAAAAAAAAAPnbRx99pDp16sjLy0tt2rRxdRwATuTh6gAAAAAAAAAAAACAK5QpU0bvvPOOVq9erT///NPVcQA4EYVxAAAAAAAAAAAA5EvPPPOMJCkuLo7COJDHsZQ6AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDzNw9UBAAAAAAAAAAAAAFdIS0uzPjIyMnT58mW5ubnJ09PT1dEAOBiFcQAAAAAAAAAAAORLo0aN0ogRI6zbhQoVUqNGjbRu3TrXhQLgFCylDgAAAAAAAAAAgHxp+PDhMgzD5kFRHMib8kRh/Pvvv1e1atV077336vPPP3d1HAAAAAAAAAAAAABALnLXL6Welpam/v37a+3atfL391ft2rXVtm1bFS9e3NXRAAAAAAAAAAAAAAC5wF0/Y3zLli26//77VbZsWfn4+KhFixZauXKlq2MBAAAAAAAAAAAAAHIJlxfGN2zYoFatWqlMmTIymUxatGhRpj4Wi0VBQUEqWLCg6tatqy1btlj3nThxQmXLlrVuly1bVn/99dediA4AAAAAAAAAAAAAuAu4vDCenJyskJAQWSyWLPfPnTtX/fv317Bhw7R9+3aFhIQoPDxcZ86cucNJAQAAAAAAAAAAAAB3I5cXxlu0aKFRo0apbdu2We6fOHGiunfvrqioKAUHB2vq1KkqXLiwpk2bJkkqU6aMzQzxv/76S2XKlLnh8a5cuaLExESbBwAAAAAAAAAAAAAg7/JwdYCbSU1N1bZt2zR48GBrm5ubm5o1a6ZNmzZJkh566CH99ttv+uuvv+Tv768ff/xRQ4YMueGYY8eO1YgRI5yeHQAAAAAAAACAu037iGhXRwAA3GHzZw51dYQ7wuUzxm/m7NmzSk9PV0BAgE17QECATp06JUny8PDQ+++/ryZNmig0NFQDBgxQ8eLFbzjm4MGDlZCQYH0cP37cqa8BAAAAAAAAAAAAAOBauXrGeHY9/fTTevrpp7PV18vLS15eXk5OBAAAAAAAAAAAAADILXL1jPESJUrI3d1dp0+ftmk/ffq0AgMDczS2xWJRcHCwwsLCcjQOAAAAAAAAAAAAACB3y9WFcU9PT9WuXVsxMTHWtoyMDMXExKhevXo5GttsNis+Pl6xsbE5jQkAAAAAAAAAAAAAyMVcvpR6UlKSDh48aN0+cuSI4uLiVKxYMVWoUEH9+/dXRESE6tSpo4ceekiTJk1ScnKyoqKiXJgaAAAAAAAAAAAAAHC3cHlhfOvWrWrSpIl1u3///pKkiIgIzZgxQx07dtTff/+toUOH6tSpUwoNDdXy5csVEBCQo+NaLBZZLBalp6fnaBwAAAAAAAAAAAAAQO7m8sJ448aNZRjGTfv07NlTPXv2dOhxzWazzGazEhMT5e/v79CxAQAAAAAAAAAAAAC5R66+xzgAAAAAAAAAAAAAADmVbwvjFotFwcHBCgsLc3UUAAAAAAAAAAAAAIAT5dvCuNlsVnx8vGJjY10dBQAAAAAAAAAAAADgRPm2MA4AAAAAAAAAAAAAyB8ojAMAAAAAAAAAAAAA8jQPVwdwFYvFIovForS0NElSYmKiixPdvqupl10dAQBwh93N31t3O753ASD/4XvXtfjuBYD8h+9e1+K7FwDyn7zw3evr6yuTyXTTPibDMIw7lCdX+vPPP1W+fHlXxwAAAAAAAAAAAAAA3IaEhAT5+fndtE++L4xnZGToxIkT2forAgC5R2JiosqXL6/jx4/f8hcdAADIOb57AQC4s/juBQDgzuK7F7i7ZafWm2+XUr/Gzc1N5cqVc3UMALfJz8+P/0gBAOAO4rsXAIA7i+9eAADuLL57gbzLzdUBAAAAAAAAAAAAAABwJgrjAAAAAAAAAAAAAIA8jcI4gLuSl5eXhg0bJi8vL1dHAQAgX+C7FwCAO4vvXgAA7iy+e4G8z2QYhuHqEAAAAAAAAAAAAAAAOAszxgEAAAAAAAAAAAAAeRqFcQAAAAAAAAAAAABAnkZhHAAAAAAAAAAAAACQp1EYB3BX2bBhg1q1aqUyZcrIZDJp0aJFro4EAECeNnbsWIWFhcnX11elSpVSmzZttH//flfHAgAgz7p06ZL69u2rihUrqlChQqpfv75iY2NdHQsAgDzhVv++HBkZKZPJZPNo3ry5a8ICcDgK4wDuKsnJyQoJCZHFYnF1FAAA8oX169fLbDbr119/1apVq3T16lU98cQTSk5OdnU0AADypJdeekmrVq3SrFmztHv3bj3xxBNq1qyZ/vrrL1dHAwDgrpedf19u3ry5Tp48aX18/fXXdzAhAGcyGYZhuDoEANwOk8mkhQsXqk2bNq6OAgBAvvH333+rVKlSWr9+vRo2bOjqOAAA5Cn//POPfH19tXjxYrVs2dLaXrt2bbVo0UKjRo1yYToAAPKWrP59OTIyUhcvXmSlUiCPYsY4AAAAgGxLSEiQJBUrVszFSQAAyHvS0tKUnp6uggUL2rQXKlRIP//8s4tSAQCQv6xbt06lSpVStWrV9Oqrr+rcuXOujgTAQSiMAwAAAMiWjIwM9e3bVw0aNNADDzzg6jgAAOQ5vr6+qlevnkaOHKkTJ04oPT1dX331lTZt2qSTJ0+6Oh4AAHle8+bN9eWXXyomJkbjx4/X+vXr1aJFC6Wnp7s6GgAH8HB1AAAAAAB3B7PZrN9++40ZawAAONGsWbPUrVs3lS1bVu7u7nrwwQfVqVMnbdu2zdXRAADI85577jnrzzVq1FDNmjVVuXJlrVu3Tk2bNnVhMgCOwIxxAAAAALfUs2dPff/991q7dq3KlSvn6jgAAORZlStX1vr165WUlKTjx49ry5Ytunr1qu655x5XRwMAIN+55557VKJECR08eNDVUQA4AIVxAAAAADdkGIZ69uyphQsXas2aNapUqZKrIwEAkC94e3urdOnSunDhglasWKHWrVu7OhIAAPnOn3/+qXPnzql06dKujgLAAVhKHcBdJSkpyeav844cOaK4uDgVK1ZMFSpUcGEyAADyJrPZrDlz5mjx4sXy9fXVqVOnJEn+/v4qVKiQi9MBAJD3rFixQoZhqFq1ajp48KBef/113XfffYqKinJ1NAAA7no3+/flYsWKacSIEWrXrp0CAwN16NAhvfHGG6pSpYrCw8NdmBqAo5gMwzBcHQIAsmvdunVq0qRJpvaIiAjNmDHjzgcCACCPM5lMWbZPnz5dkZGRdzYMAAD5wLfffqvBgwfrzz//VLFixdSuXTuNHj1a/v7+ro4GAMBd72b/vvzJJ5+oTZs22rFjhy5evKgyZcroiSee0MiRIxUQEOCCtAAcjcI4AAAAAAAAAAAAACBP4x7jAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDyNwjgAAAAAAAAAAAAAIE+jMA4AAAAAAAAAAAAAyNMojAMAAAAAAAAAAAAA8jQK4wAAAADgAkePHpXJZFJcXJyro1jt27dPDz/8sAoWLKjQ0FBXx3GayMhItWnTxtUxcpWgoCBNmjTJJceeMWOGihQp4rDxGjdurL59+97RY94OZ/0OyM3v73Xr1slkMunixYuujgIAAAAgH6IwDgAAACBfioyMlMlk0rhx42zaFy1aJJPJ5KJUrjVs2DB5e3tr//79iomJybJPdoqOud3kyZM1Y8YMu57jysLxnRAbG6uXX3452/1zQ2H5RhYsWKCRI0dat3PrtStfvrxOnjypBx54wNVR7pj69evr5MmT8vf3d+i4JpNJixYtcuiYAAAAAPIeCuMAAAAA8q2CBQtq/PjxunDhgqujOExqauptP/fQoUN65JFHVLFiRRUvXvy2xzEMQ2lpabf9fGdJT09XRkaG/P39c21R11VKliypwoULuzqGQxQrVky+vr6ujnFL7u7uCgwMlIeHh6uj3BFXr16Vp6enAgMD8+0fHwEAAABwLQrjAAAAAPKtZs2aKTAwUGPHjr1hn+HDh2daVnzSpEkKCgqybl9bunjMmDEKCAhQkSJFFB0drbS0NL3++usqVqyYypUrp+nTp2caf9++fapfv74KFiyoBx54QOvXr7fZ/9tvv6lFixby8fFRQECAXnjhBZ09e9a6v3HjxurZs6f69u2rEiVKKDw8PMvXkZGRoejoaJUrV05eXl4KDQ3V8uXLrftNJpO2bdum6OhomUwmDR8+PNMYkZGRWr9+vSZPniyTySSTyaSjR49al0f+8ccfVbt2bXl5eennn3/WoUOH1Lp1awUEBMjHx0dhYWFavXq1zZhBQUEaM2aMunXrJl9fX1WoUEGffvqpdX9qaqp69uyp0qVLq2DBgqpYsaLN9bp48aJ69OihgIAA6zn8/vvvJf3frOYlS5YoODhYXl5eOnbsWKalpq+dw549e8rf318lSpTQkCFDZBiGdf8ff/yhfv36WV+3JP3xxx9q1aqVihYtKm9vb91///364Ycfsjz/kjRr1izVqVNHvr6+CgwM1PPPP68zZ85Y91+4cEGdO3dWyZIlVahQId17773W98ytzsOxY8fUunVr+fj4yM/PTx06dNDp06dtjr906VKFhYWpYMGCKlGihNq2bWtzHa6fVT1x4kTVqFFD3t7eKl++vF577TUlJSVJ+nc57KioKCUkJFjPx7X3y5UrVzRw4ECVLVtW3t7eqlu3rtatW2eTY8aMGapQoYIKFy6stm3b6ty5czc8Z5LUvn179ezZ07rdt29fmUwm7du3z3puvL29re+t61c1uNG1u2bFihWqXr26fHx81Lx5c508efKGOdLT0/Xiiy+qUqVKKlSokKpVq6bJkyffNPvNrul/l1K/9jmKiYlRnTp1VLhwYdWvX1/79++3GXPUqFEqVaqUfH199dJLL2nQoEE3vfVBRkaGxo4da80dEhKi+fPn37D/W2+9pbp162ZqDwkJUXR0tKR/Vxh4/PHHVaJECfn7+6tRo0bavn27TX+TyaRPPvlETz/9tLy9vTV69OhMS6mfO3dOnTp1UtmyZVW4cGHVqFFDX3/9tc04jRs3Vu/evfXGG2+oWLFiCgwMtPn9dO13cdu2bWUymWx+NwMAAADA9SiMAwAAAMi33N3dNWbMGE2ZMkV//vlnjsZas2aNTpw4oQ0bNmjixIkaNmyYnnrqKRUtWlSbN2/WK6+8oh49emQ6zuuvv64BAwZox44dqlevnlq1amUtFF68eFGPPfaYatWqpa1bt2r58uU6ffq0OnToYDPGzJkz5enpqY0bN2rq1KlZ5ps8ebLef/99vffee9q1a5fCw8P19NNP68CBA5KkkydP6v7779eAAQN08uRJDRw4MMsx6tWrp+7du+vkyZM6efKkypcvb90/aNAgjRs3Tnv37lXNmjWVlJSkJ598UjExMdqxY4eaN2+uVq1a6dixYzbjvv/++6pTp4527Nih1157Ta+++qq1GPjhhx9qyZIl+vbbb7V//37Nnj3bWvjKyMhQixYttHHjRn311VeKj4/XuHHj5O7ubh07JSVF48eP1+eff649e/aoVKlSWZ6fmTNnysPDQ1u2bNHkyZM1ceJEff7555L+XZq7XLlyio6Otr5uSTKbzbpy5Yo2bNig3bt3a/z48fLx8clyfOnfGbMjR47Uzp07tWjRIh09elSRkZHW/UOGDFF8fLx+/PFH7d27V5988olKlCiRrfPQunVrnT9/XuvXr9eqVat0+PBhdezY0Tr2smXL1LZtWz355JPasWOHYmJi9NBDD90wq5ubmz788EPt2bNHM2fO1Jo1a/TGG29I+nc57EmTJsnPz896Pq69X3r27KlNmzbpm2++0a5du/Tss8+qefPm1vfZ5s2b9eKLL6pnz56Ki4tTkyZNNGrUqBvmkKRGjRrZFNfXr1+vEiVKWNtiY2N19epV1a9fP9Nzb3TtpH/fG++9955mzZqlDRs26NixY1m+76/JyMhQuXLlNG/ePMXHx2vo0KF666239O23397wOTe7pjfy9ttv6/3339fWrVvl4eGhbt26WffNnj1bo0eP1vjx47Vt2zZVqFBBn3zyyU3HGzt2rL788ktNnTpVe/bsUb9+/dSlS5dMf4RzTefOnbVlyxYdOnTI2rZnzx7t2rVLzz//vCTp0qVLioiI0M8//6xff/1V9957r5588kldunTJZqzhw4erbdu22r17t83ruOby5cuqXbu2li1bpt9++00vv/yyXnjhBW3ZssWm38yZM+Xt7a3NmzdrwoQJio6O1qpVqyT9e/0lafr06Tp58qR1GwAAAAAyMQAAAAAgH4qIiDBat25tGIZhPPzww0a3bt0MwzCMhQsXGtf/r9KwYcOMkJAQm+d+8MEHRsWKFW3GqlixopGenm5tq1atmvHoo49at9PS0gxvb2/j66+/NgzDMI4cOWJIMsaNG2ftc/XqVaNcuXLG+PHjDcMwjJEjRxpPPPGEzbGPHz9uSDL2799vGIZhNGrUyKhVq9YtX2+ZMmWM0aNH27SFhYUZr732mnU7JCTEGDZs2E3HadSokdGnTx+btrVr1xqSjEWLFt0yx/33329MmTLFul2xYkWjS5cu1u2MjAyjVKlSxieffGIYhmH06tXLeOyxx4yMjIxMY61YscJwc3Oznov/mj59uiHJiIuLs2m//tpfe03Vq1e3Ocabb75pVK9e3SbnBx98YDNOjRo1jOHDh9/yNd9IbGysIcm4dOmSYRiG0apVKyMqKirLvjc7DytXrjTc3d2NY8eOWdv27NljSDK2bNliGIZh1KtXz+jcufMNs2T1+q43b948o3jx4tbt6dOnG/7+/jZ9/vjjD8Pd3d3466+/bNqbNm1qDB482DAMw+jUqZPx5JNP2uzv2LFjprGut2vXLsNkMhlnzpwxzp8/b3h6ehojR440OnbsaBiGYYwaNcqoX7++tf9/36NZvbZr742DBw9a2ywWixEQEHDDHFkxm81Gu3btbrj/Ztf02u+AHTt2GIbxf5+j1atXW/ssW7bMkGT8888/hmEYRt26dQ2z2WwzToMGDWx+R13//r58+bJRuHBh45dffrF5zosvvmh06tTphrlDQkKM6Oho6/bgwYONunXr3rB/enq64evrayxdutTaJsno27evTb9rr/HChQs3HKtly5bGgAEDrNuNGjUyHnnkEZs+YWFhxptvvmlzrIULF95wTAAAAAAwDMNgxjgAAACAfG/8+PGaOXOm9u7de9tj3H///XJz+7//xQoICFCNGjWs2+7u7ipevLjN0tmSVK9ePevPHh4eqlOnjjXHzp07tXbtWvn4+Fgf9913nyTZzOasXbv2TbMlJibqxIkTatCggU17gwYNcvSa/6tOnTo220lJSRo4cKCqV6+uIkWKyMfHR3v37s00Y7xmzZrWn00mkwIDA63nKTIyUnFxcapWrZp69+6tlStXWvvGxcWpXLlyqlq16g0zeXp62ox/Iw8//LDNMtv16tXTgQMHlJ6efsPn9O7dW6NGjVKDBg00bNgw7dq166bH2LZtm1q1aqUKFSrI19dXjRo1kiTr+Xj11Vf1zTffKDQ0VG+88YZ++eUX63Nvdh727t2r8uXL28zeDw4OVpEiRazXNy4uTk2bNr3lebhm9erVatq0qcqWLStfX1+98MILOnfunFJSUm74nN27dys9PV1Vq1a1ec+uX7/e+n7du3dvpmW6r/8MZOWBBx5QsWLFtH79ev3000+qVauWnnrqKeuM5/Xr16tx48bZfm3XFC5cWJUrV7Zuly5dOtPn878sFotq166tkiVLysfHR59++mmm9/P1bnZNb+T692vp0qUlyZpr//79mWb632zm/8GDB5WSkqLHH3/c5pp8+eWXNr9D/qtz586aM2eOJMkwDH399dfq3Lmzdf/p06fVvXt33XvvvfL395efn5+SkpIynYv//k74r/T0dI0cOVI1atRQsWLF5OPjoxUrVtz0d4SUvWsFAAAAAP9FYRwAAABAvtewYUOFh4dr8ODBmfa5ublZ7zV9zdWrVzP1K1CggM22yWTKsi0jIyPbuZKSktSqVSvFxcXZPA4cOKCGDRta+3l7e2d7TGf6b46BAwdq4cKFGjNmjH766SfFxcWpRo0aSk1Ntel3s/P04IMP6siRIxo5cqT++ecfdejQQe3bt5ckFSpU6JaZChUqlOm+0o7y0ksv6fDhw3rhhRe0e/du1alTR1OmTMmyb3JyssLDw+Xn56fZs2crNjZWCxculCTr+WjRooX1ftgnTpxQ06ZNrUt73+w8ZEd2ztU1R48e1VNPPaWaNWvqu+++07Zt22SxWGyyZiUpKUnu7u7atm2bzft17969t7wX982YTCY1bNhQ69atsxbBa9asqStXrui3337TL7/8Yv0jA3tk9b7772f9et98840GDhyoF198UStXrlRcXJyioqJuek5udk2zk+vae9ee3xvXu3Zf+GXLltlck/j4+JveZ7xTp07av3+/tm/frl9++UXHjx+3WZo/IiJCcXFxmjx5sn755RfFxcWpePHimc7FrX43vfvuu5o8ebLefPNNrV27VnFxcQoPD7frdwQAAAAAZBeFcQAAAACQNG7cOC1dulSbNm2yaS9ZsqROnTplUzCLi4tz2HF//fVX689paWnatm2bqlevLunfYuiePXsUFBSkKlWq2DzsKYb7+fmpTJky2rhxo037xo0bFRwcbFdeT0/Pm86i/u/4kZGRatu2rWrUqKHAwEAdPXrUruNJ/+bv2LGjPvvsM82dO1ffffedzp8/r5o1a+rPP//U77//bveY/7V582ab7Wv3Tb52v/Ibve7y5cvrlVde0YIFCzRgwAB99tlnWY6/b98+nTt3TuPGjdOjjz6q++67L8sZryVLllRERIS++uorTZo0SZ9++ql1343OQ/Xq1XX8+HEdP37c2jc+Pl4XL160Xt+aNWsqJiYmW+di27ZtysjI0Pvvv6+HH35YVatW1YkTJ2z6ZHU+atWqpfT0dJ05cybT+zUwMFCSVL169SzP9a1cu8/4unXr1LhxY7m5ualhw4Z69913deXKlUyrIdwq6+3YuHGj6tevr9dee021atVSlSpVbjrr+pqbXVN7VatWLdM9tG92T+3g4GB5eXnp2LFjma7J9SsM/Fe5cuXUqFEjzZ49W7Nnz9bjjz+uUqVKWfdv3LhRvXv31pNPPqn7779fXl5eOnv2rN2vZ+PGjWrdurW6dOmikJAQ3XPPPbf1eS5QoIBDrjEAAACAvI3COAAAAABIqlGjhjp37qwPP/zQpr1x48b6+++/NWHCBB06dEgWi0U//vijw45rsVi0cOFC7du3T2azWRcuXFC3bt0kSWazWefPn1enTp0UGxurQ4cOacWKFYqKirK7CPT6669r/Pjxmjt3rvbv369BgwYpLi5Offr0sWucoKAgbd68WUePHtXZs2dvOmvz3nvv1YIFCxQXF6edO3fq+eeft3uW58SJE/X1119r3759+v333zVv3jwFBgaqSJEiatSokRo2bKh27dpp1apVOnLkiH788UctX77crmNI/y5n3r9/f+3fv19ff/21pkyZYnNugoKCtGHDBv3111/WAmDfvn21YsUKHTlyRNu3b9fatWutf9TwXxUqVJCnp6emTJmiw4cPa8mSJRo5cqRNn6FDh2rx4sU6ePCg9uzZo++//9463s3OQ7Nmzazv3+3bt2vLli3q2rWrGjVqZF3KetiwYfr66681bNgw7d27V7t379b48eOzzFqlShVdvXrVmnXWrFmaOnWqTZ+goCAlJSUpJiZGZ8+eVUpKiqpWrarOnTura9euWrBggY4cOaItW7Zo7NixWrZsmaR/l59fvny53nvvPR04cEAfffRRtq5X48aNFR8frz179uiRRx6xts2ePVt16tS56R+KZHXtbse9996rrVu3asWKFfr99981ZMiQmxalpZtf09vRq1cvffHFF5o5c6YOHDigUaNGadeuXTdcFcHX11cDBw5Uv379NHPmTB06dEjbt2/XlClTNHPmzJseq3Pnzvrmm280b948m2XUpX/PxaxZs7R3715t3rxZnTt3tmtVguvHWbVqlX755Rft3btXPXr00OnTp+0eJygoSDExMTp16pQuXLhg9/MBAAAA5A8UxgEAAADg/4uOjs5UuK1evbo+/vhjWSwWhYSEaMuWLbdcCtke48aN07hx4xQSEqKff/5ZS5YsUYkSJSTJOss7PT1dTzzxhGrUqKG+ffuqSJEiNvczz47evXurf//+GjBggGrUqKHly5dryZIluvfee+0aZ+DAgXJ3d1dwcLBKlix50/srT5w4UUWLFlX9+vXVqlUrhYeH68EHH7TreL6+vpowYYLq1KmjsLAwHT16VD/88IP19X/33XcKCwtTp06dFBwcrDfeeOO2Zo527dpV//zzjx566CGZzWb16dNHL7/8snV/dHS0jh49qsqVK6tkyZKS/r0/stlsVvXq1dW8eXNVrVpVH3/8cZbjlyxZUjNmzNC8efMUHByscePG6b333rPp4+npqcGDB6tmzZpq2LCh3N3d9c0339zyPJhMJi1evFhFixZVw4YN1axZM91zzz2aO3eudezGjRtr3rx5WrJkiUJDQ/XYY49py5YtWWYNCQnRxIkTNX78eD3wwAOaPXu2xo4da9Onfv36euWVV9SxY0eVLFlSEyZMkCRNnz5dXbt21YABA1StWjW1adNGsbGxqlChgqR/7+X+2WefafLkyQoJCdHKlSv1zjvv3PL61KhRQ0WKFFFoaKh8fHysryk9Pf2W9xfP6trdjh49euiZZ55Rx44dVbduXZ07d06vvfbaTZ9zs2t6Ozp37qzBgwdr4MCB1uX1IyMjVbBgwRs+Z+TIkRoyZIjGjh1rfa8uW7ZMlSpVuumx2rdvb72vfJs2bWz2ffHFF7pw4YIefPBBvfDCC+rdu7fNjPLseuedd/Tggw8qPDxcjRs3VmBgYKZjZcf777+vVatWqXz58qpVq5bdzwcAAACQP5iMm91ACwAAAACAPK5x48YKDQ3VpEmTXB0FsNvjjz+uwMBAzZo1y9VRAAAAACBX83B1AAAAAAAAANxaSkqKpk6dqvDwcLm7u+vrr7/W6tWrtWrVKldHAwAAAIBcj8I4AAAAAADAXcBkMumHH37Q6NGjdfnyZVWrVk3fffedmjVr5upoAAAAAJDrsZQ6AAAAAAAAAAAAACBPc3N1AAAAAAAAAAAAAAAAnInCOAAAAAAAAAAAAAAgT6MwDgAAAAAAAAAAAADI0yiMAwAAAAAAAAAAAADyNArjAAAAAAAAAAAAAIA8jcI4AAAAAAAAAAAAACBPozAOAAAAAAAAAAAAAMjTKIwDAAAAAAAAAAAAAPI0CuMAAAAAAAAAAAAAgDzt/wHZ8mipKCuUGgAAAABJRU5ErkJggg==", "text/plain": [ "
" ] @@ -2043,7 +2356,7 @@ { "data": { "text/plain": [ - "(35339, 31)" + "(35328, 31)" ] }, "execution_count": 22, @@ -2149,53 +2462,24 @@ "variants.head(2)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee3f08ed", - "metadata": {}, - "outputs": [], - "source": [ - "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files\n", - "# Using same annotation file that the authors used\n", - "from tqdm import tqdm\n", - "\n", - "\n", - "# Extract assembly from first variant_id (e.g. chr1_925969_C_T_hg38 -> hg38)\n", - "assembly = variants[\"variant_id\"].iloc[0].split(\"_\")[-1]\n", - "assert assembly == \"hg38\"\n", - "# Extract genomic coordinates from the variant_id\n", - "variants[[\"chrom\", \"pos\", \"ref\", \"alt\"]] = variants[\"variant_id\"].str.extract(\n", - " r\"(chr\\d+|chrX|chrY)_(\\d+)_([ACGT])_([ACGT])\"\n", - ")\n", - "variants[\"pos\"] = variants[\"pos\"].astype(int)\n", - "variants = variants.sort_values(by=[\"chrom\", \"pos\"]).reset_index(drop=True).reset_index()\n", - "# Remove version numbers after dot in transcript_id\n", - "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", - "gtf_s, fasta = process_gtf(f\"{DATA_DIR}/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\")\n", - "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", - "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" - ] - }, { "cell_type": "code", "execution_count": 26, - "id": "a45df45b", + "id": "ee3f08ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing chromosomes: 100%|██████████| 24/24 [00:04<00:00, 5.01it/s]\n" + "Processing transcripts: 100%|██████████| 110025/110025 [00:16<00:00, 6700.35it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - " Processed 312994 mutations with CDS context:\n" + "Processed 110025 GTF CDS sequences\n" ] }, { @@ -2219,65 +2503,189 @@ " \n", " \n", " \n", + " name\n", " chrom\n", - " pos\n", - " variant_id\n", - " ref\n", - " alt\n", - " tx_name\n", + " strand\n", " cdsStart\n", " cdsEnd\n", - " tx_strand\n", - " var_rel_dist_in_cds\n", - " ref_codon\n", - " alt_codon\n", - " ref_aa\n", - " alt_aa\n", - " codon_position\n", - " index\n", - " transcript_id\n", - " protein_variant\n", - " AlphaMissense\n", - " label\n", + " cds_starts\n", + " cds_ends\n", + " cds_length\n", " \n", " \n", " \n", " \n", " 0\n", + " ENST00000641515\n", " chr1\n", - " 925969\n", - " chr1_925969_C_T_hg38\n", - " C\n", - " T\n", - " ENST00000420190\n", - " 924431\n", - " 939291\n", " +\n", - " 564\n", - " CCT\n", - " TCT\n", - " P\n", - " S\n", - " 188\n", - " 0\n", - " ENST00000342066\n", - " Q96NU1:P10S\n", - " 0.967398\n", - " 0.0\n", + " 65564\n", + " 70008\n", + " (65564, 69036)\n", + " (65573, 70008)\n", + " 981\n", " \n", " \n", " 1\n", + " ENST00000335137\n", " chr1\n", - " 930165\n", - " chr1_930165_G_A_hg38\n", - " G\n", - " A\n", - " ENST00000420190\n", - " 924431\n", - " 939291\n", " +\n", - " 619\n", - " CGG\n", + " 69090\n", + " 70008\n", + " (69090,)\n", + " (70008,)\n", + " 918\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " name chrom strand cdsStart cdsEnd cds_starts \\\n", + "0 ENST00000641515 chr1 + 65564 70008 (65564, 69036) \n", + "1 ENST00000335137 chr1 + 69090 70008 (69090,) \n", + "\n", + " cds_ends cds_length \n", + "0 (65573, 70008) 981 \n", + "1 (70008,) 918 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files\n", + "# Using same annotation file that the authors used\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "# Extract assembly from first variant_id (e.g. chr1_925969_C_T_hg38 -> hg38)\n", + "assembly = variants[\"variant_id\"].iloc[0].split(\"_\")[-1]\n", + "assert assembly == \"hg38\"\n", + "# Extract genomic coordinates from the variant_id\n", + "variants[[\"chrom\", \"pos\", \"ref\", \"alt\"]] = variants[\"variant_id\"].str.extract(\n", + " r\"(chr\\d+|chrX|chrY)_(\\d+)_([ACGT])_([ACGT])\"\n", + ")\n", + "variants[\"pos\"] = variants[\"pos\"].astype(int)\n", + "variants = variants.sort_values(by=[\"chrom\", \"pos\"]).reset_index(drop=True).reset_index()\n", + "# Remove version numbers after dot in transcript_id\n", + "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", + "gtf_s, fasta = process_gtf(\n", + " f\"{DATA_DIR}/reference/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\"\n", + ")\n", + "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", + "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a45df45b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing chromosomes: 0%| | 0/24 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2362,7 +2770,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "0b0e27df", "metadata": {}, "outputs": [], @@ -2381,7 +2789,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "38723b3a", "metadata": {}, "outputs": [ @@ -2394,7 +2802,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -2413,7 +2821,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "527c3739", "metadata": {}, "outputs": [ @@ -2438,6 +2846,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2447,13 +2856,12 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2465,6 +2873,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2473,14 +2882,13 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2489,6 +2897,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2497,14 +2906,13 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2512,30 +2920,34 @@ " \n", " \n", "
chromposvariant_idrefalttx_namecdsStartcdsEndtx_strandvar_rel_dist_in_cdsref_codonalt_codonref_aaalt_aacodon_positionindextranscript_idprotein_variantAlphaMissenselabel
0chr1925969chr1_925969_C_T_hg38CTENST00000420190924431939291+564CCTTCTPS1880ENST00000342066Q96NU1:P10S0.9673980.0
1chr1930165chr1_930165_G_A_hg38GAENST00000420190924431939291+619CGGCAGRQ
level_0idvariant_idtranscript_idchromposrefalt...alt_codonref_aaalt_aaalt_seqcodon_positionlevel_0_yindex_ytranscript_id_yprotein_variant_y
000chr1_925969_C_T_hg38ENST00000342066Q96NU1:P10Schr1925969CT...TCTPSATGTCCAAGGGGATCCTGCAGGTGCATTCTCCGATCTGCGACTGCC...900ENST00000342066Q96NU1:P10S0.967398
111chr1_930165_G_A_hg38ENST00000342066Q96NU1:R28Qchr1930165GA...CAGRQATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC...2711ENST00000342066Q96NU1:R28Q0.662765
\n", - "

2 rows × 27 columns

\n", + "

2 rows × 29 columns

\n", "" ], "text/plain": [ - " id variant_id transcript_id protein_variant AlphaMissense \\\n", - "0 0 chr1_925969_C_T_hg38 ENST00000342066 Q96NU1:P10S 0.967398 \n", - "1 1 chr1_930165_G_A_hg38 ENST00000342066 Q96NU1:R28Q 0.662765 \n", + " level_0 id variant_id transcript_id protein_variant \\\n", + "0 0 0 chr1_925969_C_T_hg38 ENST00000342066 Q96NU1:P10S \n", + "1 1 1 chr1_930165_G_A_hg38 ENST00000342066 Q96NU1:R28Q \n", "\n", - " label chrom pos ref alt ... alt_codon ref_aa alt_aa \\\n", - "0 0.0 chr1 925969 C T ... TCT P S \n", - "1 0.0 chr1 930165 G A ... CAG R Q \n", + " AlphaMissense label chrom pos ref ... ref_aa alt_aa \\\n", + "0 0.967398 0.0 chr1 925969 C ... P S \n", + "1 0.662765 0.0 chr1 930165 G ... R Q \n", "\n", - " alt_seq codon_position index_y \\\n", - "0 ATGTCCAAGGGGATCCTGCAGGTGCATTCTCCGATCTGCGACTGCC... 9 0 \n", - "1 ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC... 27 1 \n", + " alt_seq codon_position \\\n", + "0 ATGTCCAAGGGGATCCTGCAGGTGCATTCTCCGATCTGCGACTGCC... 9 \n", + "1 ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCC... 27 \n", "\n", - " transcript_id_y protein_variant_y AlphaMissense_y label_y \n", - "0 ENST00000342066 Q96NU1:P10S 0.967398 0.0 \n", - "1 ENST00000342066 Q96NU1:R28Q 0.662765 0.0 \n", + " level_0_y index_y transcript_id_y protein_variant_y AlphaMissense_y \\\n", + "0 0 0 ENST00000342066 Q96NU1:P10S 0.967398 \n", + "1 1 1 ENST00000342066 Q96NU1:R28Q 0.662765 \n", "\n", - "[2 rows x 27 columns]" + " label_y \n", + "0 0.0 \n", + "1 0.0 \n", + "\n", + "[2 rows x 29 columns]" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2546,7 +2958,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "7fbe8ebf", "metadata": {}, "outputs": [ @@ -2567,7 +2979,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "7d6dc737", "metadata": {}, "outputs": [], @@ -2585,7 +2997,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "2b5660bb", "metadata": {}, "outputs": [ @@ -2648,7 +3060,7 @@ "1 1 " ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2661,57 +3073,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "9c9927e0", "metadata": {}, - "outputs": [], - "source": [ - "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files #\n", - "# Using same annotation file that the authors used\n", - "\n", - "## Extract assembly from first variant_id (e.g. chr1_925969_C_T_hg38 -> hg38)\n", - "assembly = variants[\"variant_id\"].iloc[0].split(\"_\")[-1]\n", - "assert assembly == \"hg38\"\n", - "## Extract genomic coordinates from the variant_id\n", - "variants[[\"chrom\", \"pos\", \"ref\", \"alt\"]] = variants[\"variant_id\"].str.extract(\n", - " r\"(chr\\d+|chrX|chrY)_(\\d+)_([ACGT])_([ACGT])\"\n", - ")\n", - "variants[\"pos\"] = variants[\"pos\"].astype(int)\n", - "variants = variants.sort_values(by=[\"chrom\", \"pos\"]).reset_index(drop=True).reset_index()\n", - "## Remove version numbers after dot in transcript_id\n", - "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", - "## Get the CDS sequences and annotations from the GTF and FASTA files\n", - "gtf_s, fasta = process_gtf(f\"{DATA_DIR}/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\")\n", - "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", - "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "89778ae1", - "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing chromosomes: 100%|██████████| 23/23 [00:00<00:00, 66.11it/s]" + "Processing transcripts: 100%|██████████| 110025/110025 [00:16<00:00, 6591.19it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - " Processed 11615 mutations with CDS context:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "Processed 110025 GTF CDS sequences\n" ] }, { @@ -2735,58 +3112,188 @@ " \n", " \n", " \n", + " name\n", " chrom\n", - " pos\n", - " variant_id\n", - " ref\n", - " alt\n", - " tx_name\n", + " strand\n", " cdsStart\n", " cdsEnd\n", - " tx_strand\n", - " var_rel_dist_in_cds\n", - " ref_codon\n", - " alt_codon\n", - " ref_aa\n", - " alt_aa\n", - " codon_position\n", - " index\n", - " transcript_id\n", - " protein_variant\n", - " AlphaMissense\n", - " label\n", + " cds_starts\n", + " cds_ends\n", + " cds_length\n", " \n", " \n", " \n", " \n", " 0\n", + " ENST00000641515\n", " chr1\n", - " 2557810\n", - " chr1_2557810_G_A_hg38\n", - " G\n", - " A\n", - " ENST00000409119\n", - " 2556664\n", - " 2561524\n", " +\n", - " 153\n", - " GAG\n", - " AAG\n", - " E\n", - " K\n", - " 51\n", - " 0\n", - " ENST00000355716\n", - " Q92956:E52K\n", - " 0.232843\n", - " 0\n", + " 65564\n", + " 70008\n", + " (65564, 69036)\n", + " (65573, 70008)\n", + " 981\n", " \n", " \n", " 1\n", + " ENST00000335137\n", " chr1\n", - " 2558346\n", - " chr1_2558346_A_G_hg38\n", - " A\n", + " +\n", + " 69090\n", + " 70008\n", + " (69090,)\n", + " (70008,)\n", + " 918\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " name chrom strand cdsStart cdsEnd cds_starts \\\n", + "0 ENST00000641515 chr1 + 65564 70008 (65564, 69036) \n", + "1 ENST00000335137 chr1 + 69090 70008 (69090,) \n", + "\n", + " cds_ends cds_length \n", + "0 (65573, 70008) 981 \n", + "1 (70008,) 918 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Get the reference sequence and CDS context annotation of the variants from the GTF annotation and FASTA files #\n", + "# Using same annotation file that the authors used\n", + "\n", + "## Extract assembly from first variant_id (e.g. chr1_925969_C_T_hg38 -> hg38)\n", + "assembly = variants[\"variant_id\"].iloc[0].split(\"_\")[-1]\n", + "assert assembly == \"hg38\"\n", + "## Extract genomic coordinates from the variant_id\n", + "variants[[\"chrom\", \"pos\", \"ref\", \"alt\"]] = variants[\"variant_id\"].str.extract(\n", + " r\"(chr\\d+|chrX|chrY)_(\\d+)_([ACGT])_([ACGT])\"\n", + ")\n", + "variants[\"pos\"] = variants[\"pos\"].astype(int)\n", + "variants = variants.sort_values(by=[\"chrom\", \"pos\"]).reset_index(drop=True).reset_index()\n", + "## Remove version numbers after dot in transcript_id\n", + "variants[\"transcript_id\"] = variants[\"transcript_id\"].str.split(\".\").str[0]\n", + "## Get the CDS sequences and annotations from the GTF and FASTA files\n", + "gtf_s, fasta = process_gtf(\n", + " f\"{DATA_DIR}/reference/ucsc_gencodev32_hg38.tsv\", f\"{DATA_DIR}/reference/{assembly}/{assembly}.fa\"\n", + ")\n", + "print(f\"Processed {gtf_s.shape[0]} GTF CDS sequences\")\n", + "display(gtf_s[[\"name\", \"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"cds_starts\", \"cds_ends\", \"cds_length\"]].head(2))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "89778ae1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing chromosomes: 0%| | 0/23 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2878,7 +3385,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 40, "id": "0db2abb9", "metadata": {}, "outputs": [], @@ -2897,7 +3404,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 41, "id": "cda7deab", "metadata": {}, "outputs": [ @@ -2922,6 +3429,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2931,13 +3439,12 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2949,6 +3456,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2957,14 +3465,13 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2973,6 +3480,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2981,14 +3489,13 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -2996,30 +3503,34 @@ " \n", " \n", "
chromposvariant_idrefalttx_namecdsStartcdsEndtx_strandvar_rel_dist_in_cdsref_codonalt_codonref_aaalt_aacodon_positionindextranscript_idprotein_variantAlphaMissenselabel
0chr12557810chr1_2557810_G_A_hg38GAENST0000040911925566642561524+153GAGAAGEK510ENST00000355716Q92956:E52K0.2328430
1chr12558346chr1_2558346_A_G_hg38AGENST000004091192556664
level_0idvariant_idtranscript_idchromposrefalt...alt_codonref_aaalt_aaalt_seqcodon_positionlevel_0_yindex_ytranscript_id_yprotein_variant_y
000chr1_2557810_G_A_hg38ENST00000355716Q92956:E52Kchr12557810GA...AAGEKATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC...5100ENST00000355716Q92956:E52K0.232843
111chr1_2558346_A_G_hg38ENST00000355716Q92956:Y61Cchr12558346AG...TGTYCATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC...6011ENST00000355716Q92956:Y61C0.839032
\n", - "

2 rows × 27 columns

\n", + "

2 rows × 29 columns

\n", "" ], "text/plain": [ - " id variant_id transcript_id protein_variant AlphaMissense \\\n", - "0 0 chr1_2557810_G_A_hg38 ENST00000355716 Q92956:E52K 0.232843 \n", - "1 1 chr1_2558346_A_G_hg38 ENST00000355716 Q92956:Y61C 0.839032 \n", + " level_0 id variant_id transcript_id protein_variant \\\n", + "0 0 0 chr1_2557810_G_A_hg38 ENST00000355716 Q92956:E52K \n", + "1 1 1 chr1_2558346_A_G_hg38 ENST00000355716 Q92956:Y61C \n", + "\n", + " AlphaMissense label chrom pos ref ... ref_aa alt_aa \\\n", + "0 0.232843 0 chr1 2557810 G ... E K \n", + "1 0.839032 0 chr1 2558346 A ... Y C \n", "\n", - " label chrom pos ref alt ... alt_codon ref_aa alt_aa \\\n", - "0 0 chr1 2557810 G A ... AAG E K \n", - "1 0 chr1 2558346 A G ... TGT Y C \n", + " alt_seq codon_position \\\n", + "0 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 51 \n", + "1 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 60 \n", "\n", - " alt_seq codon_position index_y \\\n", - "0 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 51 0 \n", - "1 ATGGAGCCTCCTGGAGACTGGGGGCCTCCTCCCTGGAGATCCACCC... 60 1 \n", + " level_0_y index_y transcript_id_y protein_variant_y AlphaMissense_y \\\n", + "0 0 0 ENST00000355716 Q92956:E52K 0.232843 \n", + "1 1 1 ENST00000355716 Q92956:Y61C 0.839032 \n", "\n", - " transcript_id_y protein_variant_y AlphaMissense_y label_y \n", - "0 ENST00000355716 Q92956:E52K 0.232843 0 \n", - "1 ENST00000355716 Q92956:Y61C 0.839032 0 \n", + " label_y \n", + "0 0 \n", + "1 0 \n", "\n", - "[2 rows x 27 columns]" + "[2 rows x 29 columns]" ] }, - "execution_count": 36, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -3037,7 +3548,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 42, "id": "de460bcf", "metadata": {}, "outputs": [ @@ -3050,7 +3561,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 37, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -3061,7 +3572,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "id": "0d4628a1", "metadata": {}, "outputs": [ @@ -3082,7 +3593,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 44, "id": "cc60a940", "metadata": {}, "outputs": [], @@ -3100,7 +3611,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 48, "id": "e64b85f3", "metadata": {}, "outputs": [ @@ -3130,7 +3641,7 @@ "└──────────┴─────────────────────┴────────────────────┴────────┴───┴───────┴───────────┴─────┴─────┘" ] }, - "execution_count": 7, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -3184,7 +3695,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 49, "id": "9a0f952c", "metadata": {}, "outputs": [ @@ -3198,7 +3709,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 17)
#binnamechromstrandtxStarttxEndcdsStartcdsEndexonCountexonStartsexonEndsscorename2cdsStartStatcdsEndStatexonFramescds_sequence
i64strstrstri64i64i64i64i64strstri64strstrstrstrstr
14"NM_021079.5""chr17""+"4506131645109016450613294510563912"45061316,45081643,45086507,450…"45061460,45081752,45086652,450…0"NMT1""cmpl""cmpl""0,2,0,1,0,2,2,2,0,0,0,0,""ATGGCGGACGAGAGTGAGACAGCAGTGAAG…
1010"NR_026723.1""chr12""-"5575246255817756558177565581775612"55752462,55757382,55760550,557…"55752840,55757553,55760640,557…0"CIP29""none""none""-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,…""
" + "shape: (2, 17)
#binnamechromstrandtxStarttxEndcdsStartcdsEndexonCountexonStartsexonEndsscorename2cdsStartStatcdsEndStatexonFramescds_sequence
i64strstrstri64i64i64i64i64strstri64strstrstrstrstr
1098"NR_136665.1""chr16""+"6724887167272204672722046727220415"67248871,67252541,67255034,672…"67249201,67252805,67255184,672…0"SLC9A5""none""none""-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,…""
1172"NR_169510.1""chr14""-"770315587703420677034206770342062"77031558,77033839,""77033215,77034206,"0"LOC105370579""none""none""-1,-1,"""
" ], "text/plain": [ "shape: (2, 17)\n", @@ -3208,29 +3719,25 @@ "│ i64 ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str │\n", "╞══════╪═════════════╪═══════╪════════╪═══╪══════════════╪════════════╪══════════════╪═════════════╡\n", - "│ 14 ┆ NM_021079.5 ┆ chr17 ┆ + ┆ … ┆ cmpl ┆ cmpl ┆ 0,2,0,1,0,2, ┆ ATGGCGGACGA │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2,2,0,0,0,0, ┆ GAGTGAGACAG │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ CAGTGAAG… │\n", - "│ 1010 ┆ NR_026723.1 ┆ chr12 ┆ - ┆ … ┆ none ┆ none ┆ -1,-1,-1,-1, ┆ │\n", + "│ 1098 ┆ NR_136665.1 ┆ chr16 ┆ + ┆ … ┆ none ┆ none ┆ -1,-1,-1,-1, ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -1,-1,-1,-1, ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ -1,-1,… ┆ │\n", + "│ 1172 ┆ NR_169510.1 ┆ chr14 ┆ - ┆ … ┆ none ┆ none ┆ -1,-1, ┆ │\n", "└──────┴─────────────┴───────┴────────┴───┴──────────────┴────────────┴──────────────┴─────────────┘" ] }, - "execution_count": 12, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the reference sequence and CDS context annotation of the variants from the GTF and FASTA files\n", - "refseq = pl.read_csv(f\"{DATA_DIR}/clinvar_syn/ucsc_refseq_hg38.tsv\", separator=\"\\t\")\n", - "refseq.head(2)\n", "\n", "# Build CDS sequences for synonymous variants\n", "valid_chroms = [\"chr\" + str(i) for i in range(1, 23)]\n", - "refseq = pl.read_csv(f\"{DATA_DIR}/clinvar_syn/ucsc_refseq_hg38.tsv\", separator=\"\\t\")\n", - "refseq_hist = pl.read_csv(f\"{DATA_DIR}/clinvar_syn/ucsc_refseq_hist_hg38.tsv\", separator=\"\\t\")\n", + "refseq = pl.read_csv(f\"{DATA_DIR}/reference/ucsc_refseq_hg38.tsv\", separator=\"\\t\")\n", + "refseq_hist = pl.read_csv(f\"{DATA_DIR}/reference/ucsc_refseq_hist_hg38.tsv\", separator=\"\\t\")\n", "refseq = pl.concat([refseq, refseq_hist])\n", "refseq = refseq.filter(pl.col(\"chrom\").is_in(valid_chroms)).unique()\n", "fasta = {}\n", @@ -3248,7 +3755,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 50, "id": "90a4cf3e", "metadata": {}, "outputs": [ @@ -3281,7 +3788,7 @@ "└──────────┴────────────┴─────────────────┴────────┴───┴─────┴─────┴─────────────┴─────────────────┘" ] }, - "execution_count": 13, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -3351,169 +3858,2650 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "97de04a0", "metadata": {}, - "outputs": [], - "source": [ - "# Process variants per chromosome and add additional features: pLI, PhyloP, codon frequencies\n", - "import re\n", - "\n", - "\n", - "result = []\n", - "\n", - "for row in tqdm(dset.rows(named=True)):\n", - " s = row[\"Name\"].split(\":\")[1].split(\" \")[0]\n", - " m = re.fullmatch(r\"c\\.(\\d+)([ACGT])>([ACGT])\", s)\n", - " pos_cds, ref_cds, alt_cds = int(m.group(1)), m.group(2), m.group(3)\n", - "\n", - " tx = refseq.filter((pl.col(\"name\") == row[\"tx\"]) & (pl.col(\"chrom\") == row[\"chrom\"]))[0]\n", - " try:\n", - " pos_cds0 = tx_gposes[(row[\"chrom\"], row[\"tx\"])].index(row[\"pos\"] - 1)\n", - " except:\n", - " continue\n", - " seq = tx[0, \"cds_sequence\"]\n", - " if pos_cds0 + 1 != pos_cds:\n", - " print(str(row))\n", - " assert seq[pos_cds0] == ref_cds\n", - " assert ref_cds == row[\"ref\"] if tx[0, \"strand\"] == \"+\" else get_reverse_complement(row[\"ref\"])\n", - " assert alt_cds == row[\"alt\"] if tx[0, \"strand\"] == \"+\" else get_reverse_complement(row[\"alt\"])\n", - "\n", - " codon_position = pos_cds0 // 3\n", - " ref_codon = seq[codon_position * 3 : (codon_position + 1) * 3]\n", - " remainder = pos_cds0 % 3\n", - " alt_nuc = list(ref_codon)\n", - " alt_nuc[remainder] = alt_cds\n", - " alt_codon = \"\".join(alt_nuc)\n", - " item = {\n", - " \"chrom\": row[\"chrom\"],\n", - " \"pos\": row[\"pos\"],\n", - " \"ref\": row[\"ref\"],\n", - " \"alt\": row[\"alt\"],\n", - " \"var_rel_dist_in_cds\": pos_cds0,\n", - " \"codon_position\": codon_position,\n", - " \"ref_codon\": ref_codon,\n", - " \"alt_codon\": alt_codon,\n", - " \"tx\": row[\"tx\"],\n", - " \"label\": row[\"ClinicalSignificance\"],\n", - " \"in_splice_junction\": row[\"in_splice_junction\"],\n", - " \"ref_seq\": seq,\n", - " \"alt_seq\": seq[:pos_cds0] + alt_cds + seq[pos_cds0 + 1 :],\n", - " }\n", - " result.append(item)\n", - "\n", - "\n", - "result_df = pl.from_dicts(result).with_row_index(\"id\")\n", - "frame = result_df.to_pandas()\n", - "(frame[\"ref_seq\"].apply(lambda x: len(x) == 0)).sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "f7ecb7ae", - "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 6731/129454 [00:06<01:58, 1032.67it/s]" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Adding additional features (pLI, PhyloP, codon frequencies)...\n" + "{'AlleleID': 178069, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3588C>T (p.Ser1196=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2025', 'RS# (dbSNP)': 200077311, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002313002|RCV001668310|RCV000735080', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900|MedGen:CN169374', 'PhenotypeList': 'Inborn genetic diseases|not provided|not specified', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721196, 'Stop': 50721196, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA180434', 'SubmitterCategories': 2, 'VariationID': 167684, 'PositionVCF': 50721196, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847412|SCV000863275|SCV001882588|SCV004011423|SCV005277460', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721196, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|███████████████████████████████████████████████████████████| 129384/129384 [00:12<00:00, 10168.75it/s]\n" + " 9%|▉ | 11706/129454 [00:11<01:54, 1025.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Dataset with additional features: 129384 variants\n" + "{'AlleleID': 237006, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2028G>A (p.Thr676=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2025', 'RS# (dbSNP)': 73892912, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000224793|RCV001726056|RCV002315674', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50704769, 'Stop': 50704769, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 6, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325660', 'SubmitterCategories': 2, 'VariationID': 235319, 'PositionVCF': 50704769, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000280798|SCV000847390|SCV001759587|SCV005331035', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50704769, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" ] }, { - "data": { - "text/html": [ - "
\n", - "shape: (2, 27)
idchromposrefaltvar_rel_dist_in_cdscodon_positionref_codonalt_codontxlabelin_splice_junctionref_seqalt_seqref_aaalt_aaref_codon_freqalt_codon_freqcodon_freq_ratiogene_nameplipli_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
u32stri64strstri64i64strstrstrstrboolstrstrstrstrf64f64f64strf64i32f64i32u32f64i32
0"chr1"45015006"G""A"941313"GAG""GAA""NM_000374.5""Likely pathogenic"true"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"E""E"4.6414453e73.7827281e70.20458"UROD"0.007.998811040.8523558
1"chr10"124400865"G""A"1133377"AAC""AAT""NM_000274.4""Benign"false"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"N""N"2.0900468e72.0353876e70.0265"OAT"0.00-2.351-213200.8583338
" - ], - "text/plain": [ - "shape: (2, 27)\n", - "┌─────┬───────┬───────────┬─────┬───┬────────────┬────────────┬─────────────────┬──────────────────┐\n", - "│ id ┆ chrom ┆ pos ┆ ref ┆ … ┆ phylop_bin ┆ cds_length ┆ cds_offset_frac ┆ cds_offset_frac_ │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ bin │\n", - "│ u32 ┆ str ┆ i64 ┆ str ┆ ┆ i32 ┆ u32 ┆ f64 ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ i32 │\n", - "╞═════╪═══════╪═══════════╪═════╪═══╪════════════╪════════════╪═════════════════╪══════════════════╡\n", - "│ 0 ┆ chr1 ┆ 45015006 ┆ G ┆ … ┆ 8 ┆ 1104 ┆ 0.852355 ┆ 8 │\n", - "│ 1 ┆ chr10 ┆ 124400865 ┆ G ┆ … ┆ -2 ┆ 1320 ┆ 0.858333 ┆ 8 │\n", - "└─────┴───────┴───────────┴─────┴───┴────────────┴────────────┴─────────────────┴──────────────────┘" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + " 16%|█▌ | 20263/129454 [00:19<01:44, 1047.00it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 346769, 'Type': 'single nucleotide variant', 'Name': 'NM_001379500.1(COL18A1):c.2832A>C (p.Pro944=)', 'GeneID': 80781, 'GeneSymbol': 'COL18A1', 'HGNC_ID': 'HGNC:2195', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 03, 2025', 'RS# (dbSNP)': 751825604, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000349746|RCV002057775|RCV004549794', 'PhenotypeIDS': 'MONDO:MONDO:0800166,MedGen:C1849409,OMIM:PS267750,Orphanet:1571|MedGen:C3661900|', 'PhenotypeList': 'Knobloch syndrome|not provided|COL18A1-related disorder', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000021.9', 'Chromosome': '21', 'Start': 45504529, 'Stop': 45504529, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '21q22.3', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10650647', 'SubmitterCategories': 2, 'VariationID': 340260, 'PositionVCF': 45504529, 'ReferenceAlleleVCF': 'A', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000436429|SCV002323680|SCV004146729|SCV004772740|SCV005207582', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr21', 'pos': 45504529, 'ref': 'A', 'alt': 'C', 'tx': 'NM_001379500.1', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 23%|██▎ | 29339/129454 [00:27<01:36, 1038.97it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 431503, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4038C>T (p.Gly1346=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Apr 01, 2024', 'RS# (dbSNP)': 367676023, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000590960|RCV001662502|RCV002358390', 'PhenotypeIDS': 'Human Phenotype Ontology:HP:0000717,MONDO:MONDO:0005260,MeSH:D001321,MedGen:C0004352,OMIM:209850|MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'Autism|not provided|Inborn genetic diseases', 'Origin': 'germline;unknown', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721646, 'Stop': 50721646, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326151', 'SubmitterCategories': 2, 'VariationID': 437882, 'PositionVCF': 50721646, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001875419|SCV002620213|SCV004155335', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721646, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 28%|██▊ | 36418/129454 [00:34<01:19, 1168.23it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 486315, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.5043C>G (p.Pro1681=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jul 01, 2024', 'RS# (dbSNP)': 958460783, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000585318', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50730934, 'Stop': 50730934, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA325591023', 'SubmitterCategories': 2, 'VariationID': 493352, 'PositionVCF': 50730934, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'G', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000693096|SCV001801058', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50730934, 'ref': 'C', 'alt': 'G', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 35%|███▍ | 45075/129454 [00:42<01:27, 967.44it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 580539, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2118C>T (p.Ile706=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 01, 2024', 'RS# (dbSNP)': 182897668, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001683645|RCV002313699', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50704859, 'Stop': 50704859, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325682', 'SubmitterCategories': 2, 'VariationID': 588762, 'PositionVCF': 50704859, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848960|SCV001903881|SCV004155306', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50704859, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580555, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2820G>A (p.Ala940=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Apr 01, 2024', 'RS# (dbSNP)': 758217731, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001545823|RCV002318678', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720428, 'Stop': 50720428, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325923', 'SubmitterCategories': 2, 'VariationID': 589256, 'PositionVCF': 50720428, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000849956|SCV001765227|SCV005041872', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720428, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580575, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3876C>T (p.Asn1292=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2024', 'RS# (dbSNP)': 371876840, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001644789|RCV002316080', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721484, 'Stop': 50721484, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326094', 'SubmitterCategories': 2, 'VariationID': 588161, 'PositionVCF': 50721484, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847568|SCV001856550|SCV004155330|SCV005207863', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721484, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580581, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4671G>A (p.Gly1557=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 20, 2021', 'RS# (dbSNP)': 191010623, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001571309|RCV002318819', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722279, 'Stop': 50722279, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326286', 'SubmitterCategories': 2, 'VariationID': 589408, 'PositionVCF': 50722279, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000850229|SCV001795752', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722279, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580587, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.5172C>T (p.Pro1724=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2025', 'RS# (dbSNP)': 557669600, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002314485|RCV001531381', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50731063, 'Stop': 50731063, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326348', 'SubmitterCategories': 2, 'VariationID': 588394, 'PositionVCF': 50731063, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848147|SCV001746453|SCV001831170', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50731063, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580670, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2268C>G (p.Pro756=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2025', 'RS# (dbSNP)': 61731160, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002314483|RCV001531378|RCV001701434', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900|MedGen:CN169374', 'PhenotypeList': 'Inborn genetic diseases|not provided|not specified', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50706085, 'Stop': 50706085, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325755', 'SubmitterCategories': 2, 'VariationID': 588392, 'PositionVCF': 50706085, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'G', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848145|SCV001746449|SCV001889731', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50706085, 'ref': 'C', 'alt': 'G', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580674, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2667G>C (p.Pro889=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2016', 'RS# (dbSNP)': 1569114747, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312412|RCV004704195', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720275, 'Stop': 50720275, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515258776', 'SubmitterCategories': 2, 'VariationID': 587929, 'PositionVCF': 50720275, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846762|SCV005207860', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720275, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580676, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2535G>A (p.Pro845=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 01, 2025', 'RS# (dbSNP)': 117066889, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002318081|RCV001573322|RCV001701435', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900|MedGen:CN169374', 'PhenotypeList': 'Inborn genetic diseases|not provided|not specified', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50715713, 'Stop': 50715713, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 6, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325884', 'SubmitterCategories': 2, 'VariationID': 589141, 'PositionVCF': 50715713, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000849750|SCV001871856|SCV002496764|SCV005277454', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50715713, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580677, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2676G>C (p.Pro892=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Oct 01, 2023', 'RS# (dbSNP)': 1173390690, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312413|RCV003424305', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720284, 'Stop': 50720284, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515258805', 'SubmitterCategories': 2, 'VariationID': 587930, 'PositionVCF': 50720284, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846764|SCV004155315', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720284, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580680, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2673G>C (p.Pro891=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Nov 01, 2022', 'RS# (dbSNP)': 1569114751, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312411|RCV001566431', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720281, 'Stop': 50720281, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515258795', 'SubmitterCategories': 2, 'VariationID': 587928, 'PositionVCF': 50720281, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846759|SCV001789944|SCV004155314|SCV005207861', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720281, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580682, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3222C>T (p.Tyr1074=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Nov 13, 2019', 'RS# (dbSNP)': 144470529, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001572895|RCV001700296|RCV002312344', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720830, 'Stop': 50720830, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325943', 'SubmitterCategories': 2, 'VariationID': 587855, 'PositionVCF': 50720830, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846468|SCV001895077|SCV005277457', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720830, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580688, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3372C>T (p.Pro1124=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 04, 2021', 'RS# (dbSNP)': 200572899, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001577348|RCV002312432', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720980, 'Stop': 50720980, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325963', 'SubmitterCategories': 2, 'VariationID': 587951, 'PositionVCF': 50720980, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000846864|SCV001804705', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720980, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580701, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3762G>A (p.Lys1254=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2025', 'RS# (dbSNP)': 145196448, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001653983|RCV002312271', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721370, 'Stop': 50721370, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326059', 'SubmitterCategories': 2, 'VariationID': 587769, 'PositionVCF': 50721370, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000845947|SCV001868546|SCV002496765|SCV005277461', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721370, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580708, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4569C>T (p.His1523=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 03, 2020', 'RS# (dbSNP)': 368142005, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001655575|RCV002316168', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722177, 'Stop': 50722177, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326260', 'SubmitterCategories': 2, 'VariationID': 588251, 'PositionVCF': 50722177, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847795|SCV001868143', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722177, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580806, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2296C>T (p.Leu766=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 04, 2020', 'RS# (dbSNP)': 201094179, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312267|RCV001644784', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50706113, 'Stop': 50706113, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325759', 'SubmitterCategories': 2, 'VariationID': 587765, 'PositionVCF': 50706113, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000845934|SCV001857925', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50706113, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580808, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2436C>T (p.Ala812=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2024', 'RS# (dbSNP)': 61729465, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001655577|RCV002314482', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50714993, 'Stop': 50714993, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325846', 'SubmitterCategories': 2, 'VariationID': 588391, 'PositionVCF': 50714993, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000848142|SCV001861741|SCV004155312|SCV005207858', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50714993, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580824, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3114G>C (p.Ala1038=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jun 01, 2024', 'RS# (dbSNP)': 772152761, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001644788|RCV002312783', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720722, 'Stop': 50720722, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325936', 'SubmitterCategories': 2, 'VariationID': 588063, 'PositionVCF': 50720722, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847249|SCV001858079|SCV002544750', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720722, 'ref': 'G', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 580832, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3559C>T (p.Leu1187=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 19, 2021', 'RS# (dbSNP)': 376858991, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002312773|RCV001592915', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721167, 'Stop': 50721167, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326004', 'SubmitterCategories': 2, 'VariationID': 588053, 'PositionVCF': 50721167, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV000847222|SCV001823167|SCV005207862', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721167, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 49%|████▊ | 63068/129454 [00:59<01:01, 1071.16it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 728536, 'Type': 'single nucleotide variant', 'Name': 'NM_001401501.2(MUC16):c.44076G>A (p.Glu14692=)', 'GeneID': 94025, 'GeneSymbol': 'MUC16', 'HGNC_ID': 'HGNC:15582', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2023', 'RS# (dbSNP)': 187392925, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV000891417|RCV003940689', 'PhenotypeIDS': 'MedGen:C3661900|', 'PhenotypeList': 'not provided|MUC16-related disorder', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000019.10', 'Chromosome': '19', 'Start': 8882863, 'Stop': 8882863, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '19p13.2', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA9162760', 'SubmitterCategories': 2, 'VariationID': 718464, 'PositionVCF': 8882863, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001035235|SCV004146564|SCV005309160', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr19', 'pos': 8882863, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001401501.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 60%|██████ | 78067/129454 [01:13<00:49, 1033.55it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1001214, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4083A>G (p.Pro1361=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2021', 'RS# (dbSNP)': 371543035, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001310466|RCV002366158', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721691, 'Stop': 50721691, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326158', 'SubmitterCategories': 2, 'VariationID': 1012485, 'PositionVCF': 50721691, 'ReferenceAlleleVCF': 'A', 'AlternateAlleleVCF': 'G', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001500271|SCV002623691', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721691, 'ref': 'A', 'alt': 'G', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 71%|███████▏ | 92335/129454 [01:27<00:38, 974.15it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1173520, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4233G>A (p.Pro1411=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 17, 2021', 'RS# (dbSNP)': 369083529, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001540436|RCV002377903', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721841, 'Stop': 50721841, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326194', 'SubmitterCategories': 2, 'VariationID': 1182740, 'PositionVCF': 50721841, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001758323|SCV002624534', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721841, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 72%|███████▏ | 93021/129454 [01:28<00:37, 974.89it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1196260, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4104C>T (p.Ser1368=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 01, 2025', 'RS# (dbSNP)': 201793890, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001572736|RCV001701200|RCV002368594', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721712, 'Stop': 50721712, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 7, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326166', 'SubmitterCategories': 2, 'VariationID': 1205883, 'PositionVCF': 50721712, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001856828|SCV002624886|SCV004155337|SCV005277462', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721712, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1199396, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3885G>A (p.Glu1295=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Dec 14, 2020', 'RS# (dbSNP)': 546313986, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001575039|RCV002458542', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721493, 'Stop': 50721493, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326099', 'SubmitterCategories': 2, 'VariationID': 1207144, 'PositionVCF': 50721493, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001801950|SCV002617661', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721493, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 72%|███████▏ | 93616/129454 [01:28<00:36, 971.95it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1215858, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.1635C>T (p.Ala545=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2024', 'RS# (dbSNP)': 780922475, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001609046', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50697627, 'Stop': 50697627, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325523', 'SubmitterCategories': 2, 'VariationID': 1227172, 'PositionVCF': 50697627, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001834698|SCV004698485|SCV005277433', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50697627, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1217481, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2184T>C (p.Gly728=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2025', 'RS# (dbSNP)': 747708688, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001608387|RCV001821925|RCV002421229', 'PhenotypeIDS': 'MedGen:C3661900|MedGen:CN169374|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|not specified|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50705026, 'Stop': 50705026, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 5, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325717', 'SubmitterCategories': 2, 'VariationID': 1224929, 'PositionVCF': 50705026, 'ReferenceAlleleVCF': 'T', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001835133|SCV002068754|SCV002718228|SCV004155307|SCV005277443', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50705026, 'ref': 'T', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1221491, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4392C>T (p.Ser1464=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Aug 01, 2024', 'RS# (dbSNP)': 767710495, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001616570', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722000, 'Stop': 50722000, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326213', 'SubmitterCategories': 2, 'VariationID': 1228939, 'PositionVCF': 50722000, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001840328|SCV005330831', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722000, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 93923/129454 [01:29<00:35, 1003.77it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1227669, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3705T>C (p.Ala1235=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 03, 2021', 'RS# (dbSNP)': 576803553, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001638944|RCV002334637', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721313, 'Stop': 50721313, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326045', 'SubmitterCategories': 2, 'VariationID': 1238541, 'PositionVCF': 50721313, 'ReferenceAlleleVCF': 'T', 'AlternateAlleleVCF': 'C', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001848243|SCV002618596', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721313, 'ref': 'T', 'alt': 'C', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 94337/129454 [01:29<00:34, 1020.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1244004, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2745G>A (p.Pro915=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 01, 2022', 'RS# (dbSNP)': 1453397190, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001665211|RCV002425018', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720353, 'Stop': 50720353, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 3, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA515259020', 'SubmitterCategories': 2, 'VariationID': 1254067, 'PositionVCF': 50720353, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001874947|SCV002742586|SCV004155319', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720353, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1247765, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4170C>T (p.Asn1390=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Oct 22, 2020', 'RS# (dbSNP)': 558643743, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001666132|RCV002370258', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50721778, 'Stop': 50721778, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326178', 'SubmitterCategories': 2, 'VariationID': 1256982, 'PositionVCF': 50721778, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001883431|SCV002626073', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50721778, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 94640/129454 [01:29<00:36, 965.54it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1252960, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4410C>T (p.Thr1470=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Apr 01, 2024', 'RS# (dbSNP)': 376136109, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001671868|RCV002329704', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722018, 'Stop': 50722018, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326218', 'SubmitterCategories': 2, 'VariationID': 1263051, 'PositionVCF': 50722018, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001887974|SCV002626993|SCV004011424|SCV005277463', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722018, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n", + "{'AlleleID': 1253191, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2799C>T (p.Gly933=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jul 01, 2023', 'RS# (dbSNP)': 907713706, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001669520', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720407, 'Stop': 50720407, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA325578069', 'SubmitterCategories': 2, 'VariationID': 1260703, 'PositionVCF': 50720407, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001887411|SCV004155320', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720407, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 94955/129454 [01:30<00:33, 1021.17it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1263803, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.3144C>T (p.Ser1048=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Aug 21, 2021', 'RS# (dbSNP)': 760688077, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001682504|RCV002440835', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720752, 'Stop': 50720752, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325938', 'SubmitterCategories': 2, 'VariationID': 1275625, 'PositionVCF': 50720752, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001905306|SCV002751012', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720752, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 74%|███████▎ | 95467/129454 [01:30<00:34, 985.75it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1279830, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.5160C>T (p.Pro1720=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Benign/Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jul 30, 2024', 'RS# (dbSNP)': 751652089, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001714909|RCV001775182|RCV002343801', 'PhenotypeIDS': 'MedGen:C3661900|MONDO:MONDO:0011652,MedGen:C1853490,OMIM:606232,Orphanet:48652|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Phelan-McDermid syndrome|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50731051, 'Stop': 50731051, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 4, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326347', 'SubmitterCategories': 2, 'VariationID': 1290001, 'PositionVCF': 50731051, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV001942614|SCV002011909|SCV002646862|SCV004155346', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50731051, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 74%|███████▍ | 96077/129454 [01:31<00:35, 952.85it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1319371, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2250C>T (p.Arg750=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Feb 11, 2021', 'RS# (dbSNP)': 188450024, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001797316|RCV002422854', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50705092, 'Stop': 50705092, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325737', 'SubmitterCategories': 2, 'VariationID': 1328684, 'PositionVCF': 50705092, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002038764|SCV002718632', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50705092, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 75%|███████▍ | 96813/129454 [01:32<00:32, 1019.92it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1334709, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2022G>A (p.Thr674=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Sep 06, 2021', 'RS# (dbSNP)': 147941361, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV001843666|RCV002406904', 'PhenotypeIDS': 'MedGen:C3661900|MeSH:D030342,MedGen:C0950123', 'PhenotypeList': 'not provided|Inborn genetic diseases', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50704763, 'Stop': 50704763, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10325659', 'SubmitterCategories': 2, 'VariationID': 1343064, 'PositionVCF': 50704763, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002102746|SCV002714984', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50704763, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 85%|████████▌ | 110267/129454 [01:44<00:19, 1003.82it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1798062, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.4656C>T (p.Pro1552=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Jan 01, 2025', 'RS# (dbSNP)': 750023626, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002333979|RCV004809819', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50722264, 'Stop': 50722264, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA10326281', 'SubmitterCategories': 2, 'VariationID': 1740560, 'PositionVCF': 50722264, 'ReferenceAlleleVCF': 'C', 'AlternateAlleleVCF': 'T', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002628299|SCV005434732', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50722264, 'ref': 'C', 'alt': 'T', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 89%|████████▉ | 115017/129454 [01:49<00:13, 1093.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 1845299, 'Type': 'single nucleotide variant', 'Name': 'NM_001372044.2(SHANK3):c.2703G>A (p.Ala901=)', 'GeneID': 85358, 'GeneSymbol': 'SHANK3', 'HGNC_ID': 'HGNC:14294', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'May 01, 2025', 'RS# (dbSNP)': 925909458, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV002455586|RCV003427481', 'PhenotypeIDS': 'MeSH:D030342,MedGen:C0950123|MedGen:C3661900', 'PhenotypeList': 'Inborn genetic diseases|not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000022.11', 'Chromosome': '22', 'Start': 50720311, 'Stop': 50720311, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '22q13.33', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA325578045', 'SubmitterCategories': 2, 'VariationID': 1791866, 'PositionVCF': 50720311, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV002738318|SCV004155317', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr22', 'pos': 50720311, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001372044.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 95%|█████████▍| 122824/129454 [01:57<00:06, 980.15it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AlleleID': 2815385, 'Type': 'single nucleotide variant', 'Name': 'NM_001401501.2(MUC16):c.44484C>T (p.Asn14828=)', 'GeneID': 94025, 'GeneSymbol': 'MUC16', 'HGNC_ID': 'HGNC:15582', 'ClinicalSignificance': 'Likely benign', 'ClinSigSimple': 0, 'LastEvaluated': 'Mar 01, 2022', 'RS# (dbSNP)': 372141764, 'nsv/esv (dbVar)': '-', 'RCVaccession': 'RCV003423296', 'PhenotypeIDS': 'MedGen:C3661900', 'PhenotypeList': 'not provided', 'Origin': 'germline', 'OriginSimple': 'germline', 'Assembly': 'GRCh38', 'ChromosomeAccession': 'NC_000019.10', 'Chromosome': '19', 'Start': 8871641, 'Stop': 8871641, 'ReferenceAllele': 'na', 'AlternateAllele': 'na', 'Cytogenetic': '19p13.2', 'ReviewStatus': 'criteria provided, multiple submitters, no conflicts', 'NumberSubmitters': 2, 'Guidelines': '-', 'TestedInGTR': 'N', 'OtherIDs': 'ClinGen:CA9162557', 'SubmitterCategories': 2, 'VariationID': 2649218, 'PositionVCF': 8871641, 'ReferenceAlleleVCF': 'G', 'AlternateAlleleVCF': 'A', 'SomaticClinicalImpact': '-', 'SomaticClinicalImpactLastEvaluated': '-', 'ReviewStatusClinicalImpact': '-', 'Oncogenicity': '-', 'OncogenicityLastEvaluated': '-', 'ReviewStatusOncogenicity': '-', 'SCVsForAggregateGermlineClassification': 'SCV004146562|SCV005208164', 'SCVsForAggregateSomaticClinicalImpact': '-', 'SCVsForAggregateOncogenicityClassification': '-', 'chrom': 'chr19', 'pos': 8871641, 'ref': 'G', 'alt': 'A', 'tx': 'NM_001401501.2', 'in_splice_junction': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 129454/129454 [02:03<00:00, 1048.95it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Process variants per chromosome and add additional features: pLI, PhyloP, codon frequencies\n", + "import re\n", + "\n", + "\n", + "result = []\n", + "\n", + "for row in tqdm(dset.rows(named=True)):\n", + " s = row[\"Name\"].split(\":\")[1].split(\" \")[0]\n", + " m = re.fullmatch(r\"c\\.(\\d+)([ACGT])>([ACGT])\", s)\n", + " pos_cds, ref_cds, alt_cds = int(m.group(1)), m.group(2), m.group(3)\n", + "\n", + " tx = refseq.filter((pl.col(\"name\") == row[\"tx\"]) & (pl.col(\"chrom\") == row[\"chrom\"]))[0]\n", + " try:\n", + " pos_cds0 = tx_gposes[(row[\"chrom\"], row[\"tx\"])].index(row[\"pos\"] - 1)\n", + " except:\n", + " continue\n", + " seq = tx[0, \"cds_sequence\"]\n", + " if pos_cds0 + 1 != pos_cds:\n", + " print(str(row))\n", + " assert seq[pos_cds0] == ref_cds\n", + " assert ref_cds == row[\"ref\"] if tx[0, \"strand\"] == \"+\" else reverse_complement_dna(row[\"ref\"])\n", + " assert alt_cds == row[\"alt\"] if tx[0, \"strand\"] == \"+\" else reverse_complement_dna(row[\"alt\"])\n", + "\n", + " codon_position = pos_cds0 // 3\n", + " ref_codon = seq[codon_position * 3 : (codon_position + 1) * 3]\n", + " remainder = pos_cds0 % 3\n", + " alt_nuc = list(ref_codon)\n", + " alt_nuc[remainder] = alt_cds\n", + " alt_codon = \"\".join(alt_nuc)\n", + " item = {\n", + " \"chrom\": row[\"chrom\"],\n", + " \"pos\": row[\"pos\"],\n", + " \"ref\": row[\"ref\"],\n", + " \"alt\": row[\"alt\"],\n", + " \"var_rel_dist_in_cds\": pos_cds0,\n", + " \"codon_position\": codon_position,\n", + " \"ref_codon\": ref_codon,\n", + " \"alt_codon\": alt_codon,\n", + " \"tx\": row[\"tx\"],\n", + " \"label\": row[\"ClinicalSignificance\"],\n", + " \"in_splice_junction\": row[\"in_splice_junction\"],\n", + " \"ref_seq\": seq,\n", + " \"alt_seq\": seq[:pos_cds0] + alt_cds + seq[pos_cds0 + 1 :],\n", + " }\n", + " result.append(item)\n", + "\n", + "\n", + "result_df = pl.from_dicts(result).with_row_index(\"id\")\n", + "frame = result_df.to_pandas()\n", + "(frame[\"ref_seq\"].apply(lambda x: len(x) == 0)).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "f7ecb7ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding additional features (pLI, PhyloP, codon frequencies)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 129384/129384 [00:02<00:00, 45206.96it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset with additional features: 129384 variants\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 27)
idchromposrefaltvar_rel_dist_in_cdscodon_positionref_codonalt_codontxlabelin_splice_junctionref_seqalt_seqref_aaalt_aaref_codon_freqalt_codon_freqcodon_freq_ratiogene_nameplipli_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
u64stri64strstri64i64strstrstrstrboolstrstrstrstrf64f64f64strf64i32f64i32u32f64i32
0"chr1"45015006"G""A"941313"GAG""GAA""NM_000374.5""Likely pathogenic"true"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"ATGGAAGCGAATGGGTTGGGACCTCAGGGT…"E""E"4.6414453e73.7827281e70.20458"UROD"0.007.998811040.8523558
1"chr10"124400865"G""A"1133377"AAC""AAT""NM_000274.4""Benign"false"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"ATGTTTTCCAAACTAGCACATTTGCAGAGG…"N""N"2.0900468e72.0353876e70.0265"OAT"0.00-2.351-213200.8583338
" + ], + "text/plain": [ + "shape: (2, 27)\n", + "┌─────┬───────┬───────────┬─────┬───┬────────────┬────────────┬─────────────────┬──────────────────┐\n", + "│ id ┆ chrom ┆ pos ┆ ref ┆ … ┆ phylop_bin ┆ cds_length ┆ cds_offset_frac ┆ cds_offset_frac_ │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ bin │\n", + "│ u64 ┆ str ┆ i64 ┆ str ┆ ┆ i32 ┆ u32 ┆ f64 ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ i32 │\n", + "╞═════╪═══════╪═══════════╪═════╪═══╪════════════╪════════════╪═════════════════╪══════════════════╡\n", + "│ 0 ┆ chr1 ┆ 45015006 ┆ G ┆ … ┆ 8 ┆ 1104 ┆ 0.852355 ┆ 8 │\n", + "│ 1 ┆ chr10 ┆ 124400865 ┆ G ┆ … ┆ -2 ┆ 1320 ┆ 0.858333 ┆ 8 │\n", + "└─────┴───────┴───────────┴─────┴───┴────────────┴────────────┴─────────────────┴──────────────────┘" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Adding additional features (pLI, PhyloP, codon frequencies)...\")\n", + "dset = process_dset(result_df, refseq, remove_non_pli=False)\n", + "print(f\"Dataset with additional features: {dset.shape[0]} variants\")\n", + "dset.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "a6fe62c7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAHDCAYAAACpu1eiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABTsUlEQVR4nO3deVwVZf//8TcgB3BBQAXEUFHLfUncMFNLBJU0y1zKysw0CyuzrGxRRLs1LbfcslJb9E7NtFIzSS1LyS3NNTNvva27wDsVcAWE6/eHvzO3R8AFUHC+r+fjwUPPzDUz1+fMnDPvMzNnjpsxxggAAAA3PPei7gAAAAAKB8EOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMHOplauXKlGjRrJ29tbbm5uSklJKeouIRdVq1bVI488UtTdwDV0pa/FuLg4ubm56e+//76+HSwkhb0tL1y4UAEBATp58mShzdNu3NzcFBcXV9TduKy2bduqXr16Rd2NG0qvXr3Uo0ePfE1bKMFuxowZ6t69uypXriw3N7c8X9zr1q1Tly5dFBoaKm9vbwUHB6tDhw5av359ru03bNigVq1aqWTJkgoODtbTTz+d40W+efNmDRo0SHXr1lWpUqVUuXJl9ejRQ7/++muO+bm5ueX51759+wI/D1drw4YNiouLK/TQdfToUfXo0UM+Pj6aNm2aPvroI5UqVSrXtidPntSIESPUoUMHBQQEyM3NTXPnzs3RLjs7W3PnzrXWX6lSpVSvXj2NHj1aZ8+ezdE+OTlZffv2VWBgoHx8fNS4cWMtWrTosn1v37693NzcNGjQoBzj8lp3Y8eOdWm3ZMkSRUdHKyQkRF5eXrrpppt03333adeuXZddvh3Nnz9fkyZNKupuFGvF4bWI/8nKytKIESP01FNPqXTp0kXalz///FNxcXHavn37NV/W9OnTc33/vRFcz+cpv652PyZJ77//vmrXri1vb2/dfPPNevvtty+7nEvtx6Tz+8fHH39clSpVkre3t6pWrap+/fq5tHnxxRe1ePFi/fzzz1ddZ4mrniIXb7zxhk6cOKFmzZrpr7/+yrPdr7/+Knd3dw0cOFDBwcE6fvy4Pv74Y7Vu3VrLly9Xhw4drLbbt29Xu3btVLt2bU2YMEF//PGH3nzzTe3fv19fffWVy7LXr1+v7t27q0GDBkpKStLUqVPVuHFj/fjjjy6fEj766KMcfdqyZYsmT56sqKiowngqrsqGDRs0cuRIPfLII/Lz8yu0+W7evFknTpzQqFGjFBkZecm2f//9t+Lj41W5cmU1bNhQ3377ba7tTp8+rb59+6pFixYaOHCgAgMDlZiYqBEjRmj16tVas2aN3NzcJElpaWlq1aqVkpOT9cwzzyg4OFgLFy5Ujx49NG/ePD3wwAO5LuOzzz5TYmLiJfvbvn17Pfzwwy7Dbr31VpfHO3fulL+/v5555hmVL19eSUlJmj17tpo1a6bExEQ1bNjwksu4nvbt2yd392t74Hz+/PnatWuXBg8efE2XcyMrDq9F/M+XX36pffv2acCAAUXdFf35558aOXKkqlatqkaNGl3TZU2fPl3ly5e/IY/iX8/nKb+uZj8mSe+8844GDhyobt26aciQIfr+++/19NNP6/Tp03rxxRdzXcbl9mO///67brvtNknSwIEDValSJf3555/atGmTS7tbb71VTZo00VtvvaUPP/zw6go1heDQoUMmOzvbGGNMqVKlTJ8+fa542lOnTpmgoCATHR3tMrxjx46mYsWKJjU11Rr27rvvGknm66+/toatX7/epKenu0z766+/Gi8vL9O7d+/LLr9fv37Gzc3N/P7771fc58Iyfvx4I8kcPHiwUOf7wQcfGElm8+bNl2179uxZ89dffxljjNm8ebORZObMmZOjXXp6ulm/fn2O4SNHjjSSTEJCgjVs3LhxRpJZvXq1NSwrK8s0bdrUBAcH51hfxhhz5swZU7VqVRMfH28kmdjY2Bxt8hp+JZKSkkyJEiXM448/nq/pC1N2drY5ffr0dVteTEyMqVKlynVb3qVkZmbmuv6LWnF4LY4YMcJIMv/9738LtQ/XS5UqVa7qvf9SunTpYlq1alUo8yqoS70vFra6deuaNm3aXHF7SWbEiBHXrD9X41LPU5s2bUzdunWvf6cucjX7sdOnT5ty5cqZmJgYl7a9e/c2pUqVMseOHcsxnyvZj3Xs2NGEhYWZv//++7L9ffPNN02pUqXMiRMnrqQ8S6EEuwtdbbAzxph69eqZ5s2bW49TU1NNiRIlzNChQ13apaenm9KlS5t+/fpddp6NGzc2jRs3vmSbs2fPGj8/P9O2bdsr6ufJkyfNkCFDzE033WQcDoe55ZZbzPjx461Qa4wxBw8ezHPjvvBF6HwTv/jvcjuWhQsXmsaNGxtvb29Trlw507t3b/PHH39Y49u0aZNjnle6PvLzBrZjxw4jyUyZMsUa1rlzZ1OhQoUcbZ07z1WrVuUYN3LkSFO5cmVz+vTpywa706dPmzNnzlxxH405H6Z8fX1Nz549L9kuJibGhIWF5TquRYsWJjw83Ho8e/Zsc8cdd5gKFSoYh8NhateubaZPn55juipVqpiYmBizcuVKEx4ebry8vMzEiROtcReun6NHj5rnnnvO1KtXz5QqVcqUKVPGdOjQwWzfvt1lnmvXrjWSzIIFC8zo0aNNpUqVjJeXl7nzzjvN/v37rXa5bQ8XhrwpU6aYOnXqGB8fH+Pn52fCw8PNvHnzLvkcpaenm9dee800btzY+Pr6mpIlS5pWrVqZNWvWuLRzvhbGjx9vJk6caKpVq2bc3d3Ntm3bjDHG7N2713Tr1s34+/sbLy8vEx4ebj7//PNLLtvJjq9FZz/27t1runfvbsqUKWMCAgLM008/nWN7z8zMNPHx8aZatWrG4XCYKlWqmGHDhpmzZ8/mWeeFLt7u5syZYySZH374wTz77LOmfPnypmTJkqZr167myJEjLtNmZ2ebUaNGmUqVKhkfHx/Ttm1bs2vXrhzzzMjIMHFxcaZGjRrGy8vLBAQEmNtuuy3X1/+Fzpw5YxwOh4mLi8t1/EcffWSaNm1qbbO33367y4d9Y4yZNm2aqVOnjnE4HKZixYrmySefNMePH3dp4wwbu3fvNm3btjU+Pj4mJCTEvPHGG1Yb5+vs4r8Lt6kff/zRREdHG19fX+Pj42Nat25tfvjhB2v8nj17jLe3t3nooYdclv/9998bd3d388ILLxhjzq+Ti5dzuZCX2/r9448/TN++fU1gYKBxOBymTp065v3333dpc6XvH05Tp041YWFhxtvb2zRt2tSsW7fOtGnTxurf5Z6nK3mui1Ju+7Hly5cbSWb58uUubTds2GAkmY8++ijHfC63H9u7d6+RZO0nzpw5YzIyMvLs188//2wkmc8+++yq6imUU7FXKy0tTRkZGfr777/14YcfateuXXr55Zet8Tt37tS5c+fUpEkTl+kcDocaNWqkbdu2XXL+xhglJyerbt26l2y3YsUKpaSkqHfv3pftszFGXbp00dq1a9WvXz81atRIX3/9tYYOHar//Oc/mjhx4mXncaF7771Xv/76q/75z39q4sSJKl++vCSpQoUKeU4zd+5c9e3bV02bNtWYMWOUnJysyZMna/369dq2bZv8/Pz0yiuvqGbNmpo1a5bi4+MVFham6tWrX1XfrkZSUpIkWf2XpPT0dPn4+ORoW7JkSUnS1q1bXa5pPHz4sMaOHavZs2fnOt2F5s6dq+nTp8sYo9q1a+vVV1/N89RuSkqKMjMzlZSUpEmTJiktLU3t2rW75Px79uyphx9+WJs3b1bTpk2t4f/+97/1448/avz48dawGTNmqG7duurSpYtKlCihL7/8Uk8++aSys7MVGxvrMt99+/bp/vvv1+OPP67+/furZs2auS7/X//6l5YuXaru3bsrLCxMycnJeuedd9SmTRvt2bNHISEhLu3Hjh0rd3d3Pf/880pNTdW4cePUu3dvbdy4UZL0yiuvKDU1VX/88Ye1jTqvWXr33Xf19NNP67777tMzzzyjs2fPaseOHdq4cWOez6l0/vX73nvv6f7771f//v114sQJvf/++4qOjtamTZtynIaZM2eOzp49qwEDBsjLy0sBAQHavXu3brvtNlWqVEkvvfSSSpUqpYULF6pr165avHix7rnnnjyXb/fXYo8ePVS1alWNGTNGP/74o6ZMmaLjx4+7nI557LHH9MEHH+i+++7Tc889p40bN2rMmDHau3evlixZclX1X+ipp56Sv7+/RowYoUOHDmnSpEkaNGiQFixYYLUZPny4Ro8erU6dOqlTp0766aefFBUVpYyMDJd5xcXFacyYMXrsscfUrFkzpaWlacuWLfrpp58ueU3z1q1blZGRocaNG+cYN3LkSMXFxally5aKj4+Xw+HQxo0btWbNGutymri4OI0cOVKRkZF64okntG/fPs2YMUObN2/W+vXr5enpac3v+PHj6tChg+6991716NFDn376qV588UXVr19fHTt2VO3atRUfH6/hw4drwIABuv322yVJLVu2lCStWbNGHTt2VHh4uEaMGCF3d3fNmTNHd955p77//ns1a9ZMtWvX1qhRozR06FDdd9996tKli06dOqVHHnlEtWrVUnx8vCRp0qRJ1jWFr7zyiiQpKCjoqtZfcnKyWrRoYV3fVaFCBX311Vfq16+f0tLSclyOcbn3D+n8+9ygQYN0++2369lnn9WhQ4fUtWtX+fv766abbpKkyz5PV/JcX0pqaqoyMzMvW7+3t3e+rsnMbT/mzBkX55Dw8HC5u7tr27ZtevDBB63hV7If++abbySdX6/t2rXTmjVr5OHhofbt22vGjBmqWrWqS/s6derIx8dH69evv+R7Yg5XFQOvwJUcsYuOjrYSvcPhMI8//rjLJ9JFixYZSWbdunU5pu3evbsJDg6+5Pw/+ugjIynHp5SLdevWzXh5eeX4JJebpUuXGklm9OjRLsPvu+8+4+bmZn777TdjzJUfJTDm6k7/ZGRkmMDAQFOvXj2X52rZsmVGkhk+fLg1zPnp+0pO/1woP0fsIiMjja+vr8tz+NRTTxl3d3dz6NAhl7a9evUyksygQYNcht93332mZcuW1mPlccSuZcuWZtKkSebzzz83M2bMMPXq1XP59HOxmjVrWttZ6dKlzauvvmqysrIuWU9qaqrx8vIyzz33nMvwcePGGTc3N/Pvf//bGpbb6dTo6GhTrVo1l2HOT+IrV67M0f7ioxxnz57N0ceDBw8aLy8vEx8fbw1zfkKuXbu2y6nNyZMnG0lm586d1rC8TsXefffd+To9cu7cuRynU48fP26CgoLMo48+6tJvScbX1zfHUZ927dqZ+vXruxxhys7ONi1btjQ333zzJZdv19ei84hdly5dXIY/+eSTRpL5+eefjTHGbN++3Ugyjz32mEu7559/3khyOXJ6cZ1OeR2xi4yMdDnq+eyzzxoPDw+TkpJijDHmyJEjxuFwmJiYGJd2L7/8co4jkg0bNsxxGutKvPfeezm2YWOM2b9/v3F3dzf33HNPjteIsy/O/kVFRbm0mTp1qpFkZs+ebQ1zHlH98MMPrWHp6ekmODjYdOvWzRqW1/tidna2ufnmm010dLTLc3H69GkTFhZm2rdvbw3LysoyrVq1MkFBQebvv/82sbGxpkSJEjm2i4Keiu3Xr5+pWLFijtN8vXr1MmXLlrXes670/SM9Pd2UK1fONG3a1GRmZlrt5s6dm+OI4uVOxV7Jc52X3I5+5/aX30sBctuPxcbGGg8Pj1zbV6hQwfTq1ctl2JXsx55++mkjyZQrV8506NDBLFiwwIwfP96ULl3aVK9e3Zw6dSrHsm655RbTsWPHq6qnSG53MnbsWK1atUrvv/++WrRooYyMDJ07d84af+bMGUmSl5dXjmm9vb2t8bn55ZdfFBsbq4iICPXp0yfPdmlpaVq+fLk6dep0RRdLr1ixQh4eHnr66addhj/33HMyxrh8oeNa2LJli44cOaInn3xS3t7e1vCYmBjVqlVLy5cvv6bLz80//vEPffPNNxo7dqzLc/jYY4/Jw8NDPXr00IYNG3TgwAGNGTPGOpJw4fpbu3atFi9efEXf2ly/fr2eeeYZdenSRQMHDtTWrVtVr149vfzyy7luE3PmzNHKlSs1ffp01a5dW2fOnFFWVtYll+Hr66uOHTtq4cKFOv/aPG/BggVq0aKFKleubA278FNZamqq/v77b7Vp00b/+te/lJqa6jLfsLAwRUdHX7ZGLy8v68sUWVlZOnr0qEqXLq2aNWvqp59+ytG+b9++cjgc1mPnJ+V//etfl12Wn5+f/vjjD23evPmybS/k4eFhLTM7O1vHjh2zjrDn1sdu3bq5HP06duyY1qxZox49eujEiRP6+++/9ffff+vo0aOKjo7W/v379Z///CfP5dv9tXjx0d6nnnpK0vm6L/x3yJAhLu2ee+45SSrQ8gcMGOBy8fjtt9+urKws/fvf/5Z0/ohDRkaGnnrqKZd2uX0xx8/PT7t379b+/fuvqg9Hjx6VJPn7+7sMX7p0qbKzszV8+PAcXzhy9sXZv8GDB7u06d+/v3x9fXM8N6VLl3Y56uJwONSsWbMrev1s375d+/fv1wMPPKCjR49a2/GpU6fUrl07rVu3TtnZ2ZIkd3d3zZ07VydPnlTHjh01ffp0DRs2LMfRoIIwxmjx4sXq3LmzjDFWf/7++29FR0crNTU1x+vzcu8fW7Zs0dGjR9W/f3+VKPG/E3y9e/fOsX4upyDP9VtvvaWEhITL/r3wwgtX1Scp7/3YmTNnXJ6bC12cQ650P+a8q0dwcLCWL1+uHj166Pnnn9e7776rAwcOaP78+Tmm8ff3v+pbIBXJqdgLT9U8+OCDaty4sR555BF9+umnkv63w0xPT88x7dmzZ/M8zJmUlKSYmBiVLVtWn376qTw8PPLsw+LFi3X27NkrOg0rnT8VFxISojJlyrgMr127tjX+WnLOP7dTeLVq1dIPP/xwTZd/sQULFujVV19Vv3799MQTT7iMa9CggebPn6+BAwda3/4JDg7WpEmT9MQTT1iHys+dO6enn35aDz30kMtpzyvlcDg0aNAgK+S1atXKZXxERIT1/169elnr6s0337zkfHv27KmlS5cqMTFRLVu21IEDB7R169YcL9r169drxIgRSkxM1OnTp13GpaamqmzZstbjsLCwK6opOztbkydP1vTp03Xw4EGXIFquXLkc7S8MmtL/dobHjx+/7LJefPFFffPNN2rWrJlq1KihqKgoPfDAA9Y6u5QPPvhAb731ln755ReXUyS51XnxsN9++03GGL322mt67bXXcp3/kSNHVKlSpVzH2f21ePPNN7s8rl69utzd3XXo0CFr+e7u7qpRo4ZLu+DgYPn5+RWo/sttT855X9zHChUq5NjRx8fH6+6779Ytt9yievXqqUOHDnrooYfUoEGDK+rLhR+sJOnAgQNyd3dXnTp18pwmr3XjcDhUrVq1HM/NTTfd5BJQpfM179ix47L9cwbWSx1ASE1NtZ6X6tWrKy4uTkOHDlW9evXy3Pbz67///a9SUlI0a9YszZo1K9c2R44ccXl8pev74m2tRIkSOU4bXk5Bnuvw8PCrWtaVutR+zMfHJ8flBU4X5pCr2Y85p+nRo4fLB4/u3bvroYce0oYNG/TYY4+5TGOMyfG8XU6RBLsLORwOdenSRWPHjtWZM2fk4+OjihUrSlKut07566+/clxnJJ1/AXXs2FEpKSn6/vvvc21zoXnz5qls2bK66667CqeQ/y+vFXC5I0U3koSEBD388MOKiYnRzJkzc23jvJbk559/VlZWlho3bmzdSuWWW26RJH344Yfat2+f3nnnHWun5XTixAkdOnRIgYGB1rV5uQkNDZV0/ijQpfj7++vOO+/UvHnzLhvsOnfurJIlS2rhwoVq2bKlFi5cKHd3d3Xv3t1qc+DAAbVr1061atXShAkTFBoaKofDoRUrVmjixInWJ3Wny1076PSPf/xDr732mh599FGNGjVKAQEBcnd31+DBg3PMU1KeH14u3inmpnbt2tq3b5+WLVumlStXavHixZo+fbqGDx+ukSNH5jndxx9/rEceeURdu3bV0KFDFRgYKA8PD40ZM0YHDhzI0f7i2p11PP/883kexbx4R5Ifdnkt5lXH1b7ZXyiv56Ag29PFWrdurQMHDujzzz/XqlWr9N5772nixImaOXNmjp3XhZwfYI4fP25dw3WtFKRe53Y8fvz4PG/vcfH1XqtWrZJ0/tYgR48eVXBw8FX09sr68+CDD+YZNi8O1YW5vi+nIMs6duxYniHrQj4+Pi4fqC/lcvuxihUrKisrS0eOHFFgYKA1PCMjQ0ePHrUyxtXsx5zTXHztpIeHh8qVK5frB/Ljx4/n+CB1OUUe7KTzhzyNMTpx4oR8fHxUr149lShRQlu2bHG583JGRoa2b9+e427MZ8+eVefOnfXrr7/qm2++ueQnOul8OFy7dq0eeeSRXE/35qZKlSr65ptvdOLECZcjBb/88os1XvrfJ56Lb3Sa26foq3ljds5/3759uvPOO13G7du3zxp/rW3cuFH33HOPmjRpooULF7ocnr+Yw+Fw+QTjvHDUeT+vw4cPKzMzM9cjRB9++KE+/PBDLVmyRF27ds1zGc7D+Je60N3pzJkzOU6R5qZUqVK66667tGjRIk2YMEELFizQ7bff7vJh4csvv1R6erq++OILl0+9a9euvez8L+XTTz/VHXfcoffff99leEpKisuFvVfjUttZqVKl1LNnT/Xs2VMZGRm699579frrr2vYsGEupxkv7mO1atX02Wefucx7xIgRV9SfatWqSZI8PT3zdW83u78W9+/f73KU87ffflN2drZ1hKRKlSrKzs7W/v37raOU0vkL51NSUlyW7+/vn6P+jIyMS95v9FKc896/f7+1HqXzR4ty2ykFBASob9++6tu3r06ePKnWrVsrLi7uksGuVq1akqSDBw+qfv361vDq1asrOztbe/bsyTNIXbhuLuxfRkaGDh48mK/tLa9tw/lFGF9f3yua78yZM5WQkKDXX39dY8aM0eOPP67PP//8ipZ1JSpUqKAyZcooKyur0O6Z6Hw+f/vtN91xxx3W8HPnzunQoUMuQbEgfb+ce++9V999991l2/Xp0+eKbvB8Jfsx5za2ZcsWderUyRq+ZcsWZWdnW+OvZj/mPPJ48aUmzi+TXrwfO3funH7//Xd16dLlsjVd6LpeY3fxYWDp/Jvu4sWLFRoaaqXismXLKjIyUh9//LFOnDhhtf3oo4908uRJlyMnWVlZ6tmzpxITE7Vo0SKX0295+eSTT5SdnX3Fp2ElqVOnTsrKytLUqVNdhk+cOFFubm7Wt3p8fX1Vvnx5rVu3zqXd9OnTc8zTeQf6K7nbfZMmTRQYGKiZM2e6nKL+6quvtHfvXsXExFxxLfnlXE7VqlW1bNmyKz4KJZ3fEcycOVN33XWXdcSuV69eWrJkSY4/6fzzvWTJEjVv3lzS+R3HxU6cOKFJkyapfPnyLofqc9vODh06pNWrV1/xNS09e/bUn3/+qffee08///yzevbs6TLe+enzwk+bqampmjNnzhXNPy8eHh45PsEuWrToktecXU6pUqVyDbTOa5mcHA6H6tSpI2PMJb+BllvtGzduvOzNpZ0CAwPVtm1bvfPOO7kGjNzW9YXs/lqcNm2ay2Pnne6ddTl3MhdfGjBhwgRJcll+9erVc9Q/a9asfB+1jIyMlKenp95++22X9Z/btUUXb1+lS5dWjRo1cr3E5kLh4eFyOBzasmWLy/CuXbvK3d1d8fHxOY5eO/sSGRkph8OhKVOmuPTv/fffV2pqar7WTV7bRnh4uKpXr64333wz1589u3A7PnjwoIYOHapu3brp5Zdf1ptvvqkvvvgix41nS5Uqle9fP/Hw8FC3bt20ePHiXH9l53Kvq9w0adJE5cqV07vvvutyHfy8efNyBPmreQ1drcK8xu5K92N33nmnAgICNGPGDJfhM2bMUMmSJa1t6Wr2Y23btlVgYKDmzZvn8ksXc+fOVVZWVo5vi+/Zs0dnz551+XbxlSiUI3Zffvml9bMXmZmZ2rFjh0aPHi1J6tKli5XqO3bsqJtuuknNmzdXYGCgDh8+rDlz5ujPP/90+Tq9JL3++utq2bKl2rRpowEDBuiPP/7QW2+9paioKJdfqHjuuef0xRdfqHPnzjp27Jg+/vhjl/lceLGm07x58xQSEqK2bdtecY2dO3fWHXfcoVdeeUWHDh1Sw4YNtWrVKn3++ecaPHiwy20MHnvsMY0dO1aPPfaYmjRponXr1uX6E2fOMPLKK6+oV69e8vT0VOfOnXP9ySFPT0+98cYb6tu3r9q0aaP777/fusVC1apV9eyzz15xLRebOnWqUlJS9Oeff0o6vz7/+OMPSecv3C5btqxOnDih6OhoHT9+XEOHDs1xEXL16tVdQnWdOnWsn5k7ePCgZsyYoYCAAJdD3rVq1bI+nV8sLCzM5UjdtGnTtHTpUnXu3FmVK1fWX3/9pdmzZ+vw4cP66KOPXC5yrV+/vtq1a6dGjRrJ399f+/fv1/vvv6/MzMwcPz+Wl06dOqlMmTJ6/vnnrTfMC0VFRcnhcKhz5856/PHHdfLkSb377rsKDAzM99EQSbrrrrsUHx+vvn37qmXLltq5c6fmzZvncvThaoWHh2vBggUaMmSImjZtqtKlS6tz586KiopScHCwbrvtNgUFBWnv3r2aOnWqYmJicly/dnEfP/vsM91zzz2KiYnRwYMHNXPmTNWpU+eKf9dz2rRpatWqlerXr6/+/furWrVqSk5OVmJiov74449L/oyOnV+L0vkQ0KVLF3Xo0EGJiYn6+OOP9cADD1i/mNKwYUP16dNHs2bNUkpKitq0aaNNmzbpgw8+UNeuXV2OrDz22GPWnfPbt2+vn3/+WV9//XW+j/5WqFBBzz//vMaMGaO77rpLnTp10rZt2/TVV1/lmGedOnXUtm1bhYeHKyAgQFu2bNGnn36a588sOXl7eysqKkrffPONdSsQ6fzp+VdeeUWjRo3S7bffrnvvvVdeXl7avHmzQkJCNGbMGFWoUEHDhg3TyJEj1aFDB3Xp0kX79u3T9OnT1bRp01z3B5dTvXp1+fn5aebMmSpTpoxKlSql5s2bKywsTO+99546duyounXrqm/fvqpUqZL+85//aO3atfL19dWXX34pY4weffRR+fj4WCHh8ccf1+LFi/XMM88oMjLSOhsQHh6uGTNmaPTo0apRo4YCAwNzHBW+lLFjx2rt2rVq3ry5+vfvrzp16ujYsWP66aef9M0331z2kpWLORwOxcXF6amnntKdd96pHj166NChQ5o7d66qV6/ucpTuUs9TQRXWNXZXsx/z8fHRqFGjFBsbq+7duys6Olrff/+9Pv74Y73++usKCAiQdHX7MS8vL40fP159+vRR69at9dBDD+nw4cOaPHmytU1fKCEhQSVLlrz6nzy9qu/Q5qFPnz55fv34wq8+T5061bRq1cqUL1/elChRwlSoUMF07tw519uaGHP+Bo4tW7Y03t7epkKFCiY2NtakpaW5tLnc16Av9ssvvxhJZsiQIVdd54kTJ8yzzz5rQkJCjKenp7n55ptz3BTVmPNfd+/Xr58pW7asKVOmjOnRo4c5cuRIrrcecN7o093d/Yput7BgwQJz6623Wjf8vPimqMZc/e1OcrsxpvPP2R/nrSPy+rv4a+a9evUyoaGhxuFwmJCQEDNw4ECTnJx8Rf1RLl8TX7VqlWnfvr0JDg42np6exs/Pz0RFRbn8uoXTiBEjTJMmTYy/v78pUaKECQkJMb169TI7duy4ouU79e7d27oFRG6++OIL06BBA+Pt7W2qVq1q3njjDTN79uwc69F5g+Lc5Ha7k+eee85UrFjR+Pj4mNtuu80kJia63AzUmP/drmDRokUu88vtFh8nT540DzzwgPHz8zPS/25Q/M4775jWrVubcuXKGS8vL1O9enUzdOhQl197yU12drb5xz/+YapUqWK8vLzMrbfeapYtW2b69OnjcluVC29QnJsDBw6Yhx9+2FqnlSpVMnfddZf59NNPL7l8Y+z5WnTe7mTPnj3mvvvuM2XKlDH+/v5m0KBBud6geOTIkSYsLMx4enqa0NDQXG9QnJWVZV588UXrhsPR0dHmt99+y/N2Jxf307mdrV271mWeI0eOtLbRvG5QPHr0aNOsWTPj5+dnfHx8TK1atczrr79+yRuyOn322WfGzc3NHD58OMe42bNnW8+7v7+/adOmjcsvBhhzfl9Tq1Yt4+npaYKCgswTTzyR5w2KL3bxdmyMMZ9//rmpU6eOKVGiRI7X17Zt28y9995rvY6qVKlievToYb03OW8hsnjxYpd5Hj582Pj6+ppOnTpZw5KSkkxMTIwpU6ZMjtuJ5Ca37Tg5OdnExsaa0NBQ4+npaYKDg027du3MrFmzrDZX8/5hzPkbmTtf782aNTPr16834eHhpkOHDlf0PF3Nc30tXe1+zBhjZs2aZWrWrGkcDoepXr26mThxYo73mdzkth9z+uc//2kaNmxovLy8TFBQkBk0aFCObGOMMc2bNzcPPvjgVdfp9v87AABAsZCVlaU6deqoR48eGjVqVFF3BxfJzs5WhQoVdO+99+rdd98t6u7Y0vbt29W4cWP99NNPV/3bu0VyHzsAAPLi4eGh+Ph4TZs27YpP7+PaOHv2bI7rfj/88EMdO3bsqi5nwtUZO3as7rvvvqsOdZLEETsAAJCrb7/9Vs8++6y6d++ucuXK6aefftL777+v2rVra+vWrXnexBdFp1jc7gQAABQ/VatWVWhoqKZMmaJjx44pICBADz/8sMaOHUuoK6Y4YgcAAGATXGMHAABgEwQ7AAAAm+AaO5vLzs7Wn3/+qTJlylzTn3wBABRv5v//dGdISIjLj9DDXgh2Nvfnn38qNDS0qLsBACgmfv/9d910001F3Q1cIwQ7m3P+NNTvv/8uX1/fIu5N/mRmZmrVqlWKioqSp6dnUXenwOxUj51qkainuKOegklLS1NoaOglfzIQNz6Cnc05T7/6+vre0MGuZMmS8vX1tc2buV3qsVMtEvUUd9RTOLgsx944yQ4AAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyiRFF3ADeOqi8tL5LlenkYjWsm1Yv7WulZbpKkQ2NjiqQvAAAUZxyxAwAAsAmCXS7WrVunzp07KyQkRG5ublq6dKnLeGOMhg8frooVK8rHx0eRkZHav3+/S5tjx46pd+/e8vX1lZ+fn/r166eTJ0+6tNmxY4duv/12eXt7KzQ0VOPGjcvRl0WLFqlWrVry9vZW/fr1tWLFikKvFwAA2APBLhenTp1Sw4YNNW3atFzHjxs3TlOmTNHMmTO1ceNGlSpVStHR0Tp79qzVpnfv3tq9e7cSEhK0bNkyrVu3TgMGDLDGp6WlKSoqSlWqVNHWrVs1fvx4xcXFadasWVabDRs26P7771e/fv20bds2de3aVV27dtWuXbuuXfEAAOCGxTV2uejYsaM6duyY6zhjjCZNmqRXX31Vd999tyTpww8/VFBQkJYuXapevXpp7969WrlypTZv3qwmTZpIkt5++2116tRJb775pkJCQjRv3jxlZGRo9uzZcjgcqlu3rrZv364JEyZYAXDy5Mnq0KGDhg4dKkkaNWqUEhISNHXqVM2cOfM6PBMAAOBGQrC7SgcPHlRSUpIiIyOtYWXLllXz5s2VmJioXr16KTExUX5+flaok6TIyEi5u7tr48aNuueee5SYmKjWrVvL4XBYbaKjo/XGG2/o+PHj8vf3V2JiooYMGeKy/Ojo6Bynhi+Unp6u9PR063FaWpokKTMzU5mZmQWq3cvDFGj6fC/X3bj8K6nAtRQlZ99v5Bqc7FSLRD3FHfUUzvJgbwS7q5SUlCRJCgoKchkeFBRkjUtKSlJgYKDL+BIlSiggIMClTVhYWI55OMf5+/srKSnpksvJzZgxYzRy5Mgcw1etWqWSJUteSYl5GtesQJMX2Kgm2db/7XCtYUJCQlF3odDYqRaJeoo76smf06dPX5floGgR7Gxm2LBhLkf50tLSFBoaqqioKPn6+hZo3vXivi5o9/LFy91oVJNsvbbFXenZ5293sisuukj6UhgyMzOVkJCg9u3by9PTs6i7UyB2qkWinuKOegrGeQYH9kawu0rBwcGSpOTkZFWsWNEanpycrEaNGlltjhw54jLduXPndOzYMWv64OBgJScnu7RxPr5cG+f43Hh5ecnLyyvHcE9PzwK/cTjvIVdU0rPdrD7Y4U29MNZJcWGnWiTqKe6oJ//Lgf3xrdirFBYWpuDgYK1evdoalpaWpo0bNyoiIkKSFBERoZSUFG3dutVqs2bNGmVnZ6t58+ZWm3Xr1rlc85CQkKCaNWvK39/fanPhcpxtnMsBAAC4EMEuFydPntT27du1fft2See/MLF9+3YdPnxYbm5uGjx4sEaPHq0vvvhCO3fu1MMPP6yQkBB17dpVklS7dm116NBB/fv316ZNm7R+/XoNGjRIvXr1UkhIiCTpgQcekMPhUL9+/bR7924tWLBAkydPdjmN+swzz2jlypV666239MsvvyguLk5btmzRoEGDrvdTAgAAbgCcis3Fli1bdMcdd1iPnWGrT58+mjt3rl544QWdOnVKAwYMUEpKilq1aqWVK1fK29vbmmbevHkaNGiQ2rVrJ3d3d3Xr1k1TpkyxxpctW1arVq1SbGyswsPDVb58eQ0fPtzlXnctW7bU/Pnz9eqrr+rll1/WzTffrKVLl6pevXrX4VkAAAA3GoJdLtq2bStj8r61h5ubm+Lj4xUfH59nm4CAAM2fP/+Sy2nQoIG+//77S7bp3r27unfvfukOAwAAiFOxAAAAtkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBLt8ysrK0muvvaawsDD5+PioevXqGjVqlIwxVhtjjIYPH66KFSvKx8dHkZGR2r9/v8t8jh07pt69e8vX11d+fn7q16+fTp486dJmx44duv322+Xt7a3Q0FCNGzfuutQIAABuLAS7fHrjjTc0Y8YMTZ06VXv37tUbb7yhcePG6e2337bajBs3TlOmTNHMmTO1ceNGlSpVStHR0Tp79qzVpnfv3tq9e7cSEhK0bNkyrVu3TgMGDLDGp6WlKSoqSlWqVNHWrVs1fvx4xcXFadasWde1XgAAUPyVKOoO3Kg2bNigu+++WzExMZKkqlWr6p///Kc2bdok6fzRukmTJunVV1/V3XffLUn68MMPFRQUpKVLl6pXr17au3evVq5cqc2bN6tJkyaSpLfffludOnXSm2++qZCQEM2bN08ZGRmaPXu2HA6H6tatq+3bt2vChAkuARAAAIBgl08tW7bUrFmz9Ouvv+qWW27Rzz//rB9++EETJkyQJB08eFBJSUmKjIy0pilbtqyaN2+uxMRE9erVS4mJifLz87NCnSRFRkbK3d1dGzdu1D333KPExES1bt1aDofDahMdHa033nhDx48fl7+/v0u/0tPTlZ6ebj1OS0uTJGVmZiozM7NANXt5mMs3uga83I3Lv5IKXEtRcvb9Rq7ByU61SNRT3FFP4SwP9kawy6eXXnpJaWlpqlWrljw8PJSVlaXXX39dvXv3liQlJSVJkoKCglymCwoKssYlJSUpMDDQZXyJEiUUEBDg0iYsLCzHPJzjLg52Y8aM0ciRI3P0d9WqVSpZsmR+y5UkjWtWoMkLbFSTbOv/K1asKMKeFI6EhISi7kKhsVMtEvUUd9STP6dPn74uy0HRItjl08KFCzVv3jzNnz/fOj06ePBghYSEqE+fPkXWr2HDhmnIkCHW47S0NIWGhioqKkq+vr4Fmne9uK8L2r188XI3GtUkW69tcVd6tpskaVdcdJH0pTBkZmYqISFB7du3l6enZ1F3p0DsVItEPcUd9RSM8wwO7I1gl09Dhw7VSy+9pF69ekmS6tevr3//+98aM2aM+vTpo+DgYElScnKyKlasaE2XnJysRo0aSZKCg4N15MgRl/meO3dOx44ds6YPDg5WcnKySxvnY2ebC3l5ecnLyyvHcE9PzwK/caRnuRVo+oJKz3az+mCHN/XCWCfFhZ1qkainuKOe/C8H9se3YvPp9OnTcnd3ffo8PDyUnX3+dGFYWJiCg4O1evVqa3xaWpo2btyoiIgISVJERIRSUlK0detWq82aNWuUnZ2t5s2bW23WrVvncm1EQkKCatasmeM0LAAA+L+NYJdPnTt31uuvv67ly5fr0KFDWrJkiSZMmKB77rlHkuTm5qbBgwdr9OjR+uKLL7Rz5049/PDDCgkJUdeuXSVJtWvXVocOHdS/f39t2rRJ69ev16BBg9SrVy+FhIRIkh544AE5HA7169dPu3fv1oIFCzR58mSX060AAAASp2Lz7e2339Zrr72mJ598UkeOHFFISIgef/xxDR8+3Grzwgsv6NSpUxowYIBSUlLUqlUrrVy5Ut7e3labefPmadCgQWrXrp3c3d3VrVs3TZkyxRpftmxZrVq1SrGxsQoPD1f58uU1fPhwbnUCAAByINjlU5kyZTRp0iRNmjQpzzZubm6Kj49XfHx8nm0CAgI0f/78Sy6rQYMG+v777/PbVQAA8H8Ep2IBAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgXwn//8Rw8++KDKlSsnHx8f1a9fX1u2bLHGG2M0fPhwVaxYUT4+PoqMjNT+/ftd5nHs2DH17t1bvr6+8vPzU79+/XTy5EmXNjt27NDtt98ub29vhYaGaty4cdelPgAAcGMh2OXT8ePHddttt8nT01NfffWV9uzZo7feekv+/v5Wm3HjxmnKlCmaOXOmNm7cqFKlSik6Olpnz5612vTu3Vu7d+9WQkKCli1bpnXr1mnAgAHW+LS0NEVFRalKlSraunWrxo8fr7i4OM2aNeu61gsAAIq/EkXdgRvVG2+8odDQUM2ZM8caFhYWZv3fGKNJkybp1Vdf1d133y1J+vDDDxUUFKSlS5eqV69e2rt3r1auXKnNmzerSZMmkqS3335bnTp10ptvvqmQkBDNmzdPGRkZmj17thwOh+rWravt27drwoQJLgEQAACAYJdPX3zxhaKjo9W9e3d99913qlSpkp588kn1799fknTw4EElJSUpMjLSmqZs2bJq3ry5EhMT1atXLyUmJsrPz88KdZIUGRkpd3d3bdy4Uffcc48SExPVunVrORwOq010dLTeeOMNHT9+3OUIoSSlp6crPT3depyWliZJyszMVGZmZoFq9vIwBZo+38t1Ny7/SipwLUXJ2fcbuQYnO9UiUU9xRz2FszzYG8Eun/71r39pxowZGjJkiF5++WVt3rxZTz/9tBwOh/r06aOkpCRJUlBQkMt0QUFB1rikpCQFBga6jC9RooQCAgJc2lx4JPDCeSYlJeUIdmPGjNHIkSNz9HfVqlUqWbJkASqWxjUr0OQFNqpJtvX/FStWFGFPCkdCQkJRd6HQ2KkWiXqKO+rJn9OnT1+X5aBoEezyKTs7W02aNNE//vEPSdKtt96qXbt2aebMmerTp0+R9WvYsGEaMmSI9TgtLU2hoaGKioqSr69vgeZdL+7rgnYvX7zcjUY1ydZrW9yVnu0mSdoVF10kfSkMmZmZSkhIUPv27eXp6VnU3SkQO9UiUU9xRz0F4zyDA3sj2OVTxYoVVadOHZdhtWvX1uLFiyVJwcHBkqTk5GRVrFjRapOcnKxGjRpZbY4cOeIyj3PnzunYsWPW9MHBwUpOTnZp43zsbHMhLy8veXl55Rju6elZ4DeO9Cy3Ak1fUOnZblYf7PCmXhjrpLiwUy0S9RR31JP/5cD++FZsPt12223at2+fy7Bff/1VVapUkXT+ixTBwcFavXq1NT4tLU0bN25URESEJCkiIkIpKSnaunWr1WbNmjXKzs5W8+bNrTbr1q1zuTYiISFBNWvWzHEaFgAA/N9GsMunZ599Vj/++KP+8Y9/6LffftP8+fM1a9YsxcbGSpLc3Nw0ePBgjR49Wl988YV27typhx9+WCEhIeratauk80f4OnTooP79+2vTpk1av369Bg0apF69eikkJESS9MADD8jhcKhfv37avXu3FixYoMmTJ7ucbgUAAJA4FZtvTZs21ZIlSzRs2DDFx8crLCxMkyZNUu/eva02L7zwgk6dOqUBAwYoJSVFrVq10sqVK+Xt7W21mTdvngYNGqR27drJ3d1d3bp105QpU6zxZcuW1apVqxQbG6vw8HCVL19ew4cP51YnAAAgB4JdAdx1112666678hzv5uam+Ph4xcfH59kmICBA8+fPv+RyGjRooO+//z7f/QQAAP83cCoWAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYFcIxo4dKzc3Nw0ePNgadvbsWcXGxqpcuXIqXbq0unXrpuTkZJfpDh8+rJiYGJUsWVKBgYEaOnSozp0759Lm22+/VePGjeXl5aUaNWpo7ty516EiAABwIyLYFdDmzZv1zjvvqEGDBi7Dn332WX355ZdatGiRvvvuO/3555+69957rfFZWVmKiYlRRkaGNmzYoA8++EBz587V8OHDrTYHDx5UTEyM7rjjDm3fvl2DBw/WY489pq+//vq61QcAAG4cBLsCOHnypHr37q13331X/v7+1vDU1FS9//77mjBhgu68806Fh4drzpw52rBhg3788UdJ0qpVq7Rnzx59/PHHatSokTp27KhRo0Zp2rRpysjIkCTNnDlTYWFheuutt1S7dm0NGjRI9913nyZOnFgk9QIAgOKtRFF34EYWGxurmJgYRUZGavTo0dbwrVu3KjMzU5GRkdawWrVqqXLlykpMTFSLFi2UmJio+vXrKygoyGoTHR2tJ554Qrt379att96qxMREl3k421x4yvdi6enpSk9Ptx6npaVJkjIzM5WZmVmger08TIGmz/dy3Y3Lv5IKXEtRcvb9Rq7ByU61SNRT3FFP4SwP9kawy6dPPvlEP/30kzZv3pxjXFJSkhwOh/z8/FyGBwUFKSkpyWpzYahzjneOu1SbtLQ0nTlzRj4+PjmWPWbMGI0cOTLH8FWrVqlkyZJXXmAuxjUr0OQFNqpJtvX/FStWFGFPCkdCQkJRd6HQ2KkWiXqKO+rJn9OnT1+X5aBoEezy4ffff9czzzyjhIQEeXt7F3V3XAwbNkxDhgyxHqelpSk0NFRRUVHy9fUt0LzrxRXNtX1e7kajmmTrtS3uSs92kyTtiosukr4UhszMTCUkJKh9+/by9PQs6u4UiJ1qkainuKOegnGewYG9EezyYevWrTpy5IgaN25sDcvKytK6des0depUff3118rIyFBKSorLUbvk5GQFBwdLkoKDg7Vp0yaX+Tq/NXthm4u/SZucnCxfX99cj9ZJkpeXl7y8vHIM9/T0LPAbR3qWW4GmL6j0bDerD3Z4Uy+MdVJc2KkWiXqKO+rJ/3Jgf3x5Ih/atWunnTt3avv27dZfkyZN1Lt3b+v/np6eWr16tTXNvn37dPjwYUVEREiSIiIitHPnTh05csRqk5CQIF9fX9WpU8dqc+E8nG2c8wAAALgQR+zyoUyZMqpXr57LsFKlSqlcuXLW8H79+mnIkCEKCAiQr6+vnnrqKUVERKhFixaSpKioKNWpU0cPPfSQxo0bp6SkJL366quKjY21jrgNHDhQU6dO1QsvvKBHH31Ua9as0cKFC7V8+fLrWzAAALghEOyukYkTJ8rd3V3dunVTenq6oqOjNX36dGu8h4eHli1bpieeeEIREREqVaqU+vTpo/j4eKtNWFiYli9frmeffVaTJ0/WTTfdpPfee0/R0Tfu9WUAAODaIdgVkm+//dblsbe3t6ZNm6Zp06blOU2VKlUu++3Otm3batu2bYXRRQAAYHNcYwcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwy6cxY8aoadOmKlOmjAIDA9W1a1ft27fPpc3Zs2cVGxurcuXKqXTp0urWrZuSk5Nd2hw+fFgxMTEqWbKkAgMDNXToUJ07d86lzbfffqvGjRvLy8tLNWrU0Ny5c691eQAA4AZEsMun7777TrGxsfrxxx+VkJCgzMxMRUVF6dSpU1abZ599Vl9++aUWLVqk7777Tn/++afuvfdea3xWVpZiYmKUkZGhDRs26IMPPtDcuXM1fPhwq83BgwcVExOjO+64Q9u3b9fgwYP12GOP6euvv76u9QIAgOKvRFF34Ea1cuVKl8dz585VYGCgtm7dqtatWys1NVXvv/++5s+frzvvvFOSNGfOHNWuXVs//vijWrRooVWrVmnPnj365ptvFBQUpEaNGmnUqFF68cUXFRcXJ4fDoZkzZyosLExvvfWWJKl27dr64YcfNHHiREVHR1/3ugEAQPFFsCskqampkqSAgABJ0tatW5WZmanIyEirTa1atVS5cmUlJiaqRYsWSkxMVP369RUUFGS1iY6O1hNPPKHdu3fr1ltvVWJioss8nG0GDx6caz/S09OVnp5uPU5LS5MkZWZmKjMzs0A1enmYAk2f7+W6G5d/JRW4lqLk7PuNXIOTnWqRqKe4o57CWR7sjWBXCLKzszV48GDddtttqlevniQpKSlJDodDfn5+Lm2DgoKUlJRktbkw1DnHO8ddqk1aWprOnDkjHx8fl3FjxozRyJEjc/Rx1apVKlmyZP6LlDSuWYEmL7BRTbKt/69YsaIIe1I4EhISiroLhcZOtUjUU9xRT/6cPn36uiwHRYtgVwhiY2O1a9cu/fDDD0XdFQ0bNkxDhgyxHqelpSk0NFRRUVHy9fUt0LzrxRXNdX1e7kajmmTrtS3uSs92kyTtirtxT0NnZmYqISFB7du3l6enZ1F3p0DsVItEPcUd9RSM8wwO7I1gV0CDBg3SsmXLtG7dOt10003W8ODgYGVkZCglJcXlqF1ycrKCg4OtNps2bXKZn/Nbsxe2ufibtMnJyfL19c1xtE6SvLy85OXllWO4p6dngd840rPcCjR9QaVnu1l9sMObemGsk+LCTrVI1FPcUU/+lwP741ux+WSM0aBBg7RkyRKtWbNGYWFhLuPDw8Pl6emp1atXW8P27dunw4cPKyIiQpIUERGhnTt36siRI1abhIQE+fr6qk6dOlabC+fhbOOcBwAAgBNH7PIpNjZW8+fP1+eff64yZcpY18SVLVtWPj4+Klu2rPr166chQ4YoICBAvr6+euqppxQREaEWLVpIkqKiolSnTh099NBDGjdunJKSkvTqq68qNjbWOuo2cOBATZ06VS+88IIeffRRrVmzRgsXLtTy5cuLrHYAAFA8ccQun2bMmKHU1FS1bdtWFStWtP4WLFhgtZk4caLuuusudevWTa1bt1ZwcLA+++wza7yHh4eWLVsmDw8PRURE6MEHH9TDDz+s+Ph4q01YWJiWL1+uhIQENWzYUG+99Zbee+89bnUCAABy4IhdPhlz+Vt/eHt7a9q0aZo2bVqebapUqXLZb3i2bdtW27Ztu+o+AgCA/1s4YgcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbKFHUHQDyo+pLy4u6C5ZDY2OKugsAAEjiiB0AAIBtcMQOAFBsXOpovJeH0bhmUr24r5We5XbN+8LReNyICHYA/k+4ktP31zs4XGt2qwfA5RHsAFwzxelaSOBqXevt92qCN0cPcaW4xg4AAMAmOGIH2Mz1PErGqT4AKF44YgcAAGATBDsAAACbINgBAADYBNfYAQV0tde0cV0aAOBa4YjdDWLatGmqWrWqvL291bx5c23atKmouwQAAIoZgt0NYMGCBRoyZIhGjBihn376SQ0bNlR0dLSOHDlS1F0DAADFCMHuBjBhwgT1799fffv2VZ06dTRz5kyVLFlSs2fPLuquAQCAYoRr7Iq5jIwMbd26VcOGDbOGubu7KzIyUomJiTnap6enKz093XqcmpoqSTp27JgyMzML1JcS504VaPp8Lzfb6PTpbJXIdFdW9o1/TZqd6rFTLRL1FHf/l+s5evRogZd34sQJSZIxpsDzQvFFsCvm/v77b2VlZSkoKMhleFBQkH755Zcc7ceMGaORI0fmGB4WFnbN+ng9PFDUHShkdqrHTrVI1FPc/V+tp/xbhbfMEydOqGzZsoU3QxQrBDubGTZsmIYMGWI9zs7O1rFjx1SuXDm5ud2Yn3DT0tIUGhqq33//Xb6+vkXdnQKzUz12qkWinuKOegrGGKMTJ04oJCTkmi8LRYdgV8yVL19eHh4eSk5OdhmenJys4ODgHO29vLzk5eXlMszPz+9advG68fX1tcWbuZOd6rFTLRL1FHfUk38cqbM/vjxRzDkcDoWHh2v16tXWsOzsbK1evVoRERFF2DMAAFDccMTuBjBkyBD16dNHTZo0UbNmzTRp0iSdOnVKffv2LequAQCAYoRgdwPo2bOn/vvf/2r48OFKSkpSo0aNtHLlyhxfqLArLy8vjRgxIscp5huVneqxUy0S9RR31ANcnpvhe88AAAC2wDV2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYIdrLi4uTm5ubi5/tWrVssafPXtWsbGxKleunEqXLq1u3brluCHz4cOHFRMTo5IlSyowMFBDhw7VuXPnXNp8++23aty4sby8vFSjRg3NnTu3UPq/bt06de7cWSEhIXJzc9PSpUtdxhtjNHz4cFWsWFE+Pj6KjIzU/v37XdocO3ZMvXv3lq+vr/z8/NSvXz+dPHnSpc2OHTt0++23y9vbW6GhoRo3blyOvixatEi1atWSt7e36tevrxUrVhR6PY888kiO9dWhQ4diWc+YMWPUtGlTlSlTRoGBgeratav27dvn0uZ6bl/Tpk1T1apV5e3trebNm2vTpk2FXk/btm1zrJ+BAwcWy3pmzJihBg0aWDfgjYiI0FdffWWNv5HWzZXUcyOtG9iYAa6xESNGmLp165q//vrL+vvvf/9rjR84cKAJDQ01q1evNlu2bDEtWrQwLVu2tMafO3fO1KtXz0RGRppt27aZFStWmPLly5thw4ZZbf71r3+ZkiVLmiFDhpg9e/aYt99+23h4eJiVK1cWuP8rVqwwr7zyivnss8+MJLNkyRKX8WPHjjVly5Y1S5cuNT///LPp0qWLCQsLM2fOnLHadOjQwTRs2ND8+OOP5vvvvzc1atQw999/vzU+NTXVBAUFmd69e5tdu3aZf/7zn8bHx8e88847Vpv169cbDw8PM27cOLNnzx7z6quvGk9PT7Nz585CradPnz6mQ4cOLuvr2LFjLm2KSz3R0dFmzpw5ZteuXWb79u2mU6dOpnLlyubkyZNWm+u1fX3yySfG4XCY2bNnm927d5v+/fsbPz8/k5ycXKj1tGnTxvTv399l/aSmphbLer744guzfPly8+uvv5p9+/aZl19+2Xh6eppdu3YZY26sdXMl9dxI6wb2RbDDNTdixAjTsGHDXMelpKQYT09Ps2jRImvY3r17jSSTmJhojDkfRNzd3U1SUpLVZsaMGcbX19ekp6cbY4x54YUXTN26dV3m3bNnTxMdHV2otVwchLKzs01wcLAZP368S01eXl7mn//8pzHGmD179hhJZvPmzVabr776yri5uZn//Oc/xhhjpk+fbvz9/a16jDHmxRdfNDVr1rQe9+jRw8TExLj0p3nz5ubxxx8vtHqMOR/s7r777jynKc71HDlyxEgy3333nTHm+m5fzZo1M7GxsdbjrKwsExISYsaMGVNo9RhzPjw888wzeU5TnOsxxhh/f3/z3nvv3fDr5uJ6jLnx1w3sgVOxuC7279+vkJAQVatWTb1799bhw4clSVu3blVmZqYiIyOttrVq1VLlypWVmJgoSUpMTFT9+vVdbsgcHR2ttLQ07d6922pz4TycbZzzuFYOHjyopKQkl2WXLVtWzZs3d+m/n5+fmjRpYrWJjIyUu7u7Nm7caLVp3bq1HA6HS//37dun48ePW22uV43ffvutAgMDVbNmTT3xxBM6evSoNa4415OamipJCggIkHT9tq+MjAxt3brVpY27u7siIyMLtR6nefPmqXz58qpXr56GDRum06dPW+OKaz1ZWVn65JNPdOrUKUVERNzw6+biepxuxHUDe+GXJ3DNNW/eXHPnzlXNmjX1119/aeTIkbr99tu1a9cuJSUlyeFwyM/Pz2WaoKAgJSUlSZKSkpJy/MqG8/Hl2qSlpenMmTPy8fG5JrU5l5/bsi/sW2BgoMv4EiVKKCAgwKVNWFhYjnk4x/n7++dZo3MehaVDhw669957FRYWpgMHDujll19Wx44dlZiYKA8Pj2JbT3Z2tgYPHqzbbrtN9erVs5Z1Pbav48ePKysrK9c2v/zyS6HVI0kPPPCAqlSpopCQEO3YsUMvvvii9u3bp88++6xY1rNz505FRETo7NmzKl26tJYsWaI6depo+/btN+S6yase6cZbN7Angh2uuY4dO1r/b9CggZo3b64qVapo4cKF1yxwIf969epl/b9+/fpq0KCBqlevrm+//Vbt2rUrwp5dWmxsrHbt2qUffvihqLtSKPKqZ8CAAdb/69evr4oVK6pdu3Y6cOCAqlevfr27eVk1a9bU9u3blZqaqk8//VR9+vTRd999V9Tdyre86qlTp84Nt25gT5yKxXXn5+enW265Rb/99puCg4OVkZGhlJQUlzbJyckKDg6WJAUHB+f4ppzz8eXa+Pr6XtPw6Fx+bsu+sG9HjhxxGX/u3DkdO3asUGp0jr9WqlWrpvLly+u3336z+lHc6hk0aJCWLVumtWvX6qabbrKGX6/tq3z58vLw8Ljm9eSmefPmkuSyfopTPQ6HQzVq1FB4eLjGjBmjhg0bavLkyTfsusmrntwU93UDeyLY4bo7efKkDhw4oIoVKyo8PFyenp5avXq1NX7fvn06fPiwdd1KRESEdu7c6RImEhIS5Ovra50CiYiIcJmHs82F175cC2FhYQoODnZZdlpamjZu3OjS/5SUFG3dutVqs2bNGmVnZ1tv/BEREVq3bp0yMzNd+l+zZk35+/tbbYqixj/++ENHjx5VxYoVi109xhgNGjRIS5Ys0Zo1a3Kc/r1e25fD4VB4eLhLm+zsbK1evbpQ68nN9u3bJcll/RSXenKTnZ2t9PT0G27dXK6e3Nxo6wY2UdTf3oD9Pffcc+bbb781Bw8eNOvXrzeRkZGmfPny5siRI8aY87c8qFy5slmzZo3ZsmWLiYiIMBEREdb0zlsEREVFme3bt5uVK1eaChUq5HqLgKFDh5q9e/eaadOmFdrtTk6cOGG2bdtmtm3bZiSZCRMmmG3btpl///vfxpjztzvx8/Mzn3/+udmxY4e5++67c73dya233mo2btxofvjhB3PzzTe73B4kJSXFBAUFmYceesjs2rXLfPLJJ6ZkyZI5bg9SokQJ8+abb5q9e/eaESNG5Ot2J5eq58SJE+b55583iYmJ5uDBg+abb74xjRs3NjfffLM5e/ZssavniSeeMGXLljXffvutyy0mTp8+bbW5XtvXJ598Yry8vMzcuXPNnj17zIABA4yfn5/LNyALWs9vv/1m4uPjzZYtW8zBgwfN559/bqpVq2Zat25dLOt56aWXzHfffWcOHjxoduzYYV566SXj5uZmVq1adcOtm8vVc6OtG9gXwQ7XXM+ePU3FihWNw+EwlSpVMj179jS//fabNf7MmTPmySefNP7+/qZkyZLmnnvuMX/99ZfLPA4dOmQ6duxofHx8TPny5c1zzz1nMjMzXdqsXbvWNGrUyDgcDlOtWjUzZ86cQun/2rVrjaQcf3369DHGnL/lyWuvvWaCgoKMl5eXadeundm3b5/LPI4ePWruv/9+U7p0aePr62v69u1rTpw44dLm559/Nq1atTJeXl6mUqVKZuzYsTn6snDhQnPLLbcYh8Nh6tata5YvX16o9Zw+fdpERUWZChUqGE9PT1OlShXTv3//HDuM4lJPbnVIcln313P7evvtt03lypWNw+EwzZo1Mz/++GOh1nP48GHTunVrExAQYLy8vEyNGjXM0KFDXe6VVpzqefTRR02VKlWMw+EwFSpUMO3atbNCnTE31rq5XD032rqBfbkZY8z1Oz4IAACAa4Vr7AAAAGyCYAcAAGATBDsAAACbINgBAADYBMEOAADAJgh2AAAANkGwAwAAsAmCHQAAgE0Q7AAAAGyCYAcAAGATBDsAAACbINgBAADYxP8DF+7GYo5LDP8AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "context_to_check = 2046\n", + "checks = check_mutation_positions(result_df.to_pandas(), context_to_check)\n", + "checks[checks[\"out_of_bounds\"]].codon_position.hist(figsize=(5, 5))\n", + "plt.title(\n", + " f\" {checks['out_of_bounds'].sum()} out of {len(checks)} variants are out of bounds (context length = {context_to_check})\"\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "6b85ad06", + "metadata": {}, + "outputs": [], + "source": [ + "# Save processed results, dset, and refseq tables\n", + "dset.write_csv(f\"{OUTPUT_DIR}/clinvar_synom.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "98d131bc", + "metadata": {}, + "source": [ + "# 5. CHD missense dataset" + ] + }, + { + "cell_type": "markdown", + "id": "6e59856c", + "metadata": {}, + "source": [ + "- Download the variant tables from the publication [Jin et al. Contribution of rare inherited and de novo variants in 2,871 congenital heart disease probands](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/#SD1). \n", + "\n", + "- The excel table with variants information can be downloaded from this [link](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) \n", + "\n", + "- We saved the `S9` table (cases) as `chd_rare_mutation.csv`, and `S10` table (controls) as `chd_mutation_ctrl.csv` \n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "fcca5699", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- 1. Loading and Filtering Variants ---\n", + "Initial: 2776, Missense: 1773, Removed 4 duplicates, Final: 1769\n", + "class\n", + "chd 1769\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Blinded IDchromposrefaltensembl_gene_idgene_nameclassclassificationpLI ScoreAA_changeRadialSVM_scorevariant_id
01-01849chr1898217CTENSG00000187961KLHL17chdmis0.0p.P321L-0.492chr1_898217_C_T
11-03030chr11425984CGENSG00000160072ATAD3Bchdmis0.0p.S516W-0.917chr1_1425984_C_G
\n", + "
" + ], + "text/plain": [ + " Blinded ID chrom pos ref alt ensembl_gene_id gene_name class \\\n", + "0 1-01849 chr1 898217 C T ENSG00000187961 KLHL17 chd \n", + "1 1-03030 chr1 1425984 C G ENSG00000160072 ATAD3B chd \n", + "\n", + " classification pLI Score AA_change RadialSVM_score variant_id \n", + "0 mis 0.0 p.P321L -0.492 chr1_898217_C_T \n", + "1 mis 0.0 p.S516W -0.917 chr1_1425984_C_G " + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def load_and_filter_variants(chd_path: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Load CHD and control cases, clean, merge, and filter to keep only missense mutations.\n", + " \"\"\"\n", + " print(\"--- 1. Loading and Filtering Variants ---\")\n", + " cols = [\n", + " \"Blinded ID\",\n", + " \"CHROM\",\n", + " \"POS\",\n", + " \"REF\",\n", + " \"ALT\",\n", + " \"Ensemble_GENEID\",\n", + " \"Gene\",\n", + " \"class\",\n", + " \"Variant_Class\",\n", + " \"pLI Score\",\n", + " \"AA_change\",\n", + " \"RadialSVM_score\",\n", + " ]\n", + "\n", + " # Load and clean pathogenic (CHD)\n", + " pathogenic = pd.read_csv(chd_path, header=1).dropna()\n", + " pathogenic[\"class\"] = \"chd\"\n", + " pathogenic.rename(columns={\"pLI score\": \"pLI Score\", \"AA change\": \"AA_change\"}, inplace=True)\n", + "\n", + " # Concatenate and standardize columns\n", + " variants = pathogenic[cols]\n", + " variants = variants.sort_values(by=[\"CHROM\", \"POS\"]).reset_index(drop=True).copy()\n", + " variants.rename(\n", + " columns={\n", + " \"CHROM\": \"chrom\",\n", + " \"POS\": \"pos\",\n", + " \"REF\": \"ref\",\n", + " \"ALT\": \"alt\",\n", + " \"Ensemble_GENEID\": \"ensembl_gene_id\",\n", + " \"Gene\": \"gene_name\",\n", + " \"Variant_Class\": \"classification\",\n", + " },\n", + " inplace=True,\n", + " )\n", + "\n", + " # Format chrom and pos\n", + " variants[\"chrom\"] = \"chr\" + variants[\"chrom\"].astype(str)\n", + " variants[\"pos\"] = variants[\"pos\"].astype(int)\n", + "\n", + " # Filter missense mutations only (single base change and classification)\n", + " initial_count = variants.shape[0]\n", + " variants = variants.loc[\n", + " (variants[\"ref\"].str.len() == 1) & (variants[\"alt\"].str.len() == 1)\n", + " ].copy() # single mutation only\n", + " variants = variants.loc[variants[\"classification\"].isin([\"misD\", \"mis\"])] # only missense mutations\n", + " final_count = variants.shape[0]\n", + "\n", + " variants[\"variant_id\"] = (\n", + " variants[\"chrom\"] + \"_\" + variants[\"pos\"].astype(str) + \"_\" + variants[\"ref\"] + \"_\" + variants[\"alt\"]\n", + " )\n", + "\n", + " # Remove duplicate variants from published data\n", + " pre_dedup = variants.shape[0]\n", + " variants = variants.drop_duplicates(subset=[\"variant_id\", \"gene_name\"], keep=\"first\").reset_index(drop=True)\n", + " print(\n", + " f\"Initial: {initial_count}, Missense: {final_count}, Removed {pre_dedup - variants.shape[0]} duplicates, Final: {variants.shape[0]}\"\n", + " )\n", + " print(variants[\"class\"].value_counts())\n", + "\n", + " return variants\n", + "\n", + "\n", + "chd_path = f\"{DATA_DIR}/chd_rare_mutation.csv\"\n", + "variants = load_and_filter_variants(chd_path)\n", + "variants.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "88428334", + "metadata": {}, + "source": [ + "- Load fasta and annotation file, filter the gtf table by gene names and canonical transctipts" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "ebafa2bf", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_canonical_gtf(gtf_s: pd.DataFrame, gtf_path: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Filter a GTF-derived DataFrame to keep only canonical transcripts.\n", + " Only keeps genes with explicit canonical annotations (Ensembl_canonical or MANE_Select).\n", + " Genes without canonical annotations are dropped.\n", + " \"\"\"\n", + " print(\"Filtering GTF for canonical transcripts...\")\n", + " canonical_map_pl = pl.read_csv(\n", + " gtf_path,\n", + " comment_prefix=\"#\",\n", + " separator=\"\\t\",\n", + " has_header=False,\n", + " columns=[2, 8],\n", + " new_columns=[\"feature\", \"attrs\"],\n", + " ).filter(pl.col(\"feature\") == \"transcript\")\n", + "\n", + " # Extract gene and transcript, then filter for canonical tags in full attrs string\n", + " # (GTF files can have multiple tag entries, so we check the full string)\n", + " canonical_map_pl = (\n", + " canonical_map_pl.with_columns(\n", + " [\n", + " pl.col(\"attrs\").str.extract(r'gene_name \"([^\"]+)\"', 1).alias(\"gene\"),\n", + " pl.col(\"attrs\").str.extract(r'transcript_id \"([^\"]+)\"', 1).alias(\"transcript\"),\n", + " ]\n", + " )\n", + " # Filter for explicit canonical tags in the full attrs string\n", + " .filter(pl.col(\"attrs\").str.contains(\"Ensembl_canonical\") | pl.col(\"attrs\").str.contains(\"MANE_Select\"))\n", + " )\n", + "\n", + " # Prioritize MANE_Select over Ensembl_canonical if both exist for a gene\n", + " canonical_map_pl = (\n", + " canonical_map_pl.with_columns(\n", + " pl.when(pl.col(\"attrs\").str.contains(\"MANE_Select\"))\n", + " .then(2)\n", + " .when(pl.col(\"attrs\").str.contains(\"Ensembl_canonical\"))\n", + " .then(1)\n", + " .otherwise(0)\n", + " .alias(\"priority\")\n", + " )\n", + " .sort(\"priority\", descending=True)\n", + " .group_by(\"gene\")\n", + " .first()\n", + " .select([\"gene\", \"transcript\"])\n", + " )\n", + "\n", + " genes_with_canonical = canonical_map_pl.shape[0]\n", + " print(f\"Found {genes_with_canonical} genes with explicit canonical transcripts\")\n", + "\n", + " canonical_map_df = canonical_map_pl.to_pandas()\n", + " canonical_map_df[\"transcript\"] = canonical_map_df[\"transcript\"].str.split(\".\").str[0] # remove version\n", + " gtf_s[\"transcript_id\"] = gtf_s[\"name\"].str.split(\".\").str[0]\n", + "\n", + " original_shape = gtf_s.shape[0]\n", + " original_genes = gtf_s[\"gene_name\"].nunique()\n", + "\n", + " gtf_filtered = gtf_s.merge(\n", + " canonical_map_df, left_on=[\"gene_name\", \"transcript_id\"], right_on=[\"gene\", \"transcript\"], how=\"inner\"\n", + " ).drop(columns=[\"gene\", \"transcript\"])\n", + "\n", + " filtered_genes = gtf_filtered[\"gene_name\"].nunique()\n", + " dropped_genes = original_genes - filtered_genes\n", + "\n", + " print(f\"GTF size before canonical filter: {original_shape} entries from {original_genes} genes\")\n", + " print(f\"GTF size after canonical filter: {gtf_filtered.shape[0]} entries from {filtered_genes} genes\")\n", + " print(f\"Dropped {dropped_genes} genes without canonical annotations\")\n", + "\n", + " return gtf_filtered\n", + "\n", + "\n", + "def prepare_annotations(variants: pd.DataFrame, gtf_path: str, fasta_path: str):\n", + " \"\"\"\n", + " Load GTF/FASTA, subset GTF to genes in variant table, filter for CDS length, and canonical transcripts.\n", + " \"\"\"\n", + " print(\"\\n--- 2. Preparing Annotations (GTF & FASTA) ---\")\n", + " # Get reference and annotation files (hg19 assembly)\n", + " gtf_s, fasta = process_gtf(gtf_path, fasta_path)\n", + " # Subset GTF to genes present in the variant table (using ENSEMBL ID)\n", + " variant_gene_ids = variants[\"ensembl_gene_id\"].unique()\n", + " gtf_gene_ids = gtf_s[\"gene_id\"].unique()\n", + " missing_gene_ids = set(variant_gene_ids) - set(gtf_gene_ids)\n", + " if missing_gene_ids:\n", + " print(f\"⚠️ Warning: {len(missing_gene_ids)} variant Ensembl IDs are missing from the GTF table.\")\n", + " # Printing IDs is less useful, but we can print the corresponding gene names if needed\n", + " missing_names = variants[variants[\"ensembl_gene_id\"].isin(missing_gene_ids)][\"gene_name\"].unique()\n", + " print(f\" Missing {len(missing_names)} names: {list(missing_names)}\")\n", + " gtf_subset = gtf_s[gtf_s[\"gene_id\"].isin(variant_gene_ids)].copy()\n", + " print(f\"GTF subset to variant genes (by Ensembl ID): {gtf_subset.shape[0]} rows.\")\n", + " # Check 2: Filter for CDS length multiple of 3\n", + " gtf_subset = gtf_subset[gtf_subset[\"cds\"].str.len() % 3 == 0]\n", + " print(f\"After filtering CDS length multiple of 3: {gtf_subset.shape[0]}\")\n", + " # Check 3: Filter for canonical transcripts only\n", + " gtf_filtered = filter_canonical_gtf(gtf_subset, gtf_path=gtf_path.replace(\".processed.tsv\", \".gtf.gz\"))\n", + " # Check 4: Validate reference allele in fasta matches variants (hg19 assembly)\n", + " print(\"\\nRunning FASTA reference allele validation...\")\n", + " for i in range(variants.shape[0]):\n", + " t = variants.iloc[i]\n", + " chrom = t[\"chrom\"]\n", + " pos = t[\"pos\"]\n", + " ref = t[\"ref\"]\n", + " try:\n", + " hg19_ref = fasta[chrom][pos - 1]\n", + " if hg19_ref != ref:\n", + " print(f\"Mismatch at {chrom}:{pos}, {ref} (variants) != {hg19_ref} (fasta), {t['variant_id']}\")\n", + " except KeyError:\n", + " print(f\"Warning: Chromosome {chrom} not found in FASTA.\")\n", + " print(\"FASTA reference allele validation complete.\")\n", + " return gtf_filtered, fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "4f3c68c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- 2. Preparing Annotations (GTF & FASTA) ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing transcripts: 100%|██████████| 64779/64779 [00:10<00:00, 6443.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⚠️ Warning: 10 variant Ensembl IDs are missing from the GTF table.\n", + " Missing 10 names: ['OTUD7B', 'OR8B4', 'SLCO1B7', 'CYFIP1', 'LENG9', 'ADRA2B', 'TPTE', 'SSTR3', 'ATP6AP1L', 'SLC25A53']\n", + "GTF subset to variant genes (by Ensembl ID): 6200 rows.\n", + "After filtering CDS length multiple of 3: 6181\n", + "Filtering GTF for canonical transcripts...\n", + "Found 64705 genes with explicit canonical transcripts\n", + "GTF size before canonical filter: 6181 entries from 1548 genes\n", + "GTF size after canonical filter: 1544 entries from 1544 genes\n", + "Dropped 4 genes without canonical annotations\n", + "\n", + "Running FASTA reference allele validation...\n", + "FASTA reference allele validation complete.\n" + ] + } + ], + "source": [ + "GTF_PROCESSED_PATH = f\"{DATA_DIR}/reference/gencode.v47lift37.basic.annotation.processed.tsv\"\n", + "FASTA_PATH = f\"{DATA_DIR}/reference/hg19/hg19.fa\"\n", + "gtf_filtered, fasta = prepare_annotations(variants=variants, gtf_path=GTF_PROCESSED_PATH, fasta_path=FASTA_PATH)" + ] + }, + { + "cell_type": "markdown", + "id": "82fc6cb2", + "metadata": {}, + "source": [ + "- Get missense variant table with CDS sequences for ref and alt codons, filtered to canonical transcripts:" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d43ada2e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_results_and_filter_canonical(\n", + " variants: pd.DataFrame, gtf_filtered: pd.DataFrame, filter_canonical: bool = True\n", + "):\n", + " \"\"\"\n", + " Run annotation processing, merge with variant data, and filter for the canonical transcript\n", + " for each unique variant.\n", + " \"\"\"\n", + " print(\"\\n--- 3. Running Annotation and Canonical Transcript Filtering ---\")\n", + " all_results = []\n", + " chroms = variants[\"chrom\"].unique()\n", + " # Annotation loop\n", + " for chrom in tqdm(chroms, desc=\"Processing chromosomes\", total=len(chroms)):\n", + " curr_variants = (\n", + " variants[variants[\"chrom\"] == chrom][[\"variant_id\", \"chrom\", \"pos\", \"ref\", \"alt\"]].drop_duplicates().copy()\n", + " )\n", + " chrom_gtf = gtf_filtered[gtf_filtered[\"chrom\"] == chrom]\n", + " chrom_results = process_a_chrom(curr_variants, chrom_gtf, return_alt_cds=True) # Assumed defined elsewhere\n", + " all_results.append(chrom_results)\n", + " all_results = pd.concat(all_results).reset_index(drop=True)\n", + " all_results.insert(0, \"id\", np.arange(all_results.shape[0])) # Add row ID\n", + " # Merge with original variant metadata\n", + " assembly = \"hg19\"\n", + " all_results[\"variant_id\"] = all_results[\"variant_id\"] + \"_\" + assembly\n", + " all_results = all_results.merge(variants.drop(\"variant_id\", axis=1), on=[\"chrom\", \"pos\", \"ref\", \"alt\"])\n", + " if filter_canonical:\n", + " print(f\"Total results rows before canonical filtering: {all_results.shape[0]}\")\n", + " # Since gtf_filtered is already canonical, we can use it to filter the results.\n", + " canonical_transcripts = gtf_filtered[[\"gene_name\", \"name\"]].copy()\n", + " canonical_transcripts.rename(columns={\"name\": \"transcript_name\"}, inplace=True)\n", + " all_results = all_results.merge(\n", + " canonical_transcripts,\n", + " left_on=[\"gene_name\", \"tx_name\"], # Assumes 'transcript_name' in all_results\n", + " right_on=[\"gene_name\", \"transcript_name\"],\n", + " how=\"inner\",\n", + " )\n", + " print(f\"Total results rows after canonical filtering: {all_results.shape[0]}\")\n", + " # Keep only variants where ref_aa != alt_aa\n", + " all_results = all_results[all_results[\"ref_aa\"] != all_results[\"alt_aa\"]]\n", + " print(f\"Total results rows after filtering ref_aa != alt_aa: {all_results.shape[0]}\")\n", + "\n", + " # Remove nonsense variants: ref_aa != \"*\" & alt_aa !=\"*\"\n", + " all_results = all_results[(all_results[\"ref_aa\"] != \"*\") & (all_results[\"alt_aa\"] != \"*\")]\n", + " print(f\"Total results rows after filtering nonsense variants: {all_results.shape[0]}\")\n", + "\n", + " # Remove variants where ref_aa or all_aa are different from AA_change\n", + " all_results = all_results[all_results[\"ref_aa\"] == all_results[\"AA_change\"].apply(lambda x: x[2])]\n", + " all_results = all_results[all_results[\"alt_aa\"] == all_results[\"AA_change\"].apply(lambda x: x[-1])]\n", + " print(\n", + " f\"Total results rows after filtering variants where ref_aa or alt_aa are different from provided AA_change: {all_results.shape[0]}\"\n", + " )\n", + " return all_results" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "eda9517b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- 3. Running Annotation and Canonical Transcript Filtering ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing chromosomes: 100%|██████████| 23/23 [00:00<00:00, 324.53it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total results rows before canonical filtering: 1721\n", + "Total results rows after canonical filtering: 1628\n", + "Total results rows after filtering ref_aa != alt_aa: 1624\n", + "Total results rows after filtering nonsense variants: 1624\n", + "Total results rows after filtering variants where ref_aa or alt_aa are different from provided AA_change: 1623\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "final_results = get_results_and_filter_canonical(variants=variants, gtf_filtered=gtf_filtered, filter_canonical=True)" + ] + }, + { + "cell_type": "markdown", + "id": "092cb6d7", + "metadata": {}, + "source": [ + "- Add alpha missense scores" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "1c14a684", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with AlphaMissense scores: 1114 / 1623\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idchromposvariant_idrefalttx_namecdsStartcdsEndtx_strand...ensembl_gene_idgene_nameclassclassificationpLI ScoreAA_changeRadialSVM_scoretranscript_nameAlphaMissenseam_class
00chr1898217chr1_898217_C_T_hg19CTENST00000338591896073900571+...ENSG00000187961KLHL17chdmis0.0p.P321L-0.492ENST000003385910.8814pathogenic
22chr13389648chr1_3389648_C_G_hg19CGENST0000037837833796483397151+...ENSG00000130762ARHGEF16chdmis0.0p.F343L-0.963ENST000003783780.8865pathogenic
\n", + "

2 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " id chrom pos variant_id ref alt tx_name \\\n", + "0 0 chr1 898217 chr1_898217_C_T_hg19 C T ENST00000338591 \n", + "2 2 chr1 3389648 chr1_3389648_C_G_hg19 C G ENST00000378378 \n", + "\n", + " cdsStart cdsEnd tx_strand ... ensembl_gene_id gene_name class \\\n", + "0 896073 900571 + ... ENSG00000187961 KLHL17 chd \n", + "2 3379648 3397151 + ... ENSG00000130762 ARHGEF16 chd \n", + "\n", + " classification pLI Score AA_change RadialSVM_score transcript_name \\\n", + "0 mis 0.0 p.P321L -0.492 ENST00000338591 \n", + "2 mis 0.0 p.F343L -0.963 ENST00000378378 \n", + "\n", + " AlphaMissense am_class \n", + "0 0.8814 pathogenic \n", + "2 0.8865 pathogenic \n", + "\n", + "[2 rows x 29 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "af_hg19 = pl.read_csv(f\"{DATA_DIR}/alphamissense_data/AlphaMissense_hg19.tsv.gz\", separator=\"\\t\", skip_rows=3)\n", + "af_hg19 = af_hg19.rename({\"#CHROM\": \"chrom\", \"POS\": \"pos\", \"REF\": \"ref\", \"ALT\": \"alt\"})\n", + "af_hg19 = af_hg19.with_columns(\n", + " pl.concat_str(\n", + " [pl.col(\"chrom\"), pl.col(\"pos\").cast(str), pl.col(\"ref\"), pl.col(\"alt\"), pl.lit(\"hg19\")], separator=\"_\"\n", + " ).alias(\"variant_id\")\n", + ")\n", + "af_hg19 = af_hg19.with_columns(pl.col(\"transcript_id\").str.split(\".\").list.first().alias(\"tx_name\"))\n", + "# Join with final_results\n", + "final_results_pl = pl.from_pandas(final_results)\n", + "final_results = final_results_pl.join(\n", + " af_hg19.select([\"variant_id\", \"tx_name\", \"am_pathogenicity\", \"am_class\"]), on=[\"variant_id\", \"tx_name\"], how=\"left\"\n", + ").rename({\"am_pathogenicity\": \"AlphaMissense\"})\n", + "print(\n", + " f\"Variants with AlphaMissense scores: {final_results.filter(pl.col('AlphaMissense').is_not_null()).shape[0]} / {final_results.shape[0]}\"\n", + ")\n", + "final_results = final_results.to_pandas().dropna(subset=[\"AlphaMissense\"])\n", + "final_results.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "ebde35a9", + "metadata": {}, + "source": [ + "- Add DDD/ASD control variants" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "aa312b44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of benign variants in DDD/ASD dataset: 3590\n", + "\n", + "Concatenated dataframe shape: (4704, 19)\n", + "Columns: ['id', 'variant_id', 'chrom', 'pos', 'ref', 'alt', 'class', 'ref_codon', 'alt_codon', 'codon_position', 'var_rel_dist_in_cds', 'classification', 'ref_aa', 'alt_aa', 'ref_seq', 'alt_seq', 'tx_name', 'AlphaMissense', 'am_class']\n", + "\n", + "First few rows:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvariant_idchromposrefaltclassref_codonalt_codoncodon_positionvar_rel_dist_in_cdsclassificationref_aaalt_aaref_seqalt_seqtx_nameAlphaMissenseam_class
059chr10_101163334_A_G_hg19chr10101163334AGcontrol_ddd_asdGTCGCC283850misVAATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG...ATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG...ENST000003705080.3736ambiguous
173chr10_101371064_G_A_hg19chr10101371064GAcontrol_ddd_asdCGGTGG212636misRWATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG...ATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG...ENST000003704950.1444benign
\n", + "
" + ], + "text/plain": [ + " id variant_id chrom pos ref alt class \\\n", + "0 59 chr10_101163334_A_G_hg19 chr10 101163334 A G control_ddd_asd \n", + "1 73 chr10_101371064_G_A_hg19 chr10 101371064 G A control_ddd_asd \n", + "\n", + " ref_codon alt_codon codon_position var_rel_dist_in_cds classification \\\n", + "0 GTC GCC 283 850 mis \n", + "1 CGG TGG 212 636 mis \n", + "\n", + " ref_aa alt_aa ref_seq \\\n", + "0 V A ATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG... \n", + "1 R W ATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG... \n", + "\n", + " alt_seq tx_name \\\n", + "0 ATGGCACCTCCGTCAGTCTTTGCCGAGGTTCCGCAGGCCCAGCCTG... ENST00000370508 \n", + "1 ATGGAGTTGGAGGGGCGGGGTGCTGGCGGTGTGGCGGGGGGGCCGG... ENST00000370495 \n", + "\n", + " AlphaMissense am_class \n", + "0 0.3736 ambiguous \n", + "1 0.1444 benign " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load ctrls\n", + "missense_variants_ddd_asd = pd.read_csv(f\"{OUTPUT_DIR}/ddd_asd_zhouetal_processed_am.csv\")\n", + "missense_variants_ddd_asd_ctrl = missense_variants_ddd_asd[\n", + " missense_variants_ddd_asd[\"classification\"] == \"control\"\n", + "].copy()\n", + "missense_variants_ddd_asd_ctrl[\"classification\"] = \"mis\"\n", + "missense_variants_ddd_asd_ctrl[\"class\"] = \"control_ddd_asd\"\n", + "print(f\"Number of benign variants in DDD/ASD dataset: {missense_variants_ddd_asd_ctrl.shape[0]}\")\n", + "cols = [\n", + " \"id\",\n", + " \"variant_id\",\n", + " \"chrom\",\n", + " \"pos\",\n", + " \"ref\",\n", + " \"alt\",\n", + " \"class\",\n", + " \"ref_codon\",\n", + " \"alt_codon\",\n", + " \"codon_position\",\n", + " \"var_rel_dist_in_cds\",\n", + " \"classification\",\n", + " \"ref_aa\",\n", + " \"alt_aa\",\n", + " \"ref_seq\",\n", + " \"alt_seq\",\n", + " \"tx_name\",\n", + " \"AlphaMissense\",\n", + " \"am_class\",\n", + "]\n", + "# Concatenate\n", + "variants = pd.concat([missense_variants_ddd_asd_ctrl[cols], final_results[cols]], ignore_index=True)\n", + "print(f\"\\nConcatenated dataframe shape: {variants.shape}\")\n", + "print(f\"Columns: {variants.columns.tolist()}\")\n", + "print(\"\\nFirst few rows:\")\n", + "variants.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "6a93d71e", + "metadata": {}, + "source": [ + "- QC of the variants:" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "5f4723e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- 4. Quality Control and Display ---\n", + "A. Initial variant type and class counts:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
classificationclass
miscontrol_ddd_asd3590
chd827
misDchd287
\n", + "
" + ], + "text/plain": [ + " count\n", + "classification class \n", + "mis control_ddd_asd 3590\n", + " chd 827\n", + "misD chd 287" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "B. Unique Variant Overlaps and Totals:\n", + "CHD-Control overlap (same variant_id): 0\n", + "Total unique CHD variants: 1114\n", + "Total unique Control variants: 0\n", + "\n", + "Position Overlaps (same chrom:pos, potentially different alleles):\n", + "CHD-Control position overlap: 0\n", + "CHD-Control_DDD_ASD position overlap: 1\n", + "Total unique CHD positions: 1113\n", + "Total unique Control positions: 0\n", + "Total unique Control_DDD_ASD positions: 3570\n", + "\n", + "C. Unique Variants and Total Rows by Class/Type:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variant_id
unique_variantstotal_rows
classclassification
chdmis827827
misD287287
control_ddd_asdmis35733590
\n", + "
" + ], + "text/plain": [ + " variant_id \n", + " unique_variants total_rows\n", + "class classification \n", + "chd mis 827 827\n", + " misD 287 287\n", + "control_ddd_asd mis 3573 3590" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "D. Final Missense-specific QC (matching original 'mis'/'misD' annotation):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variant_id
unique_variantstotal_rows
classclassification
chdmis827827
misD287287
control_ddd_asdmis35733590
\n", + "
" + ], + "text/plain": [ + " variant_id \n", + " unique_variants total_rows\n", + "class classification \n", + "chd mis 827 827\n", + " misD 287 287\n", + "control_ddd_asd mis 3573 3590" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "E. Venn Diagram - Overlap of Missense Variants Between Classes:\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbcAAAHDCAYAAACnJFQ8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACEUklEQVR4nO3dd3xT5f4H8M9JmtF0pOmedFCglFUsFNkg5XIVFVwMURCvOEC9WCfee0VwAOLAwVAEQdQLPxEXekEtw4WAIIrs0UIpbTrTtE2b+fz+CI1Nd9okJzn5vl+vvCAnJyff0+Sc73me8wyOMcZACCGECIiI7wAIIYQQZ6PkRgghRHAouRFCCBEcSm6EEEIEh5IbIYQQwaHkRgghRHAouRFCCBEcSm6EEEIEh5IbIYQQwaHk5gIbNmwAx3HIz8/nO5Q2LV++HCkpKRCLxcjIyHD69vfs2QOO47Bnzx6nb5u0jeM4PPvss3yHQQhvvDq5HTt2DHfccQfi4uIgk8kQGxuLGTNm4NixY3yH5vG++eYbPPHEExg+fDjee+89vPjii62ue9ddd4HjOAQHB6Ourq7Z62fOnAHHceA4Di+//LIrw/ZKr776KjiOw3fffdfqOmvXrgXHcfjiiy/cGFnXrFq1Chs2bHDa9houhho/QkNDcfXVV+PDDz/s9HY/+ugjrFixwmlxeqqSkhI89dRT6NevHwIDAyGXy5GamorZs2fjxx9/5Ds8t/PjO4DO2rZtG6ZPn47Q0FD84x//QHJyMvLz87Fu3Tps3boVmzdvxk033cR3mB5r165dEIlEWLduHaRSabvr+/n5QafT4csvv8SUKVPsXvvwww8hl8tRX19vt3zUqFGoq6vr0PaFbNq0aXj88cfx0UcfITs7u8V1PvroI4SFheHaa691ymfW1dXBz8+1h/eqVasQHh6Ou+66y6nbffjhhzF48GAAQHl5ObZs2YI77rgDGo0G8+bNc3h7H330Ef7880/Mnz/fqXF6kgMHDmDixImorq7GtGnTcP/990MmkyEvLw+fffYZNmzYgL1792LUqFF8h+o+zAudPXuWKRQKlpaWxkpKSuxeKy0tZWlpaSwgIICdO3fOrXHV1NQwxhh77733GACWl5fn1s93xOzZs1lAQECH1p01axYLCAhgf/vb39jkyZObvd6jRw92yy23MABs+fLlzg5VEMaNG8eUSiWrr69v9tqlS5eYSCRi999/f5c+w2w2s7q6ui5twxF9+vRho0ePdtr2du/ezQCwjz/+2G65Xq9ncXFxbNiwYZ3a7sSJE1liYqITIvRMFRUVLCYmhkVHR7MTJ040e91isbCPPvqIHThwoM3tNJy/hMIrqyWXL18OnU6Hd955BxEREXavhYeH4+2330ZtbS1eeuklAMDWrVvBcRz27t3bbFtvv/02OI7Dn3/+aVt28uRJ3HrrrQgNDYVcLsegQYOaVRc13Ffbu3cv5s6di8jISMTHx7ca8+eff46JEyciNjYWMpkM3bt3x3PPPQez2Wy33pgxY9C3b18cOnQIw4YNg7+/P5KTk7FmzZoO/W1MJhOee+45dO/eHTKZDElJSXj66aeh1+tt63Ach/feew+1tbW26p+OVC/dfvvt+N///geNRmNbdvDgQZw5cwa33357s/Vbuud25swZ3HLLLYiOjoZcLkd8fDymTZuGqqoq2zrffvstRowYgZCQEAQGBqJXr154+umn7bat1+uxcOFCpKamQiaTISEhAU888YTdfjbs64MPPojPPvsMffv2hUwmQ58+fbBjxw679aqrqzF//nwkJSVBJpMhMjIS48ePx+HDh+3W279/P/7+979DqVRCoVBg9OjR+Omnn9r9291xxx2oqqrCV1991ey1zZs3w2KxYMaMGQCAl19+GcOGDUNYWBj8/f2RmZmJrVu3Nntfw759+OGH6NOnD2QymW2/mt5zu3DhAubOnYtevXrB398fYWFhuO2225rdF274Xf/000/IyclBREQEAgICcNNNN6G0tNS2XlJSEo4dO4a9e/fafkNjxowBABiNRixatAg9evSAXC5HWFgYRowYgW+//bbdv1NLpFIpVCpViyXRDz74AJmZmfD390doaCimTZuGgoIC2+tjxozBV199hQsXLtjiTEpKAmMM4eHhyMnJsa1rsVgQEhICsVhs9xtftmwZ/Pz8UFNTY1vWkXMEAGg0GsyfPx8JCQmQyWRITU3FsmXLYLFYbOvk5+fbqvTfeecd27E7ePBgHDx4sN2/z5o1a1BUVIQVK1YgLS2t2escx2H69Om20jAAPPvss+A4DsePH8ftt98OlUqFESNGAOjYOaRhuy3d101KSrIrzTf8pr7//nvcd999CAsLQ3BwMGbOnInKykq79/7666+YMGECwsPDbee+u+++u92/QYv4zq6dERsby5KSktpcJykpicXHxzPGGNPpdCwwMJDNnTu32Xpjx45lffr0sT3/888/mVKpZOnp6WzZsmXsrbfeYqNGjWIcx7Ft27bZ1msonaWnp7PRo0ezN998ky1dutTutcYlt8mTJ7MpU6aw5cuXs9WrV7PbbruNAWCPPfaYXTyjR49msbGxLDIykj344IPsjTfeYCNGjGAA2Lp169r928yaNYsBYLfeeitbuXIlmzlzJgNgV+LatGkTGzlyJJPJZGzTpk1s06ZNbZZyG0puWq2WyeVyuzjmz5/P0tLSWF5eXrOSW8OV+O7duxlj1ivw5ORkFhsby55//nn27rvvskWLFrHBgwez/Px8299fKpWyQYMGsddff52tWbOGPfbYY2zUqFG27ZrNZva3v/2NKRQKNn/+fPb222+zBx98kPn5+bFJkybZxQ6ADRgwgMXExLDnnnuOrVixgqWkpDCFQsHKysps691+++1MKpWynJwc9u6777Jly5axG264gX3wwQe2dXJzc5lUKmVDhw5lr7zyCnvttddY//79mVQqZfv372/ze6mqqmJyuZzdcsstzV676qqrWGJiIrNYLIwxxuLj49ncuXPZW2+9xV599VWWlZXFALDt27c327fevXuziIgItmjRIrZy5Ur222+/2V5buHChbd2PP/6YDRgwgD3zzDPsnXfeYU8//TRTqVQsMTGR1dbW2tZr+O0OHDiQXXPNNezNN99kjz76KBOLxWzKlCm29T799FMWHx/P0tLSbL+hb775hjHG2NNPP804jmNz5sxha9euZa+88gqbPn267fhoTcPvZf369ay0tJSVlpayU6dOsYULF7b4+3/++ecZx3Fs6tSpbNWqVWzRokUsPDycJSUlscrKSsYYY9988w3LyMhg4eHhtjg//fRTxhhjN954I8vMzLRt77fffmMAmEgksvtbT5w4kQ0aNMj2vKPniNraWta/f38WFhbGnn76abZmzRo2c+ZMxnEc++c//2lbr+HYGThwIEtNTWXLli1jL730EgsPD2fx8fHMYDC0+XcbOnQo8/f3b3e9xhr+punp6WzSpEls1apVbOXKlYyxjp1DGGv+G2uQmJjIZs2aZXve8Jvq168fGzlyJHvjjTfYvHnzmEgkYqNGjbL97tVqNVOpVKxnz55s+fLlbO3atexf//oX6927d4f3yy6+Tr2LRxqNhgFodhJr6sYbb2QAmFarZYwxNn36dBYZGclMJpNtnaKiIiYSidjixYtty8aNG8f69etnV31ksVjYsGHDWI8ePWzLGr6wESNG2G2z8WuNk5tOp2sW43333ccUCoXdZ40ePZoBYK+88optmV6vZxkZGSwyMrLNH/CRI0cYAHbPPffYLX/ssccYALZr1y7bsoaE1RGN17311lvZuHHjGGPWJBMdHc0WLVrUoeTWcPJoWu3U2GuvvcYAsNLS0lbX2bRpExOJROyHH36wW75mzRoGgP3000+2ZQCYVCplZ8+etS37/fffGQD25ptv2pYplUo2b968Vj/TYrGwHj16sAkTJtgORsas32tycjIbP358q+9tcNtttzG5XM6qqqpsy06ePMkAsAULFthtszGDwcD69u3LrrnmGrvlDSfiY8eONfuspieeln5/+/btYwDY+++/b1vW8NvNzs62289HHnmEicViptFobMtaq5YcMGAAmzhxYgt/gbY1/F6aPkQiEXvhhRfs1s3Pz2disbjZ8qNHjzI/Pz+75a1VSy5fvpyJxWLbOeKNN95giYmJLCsriz355JOMMetvPCQkhD3yyCO293X0HPHcc8+xgIAAdvr0abvPfeqpp5hYLGYXL15kjP2V3MLCwlhFRYVtvc8//5wBYF9++WWbfzeVSsUyMjKaLddqtbaLhNLSUrtqx4bkNn36dLv3OHIOcTS5ZWZm2p2/XnrpJQaAff7554wx6wUTAHbw4ME297ejvK5asrq6GgAQFBTU5noNr2u1WgDA1KlTUVJSYldFtnXrVlgsFkydOhUAUFFRgV27dmHKlCmorq5GWVkZysrKUF5ejgkTJuDMmTMoLCy0+5w5c+ZALBa3G7e/v7/dPpSVlWHkyJHQ6XQ4efKk3bp+fn647777bM+lUinuu+8+lJSU4NChQ61+xtdffw0AdlUtAPDoo48CQItVYo66/fbbsWfPHhQXF2PXrl0oLi5usUqyJUqlEgCwc+dO6HS6FtcJCQkBYK3GbVx109jHH3+M3r17Iy0tzfYdlZWV4ZprrgEA7N6922797OxsdO/e3fa8f//+CA4Oxvnz5+0+d//+/bh8+XKLn3nkyBFb9Wt5ebntM2trazFu3Dh8//33rcbb4I477kB9fT22bdtmW/bRRx8BgK1KErD/rVRWVqKqqgojR45sVkUKAKNHj0Z6enqbn9t0m0ajEeXl5UhNTUVISEiL27333nvBcZzt+ciRI2E2m3HhwoV2PyskJATHjh3DmTNn2l23Jc888wy+/fZbfPvtt9iyZQumT5+Of/3rX3j99ddt62zbtg0WiwVTpkyx+w1ER0ejR48ezX4DLWnYp59//hkA8MMPP2DkyJEYOXIkfvjhBwDAn3/+CY1Gg5EjRwJw7Bzx8ccfY+TIkVCpVHYxZmdnw2w24/vvv7eLZ+rUqVCpVHbxAbD7nbZEq9UiMDCw2fI777wTERERtseTTz7ZbJ3777/f7rkrzyH33nsvJBKJ7fkDDzwAPz8/22c2HPvbt2+H0Wjs9Oc08Lrk1pC0GpJca5omwYb7JFu2bLGts2XLFmRkZKBnz54AgLNnz4Ixhv/85z92P4qIiAgsXLgQgLW5bWPJyckdivvYsWO46aaboFQqERwcjIiICNxxxx0AYHe/CQBiY2MREBBgt6whxrb6zl24cAEikQipqal2y6OjoxESEtKhE1N7rrvuOgQFBWHLli348MMPMXjw4Gaf15rk5GTk5OTg3XffRXh4OCZMmICVK1fa7f/UqVMxfPhw3HPPPYiKisK0adPwf//3f3aJ48yZMzh27Fiz76jhb9T0O+rWrVuzWFQqlV19/0svvYQ///wTCQkJyMrKwrPPPmt3Umk4Uc+aNavZ57777rvQ6/XNvsemrr32WoSGhtoSGgD897//xYABA9CnTx/bsu3bt+Pqq6+GXC5HaGgoIiIisHr16ha339HfX11dHZ555hnbvZ/w8HBERERAo9G0uN2mf7OGk27TeyQtWbx4MTQaDXr27Il+/frh8ccfxx9//NGhOAGgX79+yM7ORnZ2NqZMmYIPPvgA119/PZ566inbfb8zZ86AMYYePXo0+z5OnDjR7DfQkquuugoKhcKWyBqS26hRo/Drr7+ivr7e9lrD/ShHzhFnzpzBjh07mq3X0GK2vd9pR//mQUFBdvcDGyxevNh2kdCapr8fV55DevToYfc8MDAQMTExtnPa6NGjccstt2DRokUIDw/HpEmT8N577zW719dRXtcVQKlUIiYmpt2D5Y8//kBcXByCg4MBADKZDJMnT8ann36KVatWQa1W46effrLr39VwAn3ssccwYcKEFrfb9EtvfEXcGo1Gg9GjRyM4OBiLFy9G9+7dIZfLcfjwYTz55JPtXvE7qvEVt7PJZDLcfPPN2LhxI86fP+9wR+FXXnkFd911Fz7//HN88803ePjhh7FkyRL88ssviI+Ph7+/P77//nvs3r0bX331FXbs2IEtW7bgmmuuwTfffAOxWAyLxYJ+/frh1VdfbfEzEhIS7J63VrK21qxYTZkyBSNHjsSnn36Kb775BsuXL8eyZcuwbds2XHvttbbvaPny5a12eG/p6rkxiUSCKVOmYO3atVCr1bh48SLOnDlja/gEWE+wN954I0aNGoVVq1YhJiYGEokE7733nl1SbNCR3x8APPTQQ3jvvfcwf/58DB06FEqlEhzHYdq0aS3+/jryN2vNqFGjcO7cOdt3/O677+K1117DmjVrcM8993Qo3qbGjRuH7du325q8WywWcByH//3vfy3G2t53AVi/jyFDhuD777/H2bNnUVxcjJEjRyIqKgpGoxH79+/HDz/8gLS0NFvDNUfOERaLBePHj8cTTzzR4noNF2MNOvs3T0tLw++//w6j0WhXMurfv3+b7wNa//105RzStJFcR3Ech61bt+KXX37Bl19+iZ07d+Luu+/GK6+8gl9++aVD32ljXpfcAOD666/H2rVr8eOPP9quqBr74YcfkJ+fb1e1B1hLBRs3bkRubi5OnDgBxpitShIAUlJSAFh/9K31R+qMPXv2oLy8HNu2bbPrZ5KXl9fi+pcvX0Ztba1d6e306dMArC2RWpOYmAiLxYIzZ86gd+/etuVqtRoajQaJiYld3BOr22+/HevXr4dIJMK0adMcfn+/fv3Qr18//Pvf/8bPP/+M4cOHY82aNXj++ecBACKRCOPGjcO4cePw6quv4sUXX8S//vUv7N6921bF+Pvvv2PcuHFOTeQxMTGYO3cu5s6di5KSElx11VV44YUXcO2119qqNYODg7v025gxYwbWrFmDLVu2IC8vz9aSrcEnn3wCuVyOnTt3QiaT2Za/9957nd8xWKvgZ82ahVdeecW2rL6+3q5VoKPa+tuHhoZi9uzZmD17NmpqajBq1Cg8++yznU5uJpMJAGwllO7du4MxhuTk5GZJwpE4R44ciWXLluG7775DeHg40tLSwHEc+vTpgx9++AE//PADrr/+etv6jpwjunfvjpqaGqeeS1py/fXX45dffsGnn37arA+qoxw5h6hUqma/H4PBgKKioha3febMGYwdO9b2vKamBkVFRbjuuuvs1rv66qtx9dVX44UXXsBHH32EGTNmYPPmzQ7/dryuWhIAHn/8cfj7++O+++5DeXm53WsVFRW4//77oVAo8Pjjj9u9lp2djdDQUGzZsgVbtmxBVlaWXbE8MjISY8aMwdtvv93iF9S4KbQjGq7IGl+BGQwGrFq1qsX1TSYT3n77bbt13377bURERCAzM7PVz2n4kTQdjaGhhDNx4sROxd/U2LFj8dxzz+Gtt95CdHR0h9+n1WptJ6kG/fr1g0gkslU9VFRUNHtfQ0mpYZ0pU6agsLAQa9eubbZuXV0damtrOxwTYL3SbFo1FxkZidjYWNtnZmZmonv37nj55ZdbrALq6G9j+PDhSEpKwgcffIAtW7Zg9OjRdl1IxGIxOI6zu/rNz8/HZ5995tA+NSUWi5uVAN58881OX2UDQEBAQIvJsekxGRgYiNTU1E5XLwHWqloAGDBgAADg5ptvhlgsxqJFi5rtF2PMLoaAgIBWq4xHjhwJvV6PFStWYMSIEbZEOHLkSGzatAmXL1+23fsCHDtHTJkyBfv27cPOnTubrafRaJodC531wAMPICoqCo888ojtIrixjpS2GzhyDunevXuz+4bvvPNOq7+pd955x+5e2urVq2EymWwDF1RWVjaLtemx7wivLLn16NEDGzduxIwZM9CvX79mI5SUlZXhv//9r10jAsB6tXXzzTdj8+bNqK2tbXGoqJUrV2LEiBHo168f5syZg5SUFKjVauzbtw+XLl3C77//7nC8w4YNg0qlwqxZs/Dwww+D4zhs2rSp1R9dbGwsli1bhvz8fPTs2RNbtmzBkSNH8M4779hVOzQ1YMAAzJo1C++8846tKvTAgQPYuHEjJk+ebHfV1BUikQj//ve/HX7frl278OCDD+K2225Dz549YTKZsGnTJojFYtxyyy0ArPcJvv/+e0ycOBGJiYkoKSnBqlWrEB8fbyul33nnnfi///s/3H///di9ezeGDx8Os9mMkydP4v/+7/+wc+dODBo0qMNxVVdXIz4+HrfeeisGDBiAwMBAfPfddzh48KCtpCMSifDuu+/i2muvRZ8+fTB79mzExcWhsLAQu3fvRnBwML788st2P4vjONx+++226vDFixfbvT5x4kS8+uqr+Pvf/47bb78dJSUlWLlyJVJTUx26b9XU9ddfj02bNkGpVCI9PR379u3Dd999h7CwsE5vMzMzE6tXr8bzzz+P1NRUREZG4pprrkF6ejrGjBmDzMxMhIaG4tdff8XWrVvx4IMPdmi7P/zwg220m4qKCnzxxRfYu3cvpk2bZuvH1b17dzz//PNYsGAB8vPzMXnyZAQFBSEvLw+ffvop7r33Xjz22GO2OLds2YKcnBwMHjwYgYGBuOGGGwAAQ4cOhZ+fH06dOoV7773XFsOoUaOwevVqALBLbkDHzxGPP/44vvjiC1x//fW46667kJmZidraWhw9ehRbt25Ffn4+wsPDO/33bxAaGopPP/0UN9xwAwYMGIBp06Zh8ODBkEgkKCgowMcffwyg5XvPTTlyDrnnnntw//3345ZbbsH48ePx+++/Y+fOna3uk8FgwLhx4zBlyhScOnUKq1atwogRI3DjjTcCADZu3IhVq1bhpptuQvfu3VFdXY21a9ciODi4WemuQ5zS5pInf/zxB5s+fTqLiYlhEomERUdHs+nTp7OjR4+2+p5vv/2WAWAcx7GCgoIW1zl37hybOXMmi46OZhKJhMXFxbHrr7+ebd261bZOQ/PWlpqtttQV4KeffmJXX3018/f3Z7GxseyJJ55gO3futGsqz5i1K0CfPn3Yr7/+yoYOHcrkcjlLTExkb731Vof+JkajkS1atIglJycziUTCEhIS2IIFC5qNjNHZrgCt6UhXgPPnz7O7776bde/encnlchYaGsrGjh3LvvvuO9t7cnNz2aRJk1hsbCyTSqUsNjaWTZ8+vVlzaoPBwJYtW8b69OnDZDIZU6lULDMzky1atMiuqT2AFpv4N26urNfr2eOPP84GDBjAgoKCWEBAABswYABbtWpVs/f99ttv7Oabb2ZhYWFMJpOxxMRENmXKFJabm9vu37HBsWPHGAAmk8ls/bEaW7duHevRoweTyWQsLS2Nvffee7am2421tm8NrzVupl1ZWclmz57NwsPDWWBgIJswYQI7efJkq822m/6um36XjDFWXFzMJk6cyIKCghgAW7eA559/nmVlZbGQkBDm7+/P0tLS2AsvvNBuP6yWugJIpdI23//JJ5+wESNGsICAABYQEMDS0tLYvHnz2KlTp2zr1NTUsNtvv52FhIQwAM26BQwePJgBsOureOnSJQaAJSQktBhrR84RjDFWXV3NFixYwFJTU5lUKmXh4eFs2LBh7OWXX7btT0vHToOm32NbioqK2OOPP87S09OZv78/k8lkLCUlhc2cOZN9//33dus2/J5a6nLT0XOI2WxmTz75JAsPD2cKhYJNmDCBnT17ttXf1N69e9m9997LVCoVCwwMZDNmzGDl5eW29Q4fPsymT5/OunXrxmQyGYuMjGTXX389+/XXXzu0/01xjDlQZiUuN2bMGJSVldmNmEIIId5qw4YNmD17Ng4ePOhQjUpXeeU9N0IIIaQtlNwIIYQIDiU3QgghgkP33AghhAgOldwIIYQIDiU3QgghguMVnbgtFgsuX76MoKAgl46bSAghxHMxxlBdXY3Y2FiIRG2XzbwiuV2+fLnZYLiEEEJ8U0FBgd2wdS3xiuTWMG1NQUGBbZR/QgghvkWr1SIhIaHd+TwBL0luDVWRwcHBlNwIIcTHdeT2FDUoIYQQIjiU3AghhAgOJTdCCCGC4xX33DrCYrHAYDDwHQYhHSaVStttzkwI6RxBJDeDwYC8vDxYLBa+QyGkw0QiEZKTkyGVSvkOhRDB8frkxhhDUVERxGIxEhIS6EqYeIWGgQmKiorQrVs3GpyAECfz+uRmMpmg0+kQGxsLhULBdziEdFhERAQuX74Mk8kEiUTCdziECIrXF3PMZjMAUNUO8ToNv9mG3zAhxHm8Prk1oGod4m3oN0uI63Qqua1cuRJJSUmQy+UYMmQIDhw40Oq6Y8aMAcdxzR4TJ07sdNCEEEJIWxxOblu2bEFOTg4WLlyIw4cPY8CAAZgwYQJKSkpaXH/btm0oKiqyPf7880+IxWLcdtttXQ6eEEIIaYnDDUpeffVVzJkzB7NnzwYArFmzBl999RXWr1+Pp556qtn6oaGhds83b94MhULh8uT25Zcu3XwzN9zQufcVFxfjhRdewFdffYXCwkJERkYiIyMD8+fPx7hx45CUlIT58+dj/vz5du979tln8dlnn+HIkSO254sWLQIAiMVihISEID09HTfffDMeeOAByGSyLuwdIYR4F4dKbgaDAYcOHUJ2dvZfGxCJkJ2djX379nVoG+vWrcO0adMQEBDgWKQClJ+fj8zMTOzatQvLly/H0aNHsWPHDowdOxbz5s1zeHt9+vRBUVERLl68iN27d+O2227DkiVLMGzYMFRXV7tgDwghxDM5VHIrKyuD2WxGVFSU3fKoqCicPHmy3fcfOHAAf/75J9atW9fmenq9Hnq93vZcq9U6EqbXmDt3LjiOw4EDB+ySfZ8+fXD33Xc7vD0/Pz9ER0cDAGJjY9GvXz+MHz8eAwYMwLJly/D88887LXbiAiYToNcD9fWAxWJ9MGZ9NB6gQCQCOM76EIkAsRiQyQC53Pp/Qoh7+7mtW7cO/fr1Q1ZWVpvrLVmyxFbFJlQVFRXYsWMHXnjhhRZLsSEhIU75nLS0NFx77bXYtm0bJTe+NCSnhmTV8G9dnTWZ/fKL9f8mU9c/y8/PmuQakl3T/8vlgEJhTYyECJhDyS08PBxisRhqtdpuuVqttpUYWlNbW4vNmzdj8eLF7X7OggULkJOTY3veMEGdkJw9exaMMaSlpbW77pNPPol///vfdssMBgPS09M79FlpaWn45ptvOhUncRBjgNnc/NESk8n6mrMSW8M2a2qsj9aIRIBS+dcjJAQICqKERwTFoeQmlUqRmZmJ3NxcTJ48GYB1GKHc3Fw8+OCDbb73448/hl6vxx133NHu58hkMsE3gGCMdXjdxx9/HHfddZfdsjfeeAPff/99hz+L+lS5gCOJzJNYLEBlpfXRgBIeERiHqyVzcnIwa9YsDBo0CFlZWVixYgVqa2ttrSdnzpyJuLg4LFmyxO5969atw+TJkxEWFuacyL1cjx49wHFch+5VhoeHIzU11W5Z01aobTlx4gSSk5MdjpG0wGwGjEZrCclZpS1P0FrCCw0FoqKA6GhrdSYhXsLh5DZ16lSUlpbimWeeQXFxMTIyMrBjxw5bI5OLFy82G7z41KlT+PHHH6lqrJHQ0FBMmDABK1euxMMPP9zsvptGo3HKfbeTJ09ix44dWLBgQZe35ZMaSmdGo/XhSzNPWCxAWZn1ceyYtSQXFWV9qFRUqiMerVMNSh588MFWqyH37NnTbFmvXr0cqobzFStXrsTw4cORlZWFxYsXo3///jCZTPj222+xevVqnDhxwqHtmUwmFBcXw2KxoLy8HHv27MHzzz+PjIwMPP744y7aCwFi7K/SmdFofU6A6mrr4+xZayOVyEhriS4iglppEo/j9bMCeLOUlBQcPnwYL7zwAh599FEUFRUhIiICmZmZWL16tcPbO3bsGGJiYiAWi6FUKpGeno4FCxZQJ+6OaEhoBoOwqhtdRa8HCgqsD5EICA8H4uOBmBjrc0J4xjEvKFJptVoolUpUVVUhODjY7rX6+nrk5eUhOTkZcrmcpwiJ1zKbrQnNYHB7Ca3eYEBeQQGSS0ogF0pClcmAhAQgKQnw9+c7GiIwbeWCpqjkRnwPY391mBZKUvEUer212vLcOWu1ZVKS9V9C3IySG/EdFstfpTRfahjCB8YAtdr6CAgAEhOBbt0AmpSVuAklNyJ8DaU0o5HvSHxTbS1w/Dhw8iQQF2ctzTlpBB5CWkPJjQiXyWQdp5GqHj2DxfJXI5SoKKB3b2v3AkJcgJIbER6z2ZrUqKTmudRqoKTEWpJLS6PGJ8TpKLkR4bBYrEnNYOA7EtIRjAGXLgGXL1urKnv0AKRSvqMiAkHJjXg/i8V6T42H5vzECSwW4Px54OJFoHt3ICXFOrsBIV1AvyDivRizJjW9npKaEJhMwKlTQH6+tRSXmEgdwkmn0S+HeCej0ToUVH09JTah0euBP/8Edu8GSkv5joZ4KUpuxCF79uwBx3HQaDT8BGCxADqdtXk59VUTNp3OOpHrH39Qi1fiMMFWS375+5du/bwbBtzQqfcVFxfjhRdewFdffYXCwkJERkYiIyMD8+fPx7hx45wS25gxY5CRkYEVK1Y4ZXu8MRqtE3tSUvMtFy5YW1YOGGAdpJmQDhBscvMG+fn5GD58OEJCQrB8+XL069cPRqMRO3fuxLx58zo015uzMMZgNpvh54k38qkVJKmrs5biEhOB9HRqcELaRdWSPJo7dy44jsOBAwdwyy23oGfPnujTpw9ycnLwyy+/ALDOjzdp0iQEBgYiODgYU6ZMgVqttm3j2WefRUZGBjZt2oSkpCQolUpMmzYN1dXVAIC77roLe/fuxeuvvw6O48BxHPLz823Vi//73/+QmZkJmUyGH3/8EXq9Hg8//DAiIyMhl8sxYsQIHDx4kJe/DwBraa2mhhIbsbpwAdizh+7FkXZRcuNJRUUFduzYgXnz5jWbqBQAQkJCYLFYMGnSJFRUVGDv3r349ttvcf78eUydOtVu3XPnzuGzzz7D9u3bsX37duzduxdLly4FALz++usYOnQo5syZg6KiIhQVFSEhIcH23qeeegpLly7FiRMn0L9/fzzxxBP45JNPsHHjRhw+fBipqamYMGECKioqXPsHaYrurZHWNJTi6F4caQOV7Xly9uxZMMaQlpbW6jq5ubk4evQo8vLybAnp/fffR58+fXDw4EEMHjwYAGCxWLBhwwYEXRnK6M4770Rubi5eeOEFKJVKSKVSKBQKREdHN/uMxYsXY/z48QCA2tparF69Ghs2bMC1114LAFi7di2+/fZbrFu3zn0TntK9NdIRDffiBg4EwsL4joZ4GCq58aQj0+idOHECCQkJdiWt9PR0hISE2M3SnZSUZEtsABATE4OSkpIOxTFo0CDb/8+dOwej0Yjhw4fblkkkEmRlZTk8K3in1ddTaY10XF0dsG8fkJfHdyTEw1By40mPHj3AcZxTGo1ImkwjwnEcLB1MDi1VifKCMWtSq6/nOxLibRiz9ov74w+6KCI2lNx4EhoaigkTJmDlypWora1t9rpGo0Hv3r1RUFCAgoIC2/Ljx49Do9EgPT29w58llUphNpvbXa979+6QSqX46aefbMuMRiMOHjzo0Oc5zGKxNhqhgY5JV1y4YC3FUeMjAkpuvFq5ciXMZjOysrLwySef4MyZMzhx4gTeeOMNDB06FNnZ2ejXrx9mzJiBw4cP48CBA5g5cyZGjx5tV53YnqSkJOzfvx/5+fkoKytrtVQXEBCABx54AI8//jh27NiB48ePY86cOdDpdPjHP/7hrN22ZzJZRxrpQPIlpF0VFcD33wNaLd+REJ5RcuNRSkoKDh8+jLFjx+LRRx9F3759MX78eOTm5mL16tXgOA6ff/45VCoVRo0ahezsbKSkpGDLli0Ofc5jjz0GsViM9PR0RERE4OLFi62uu3TpUtxyyy248847cdVVV+Hs2bPYuXMnVCpVV3e3OYPBWmKj4bOIM9XVAT/+CBQV8R0J4RHHOtKygWdarRZKpRJVVVUIDg62e62+vh55eXlITk6GXC7nKULiEMas99b0er4j4VW9wYC8ggIkl5RATk3aXaNnT6BXL76jIE7SVi5oikpuxL0sFmvDER9PbMRNTp8Gfv2Vqr19ECU34j4NiY1KKcSdioqsDU2owZJPoeRG3KOhRSRdQRM+VFZSS0ofQ8mNuF5DYqM+SIRPVVXAzz9TlbiPoORGXMtspsRGPEd1tTXB0WABgkfJjbiO2UxDaRHPU1NDCc4HUHIjrtHQeIQSG/FEtbV0D07gKLkR56N7bMQb1NRQK0oBo+RGnItKbMSbaLXWueEowQkOJTfikIYZvDUaTfMXG0b2p+b+xJtoNMCBA3RBJjDCnaz0yy/d+3k33NCptxUXF+OFF17AV199hcLCQkRGRiIjIwPz58/HuHHjnBLamDFjkJGRgRUrVjhle63S6ewS254ff8TYK38XjuMQFBiIlKQkjB8zBo/MnYuYRpOnPrt0KRYtWwYAEIvFCFEqkd6rF26+4QY8cPfdkMlkf+3P9ddj75WZC6RSKcLDwnBV//6YPWMGbu7E93Df/Pl4d9MmbF63DrdNntxkl3R4bvly/N9nn6GwqAhBgYFI79ULOfPmYdJ117kkHsKDigrg6FFgwAC+IyFOQiU3HuXn5yMzMxO7du3C8uXLcfToUezYsQNjx47FvHnz3BoLYwymrowcUl/fatXOqYMHcfnECRzctQtP/vOf+G7vXvQdNgxHjx2zW69PWhqKTp7ExaNHsfuLL3Db5MlY8tprGDZhAqqrq+3WnTNrFopOnsS5w4fxycaNSO/VC9P+8Q/cO3++Q2HrdDps3rYNTzz8MNZ/8EGz1+/PycG27dvx5rJlOHngAHZs3YpbJ01CeUWFS+IhPLp4kSY9FRBKbjyaO3cuOI7DgQMHcMstt6Bnz57o06cPcnJy8MsvvwAALl68iEmTJiEwMBDBwcGYMmUK1Gq1bRvPPvssMjIysGnTJiQlJUGpVGLatGm2ZHDXXXdh7969eP3118FxHDiOQ35+vq168X//+x8yMzMhk8nw448/Qq/X4+GHH0ZkZCTkcjlGjBiBgwcPtr0jRmObzaojIyIQHRWFnqmpmHbLLfhpxw5EhIfjgUcftVvPz88P0VFRiI2JQb8+ffDQvfdi7/bt+PPECSx7/XW7dRX+/oiOikJ8XByuHjwYyxYtwtuvvYa1Gzfiuz17OvwdfPz550hPS8NT8+fj+337UHDpkt3rX/zvf3g6JwfX/e1vSOrWDZkZGXjo3ntx9x13uCQewrNjx4CyMr6jIE5AyY0nFRUV2LFjB+bNm9fibNghISGwWCyYNGkSKioqsHfvXnz77bc4f/48pk6darfuuXPn8Nlnn2H79u3Yvn079u7di6VLlwIAXn/9dQwdOhRz5sxBUVERioqKkJCQYHvvU089haVLl+LEiRPo378/nnjiCXzyySfYuHEjDh8+jNTUVEyYMAEVTUoqNmaztTrSAf7+/rh/9mz8tH8/SkpL21w3rWdPXJudjW3bt7e73VnTp0MVEoJtDlRJr9u0CXfcdhuUSiWuzc7Ghv/+1+716KgofP3tt81Kjh3RmXgIzxgDDh1y+DdNPA8lN56cPXsWjDGkpaW1uk5ubi6OHj2Kjz76CJmZmRgyZAjef/997N271640ZbFYsGHDBvTt2xcjR47EnXfeidzcXACAUqmEVCqFQqFAdHQ0oqOjIRaLbe9dvHgxxo8fj+7du0Mmk2H16tVYvnw5rr32WqSnp2Pt2rXw9/fHunXrmgfY0DKyE7MmpfXoAQDIb2NuucbrdmQ9kUiEnqmpHVoXAM6cO4dffv0VU2++GQBwx5QpeO/DD9F4Fqh3XnsNP+/fj7Du3TH4mmvwyNNP46crpWpnx0M8hMFgbWBCA3x7NUpuPOnINHonTpxAQkKCXUkrPT0dISEhOHHihG1ZUlISgoKCbM9jYmJQUlLSoTgaz+h97tw5GI1GDB8+3LZMIpEgKyvL7vOu7ID16raTLcwa9p/juA6t25H1HF13/QcfYMI11yA8LAwAcN348ajSarHr++9t64waPhznjxxB7mef4dYbb8Sxkycx8rrr8Nzy5U6Ph3iQ6mrgt9/4joJ0gXBbS3q4Hj16gOM4nDx5ssvbkkgkds85joOlg0mnpSrRDqmvB7pw0j5x+jQAIKlbtw6tm9yB9cxmM86cO4fBAwd2aN2NmzejWK2GX3i43fL1H3yAcaNH25ZJJBKMHDYMI4cNw5Pz5+P5l1/G4pdewpP//CekUqlT4nHEsYsX8exHH+HQuXMorqyEQiZDekICHr/5ZtyQlWVb764VK7Bx165m7+8VF4eTq1fbnj/70UdYtHlzq5/349KlGJ6eDgBYu3MnPtizBycvXYKmthaxoaEY068fFk6bhqSoKCfupQcoLgZOngTaqF0hnouSG09CQ0MxYcIErFy5Eg8//HCzJKPRaNC7d28UFBSgoKDAVno7fvw4NBoN0q+cbDpCKpXC3IG+Z927d4dUKsVPP/2ExMREAIDRaMTBgwcxv2mrP4MB8PfvcAyN1dXV4Z2NGzFq2DBENEosLTl5+jR25OZiwSOPtLvdjf/9Lyo1Gtxy443trvv1N9+guqYGv+3da1dN++eJE5j94IPQVFUhRKls8b3pvXrBZDKhvr6+zeTmSDyOuFBSguq6Osy65hrEhoZCp9fjk59/xo3PP4+3587FvX//u21dmUSCdx980O79yia/tZuHDkVqTEyzz3l60ybU1Ndj8JUqZAD47fx5JEdF4casLKgCA5GnVmPtN99g+8GD+P311xF7pRQsGGfOAMHBQGws35EQB1Fy49HKlSsxfPhwZGVlYfHixejfvz9MJhO+/fZbrF69GsePH0e/fv0wY8YMrFixAiaTCXPnzsXo0aPtqhPbk5SUhP379yM/Px+BgYEIDQ1tcb2AgAA88MADePzxxxEaGopu3brhpZdegk6nwz/+8Q/rSp3ooF1SWor6+npU19Tg0JEjeOmNN1BWXo5t779vt57JZEKxWg2LxYLyigrs+eknPP/yy8jo1w+PP/SQ3bq6ujoUq9UwmUy4dPkyPt2+Ha+tXo0H7r4bY0eObDemdR98gInjx2NAv352y9PT0vDIv/6FD//v/zBvzhyMuf56TL/lFgwaOBBhoaE4fvIknn7uOYwdOdJumvuuxuOI6wYNwnVNvv8HJ05EZk4OXv38c7vk5icW446xY9vcXv/kZPRPTrZbVlBaikvl5bhn/HhIG9UMrHrggWbvn3z11RiUk4P3d+/GU7fe2pld8mxHjgBKJdDZWg7CC0puPEpJScHhw4fxwgsv4NFHH0VRUREiIiKQmZmJ1atXg+M4fP7553jooYcwatQoiEQi/P3vf8ebb77p0Oc89thjmDVrFtLT01FXV4e8NvryLF26FBaLBXfeeSeqq6sxaNAg7Ny5EyqVynqfrRNzYfUaPBgcxyEwMBApiYn429ixyJk3D9FNqrGOnTyJmLQ0iMViKIODkd6rFxY88kizTtwAsHbjRqzduBFSqRRhoaHIHDAAW9avx03XX99uPOqSEnz1zTf4aO3aZq+JRCLcNHEi1n3wAebNmYMJ11yDjf/9L55+7jno6uoQGx2N6ydMwDNPPOG0eJxBLBYjITwcB8+cafaa2WxGrV6PYIWiw9v77/ffgzGGGWPGtLtuUmQkAEBTW9vh7XsVs9ma4IYN61JVPHEvjnWkZQPPtFotlEolqqqq7K6WAaC+vh55eXlITk6GXC7nKUIfUV9P04Q4Ub3BgLyCAiSXlEDeiZZ5tfX1qDMYUFVbiy8OHMDj772HqSNH4sMr/QfvWrEC7+/eDX+pFDq9HqrAQEwfNQrLZs1CYDtVygMefhiVNTW4sG5diw1iyrVamC0WXCwtxeItW/DlgQP4ZtEijHfy/UWP0qcPkJLCdxQ+ra1c0BSV3EjHmM2U2DzMo+vX4+0dOwBYS5w3X3013rrvPtvrMSoVnrj5ZlzVvTssFgt2HD6MVV9/jd/z8rDnxRfh1+heY2PHLl7EH/n5eOLmm1tt6Rk3ezb0V0akCQsKwhv33ivsxAZYG5dERVH1pJeg5Eba19Dsv4lrb70VP7TS5+vpRx7B001GIHGnF195BS++9lqLr428+mr8b+tWN0fkfPNvvBG3DhuGyxUV+L8ff4TZYoGh0RBoS2bNslt/2qhR6Bkbi3998AG2/vQTpo0a1eJ2P7wyosqMRi1Gm/rfwoWoNxpxoqAAH+zZg1pfuPCh6kmvQtWSpH2tVEcWXr6MulZOaqEqFUJVKldH1qqKykpUVFa2+Jq/XI44D2j91tVqyab+9swz0NTWYv/LL7da4qrT6xE4dSpmjxuHd5s00gGs/fKS58xBkL8/jnbw3u65oiL0feghLL/rLjzopnuMvKLqSd5QtSRxnjaqIz0hQbSG7+TKh1uHDcN9q1bhdGEhesXHt7iOv0yGsKAgVLQynNhPJ07gQkkJlsyc2eHP7R4Tg4EpKfhw717fSG5UPekVOjVCycqVK5GUlAS5XI4hQ4bgwIEDba6v0Wgwb948xMTEQCaToWfPnvj66687FTBxo1aqI4lnqjMYAABVbXxn1TodyrRaRLTSh+/DKwNq395GlWSLn63Xt/m5gtJQPen5lV4+zeHktmXLFuTk5GDhwoU4fPgwBgwYgAkTJrQ63JPBYMD48eORn5+PrVu34tSpU1i7di3i4uK6HHxjXlC76n30epp41IVsv1kHf7slLUwUazSZbC0j0xMSUG8woLqFZPPcli1gjOHvV13V4jY+/uknjOjdG90iIpq9bjKbUVlT02z5gdOncfTCBQxKTXVoP7xaRQVNj+PhHK6WfPXVVzFnzhzMnj0bALBmzRp89dVXWL9+PZ566qlm669fvx4VFRX4+eefbcNEJSUldS3qRiQSCTiOQ2lpKSIiImgcP2cxm62DIhOXYIyhVKsFZzZD4uD4nPetWgWtTodRffogLiwMxZWV+HDvXpy8dAmv3H03Av39ka9WY+D8+Zg+ahTSrlRR7vztN3z966/4+1VXYdKQIc22u/O331BeXd1q37aaujok3H03po4YgT7duiFALsfR/Hy8l5sLZUAA/tNktgrBo+pJj+ZQgxKDwQCFQoGtW7dicqMZi2fNmgWNRoPPP/+82Xuuu+46hIaGQqFQ4PPPP0dERARuv/12PPnkk3bDHjWm1+uhb9RZWKvVIiEhodWbiDU1Nbh06RKV3pyJSm0ux5nNiK+oQGArk7y2ZvP332Pdt9/i6IULKK+uRpC/PzK7d8dD11+PG68kLU1NDR565x38cuoULldUwGyxIDUmBjNGj8ZjN90EiV/z69rpy5fjk337ULxxI0IbDcTdwGA04okNG7D76FHkl5SgzmBAbGgosgcMwL+nTBHe2JIdERMDODBaEOkaRxqUOJTcLl++jLi4OPz8888YOnSobfkTTzyBvXv3Yv/+/c3ek5aWhvz8fMyYMQNz587F2bNnMXfuXDz88MNYuHBhi5/z7LPPYtGiRc2Wt7VDZrMZRgdPEqQVFRXA77/zHYWwMQaJxQIxXZB5v5EjgZAQvqPwCR7VWtJisSAyMhLvvPMOxGIxMjMzUVhYiOXLl7ea3BYsWICcnBzb84aSW1vEYnGrJUHioLNnaS4rQjrqxAmg0cU+8QwOJbfw8HCIxWKo1Wq75Wq1GtHR0S2+JyYmBhKJxC7x9O7dG8XFxTAYDC2Oqi6TyZqNJUjc5PJloKqK7ygI8R5lZUBpKdBCIxzCH4daS0qlUmRmZtpmeQasJbPc3Fy7asrGhg8fjrNnz9rNL3b69GnExMS0OV0I4QFj1pvkhBDHNJ3Ml/DO4a4AOTk5WLt2LTZu3IgTJ07ggQceQG1tra315MyZM7FgwQLb+g888AAqKirwz3/+E6dPn8ZXX32FF198EfPmzXPeXhDnuHiRWkgS0hlVVUBhId9RkEYcvuc2depUlJaW4plnnkFxcTEyMjKwY8cORF1pKXXx4kWIRH/lzISEBOzcuROPPPII+vfvj7i4OPzzn//Ek08+6by9IF1nNgNXZscmhHTCqVPW1pOiTo2NQZzM68eWJE5y9ixVrRDSVf36AU7sx0vsOZIL6BKDAEajNbkRQrrm9GnqH+ohKLkRa2KjPoKEdJ1eD5w/z3cUBJTciMkE5OfzHQUhwpGXBzg4pBpxPkpuvu7SJeqwTYgz6fVAURHfUfg8Sm6+7sIFviMgRHioNoR3lNx8WUUFoNXyHQUhwkPHFu8oufkymo+KENeh0huvKLn5Kr0eKC7mOwpChKuwkO5n84iSm6+6eJFadBHiSiYTUFDAdxQ+i5KbL2KMGpIQ4g50nPGGkpsvUquBujq+oyBE+KqrgfJyvqPwSZTcfBHd6CbEfeh44wUlN1+j01knViSEuEdxMWAw8B2Fz6Hk5muohSQh7mWxWG8FELei5OZr6CAjxP3ootLtKLn5EqORbm4TwofSUup642aU3HxJSYm1GwAhxL3MZqCsjO8ofAolN19CVSOE8IeOP7ei5OYrLBZqJUkIn+h+t1tRcvMV5eU02zYhfKqvBzQavqPwGZTcfAVdNRLCPzoO3YaSm6+gg4oQ/tF9N7eh5OYLtFrryCSEEH5ptTSuq5tQcvMFJSV8R0AIaUDHo1tQcvMFdBObEM9Bx6NbUHLzBVVVfEdACGlAyc0tKLkJndFI99sI8SQ1NTQUlxtQchM6ukokxLNYLNaGJcSlKLkJHVVJEuJ56KLT5Si5CR0dRIR4HrrodDlKbkJHBxEhnocuOl2OkpuQUWMSQjwTNSpxOUpuQkZXh4R4JmpU4nKU3ISMqiQJ8Vx08elSlNyEjJIbIZ6Ljk+XouQmZHS/jRDPRQMouxQlNyGrr+c7AkJIa+j4dClKbkLFGKDX8x0FIaQ1lNxcipKbUBkM1gRHCPFMRiN1B3AhSm5CRaU2Qjwfld5chpKbUNFBQ4jno+PUZSi5CRUdNIR4PqphcRlKbkJFBw0hno8uQl2GkptQ0UFDiOej49RlKLkJFR00hHg+qmFxGUpuQkUHDSGejy5CXYaSm1AZDHxHQAhpDx2nLkPJTaiocyghno8GWnCZTiW3lStXIikpCXK5HEOGDMGBAwdaXXfDhg3gOM7uIZfLOx0w6SA6aAjxfHQR6jIOJ7ctW7YgJycHCxcuxOHDhzFgwABMmDABJSUlrb4nODgYRUVFtseFCxe6FDTpAEpuhHg+Ok5dxuHk9uqrr2LOnDmYPXs20tPTsWbNGigUCqxfv77V93Ach+joaNsjKiqqS0GTDqArQkI8HyU3l3EouRkMBhw6dAjZ2dl/bUAkQnZ2Nvbt29fq+2pqapCYmIiEhARMmjQJx44da/Nz9Ho9tFqt3YM4iA4aQjwfHacu4+fIymVlZTCbzc1KXlFRUTh58mSL7+nVqxfWr1+P/v37o6qqCi+//DKGDRuGY8eOIT4+vsX3LFmyBIsWLXIkNOLlGLMOkm40AgYjYDRc+b/hr+Um01/rNn5wnPUBACLRX//6+QFSKSCRWB9N/y8W87OvvsRkNsFoNtoeBpMBBrMRpiv/N1lMsDALGGNgsJ7oG/7P4cp9+iv/AoCfyA8SPykkYj9IxVJIxVL4+UkgFUus/xf72db1CpTcXMah5NYZQ4cOxdChQ23Phw0bht69e+Ptt9/Gc8891+J7FixYgJycHNtzrVaLhIQEV4cqLB56gOv11gnCa2ut/9qSlwmAm49zkeivZCeXAwEB1odC8VeSJO0zW8zQGXSo1eugM9Si3lhvS2YW5tzqcT30gKG2jTU4SMR+kIglkPrJoJD6I0AWiACpAlI/qVNjcQr6obmMQ8ktPDwcYrEYarXabrlarUZ0dHSHtiGRSDBw4ECcPXu21XVkMhlkMpkjoZGmPCC5NU5kDY+G0pcnsFisMer1QE0NUFZ25QUOUPhbkxwlPHuNE1mtoQY6vQ51xnq4/cqkVcyWWHUGHTS6StsrErEEAbIAKKQKz0l4HnCcCpVDyU0qlSIzMxO5ubmYPHkyAMBisSA3NxcPPvhgh7ZhNptx9OhRXHfddQ4HSxzg5jMxY0B1NaDVemYicwizJmWdruWEFxQEKJXWqk2h0xv10NRpUKP3xETmGKPZCI1OA41OY1vWkPACpAEI9g9GoCzQvdWalNxcxuFqyZycHMyaNQuDBg1CVlYWVqxYgdraWsyePRsAMHPmTMTFxWHJkiUAgMWLF+Pqq69GamoqNBoNli9fjgsXLuCee+5x7p4Qe244aMxmQKP562E2u/wj+dNCwlMoAJUKCAmxlu6EokZfA02tNQnojDq+w3GpxgmvUFMIP5EfQhQhCPEPgVKhhFjk4huzVB3gMg4nt6lTp6K0tBTPPPMMiouLkZGRgR07dtgamVy8eBGiRl9YZWUl5syZg+LiYqhUKmRmZuLnn39Genq68/aCNOeig0avByo1gKbSWlLz5fvhDcmusNBaigsJsSa7oCDvOmdZLBZU1Wuhqa2Epk4Do9nId0i8MVlMKKspQ1lNGThwCPYPhkqhQogixDVVmFRycxmOMc8/PWm1WiiVSlRVVSE4OJjvcLzD998DVVVO2VRNzV+lM52wL+SdQiwGgoP/KtX5ubzZluOMZiM0tRpU1mmgratyesMPIVJIFVApVFApVFDIFM7ZaHg40KjBHWmbI7nAAw874hRyeZeSm9kMlJYBpSVAXZ0T4/IBZjNQWWl9cBwQGgpERQGBgXxHBmjrtCipLkFlbaWt6T3pGJ1BB51Bh0JNIRRSBaKCohAaGNq1qksaitBlKLkJVScPGp0OUKuBigqB30NzE8aA8nLrQ6GwJrnQUPf2sTObzSirLUOJtgR1RrpScQadQYe88jxcrLyI8MBwRAZFwl/q7/iGqFW4y1ByEyoHDhqLxVrKUKutVZDENXQ6IC8PuHgRCI8AoiJde+GuM9RBrS1GRU0FzIyuVFzBbDFDrVVDrVUjWB6MyKBIqAJUHW9xSSU3l6HkJlQdOGj0eqC01Pow+m4bArczmwF1sfURHAxERlrvzzmjbYHFYkGlrhJqrRo1erpScSdtvRbaei0kFRJEBkUiIiii/UYolNxchpKbULVx0NTVAZcuWVs90m0Xfmm11odEAsTEApERnWtpabFYoNaqUawt9unWjp7AaDaiUFOIQs1lhAWEIk4VB7mkleORqiVdhpKbULWQ3PR6a7P1snJQUvMwRiNw8YK1NBcXB4SFdawkxxhDWXUZLmkuUVLzOAzlteWoqK1ARFAE4kLiIPGT2K9CJTeXoeQmVI2uCI1G4HIRUKL27X5p3kCvB86fB4qLgfh4a1eC1lTUVOCS5hLqjfVui484joGhpLoEZTVliA6ORrQyGn7iK6deSm4uQ8lNqGQymC0c1MUMRUXU8tHb6HTA6dPWDuEJCfbdCLR1WhRUFKC2zQGEiaexMAsuV11GSXUJYpQxiAqNg4impnAZSm4CZLEAFy9yKD8uhVmn5zsc0gXV1cDx40CICgiNqEVZXQG09TS/oTczWUwoqCzAZZMWEeX9kBCa4F3T9HgJSm4CU1QEnDhhHbg4FHJI4DvJzWAy4p29H+J/R/egur4GqZFJuG/MDAxJGch3aF1itOhx4tIl1OSVIyjIOqiFJ4564goGkwnvfL0X/zt4FNV19UiNjcR9143BkLQUvkPrsjoxw++Xfsf5svPoHdMbUcFR7b+JdJgXjYBH2qLXAwcPAr/+ak1sAGCWdaJTqRdb/MUKfLT/c0zoOxqP/O0eiEQiPLJ5MY5cPM53aJ2m0ZfggvZPVBvKwZi1ZWV+vvVfX7D4wy/w0e79mDCoLx65+W8QcSI88vZmHDl3ke/Quswss3YTqK6vxoG8Azh84TCMJmoU5CyU3ASgsBDYvdvaCKExk0LJT0A8OFZ4Gt8e/wFzx87Ew9mzcdNVf8fKO55HjDICb+3awHd4DjNa9LhUfRIlunxYmnTAtlis33VhoRdPK9QBxy4U4tvDxzH3hrF4eFI2bhp2FVY+eAdiQpV464tdfIfXZaZA+/EpCzWF2H1qN4qrilt5B3EEJTcv1lBaO3y45U7YRh9KbrtO/gQxJ8LkqybYlsn8pLghYzyOXjoJdVUpj9E5pqG0pjO1XTyrrRV2KW7XkZMQizhMHnaVbZlM4ocbrs7A0fxLUFc6Z2BwvhiDms+TpDfpcTD/IJXinICSm5dqrbTWmCkwxG3x8O108XkkhMUhsMlo7X1ie1pfV+fxEZZD2iqttUbIpbjTl4qREBGGQLl9R+c+3WKtrxeq+QjLKRjHwRjY+swCVIrrOh+5LS0cej3wxx9tJ7UGFokMZokcYh/oB1VWU4nwQFWz5WFXlpVWV7g7JIdo9CUoqyvocFJrqqEUFxlpHdJLCMq0NQgPbj6VQpjSuqy0qtrdITmN2V/W7ujZDaW4uJA49Ivr17wDOGkTJTcvUlgIHD3q2DiQpgAlxBrhJze90QCJuPnBL7sytp/e5JmtRo0WPdS1ee1WQXZEQymuuto6+4C3t6jUG42Q+DVPALIrO6Y3em9RtaUqydYUagpRVlOG/vH9Ea2MdmFUwkLVkl7AYgGOHGn93lpbjAEhrgjJ48gk0haHn9KbDNbX/TxvDL9ao6ZD99Yc3m4tcOGC908sK5NIYDQ1L8nqr9S/yiTem71NgR1PbsBfpbijl47CC+aX9giU3DxcfT3w889AQUHn3u8rjUrCA1Uoq6lstrz8yrKIoFB3h9SmirrLuFxzptPVkO0xm60lfY3GJZt3i/DgQJRpm89sUF5lXRahDHJ3SE7jSMmtsfzyfPxy/hcYrly0kdZRcvNgGg3www/WudY6yxTgG8mtR1QKCsoLUaO3L64cKzwFAOgZlcxHWM1YmAVFtedQVn/J5TNhMwaUlFjn6bNYXPpRLtEjPgoFpeWoqbevUj52oRAA0DPOOzs9t9eYpD1lNWX44cwPqK733nuO7kDJzUMVFgI//WQtuXWFRSqHubXpNgTkmt7DYGYWfHZ4p22ZwWTEl7/nok9cT0QpI3iMzspkMeBSzQlUG8rd+rlVVVdaU3rZ+KLXDOgNs4Xhs58P25YZTCZ8uf939EmMQ5TKOy/cOtKYpD06gw4/nvmRWlO2wXsrrQWKMeDkSeDsWedt0xcalfSN64VxvYdj1e73UanTIF4Vg6//2IWiqhL8+/qH+A4PdaYaFNWcgYnx03eprg4ouAjExnrPFGJ9k+IwLqM3Vn25G5XVOsRHqPD1gT9QVFGFf0+/nu/wOq2zVZJNmSwmHMw/iLToNPSI6uGUbQoJx7zg7qRWq4VSqURVVRWChdLOuQUmk7XRiNrJ3XcCCk8j8Er1nJDpTQa8vedD7PhzD6rrapAalYT7Rs/A1d2vav/NLqQ1lEGty/OIhgAcB0RHW2cb8AZ6owlvf70HO379E9W6OqTGRuG+60bj6t7d+Q6t07Sp3VAX59xWj7EhschIyIBYJOxZBhzJBZTcPERtLXDgAFDT/P55l0lqKhF6/Efnb5i0iTGGsvoCVNZ7XtVRWJj1QdyvbHA/mBXOH/dV6a/E4KTB8JcKd0xZR3IB3XPzABUV1oYjrkhsAGAMVMEi8ZK6KIGwMDMu157xyMQGAOXl1hkkvLGhiTcz+ctcktgAoKquCj+c+QFVOu8elsxZKLnxrKwM+OUXx/uvOUof4p0ty7yRhZlxueYMao0avkNpU3U1JTh304c1H0XHqds36bHv/D5U1nahibVAUHLjUUkJsH+/e2bJpuTmHhaLGZdqTjm9Y7ar1NYClynBuY0+LMTln2E0G/HL+V9QUevZQ865GiU3nhQXW0f0d9dJRR8cASbwm818M1tMuFR7EvUmF9Uvu4iu1tpVgBKca1kkfjC6qeO5yWLCL+d/QWm198yG4WyU3HhQVGSdVNStJxOxGIbgcDd+oG8xW0y4VHMS9aZavkPplLo64NIlSnCupA9VWpuruonZYsaBvAMo0Za47TM9CSU3N1Orrc39+WijqlfRoKuu0FBi05u9ezDH+nrgEpXgXMbV99taYmEW/HrhV5RVl7n9s/lGyc2NSkt5KLE1QvfdnM9iMaOw9jT0Ju9ObA3q64DLlynBORvjOBh4GlHFbDHjQP4Bn7sHR8nNTcrL3XuPrSUWicxnZglwBwuzJjZvu8fWHp2OGpk4myEkCKyF6XvcxWwxY//5/dDoNLzF4G6U3NygstJ9rSLbQ1WTzmFhFlyuOYM6kzAHr9XVWhs9ef4QD96BjyrJphoamWjrvKMlb1dRcnOxujpric0TEhsA1IdQcnOGUt0Fr2nu31k1NdZ+mKTr3NEFoCOMZiMO5B2A3uiZk/c6EyU3FzKbrYlN70G/I7MiCMZA/q8ivVllvRpVBt9oYl1ZCWiFncNdTq8KhkXuOSME1Rnr8OuFX2EReL0zJTcXOnLEOt2Ip9FFJvEdgtfSGbUoq7vIdxhupVYDdcKeVMKl6mI9ryFXRW0FjhYe5TsMl6Lk5iJnzlhbnXmi+tBYWPykfIfhdYxmPS7XnnH5JKOehjGg6LJ11griGLNM6jFVkk1drLiIvNI8vsNwGUpuLlBcbJ2TzWOJRKgLT+A7Cq/S0OTfwjzk5qmbmUzURaAz6mIi3dpx21HHLh8TbB84Sm5OVl0N/PYb31G0ry4qCcyDDzpPU6w7D4O5ju8weFVfD6h9c7CLTmEch7oY/meAbwsDw68XfoVOL4x+mo1RcnMig8E6J5s3VN+YZQoYgj37wPMUZXWXUGOkUdYBoFprnaKJtE8fEQqLVMJ3GO0ymo04kH8AJrMXnLgcQMnNSRgDDh2ydoD1FtSwpH1aQzkq6j305ilPysutswmQtuliI/kOocOq66vx20UvqHJyACU3Jzl+3Pv6BBlCImGWKfgOw2PpTTqodcK94d5ZjFkH/zYY+I7Ec5kCFG6bAcBZirXFOFV8iu8wnIaSmxOUlQHnz/MdRSdwHHQRiXxH4ZEszIJi3XkwRi0oWmKxAMVqGsGkNd5UamvsjPqMYCY6peTWRSaTtT+bt6qL6AbG0c+gqYr6y14/yr+r1dcBGg3fUXgeJhajPjKM7zA6hYHhSMERQXTwprNaFx0/bh1iy1sxiRT14fF8h+FR6k21qKwv4jsMr1BWRtWTTeliIngdJLmravQ1OKX2/upJSm5dUFYGXLjAdxRdVxPXi0pvV1iYBWpdns911O4sxqh6sjGLnxi13WL5DqPLzpWc8/rqSTqjdZK3V0c2ZpHKoYtK5jsMj0DVkY6j6sm/6OJjwCR+fIfRZUKonqTk1kneXh3ZVG1sD1jEnt8nx5X0Jh0q9VQd2RlUPQlYJBLUxnveOJKd5e3Vk51KbitXrkRSUhLkcjmGDBmCAwcOdOh9mzdvBsdxmDx5cmc+1mMIpTqyMeYngS6mO99h8Oav1pFUv9YZVD0J1CTGAmLvvdfWEm+unnQ4uW3ZsgU5OTlYuHAhDh8+jAEDBmDChAkoKWl7XJ78/Hw89thjGDlyZKeD9QRCqo5sqjYqBWaJnO8weEHVkV3ny9WTJn8Z6ry0+X9bvLl60uHk9uqrr2LOnDmYPXs20tPTsWbNGigUCqxfv77V95jNZsyYMQOLFi1CSkpKlwLmm9CqI+2IxaiN68l3FG5H1ZHO46vVk7WJ8R49QHJXeGv1pEPJzWAw4NChQ8jOzv5rAyIRsrOzsW/fvlbft3jxYkRGRuIf//hH5yP1AJWVwquObKouohtMsgC+w3ArdV0eVUc6CWNAO5U4gmMMVKA+yjv7tXXUuZJzqK6v5jsMhziU3MrKymA2mxEVZX/TNCoqCsXFxS2+58cff8S6deuwdu3aDn+OXq+HVqu1e3iCEyf4jsANOA418Wl8R+E21YYK1JtooERn0ul8a+zJmmThTx/FwHCiyLtOgC5tLVldXY0777wTa9euRXh4eIfft2TJEiiVStsjIYH/H09JiXXAWF+gD4uFUaHkOwyXY4yhvP4S32EIkreNs9pZhpBgGEKFf6wAgFqrRkWt90wJ4VByCw8Ph1gshlqttluuVqsRHR3dbP1z584hPz8fN9xwA/z8/ODn54f3338fX3zxBfz8/HDu3LkWP2fBggWoqqqyPQoKChwJ0yV8otTWSHW3PnyH4HJaQxkM5nq+wxAkvR7QelctlsMYx6E6hf8Lb3fyptKbQ8lNKpUiMzMTubm5tmUWiwW5ubkYOnRos/XT0tJw9OhRHDlyxPa48cYbMXbsWBw5cqTVEplMJkNwcLDdg0+FhYCH1Iy6jTE4TNBT4liYGeV1VGpzpfJyYXcN0CXEwBTkW/enK2oroNaq21/RAzjclT4nJwezZs3CoEGDkJWVhRUrVqC2thazZ88GAMycORNxcXFYsmQJ5HI5+vbta/f+kJAQAGi23FNZLMDJk3xHwY+ahN6QVZVALMBZejX1apiYke8wBM1oAKqqgCuHvKCYFP7Wfm0+6GTRSUQGRYLz8NahDie3qVOnorS0FM888wyKi4uRkZGBHTt22BqZXLx4ESKRcAY+uXDBuyYgdSYm9oM2eQBUJ1tvCeuNzBYTKqjpv1tUVADBwYCATglgHIeqXsnC2ikHaOu1KKwsRHyoZw+4zjEvaAOt1WqhVCpRVVXl1ipKsxnIzbXeP/BlQflHoSjJ5zsMpymtK6BR/90oPBwIDeU7Cuep7RaLmmTPPrG7mkKqwNheY91ekHEkF/jmpUcHnTtHiQ2wVk8KZcZuk8UAjd477hkIRUWF9UJRCHy5OrIxnUGHC+We3emXklsrDAZrciN/VU8KQXldIc2u7WYWizXBeTtfr45s6kzJGZjMJr7DaBV9S604f946jiSxMgSHe33rSaNFD63BRzpgeRiNBjB5eelNFx8NU3Ag32F4DL1Jj/zyfL7DaBUltxZYLMIfZqszvL16skpfSpOQ8oQxQFvFdxSdZ1L4oyYpju8wPE5+Wb7HDl1Hya0Fly/75uCv7fHm6kkLs0CrL+U7DJ9WVeWd/d6oOrJ1dcY6lFR75mCi9G21ID+f7wg8lyE4HDWx3jdzQI2xkvq18cxo9M5uNTUpCVQd2Yb8sny+Q2gRJbcmtFrr6P+kdbXxvVCviuE7DIdQC0nPUKnhOwLH1EWFQxfffGhB8peS6hLoPHCgB0puTeTl8R2Bd6hKGQiTgt9h0TpKb9Kh3lTDdxgEQJ3OWoLzBsbgQGh7JvEdhlfwxIYllNwaMRqt40iSDhCLoekxGBY/Kd+RtItKbZ6DMUDjBQ1LzFIJNH160H22DrpYcdHjZuumb66RggLhdDZ1B7NMAU3qIDDOc39GFosZ1UYBdLISEG2VtUWyp2IiETR9esAilfAditcwmo0o1HhWycDhsSWFjJr/O84YHIbqxL4Izv+D71BaVGUog4UJ74rFaDbhk/178dOpo6jV1yMhLBK3Xj0G/RJS+A6tXWYzUFNjHXPSE2l7JlMDkk7IL8tHQqjnTAHkuZfcblZWZj3giOPqIhM9toN3lcEzmyl31Tu5X2DH7/sxrGdf3DHybxCJRHhl+2acunyR79A6RKPhO4KW1SbEoD4qjO8wvJKmTgONTsN3GDaU3K6g5v9dU53YF4Ygzzop6ExaGMx1fIfhdOfUhfjlzHHcdvVYTB+ejWv6XIUFk+5AWKASm/ft4ju8Dqmvtz48iT40xOcHRO4qT+oWQMkN1mG21NTmoGs4DprUQR41gkm1vpzvEFziwLmTEHEcrulzlW2Z1M8Po9MzcLb4EsqrvaDFBoBqD6opMSn8UdW7O+Dhc5R5uqKqIo9pWELJDUBpqWff4PYWTCJFZa+rYZbI+Q4FAFBr1PAdgktcKC1GdEgY/KUyu+XdI62j1V8o844rtVoPSW5muQyV/XuB+Yn5DsXrmSwmlNd6xkUlJTcAxcV8RyAcZnkAKtOGwiKRtb+yC9WZagQ7IolGV4MQRfMGDyEB1mWa2mp3h9QpBgP/w9yZZVJU9k+DReb5XVq8hVrrGRdXPp/cGANKhNnmgDdm/0BU9rqa1z5wQi21AYDRZISfuHkpQyK2Nn42ePA0JE3x2YjLLJWgckAazP78XogJDSU3D1FZyf/VoxCZFMHWBCfmp6+QkJObxE8CUwsdMo1XkppU7D09fGpr+flci8TvSmLzjCp0IdEZdNDWafkOg5IbVUm6jilAyUuCM1r00Js9b6w7ZwlRBEKja17k0Vy5iRUSEOTukDqtvt79AydYJBJU9k+DWeHv3g/2IZ5QevP55EatJF3LFBhivQfnxipKIZfaACAxPArFmnLUGfR2y8+pC22vewvG3Ft6M0slqMhIgynQc1r1ClFxFf+lBp9ObrW11HHbHUwBSlSkDXNbI5Mag7CndRjcvTcsjGHXscO2ZUazCd+f/B3do+IQFqTkMTrHuSu5mWVSVGb0phKbG2jqNNAb9e2v6ELeUznvAlQl6T5mRRAq0oZBdXIfxEbX9d61WMyoM3tHa8HOSo2OQ1b33vj4l93Q1ukQpVThx5N/oKy6Cvdccz3f4TmsttbaFceVYxSb/GXWVpFyajziLmqtGt3CuvH2+T5dcqPk5l5m/0BUpg936VQ5NSaNx05770z3ZU/ChP5Z+OnUUXzww06YLRbkTJyKtNhEvkNzmMUC1LlwIBljUAAqB/SmxOZmxVp+T7A+W3IzGmlSUj6YZQpU9B6O4PO/QV7p/B+/0O+3NZD6+WH68GxMH57NdyhOUVsLBAQ4f7t1UWHQ9kymqWt4UFZTBrPFDLGIn87xPvuNV1RYb2YT92NiP1T1GIya2J5O3zZNSuqdnF1yYxyH6pQEaNO6U2LjidliRlUdf0PB+ey3XuUdw+8JWm18L+t8cE66sjNbTDBa+L2JTTrHYHDeEHgWPzE0fXtAlxDjnA2STuNzlgCfTW6eOuWGr9GHxqAifQTM0q63YKs389QjmHQZY4DeCdclJn8ZKgamwxAa0vWNkS6jkhsPqOTmOUyKYJT3GQVDYGiXtqM3UXLzZl1NbvpQJSoG9qGm/h6ESm5u5olzSfk6JpGiMm0odBGdb+1Xb6Hk5s26ktxq46Kh6dsTTOKzbeQ8Uq2+Fiaexjr1yeRGpTYPJRKhOrk/tEn9O3UfTm8S7pBbvqAzF5xMLEZVWgpqUrvRXGweiIFBW8/POJM+eZlDyc2z1UUmwhAcjuC83yGt7tjcUNSYxPs1NCrpaONGvSoY2p7J1H/Nw2l0GoQGdO2WQ2f4ZMmNGpN4PrM8AJW9h0HbrW+HSnHUmMT7dbRRCROLoe2ZDA2NOOIV+GpUQiU34tHqopNhCIlstxRHjUmEQa8H/NtoD0KlNe/DV6MSnyu56fXUmMTbdKQUR41JhKG1khuV1rwXX41KfK7kRlWS3qutUhw1JhGGli48qbTm3Roalbj7vpvPldxoihvv1lIpzsLM1JhEIAyGv/5v8aPSmlBU17t/pg6fK7lRlaQw1EUnQx8Wi4DC0xAXn+Y7HOIkjAEmxkEfH4XaxBgwiXtncSeuUe/Caa5aQ8mNeC2LRIbqpH7QhYZAe6IAwZqOdRsgnolxHLSqMFweEAeZikpqQqI3ub9mhZIb8Xp1UjGKErujIiIaEcUFCKjmp9Mo6bwapQql0XEwyBVQigBKbcJCJTc3cMbgrMSzGM3WA0evCMCllDQoqrUIL74Efx3dYPV0usAglEYnoD4g0Las8X03IgyU3NzA10pudXU1+PTT5Th1aj/OnDmAmppK/POf72HcuLvs1jt9+gByczfg9On9yM//A2azCV980fKEd19/vRp//LELp0/vR1lZAa65Zhbmz9/QbixvvTUH33zzLgYNmohnntnuhL2zatqYRBcUjItB6QjUVCCi+BKkeh/70r2AXq5AaUw8aoNDmr1mNLo/Hk9gNBjx4aoPseerPajR1iCpRxJmzJuBgUMH8h1al/FRLelTrSVNJsBs5jsK99Jqy7B582JcunQCyckDWl3v11+/xrffvguO4xAdndLmNrdtW4ajR3ehW7c+EIs7dn105syvyM3dAKlU7lD8HWGwtJy8akJCkderHy536446RWCL6xD30gUG4XJid+T37NNiYgN8N7mteGYFPv/gc4y+djTuefweiEQiLH5oMY7/dpzv0LpMb9SDuXl2aJ8quflaqQ0AQkNjsHFjEVSqaJw58ysefXRwi+tde+0DuOWWJyGT+WPNmgdRWNh6C8QXX9yLiIhu4DgOU6a0nzQYY1i79mGMHTsTf/yR2+l9aY3R3MZVIcehWhWGalUYZLpaqMpLEKQph8hZM2OSdllEYmhVYagMj4RBrmh3fV+sljx99DR+2PEDZj8yGzfNugkAcM0N1+DBWx/Ehtc24KX3X+I5wq5hYDCYDJBJ3Hc31adKbr6Y3CQSGVSq6HbXU6miIJN1bB6syMhEcA6MwL579yZcuPAn7rzzhQ6/xxHGVkpuTekVAShOSMa53hkoie0Gg8z5pUjyF71cAXVcIs6lZ0Adn9ShxAYARn5mSOHVT9/9BJFYhAm3TLAtk8qkGD95PE7+cRKlxaU8Rucc7r7v5lMlN2pM4n46XTU2bnwSt932dIeSbGe0Vi3ZGoufHyojolEZEQ1FtRYh5WoEajXg3FxtIkSM41CtDIUmPBJ1AUGd2obRB0tu50+eR1xiHBSB9hcAPfv2BADkncpDRHQEH6E5Tb2pHkoo3fZ5PpXcfLHkxrctWxZDKvXHpEmPuGT7FmaB2dL5mzS6oGDogoLhZzQg5EqVpZSughyml/tDGxKGqtAImLvY8doXqyUryyqhClc1W96wrKK0wt0hOZ3e6N7jyqeSG52z3Kuw8DS+/PJ1PPbYfyFxUV17QzeArjJJpCiLjkdZdDyk9XUI0GoQpK2EXFdLJboWMI5DXUAQaoJDUBOsglHmvO+XMWvjLz8fOjsZ9AZIWrgokMqkAAB9vfefvKha0oV8raUk39au/SfS0oZh2LBbXPYZFji/YYhB7g+D3B+VkTEQm4wIqK5CYFUlAqq1EFl890dkFotRGxSCmuAQ1AYpYXFh9vG19j5SmRTGFpqJGvTWYqxMAGNrWph7v9RONShZuXIlkpKSIJfLMWTIEBw4cKDVdbdt24ZBgwYhJCQEAQEByMjIwKZNmzodcFfQBbj7/P77Lhw+vAM33PBPqNX5tofZbILBUAe1Oh86XddHEmEuPmDMfhJoVeG4nNQDZ/sMxKXkntCERcIokbr0cz2FUSpDZXgUClLScC59IIoSu6NaFebSxAb43rGqClehsqyy2fKGZaER7p/J2tncndwc/oVu2bIFOTk5WLNmDYYMGYIVK1ZgwoQJOHXqFCIjI5utHxoain/9619IS0uDVCrF9u3bMXv2bERGRmLChAktfILr+NrVIJ/Kyi4CAJYsubnZa+XlhZgzJxn/+MdrmDRpfpc+h8F9Z0EmEqE2OMTWP8vPoIdcVwt5nQ7yuhrI63QQm7y3qZ/JTwK9vwL1/gGoV1gfJh9J4nxL6ZWCo78eha5GZ9eo5NTRUwCA5F7JfIXmNB7fz+3VV1/FnDlzMHv2bADAmjVr8NVXX2H9+vV46qmnmq0/ZswYu+f//Oc/sXHjRvz4449uT26+djXIp/79r8HTT3/abPnKlfciIiIRU6b8C4mJ/br8Oe5Mbk2ZpDLUSGWoCfnrqtpbEp6nJzJfO1aHjR+GT9//FDs/2Wnr52Y0GJH7eS569uvp9S0lAfcfqw4lN4PBgEOHDmHBggW2ZSKRCNnZ2di3b1+772eMYdeuXTh16hSWLVvW6np6vR76Rq0/tFrnDITrawdMg+3b30JtrQYVFZcBAAcOfImysksAgOuvfwgBAUqUlFzA7t3W6uKzZ38FAGzZ8jwAa7+2sWPvtG3vwIEvkZf3OwDAZDIiP/8P27pZWTciObk/IiK6ISKiW7NY3n13PkJConD11ZOds3Me9qW2lvBk9XXwMxohNhkhMRrgZzJCbDTC78r/ndlohXEcTH4SmCRSmCQSmP0kMF75v0kihUHu71GJrCUe9rW6XK9+vTB8/HC8/+b70FRoEJMQg11f7kJJUQkeevYhvsNzCo+uliwrK4PZbEZUVJTd8qioKJw8ebLV91VVVSEuLg56vR5isRirVq3C+PHjW11/yZIlWLRokSOhdYivHTANPvvsZZSUXLA937dvG/bt2wYAGDPmDgQEKKFW5+HDD/9j976G5337jrZLbj///Al27dpoe37+/G84f/43AEB4eDySk/u7bF+a4rPk1lEmqQwmadsNAsQmoy35+RkN4BgDxyzgGADGwIEBDAAHMHAAx4FxAONEYCLRlWRmTWRmP++fA80Xj9VHnn8EH660H1vyP2/8B30z+/IdmlO4u1qSYw584uXLlxEXF4eff/4ZQ4cOtS1/4oknsHfvXuzfv7/F91ksFpw/fx41NTXIzc3Fc889h88++6xZlWWDlkpuCQkJqKqqQnBwcEfDbebwYaCwsNNvJx6oxlCJ4xU/8h0GcbJ+/QD/jg2YQ7xEYlgi+sd37cJXq9VCqVR2KBc4VHILDw+HWCyGWq22W65WqxEd3froEyKRCKmpqQCAjIwMnDhxAkuWLGk1uclkMsic2G+mgQMjRhFvQV+qINHXKjwizr2jPTr0aVKpFJmZmcjN/WvwW4vFgtzcXLuSXHssFotdycxdRD41kqZv4EBnQSGi5CY87j5WHW4tmZOTg1mzZmHQoEHIysrCihUrUFtba2s9OXPmTMTFxWHJkiUArPfPBg0ahO7du0Ov1+Prr7/Gpk2bsHr1aufuSQfQASM8lNwI8Q6ODLbuDA4nt6lTp6K0tBTPPPMMiouLkZGRgR07dtgamVy8eBGiRkWk2tpazJ07F5cuXYK/vz/S0tLwwQcfYOrUqc7biw6i5CY8nJurOoh70LEqPO6ulnSoQQlfHLmJ2Jbjx4Fz55wYGOGd3lyH30u/4zsM4mRXXeVbY0v6gl5RvdAzumeXtuFILvCpy14XtFEhPJOI6EsVGo6jxCZEcol750/0qeQmp7kpBUfEiSAWeX+/LvIXqWf3Lyed5M5ZuAFKbkQApCL6YoXEwwdPIZ0k96OSm8tQtaQwSSi5CYqEqiQFiaolXYhKbsIkEdNVi5BQtaTwcOAg9XPvF+tTyc3PDxCL+Y6COBtVSwpLCxNSEy8nk8jc3s/Np5IbQKU3IaIWk8JCyU14ZH7uP0YpuRGvJxHTlyokVC0pPO6+3wb4YHKjRiXCQ9WSwkLJTXgoubkBldyEh5KbsAhgOjrShLu7AQCdGFvS21FyEx4/H2wtaTIasfeLD3H0lz2o19UgMi4JYybPQEr6QL5D6xKOo64AQuTuDtyAD5bcgoL4joA4m5gTQyZW8B2GW32xYQX2f/s5+g4Zjb9NvQcikQib31iMi2eO8x1al9AEpcIUJHf/idfnkptSyXcExBUUEt/5YgvzTuP4wR8w9uaZyL51Nq4a9Xfc8ejzUIZFYNcnG/gOr0sCAviOgDgbBw5Kf/cfnz6X3GQyqpoUogC/EL5DcJuTh34CJxLhqpETbMv8JFJkjBiPS+dPoqqilMfoukbhWwVwnxAoD4RY5P4Oxj6X3AAgJITvCIizBfhQya244DzCouIg87fPBLFJ1ulE1AV5fITlFAGBfEdAnI2PUhvgo8mNqiaFJ0ASwncIblNTVYlAparZ8oZl1ZoKd4fkFBwHKOiem+BQcnMjSm7C4yeS+EyjEqPBAHEL7eX9rgynbzLq3R2SU/j7AyKfPCMJW4gihJfP9cmfElVLCpOvNCqRSKUwm4zNlpuMBgCAHw/Nrp2BGpMID1+NSQAfTW7UqESYfKVRSaBShZqqymbLG5YFhYS6OySnoMYkwsNXYxLAR5MbQKU3IfKVRiVRCSkoVxdCX6ezW16Yd+rK68l8hNVl1JhEePgqtQE+nNzovpvw+Eqjkt5XDQOzWHD4h522ZSajEb//nIu45J5QhkbwGF3nUGMSYeIzufnsQDeU3ISnoVGJ3qxrf2UvFpfSC70zh2P3tveh02qgiozBH/t2oaqsBNfPfIjv8DqFGpMIE1+NSQAfTm6hodarRcb4joQ4U5A0rFl1nRBNuvsR7PncOrZkna4GUfFJmPrQf5DYsy/foXUKDYsnPGKRmEpufJBIrAmuvJzvSIgzhciiUFZXwHcYLucnkSL71tnIvnU236E4Bd0DF56IwAjeGpMAPnzPDQCioviOgDibUhoBjvPpn7XXEYup5CZEUcH8nmB9+iwQHc13BMTZxCI/BEnC+A6DOECppPttQkTJjUcBAUAgNT8WHJWcrlq8iar5SGLEy6kUKl7mcGvMp5MbQKU3IVLJ6Ev1FhxHLZeFiO9SG0DJje67CZBULPeZobi8XWAg4OezzdqEK1rJ/wWmzyc3lQqQSvmOgjhbiIyuWrwBVUkKj0Kq4GXm7aZ8PrlxHJXehIiqJr0DdQEQHk+okgQouQGg5CZEARIlJCIaHduTyeU0gLkQRQd7xoUlJTcAkZHUFFmIqGrSs1GVpPBIxBKEBXpGVxw6pcPaiZRaTQpPuH883yGQNoR5xjmQOFGMMgYcx/EdBgBKbjaJiXxHQJwtSBoKfz/+b2yT5gKDaP42IUoKS+I7BBtKbleEh1OHbiGKVCTxHQJpQVQk3xEQZ1MpVFAqPKcLDiW3RpKS+I6AOFu4PB5ijjpSeRI/P7rfJkSeVGoDKLnZSUigDqVCIxb5IYzuvXmUiAhqwCU0UrEUsSGxfIdhh35ijfj5AXFxfEdBnI2qJj0Hx1lbJxNh6RbWDSIPu2LxrGg8AFVNCo/CLwiBklC+wyCwjiMp43c8XeJkHDgkhnpeizxKbk0EB1snMSXCEkWlN49ApTbhiQiKgELmeU1fKbm1gEpvwqOSx0AioiIDn2QyGm5LiJLCk/gOoUWU3FoQE0NVJ0Ij4kSI8O/Gdxg+jUptwqOQKhAZ5JlfLCW3FohEQDc6DwpOhCIRHEc/eT6IREB4BN9REGdLCkvymBFJmqIjvRXduwMSCd9REGeSif0RLk/gOwyfFBUFSKibjaDI/GQeWyUJUHJrlURiTXBEWOICe0LEifkOw6eIxdaqfiIsPaN6Qizy3GOJklsbUlJoSg6hkYrliFak8B2GT4mJpcERhCZAGoBuoZ5974aSWxvEYqBHD76jIM4WHdAdYhHVObuDRErzJQpRr+heHtdpu6lORbdy5UokJSVBLpdjyJAhOHDgQKvrrl27FiNHjoRKpYJKpUJ2dnab63uaxEQgIIDvKIgz+YkkiA2gqxZ3iI8DxJ59DiQOCpYHI07l+UM5Ofyz27JlC3JycrBw4UIcPnwYAwYMwIQJE1BSUtLi+nv27MH06dOxe/du7Nu3DwkJCfjb3/6GwsLCLgfvDhwH9OrFdxTE2aIUyZCK/fkOQ9DkcutsG0RYesf05juEDuEYY8yRNwwZMgSDBw/GW2+9BQCwWCxISEjAQw89hKeeeqrd95vNZqhUKrz11luYOXNmhz5Tq9VCqVSiqqoKwcHBjoTrNN9/D1RV8fLRxEVKdReRp/2d7zAEKzWVRvsRmrCAMAxLHcbb5zuSCxwquRkMBhw6dAjZ2dl/bUAkQnZ2Nvbt29ehbeh0OhiNRoS28avX6/XQarV2D7719o6LFeKAcP8EyP1oEj9XCAigxCZE3lJqAxxMbmVlZTCbzYhqcoc4KioKxcXFHdrGk08+idjYWLsE2dSSJUugVCptj4QE/vsmRURQFYvQcByHhEDvOVi9iQccssTJooOjoQrwnon43Hqrd+nSpdi8eTM+/fRTyNtoY79gwQJUVVXZHgUFBW6MsnVUehMelTwagRLvOWC9QbDSOgA5EQ4OHNJi0vgOwyEOJbfw8HCIxWKo1Wq75Wq1GtHR0W2+9+WXX8bSpUvxzTffoH///m2uK5PJEBwcbPfwBCEhNKiyECUF96dhuZxEJAKSPG/2E9JFqZGpCJIH8R2GQxw6oqVSKTIzM5Gbm2tbZrFYkJubi6FDh7b6vpdeegnPPfccduzYgUGDBnU+Wg+Qng4oPG92B9IFCkkwdQ1wkoQEGvhAaILkQegZ1ZPvMBzm8OVqTk4O1q5di40bN+LEiRN44IEHUFtbi9mzZwMAZs6ciQULFtjWX7ZsGf7zn/9g/fr1SEpKQnFxMYqLi1FTU+O8vXAjsRjIyOA7CuJssQE9oJAo+Q7DqwUFUYdtoeHAISMhw+M7bLfE4UFxpk6ditLSUjzzzDMoLi5GRkYGduzYYWtkcvHiRbs/xOrVq2EwGHDrrbfabWfhwoV49tlnuxY9T8LCrNWT+fl8R0KcheM4pARn4FjFD2DMwnc4XkckApKT+Y6COFtqZCpCFCF8h9EpDvdz44Mn9HNrymwG9uwBdDq+IyHOVFhzGoU1p/gOw+skJlKpTWiC5EEY1WOUR5XaXNbPjfyFqieFiaonHUfVkcLjzdWRDbw3cg/QUD1JhKOhepJaT3YMVUcKkzdXRzagI7iLqPWk8FDryY6j1pHC462tI5ui5NZFVD0pTFQ92T6qjhQeDhwGJgz06urIBt6/Bx4gLIxm7RYajuOQohxIs3a3Qiy2TuZLhKVnVE8oFcK4qKPk5iS9e1vHnyTCofALQopyIN9heByOs474L5PxHQlxphhlDHpGe391ZANKbk7CcUBmJk1sKjSh8hjEBgrngHeGhARAKYyLe3JFsDwYA7sJ60KOkpsTSSRAVhbg53DXeOLJ4gN7QSWP4TsMjxAeDrQzjCzxMlKxFFnJWRCLhFUFT8nNyQIDrSU4juM7EuJMKcqBUPh5xgACfAkMoq4vQiPiRBiUNAj+UuHNSk/JzQUiI4E075odgrRDzInRQ5UFP5GU71B4IZUCPVKt/dqIcPSN64uwwDC+w3AJ+qm6SGoqEBfHdxTEmWRif6SGDPK5Dt4iEdCjh7XanQhHUlgSEsOEOz+Rbx2lbpaRYZ0DjghHsDQMiUF9+Q7DrZKTqaGU0IQFhKFvnLB/x5TcXEgkAgYPphEchCZSkYhIRRLfYbhFbKy1HycRDoVUgUFJg8AJvGEAJTcXk8utCY5aUApLYlBfKGWRfIfhUqGhQHw831EQZ5KIJchKzoLUT/j3jim5uUFICDBkiHVUByIMHMehR8hgBEvD+Q7FJVQqGnVHaCRiCYamDEWQPIjvUNyCkpubhIZa+8BRazPhEHEi9FBlIVASyncoTqVUWhObwGutfIqfyA9DkocIZmitjqBTrRuFh1urKCnBCYeYE6OXaggCJSq+Q3GK4GAgtQf9RoVELBJjSMoQqAKE8RvtKPoJu1lkpLWTN508hEMs8kNP1RAESEL4DqVLgoKAHj0BMf02BUMsEiMrKQuhAcKqXegI+hnzIDqaSnBC4yeSoJfqaq8twQUHAz17UWITkoaqyPAgYd4Xbg/9lHkSGWm9B0eNTISjIcEFSb2r7bxSCfSkEpugSMQSXJ1ytWBHH+kI+jnzKCLC2oqSugkIR0MVZbDUO+Y/Uqmso49QLYJwNCQ2X7vH1hT9pHkWFgZcfbV17D4iDGJOjJ6qLKhknj18fliYdZg4SmzCIZfIMaz7MIQoQvgOhXf0s/YAKhUwcqT1vgcRBms3gcGIDejBdyjNcJy1czY19xeWEP8QjOwxEsH+dCIBKLl5DIUCGD6c5soSmvigNHRXXgUR5xk3V8ViazVkbCzfkRBnilfFY3jqcMglNNZfA0puHsTPz9qKsidN/CwoYf5x6B06HBIRvycemRxIT6fBvIWEA4feMb0xsNtAiKh+2Q79NTxQr17AoEHUklJIAiRK9A0bxVtXgeBgoE864C+8OSl9VsM4kamRqXyH4pEouXmomBhgxAg6GQmJRCxDWugwhPsnuPVzo6KsF0zUKlc4AqQBGJE6ApHBwh68uysouXmw4GBrQ5NQ3xtcQLBEnAgpygx0C+oDDq5tzcFx1rnYEhOp4YiQRARGYGSPkQiUB/Idikej5ObhZDJg6FDrCYoIR3RACnqqhkAscs301hIJkJZm7UtJhCMlPAVDUoZA4kfToreHkpsXEImA/v2t9+FkMr6jIc6ilEWgX9gYp88Lp1IBfftax4okwiCXyDEkeQj6xPUR/CSjzkK18F4kJsba8fboUeDyZb6jIc4gFcvRSzUEpXUFuFh9DGaLsdPb8vMDEpOAMKrGFpQEVQL6xPah0pqDKLl5GanUOqtAbCzwxx+AwcB3RMQZIvwToJRGIE/7O6r0JQ6/X6UCkpKs1ZFEGOQSOQbED6BGI51Eyc1LUSlOeDpTiqPSmjBRaa3rKLl5MSrFCVNHS3FUWhMeuUSO/vH9ERUcxXcoXo+SmwBQKU542irFUWlNmKi05lyU3ASioRQXFwecOAHU1PAdEXGGhlJcQc0JVNQXIjyCIT6OSmtCEiwPRnpsOiKCqN+GM1FyE5joaOuIFAUFwKlTQH093xGRrpKK5RjZeyDiklJxUXsCaq2a75CIEyikCvSK6oU4VRw173cBSm4CxHFAt27WaU3y8oAzZwBj51uYEx5FRFg7Y1sHOw5CdFgWKmorcKLoBCpqK3iOjnSGzE+GHpE9kBiWSIMduxAlNwETiaxzdnXrBpw7B5w/D5jNfEdFOkKpBHr3bnmEkdCAUAxPHQ61Vo2TRSehrde6P0DiMD+RH7pHdEdKRAr8xHTqdTX6C/uAhqGYkpKA06eBixcBxviOirQkIMD6XXVkvrWo4ChEBkWisLIQp9SnoDPoXB8gcZiIEyEpLAk9onpA6iflOxyfQcnNh8jl1mG8une33o+7fJmSnKdQKIDUVGsp25HbLxzHIT40HrEhsbhQfgFnS8+i3kg3Wj2BiBMhLiQOvaJ7wV9K03u4G8eY55/etFotlEolqqqqEBxMU6g7S329tRR34QI1POEDx1mrHZOSgMhI54zczxhDcVUx8svzUVZT1vUNEocppAokhiWiW2g3Kqk5mSO5gEpuPkwut8763aMHUFwM5OcDZXQ+dDmpFEhIsCY1hcK52+Y4DjEhMYgJiUFNfQ0ulF9AQWUBjGZqUeRqkUGRSApPQmRQJLV+9ACU3Ag4ztoRPCbG2j8uPx+4dIlaWDpbw4gisbHWxj6uFigPRJ+4PkiLSUNhZSHyy/NRVVfl+g/2IVKxFAmhCUgMS0SALIDvcEgjVC1JWmQ2A4WF1q4EWmqM12lisbVjfVKStQUk3yprK5Ffno/LmsuwMAvf4XitEP8QJIUnIS4kjprzuxFVS5IuE4utjRu6dbMmt+JiQK0GNBq+I/N8Eon1HlpUlPXh50FHmSpABVWACn1j+0KtVUNdrUaJtgQmi4nv0DwaBw4hihBEBUchWhmNIDlNlufpPOiwI54qONj66NnT2vBErbYmu7IywEIX/wCs984aRocJC3NO4xBXkvhJEB8aj/jQeFgsFpTXlqO4qhhqrRp1xjq+w/MIYpEYEYERiAqOQlRwFGQSminYm3Qqua1cuRLLly9HcXExBgwYgDfffBNZWVktrnvs2DE888wzOHToEC5cuIDXXnsN8+fP70rMhEdyOZCYaH2YzUBpqTXRlZQAej3f0bkPx1lHDWlIaN4867VIJEJEUAQigiLQD/2grdPaEp2mTsN3eG4ll8gRGRSJaGU0IgIjqMrRizmc3LZs2YKcnBysWbMGQ4YMwYoVKzBhwgScOnUKkZHNJ9XT6XRISUnBbbfdhkceecQpQRPPIBZbT+7R0db+cpWV1tJcVZW1+lJI3QtEImvpVam0NgyJjARkAr2QD/YPRrB/MHpG90S9sR4l2hJU6ipRVVeF6vpqQd2r85f4I0QRAqW/EhFBEQhRhPAdEnEShxuUDBkyBIMHD8Zbb70FALBYLEhISMBDDz2Ep556qs33JiUlYf78+Q6X3KhBiXfS6/9KdN6U8BonspAQ679BQe5p4ejpLBYLtPVaVNVVQaPTeFXCa5zIGv6lfmjexWUNSgwGAw4dOoQFCxbYlolEImRnZ2Pfvn2di5YIlkxmLeE0LtA3TnharTXZ1ddbl7v7/p1EYq1mlcut98wokbVPJBIhRBGCEEUIEsMSATRPeDqDDvXGeuhNerf3rxNxIsglcsj8ZPCX+iNYHkyJzEc5lNzKyspgNpsRFWU/S2xUVBROnjzptKD0ej30jW7gaKktumC0lPAaGAx/JbrGSa/hX6PRWv3JmDURNvyf4+wfIpH1X5nsr+TV8P/GyyiBOUdLCa+B2WKG3qhHvanemvCa/F9v0oOBwWKxgIGBMWb7l+M4iDgROHDgOOtDIpbYkpdcIofcTw6Z5K//00SfpIFHtpZcsmQJFi1axHcYxM2kUuuDCIdYJIZCpoBC5uShWAhph0PXruHh4RCLxVCr7SdLVKvViI6OdlpQCxYsQFVVle1RUFDgtG0TQggRPoeSm1QqRWZmJnJzc23LLBYLcnNzMXToUKcFJZPJEBwcbPcghBBCOsrhasmcnBzMmjULgwYNQlZWFlasWIHa2lrMnj0bADBz5kzExcVhyZIlAKyNUI4fP277f2FhIY4cOYLAwECkpqY6cVcIIYQQK4eT29SpU1FaWopnnnkGxcXFyMjIwI4dO2yNTC5evGjX8fHy5csYOHCg7fnLL7+Ml19+GaNHj8aePXu6vgeEEEJIEzRwMiGEEK/gSC6gxtCEEEIEh5IbIYQQwaHkRgghRHAouRFCCBEcSm6EEEIEh5IbIYQQwaHkRgghRHAouRFCCBEcSm6EEEIExyOnvGmqYRAVmteNEEJ8V0MO6MjAWl6R3KqrqwEACQkJPEdCCCGEb9XV1VAqlW2u4xVjS1osFly+fBlBQUHgOK5T29BqtUhISEBBQYHgx6ekfRUeX9lPgPZVqJyxr4wxVFdXIzY21m6A/pZ4RclNJBIhPj7eKdvypfnhaF+Fx1f2E6B9Faqu7mt7JbYG1KCEEEKI4FByI4QQIjg+k9xkMhkWLlwImUzGdyguR/sqPL6ynwDtq1C5e1+9okEJIYQQ4gifKbkRQgjxHZTcCCGECA4lN0IIIYJDyY0QQojgCCq5rVy5EklJSZDL5RgyZAgOHDjQ6rrHjh3DLbfcgqSkJHAchxUrVrgvUCdwZF/Xrl2LkSNHQqVSQaVSITs7u831PY0j+7pt2zYMGjQIISEhCAgIQEZGBjZt2uTGaDvPkf1sbPPmzeA4DpMnT3ZtgE7kyL5u2LABHMfZPeRyuRuj7RpHv1eNRoN58+YhJiYGMpkMPXv2xNdff+2maLvGkX0dM2ZMs++V4zhMnDjROcEwgdi8eTOTSqVs/fr17NixY2zOnDksJCSEqdXqFtc/cOAAe+yxx9h///tfFh0dzV577TX3BtwFju7r7bffzlauXMl+++03duLECXbXXXcxpVLJLl265ObIHefovu7evZtt27aNHT9+nJ09e5atWLGCicVitmPHDjdH7hhH97NBXl4ei4uLYyNHjmSTJk1yT7Bd5Oi+vvfeeyw4OJgVFRXZHsXFxW6OunMc3Ve9Xs8GDRrErrvuOvbjjz+yvLw8tmfPHnbkyBE3R+44R/e1vLzc7jv9888/mVgsZu+9955T4hFMcsvKymLz5s2zPTebzSw2NpYtWbKk3fcmJiZ6VXLryr4yxpjJZGJBQUFs48aNrgrRabq6r4wxNnDgQPbvf//bFeE5TWf202QysWHDhrF3332XzZo1y2uSm6P7+t577zGlUumm6JzL0X1dvXo1S0lJYQaDwV0hOk1Xj9XXXnuNBQUFsZqaGqfEI4hqSYPBgEOHDiE7O9u2TCQSITs7G/v27eMxMudzxr7qdDoYjUaEhoa6Kkyn6Oq+MsaQm5uLU6dOYdSoUa4MtUs6u5+LFy9GZGQk/vGPf7gjTKfo7L7W1NQgMTERCQkJmDRpEo4dO+aOcLukM/v6xRdfYOjQoZg3bx6ioqLQt29fvPjiizCbze4Ku1OccV5at24dpk2bhoCAAKfEJIjkVlZWBrPZjKioKLvlUVFRKC4u5ikq13DGvj755JOIjY21+yF6os7ua1VVFQIDAyGVSjFx4kS8+eabGD9+vKvD7bTO7OePP/6IdevWYe3ate4I0Wk6s6+9evXC+vXr8fnnn+ODDz6AxWLBsGHDcOnSJXeE3Gmd2dfz589j69atMJvN+Prrr/Gf//wHr7zyCp5//nl3hNxpXT0vHThwAH/++Sfuuecep8XkFbMCEOdZunQpNm/ejD179njVTXlHBAUF4ciRI6ipqUFubi5ycnKQkpKCMWPG8B2aU1RXV+POO+/E2rVrER4eznc4Ljd06FAMHTrU9nzYsGHo3bs33n77bTz33HM8RuZ8FosFkZGReOeddyAWi5GZmYnCwkIsX74cCxcu5Ds8l1m3bh369euHrKwsp21TEMktPDwcYrEYarXabrlarUZ0dDRPUblGV/b15ZdfxtKlS/Hdd9+hf//+rgzTKTq7ryKRCKmpqQCAjIwMnDhxAkuWLPHY5Obofp47dw75+fm44YYbbMssFgsAwM/PD6dOnUL37t1dG3QnOeNYlUgkGDhwIM6ePeuKEJ2mM/saExMDiUQCsVhsW9a7d28UFxfDYDBAKpW6NObO6sr3Wltbi82bN2Px4sVOjUkQ1ZJSqRSZmZnIzc21LbNYLMjNzbW74hOCzu7rSy+9hOeeew47duzAoEGD3BFqlznre7VYLNDr9a4I0Skc3c+0tDQcPXoUR44csT1uvPFGjB07FkeOHPHoGeud8Z2azWYcPXoUMTExrgrTKTqzr8OHD8fZs2dtFysAcPr0acTExHhsYgO69r1+/PHH0Ov1uOOOO5wblFOapXiAzZs3M5lMxjZs2MCOHz/O7r33XhYSEmJrMnznnXeyp556yra+Xq9nv/32G/vtt99YTEwMe+yxx9hvv/3Gzpw5w9cudJij+7p06VImlUrZ1q1b7ZreVldX87ULHebovr744ovsm2++YefOnWPHjx9nL7/8MvPz82Nr167laxc6xNH9bMqbWks6uq+LFi1iO3fuZOfOnWOHDh1i06ZNY3K5nB07doyvXegwR/f14sWLLCgoiD344IPs1KlTbPv27SwyMpI9//zzfO1Ch3X2NzxixAg2depUp8cjmOTGGGNvvvkm69atG5NKpSwrK4v98ssvttdGjx7NZs2aZXuel5fHADR7jB492v2Bd4Ij+5qYmNjivi5cuND9gXeCI/v6r3/9i6WmpjK5XM5UKhUbOnQo27x5Mw9RO86R/WzKm5IbY47t6/z5823rRkVFseuuu44dPnyYh6g7x9Hv9eeff2ZDhgxhMpmMpaSksBdeeIGZTCY3R905ju7ryZMnGQD2zTffOD0WmvKGEEKI4AjinhshhBDSGCU3QgghgkPJjRBCiOBQciOEECI4lNwIIYQIDiU3QgghgkPJjRBCiOBQciOEECI4lNwIIYQIDiU3QgghgkPJjRBCiOBQciOEECI4/w8wlqjj5Cg6rwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total unique missense variants in each group:\n", + "CHD: 1114\n", + "Control: 0\n", + "Control_DDD_ASD: 3573\n" + ] + } + ], + "source": [ + "def display_qc(all_results: pd.DataFrame):\n", + " \"\"\"\n", + " Perform and display final QC checks.\n", + " \"\"\"\n", + " print(\"\\n--- 4. Quality Control and Display ---\")\n", + "\n", + " # A. Initial Variant Type Counts\n", + " print(\"A. Initial variant type and class counts:\")\n", + " display(all_results[[\"classification\", \"class\"]].value_counts().to_frame(\"count\"))\n", + "\n", + " # B. Variant Overlaps\n", + " chd_variants = set(all_results[all_results[\"class\"] == \"chd\"][\"variant_id\"])\n", + " control_variants = set(all_results[all_results[\"class\"] == \"control\"][\"variant_id\"])\n", + "\n", + " chd_control_overlap = chd_variants & control_variants\n", + "\n", + " print(\"\\nB. Unique Variant Overlaps and Totals:\")\n", + " print(f\"CHD-Control overlap (same variant_id): {len(chd_control_overlap)}\")\n", + " print(f\"Total unique CHD variants: {len(chd_variants)}\")\n", + " print(f\"Total unique Control variants: {len(control_variants)}\")\n", + "\n", + " # Check for position overlaps (chrom_pos)\n", + " all_results[\"chrom_pos\"] = all_results[\"chrom\"].astype(str) + \":\" + all_results[\"pos\"].astype(str)\n", + " chd_positions = set(all_results[all_results[\"class\"] == \"chd\"][\"chrom_pos\"])\n", + " control_positions = set(all_results[all_results[\"class\"] == \"control\"][\"chrom_pos\"])\n", + " control_ddd_asd_positions = set(all_results[all_results[\"class\"] == \"control_ddd_asd\"][\"chrom_pos\"])\n", + " position_overlap = chd_positions & control_positions\n", + " position_overlap_ddd_asd = chd_positions & control_ddd_asd_positions\n", + " print(\"\\nPosition Overlaps (same chrom:pos, potentially different alleles):\")\n", + " print(f\"CHD-Control position overlap: {len(position_overlap)}\")\n", + " print(f\"CHD-Control_DDD_ASD position overlap: {len(position_overlap_ddd_asd)}\")\n", + " print(f\"Total unique CHD positions: {len(chd_positions)}\")\n", + " print(f\"Total unique Control positions: {len(control_positions)}\")\n", + " print(f\"Total unique Control_DDD_ASD positions: {len(control_ddd_asd_positions)}\")\n", + "\n", + " if len(position_overlap) > 0:\n", + " print(f\" Note: {len(position_overlap)} positions have variants in both CHD and Control groups\")\n", + "\n", + " # C. Unique and Total Rows by Class and Variant Type\n", + " print(\"\\nC. Unique Variants and Total Rows by Class/Type:\")\n", + " variant_counts = (\n", + " all_results.groupby([\"class\", \"classification\"])\n", + " .agg({\"variant_id\": [\"nunique\", \"count\"]})\n", + " .rename(columns={\"nunique\": \"unique_variants\", \"count\": \"total_rows\"})\n", + " )\n", + " display(variant_counts)\n", + "\n", + " # D. Final Missense-specific QC\n", + " print(\"\\nD. Final Missense-specific QC (matching original 'mis'/'misD' annotation):\")\n", + " final_qc_counts = (\n", + " all_results.groupby([\"class\", \"classification\"])\n", + " .agg({\"variant_id\": [\"nunique\", \"count\"]})\n", + " .rename(columns={\"nunique\": \"unique_variants\", \"count\": \"total_rows\"})\n", + " )\n", + " display(final_qc_counts)\n", + "\n", + " # E. Venn Diagram for Missense Variants Overlap\n", + " print(\"\\nE. Venn Diagram - Overlap of Missense Variants Between Classes:\")\n", + " # Get sets of variant IDs for each group (missense only)\n", + " chd_varids = set(all_results[(all_results[\"class\"] == \"chd\")][\"variant_id\"].unique())\n", + " control_varids = set(all_results[(all_results[\"class\"] == \"control\")][\"variant_id\"].unique())\n", + " control_ddd_asd_varids = set(all_results[(all_results[\"class\"] == \"control_ddd_asd\")][\"variant_id\"].unique())\n", + " # Create Venn diagram using matplotlib circles\n", + " plt.figure(figsize=(5, 5))\n", + " # Create three circles\n", + " circle1 = plt.Circle((0.3, 0.3), 0.2, alpha=0.3, fc=\"blue\", label=\"CHD\")\n", + " circle2 = plt.Circle((0.5, 0.3), 0.2, alpha=0.3, fc=\"darkgreen\", label=\"Control\")\n", + " circle3 = plt.Circle((0.4, 0.5), 0.2, alpha=0.3, fc=\"red\", label=\"Control_DDD_ASD\")\n", + " plt.gca().add_patch(circle1)\n", + " plt.gca().add_patch(circle2)\n", + " plt.gca().add_patch(circle3)\n", + " # Add counts in appropriate locations\n", + " # Unique to each set\n", + " plt.text(0.2, 0.3, str(len(chd_varids - control_varids - control_ddd_asd_varids)), fontsize=12)\n", + " plt.text(0.6, 0.3, str(len(control_varids - chd_varids - control_ddd_asd_varids)), fontsize=12)\n", + " plt.text(0.4, 0.6, str(len(control_ddd_asd_varids - chd_varids - control_varids)), fontsize=12)\n", + " # Pairwise overlaps\n", + " plt.text(0.4, 0.25, str(len(chd_varids & control_varids - control_ddd_asd_varids)), fontsize=12) # CHD & Control\n", + " plt.text(\n", + " 0.3, 0.45, str(len(chd_varids & control_ddd_asd_varids - control_varids)), fontsize=12\n", + " ) # CHD & Control_DDD_ASD\n", + " plt.text(\n", + " 0.5, 0.45, str(len(control_varids & control_ddd_asd_varids - chd_varids)), fontsize=12\n", + " ) # Control & Control_DDD_ASD\n", + " # Three-way overlap\n", + " plt.text(0.4, 0.35, str(len(chd_varids & control_varids & control_ddd_asd_varids)), fontsize=12)\n", + "\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, 1)\n", + " plt.title(\"Overlap of Missense Variants Between Groups\")\n", + " plt.legend()\n", + " plt.axis(\"equal\")\n", + " plt.show()\n", + " print(\"\\nTotal unique missense variants in each group:\")\n", + " print(\"CHD:\", len(chd_varids))\n", + " print(\"Control:\", len(control_varids))\n", + " print(\"Control_DDD_ASD:\", len(control_ddd_asd_varids))\n", + "\n", + " return all_results # Return final processed table\n", + "\n", + "\n", + "# 4. run the filter and display qc\n", + "_ = display_qc(variants)" + ] + }, + { + "cell_type": "markdown", + "id": "ebea927d", + "metadata": {}, + "source": [ + "- Save to disk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5da097da", + "metadata": {}, + "outputs": [], + "source": [ + "# Keep only the controls from the DDD/ASD dataset and CHD cases:\n", + "mask = variants[\"class\"].isin([\"control_ddd_asd\", \"chd\"])\n", + "variants = variants[mask].copy()\n", + "# rename control_ddd_asd to control\n", + "variants.loc[variants[\"class\"] == \"control_ddd_asd\", \"class\"] = \"control\"\n", + "variants.to_csv(f\"{OUTPUT_DIR}/chd_dnm_filtered_canonical_transcripts_ddd_asd_ctrls_am_scores.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "39195fb4", + "metadata": {}, + "source": [ + "\n", + "- Add features: pLI, phlop, cds length, cds_frac" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "404f5d52", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with pLI scores: 4680 / 4704\n", + "\n", + "Adding PhyloP scores from bigWig file...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4704/4704 [00:00<00:00, 27221.87it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with PhyloP scores: 4702 / 4704\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with pLI scores: 31606 / 31738\n", + "\n", + "Adding PhyloP scores from bigWig file...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 31738/31738 [00:00<00:00, 40671.22it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants with PhyloP scores: 31738 / 31738\n" + ] + } + ], + "source": [ + "def process_variant_features(variants, pli_file_path, phylop_bw_path):\n", + " \"\"\"\n", + " Add pLI, PhyloP conservation scores, and CDS features to variant data.\n", + " \"\"\"\n", + "\n", + " # Read gnomAD pLI metrics by transcript (not by gene)\n", + " df_pli = pl.from_pandas(pd.read_csv(pli_file_path, sep=\"\\t\"))\n", + " # Select relevant columns and process\n", + " df_pli = df_pli.select([\"transcript\", \"pLI\"])\n", + " df_pli = df_pli.with_columns(pl.col(\"transcript\").str.split(\".\").list.first().alias(\"transcript\"))\n", + "\n", + " # Add pLI scores by transcript to variants\n", + " variants_pl = pl.from_pandas(variants)\n", + " variants_pl = variants_pl.with_columns(pl.col(\"tx_name\").str.split(\".\").list.first().alias(\"tx_name_clean\"))\n", + " variants_pl = variants_pl.join(df_pli, left_on=\"tx_name_clean\", right_on=\"transcript\", how=\"left\")\n", + "\n", + " # Create pLI bins (multiply by 10 and cast to int)\n", + " variants_pl = variants_pl.with_columns((pl.col(\"pLI\") * 10).cast(pl.Int32).alias(\"pLI_bin\"))\n", + " print(\n", + " f\"Variants with pLI scores: {variants_pl.filter(pl.col('pLI').is_not_null()).shape[0]} / {variants_pl.shape[0]}\"\n", + " )\n", + "\n", + " # Add PhyloP conservation scores\n", + " print(\"\\nAdding PhyloP scores from bigWig file...\")\n", + " bw = pyBigWig.open(phylop_bw_path)\n", + " phylop = []\n", + " for row in tqdm(variants_pl.rows(named=True), total=variants_pl.shape[0]):\n", + " try:\n", + " phylop_score = bw.values(row[\"chrom\"], row[\"pos\"] - 1, row[\"pos\"])[0]\n", + " phylop.append(phylop_score if phylop_score is not None else -1000)\n", + " except:\n", + " phylop.append(-1000)\n", + " bw.close()\n", + "\n", + " # Add phylop column and create bins\n", + " variants_pl = variants_pl.with_columns(pl.Series(values=phylop, name=\"phylop\").fill_nan(-1000))\n", + " variants_pl = variants_pl.with_columns(pl.col(\"phylop\").round().cast(pl.Int32).alias(\"phylop_bin\"))\n", + " print(\n", + " f\"Variants with PhyloP scores: {variants_pl.filter(pl.col('phylop') != -1000).shape[0]} / {variants_pl.shape[0]}\"\n", + " )\n", + "\n", + " # Add cds length and cds offset fraction\n", + " variants_pl = variants_pl.with_columns(pl.col(\"ref_seq\").str.len_chars().alias(\"cds_length\"))\n", + " variants_pl = variants_pl.with_columns(\n", + " (pl.col(\"var_rel_dist_in_cds\") / pl.col(\"cds_length\")).alias(\"cds_offset_frac\")\n", + " )\n", + " variants_pl = variants_pl.with_columns(\n", + " (pl.col(\"cds_offset_frac\") * 10).cast(pl.Int32).alias(\"cds_offset_frac_bin\")\n", + " )\n", + "\n", + " # Convert back to pandas for further analysis\n", + " return variants_pl.to_pandas()\n", + "\n", + "\n", + "phylop_bw_path = f\"{DATA_DIR}/reference/hg19.100way.phyloP100way.bw\"\n", + "pli_file_path = f\"{DATA_DIR}/reference/gnomad.v2.1.1.lof_metrics.by_transcript.txt\"\n", + "\n", + "chd_variants_with_ddd_controls = process_variant_features(variants, pli_file_path, phylop_bw_path)\n", + "ddd_asd_variants = process_variant_features(\n", + " missense_variants_ddd_asd[missense_variants_ddd_asd[\"classification\"] != \"control\"], pli_file_path, phylop_bw_path\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "322a72a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvariant_idchromposrefaltclassref_codonalt_codoncodon_position...am_classchrom_postx_name_cleanpLIpLI_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
059chr10_101163334_A_G_hg19chr10101163334AGcontrolGTCGCC283...ambiguouschr10:101163334ENST000003705080.0011570.05.229512420.6843806
173chr10_101371064_G_A_hg19chr10101371064GAcontrolCGGTGG212...benignchr10:101371064ENST000003704950.9295409.03.016310950.5808225
\n", + "

2 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " id variant_id chrom pos ref alt class ref_codon \\\n", + "0 59 chr10_101163334_A_G_hg19 chr10 101163334 A G control GTC \n", + "1 73 chr10_101371064_G_A_hg19 chr10 101371064 G A control CGG \n", + "\n", + " alt_codon codon_position ... am_class chrom_pos tx_name_clean \\\n", + "0 GCC 283 ... ambiguous chr10:101163334 ENST00000370508 \n", + "1 TGG 212 ... benign chr10:101371064 ENST00000370495 \n", + "\n", + " pLI pLI_bin phylop phylop_bin cds_length cds_offset_frac \\\n", + "0 0.001157 0.0 5.229 5 1242 0.684380 \n", + "1 0.929540 9.0 3.016 3 1095 0.580822 \n", + "\n", + " cds_offset_frac_bin \n", + "0 6 \n", + "1 5 \n", + "\n", + "[2 rows x 28 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chd_variants_with_ddd_controls.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "f88aa812", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvariant_idtx_namevariant_typechromposrefaltcdsStartcdsEnd...AlphaMissenseam_classtx_name_cleanpLIpLI_binphylopphylop_bincds_lengthcds_offset_fraccds_offset_frac_bin
00chr10_100011447_C_T_hg19ENST00000260702missensechr10100011447CT100008677100022776...0.1004benignENST000002607029.191200e-200.04.771522710.8643778
11chr10_100017561_C_G_hg19ENST00000260702missensechr10100017561CG100008677100022776...0.1315benignENST000002607029.191200e-200.04.867522710.4865704
\n", + "

2 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " id variant_id tx_name variant_type chrom \\\n", + "0 0 chr10_100011447_C_T_hg19 ENST00000260702 missense chr10 \n", + "1 1 chr10_100017561_C_G_hg19 ENST00000260702 missense chr10 \n", + "\n", + " pos ref alt cdsStart cdsEnd ... AlphaMissense am_class \\\n", + "0 100011447 C T 100008677 100022776 ... 0.1004 benign \n", + "1 100017561 C G 100008677 100022776 ... 0.1315 benign \n", + "\n", + " tx_name_clean pLI pLI_bin phylop phylop_bin cds_length \\\n", + "0 ENST00000260702 9.191200e-20 0.0 4.771 5 2271 \n", + "1 ENST00000260702 9.191200e-20 0.0 4.867 5 2271 \n", + "\n", + " cds_offset_frac cds_offset_frac_bin \n", + "0 0.864377 8 \n", + "1 0.486570 4 \n", + "\n", + "[2 rows x 39 columns]" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print(\"Adding additional features (pLI, PhyloP, codon frequencies)...\")\n", - "dset = process_dset(result_df, refseq, remove_non_pli=False)\n", - "print(f\"Dataset with additional features: {dset.shape[0]} variants\")\n", - "dset.head(2)" + "ddd_asd_variants.head(2)" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "a6fe62c7", + "execution_count": 75, + "id": "ba87e543", + "metadata": {}, + "outputs": [], + "source": [ + "# Save to disk\n", + "chd_variants_with_ddd_controls.to_csv(\n", + " f\"{OUTPUT_DIR}/chd_dnm_filtered_canonical_transcripts_ddd_asd_ctrls_am_scores_cds_features.csv\", index=False\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a6b57d57", + "metadata": {}, + "source": [ + "# 6. COSMIC synonymous analyses data\n", + "\n", + "This will include the processing of COSMIC variants and gnomAD common variants" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "7d257dc0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyfaidx\n", + "import seaborn as sns\n", + "\n", + "\n", + "# Output file paths\n", + "COSMIC_OUTPUT_FILE = f\"{OUTPUT_DIR}/cosmic_mutantcensus_gencode_v47_canonical.csv\"\n", + "GNOMAD_OUTPUT_FILE = f\"{OUTPUT_DIR}/gnomad_af0.01_canonical_genes.csv\"\n", + "\n", + "cosmic_dir = f\"{DATA_DIR}/cosmic/cosmic_raw\"\n", + "cosmic_files = {\n", + " \"cosmic_samples\": f\"{cosmic_dir}/Cosmic_Sample_v102_GRCh38.tsv.gz\",\n", + " \"cosmic_mutant_census\": f\"{cosmic_dir}/Cosmic_MutantCensus_v102_GRCh38.tsv.gz\",\n", + " \"hg38\": f\"{DATA_DIR}/reference/hg38/hg38.fa\",\n", + "}\n", + "\n", + "import os\n", + "\n", + "\n", + "gnomad_dir = f\"{DATA_DIR}/gnomad\"\n", + "gnomad_files = {\n", + " \"gnomad_exomes\": f\"{gnomad_dir}/gnomad.exomes.v4.1\",\n", + " \"gnomad_genomes\": f\"{gnomad_dir}/gnomad.genomes.v4.1\",\n", + "}\n", + "gencode_v47_file = f\"{DATA_DIR}/reference/gencode.v47.basic.annotation.processed.filtered.tsv\"\n", + "hg38 = {}\n", + "with pyfaidx.Fasta(cosmic_files[\"hg38\"]) as f:\n", + " for k in f.keys():\n", + " hg38[k] = f[k][:].seq\n", + "\n", + "\n", + "valid_chroms = [\"chr\" + str(x) for x in range(1, 23)]" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "6f9e7839", + "metadata": {}, + "outputs": [], + "source": [ + "def map_variants_to_genes_by_exons_efficient(\n", + " genes_df, variants_df, variant_columns=[\"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "):\n", + " \"\"\"\n", + " Efficient mapping using sorted variants and binary search for range queries.\n", + " \"\"\"\n", + " import bisect\n", + "\n", + " gene_variant_mapping = {}\n", + "\n", + " print(f\"Processing {len(genes_df)} genes and {len(variants_df)} variants...\")\n", + "\n", + " # Pre-sort variants by chromosome and position for efficient range queries\n", + " chrom_sorted_variants = {}\n", + " unique_chroms = variants_df.select(\"chrom\").unique().to_series().to_list()\n", + "\n", + " for chrom in unique_chroms:\n", + " chrom_variants = variants_df.filter(pl.col(\"chrom\") == chrom).sort(\"pos\")\n", + " if len(chrom_variants) > 0:\n", + " # Extract positions and variant data separately for efficient binary search\n", + " positions = chrom_variants.select(\"pos\").to_series().to_numpy() - 1\n", + " variant_data = chrom_variants.select(variant_columns).to_dicts()\n", + " chrom_sorted_variants[chrom] = (positions, variant_data)\n", + "\n", + " # Process genes\n", + " for gene_row in genes_df.iter_rows(named=True):\n", + " id = gene_row[\"id\"]\n", + " chrom = gene_row[\"chrom\"]\n", + " strand = gene_row[\"strand\"]\n", + " cds_seq = gene_row[\"cds_sequence\"]\n", + "\n", + " gene_variant_mapping[id] = {\"variants\": []}\n", + "\n", + " if chrom not in chrom_sorted_variants:\n", + " continue\n", + "\n", + " positions, variant_data = chrom_sorted_variants[chrom]\n", + "\n", + " # Parse exon coordinates\n", + " exon_starts = gene_row[\"exonStarts\"]\n", + " exon_ends = gene_row[\"exonEnds\"]\n", + " cds_start = gene_row[\"cdsStart\"]\n", + " cds_end = gene_row[\"cdsEnd\"]\n", + "\n", + " if exon_starts and exon_ends:\n", + " if isinstance(exon_starts, str):\n", + " starts = [int(x.strip()) for x in exon_starts.split(\",\") if x.strip()]\n", + " else:\n", + " starts = [int(exon_starts)]\n", + "\n", + " if strand == \"-\":\n", + " assert starts[0] == cds_start, f\"{cds_start}, {starts[0]}, {gene_row['id']}, {gene_row['gene_id']}\"\n", + " starts[0] = cds_start\n", + "\n", + " if isinstance(exon_ends, str):\n", + " ends = [int(x.strip()) for x in exon_ends.split(\",\") if x.strip()]\n", + " else:\n", + " ends = [int(exon_ends)]\n", + " if strand == \"+\":\n", + " assert ends[-1] == cds_end, f\"{cds_end}, {ends[-1]}, {gene_row['id']}, {gene_row['gene_id']}\"\n", + " ends[-1] = cds_end\n", + " else:\n", + " raise ValueError(f\"No exon coordinates found for gene {id}\")\n", + " assert starts[0] == cds_start and ends[-1] == cds_end\n", + " # Use binary search to find variants in each exon range\n", + " cum_left = 0\n", + " for start, end in zip(starts, ends):\n", + " # Find range of variants within [start, end] using binary search\n", + " left_idx = bisect.bisect_left(positions, start)\n", + " right_idx = bisect.bisect_left(positions, end)\n", + "\n", + " # Extract variants in this range\n", + " for i in range(left_idx, right_idx):\n", + " variant = variant_data[i].copy()\n", + " variant[\"chrom\"] = chrom\n", + " variant[\"exon_start\"] = start\n", + " variant[\"exon_end\"] = end\n", + " dist_left = cum_left + variant[\"pos\"] - 1 - start\n", + " dist_in_cds = dist_left if strand == \"+\" else len(cds_seq) - dist_left - 1\n", + " variant[\"dist_left\"] = dist_left\n", + " variant[\"dist_in_cds\"] = dist_in_cds\n", + " codon_start = dist_in_cds // 3 * 3\n", + " variant[\"ref_codon\"] = cds_seq[codon_start : codon_start + 3]\n", + " alt_codon = []\n", + " for j in range(3):\n", + " if j + codon_start != dist_in_cds:\n", + " alt_codon.append(variant[\"ref_codon\"][j])\n", + " else:\n", + " if strand == \"+\":\n", + " assert variant[\"ref\"].upper() == cds_seq[j + codon_start].upper()\n", + " alt_codon.append(variant[\"alt\"].upper())\n", + " else:\n", + " assert variant[\"ref\"].upper() == reverse_complement_dna(cds_seq[j + codon_start]).upper()\n", + " alt_codon.append(reverse_complement_dna(variant[\"alt\"].upper()))\n", + " variant[\"alt_codon\"] = \"\".join(alt_codon)\n", + " gene_variant_mapping[id][\"variants\"].append(variant)\n", + " cum_left += end - start\n", + "\n", + " return gene_variant_mapping\n", + "\n", + "\n", + "def get_alt_seq(row):\n", + " ref_seq, ref_codon, alt_codon, codon_pos = (\n", + " row[\"ref_seq\"],\n", + " row[\"ref_codon\"],\n", + " row[\"alt_codon\"],\n", + " row[\"codon_position\"],\n", + " )\n", + " assert codon_pos >= 0 and codon_pos < len(ref_seq) / 3\n", + " assert ref_seq[codon_pos * 3 : (codon_pos + 1) * 3] == ref_codon\n", + " alt_seq = ref_seq[: codon_pos * 3] + alt_codon + ref_seq[(codon_pos + 1) * 3 :]\n", + " return alt_seq\n", + "\n", + "\n", + "def convert_gene_variant_mapping_to_df(gene_variant_mapping, genes, extra_cols=[]):\n", + " # Flatten gene_variant_mapping into a list of variant dicts, each with gene id\n", + " variant_rows = []\n", + " for row_id, info in gene_variant_mapping.items():\n", + " for variant in info[\"variants\"]:\n", + " row = variant.copy()\n", + " row[\"row_id\"] = row_id\n", + " variant_rows.append(row)\n", + "\n", + " gene_variant_df = pd.DataFrame(variant_rows)\n", + " gene_variant_df[\"codon_pos\"] = gene_variant_df[\"dist_in_cds\"] // 3\n", + "\n", + " # Compute ref_aa and alt_aa columns\n", + " gene_variant_df[\"ref_aa\"] = gene_variant_df[\"ref_codon\"].apply(lambda c: codon_to_aa(c) if pd.notnull(c) else None)\n", + " gene_variant_df[\"alt_aa\"] = gene_variant_df[\"alt_codon\"].apply(lambda c: codon_to_aa(c) if pd.notnull(c) else None)\n", + "\n", + " # Compute is_synonymous column\n", + " gene_variant_df[\"is_synonymous\"] = gene_variant_df.apply(\n", + " lambda row: (\n", + " row[\"ref_aa\"] == row[\"alt_aa\"]\n", + " if pd.notnull(row[\"ref_aa\"]) and pd.notnull(row[\"alt_aa\"]) and (row[\"ref_aa\"] != \"*\")\n", + " else False\n", + " ),\n", + " axis=1,\n", + " )\n", + "\n", + " gene_variant_df = pl.from_pandas(gene_variant_df)\n", + "\n", + " temp = gene_variant_df.with_columns(pl.col(\"row_id\").cast(pl.Int64)).join(\n", + " genes.select([\"id\", \"gene_name\", \"name\", \"gene_id\", \"cds_sequence\", \"strand\"]), left_on=\"row_id\", right_on=\"id\"\n", + " )\n", + " cols_to_select = [\n", + " \"chrom\",\n", + " \"pos\",\n", + " \"ref\",\n", + " \"alt\",\n", + " \"ref_codon\",\n", + " \"alt_codon\",\n", + " \"gene_name\",\n", + " \"gene_id\",\n", + " \"cds_sequence\",\n", + " \"strand\",\n", + " \"codon_pos\",\n", + " \"dist_in_cds\",\n", + " ]\n", + " if extra_cols:\n", + " cols_to_select += [x for x in extra_cols if x not in [\"chrom\", \"pos\", \"ref\", \"alt\"]]\n", + " temp = temp.select(cols_to_select)\n", + " result = (\n", + " temp.sort(\"chrom\", \"pos\")\n", + " .with_row_index(\"id\")\n", + " .rename({\"cds_sequence\": \"ref_seq\", \"codon_pos\": \"codon_position\", \"dist_in_cds\": \"var_rel_dist_in_cds\"})\n", + " )\n", + "\n", + " result = result.with_columns(\n", + " pl.struct(pl.col(\"ref_seq\"), pl.col(\"ref_codon\"), pl.col(\"alt_codon\"), pl.col(\"codon_position\"))\n", + " .map_elements(get_alt_seq, return_dtype=pl.Utf8)\n", + " .alias(\"alt_seq\")\n", + " )\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "1426ad03", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAHDCAYAAACpu1eiAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAU7FJREFUeJzt3XlcFWX///E3IAdwQUAFxFBRy31J3DBTSwSVNMtcysrMNAsrs6xsUUS7NS233LJSW/ROzbRSM0ktS8ktzTUzb72tu8A7FXAFhOv3h78zt0fABVBwvq/n48FDz8w1M9fnzJwz7zMzZ46bMcYIAAAANzz3ou4AAAAACgfBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBzqZWrlypRo0aydvbW25ubkpJSSnqLiEXVatW1SOPPFLU3cA1dKWvxbi4OLm5uenvv/++vh0sJIW9LS9cuFABAQE6efJkoc3Tbtzc3BQXF1fU3bistm3bql69ekXdjRtKr1691KNHj3xNWyjBbsaMGerevbsqV64sNze3PF/c69atU5cuXRQaGipvb28FBwerQ4cOWr9+fa7tN2zYoFatWqlkyZIKDg7W008/neNFvnnzZg0aNEh169ZVqVKlVLlyZfXo0UO//vprjvm5ubnl+de+ffsCPw9Xa8OGDYqLiyv00HX06FH16NFDPj4+mjZtmj766COVKlUq17YnT57UiBEj1KFDBwUEBMjNzU1z587N0S47O1tz58611l+pUqVUr149jR49WmfPns3RPjk5WX379lVgYKB8fHzUuHFjLVq06LJ9b9++vdzc3DRo0KAc4/Jad2PHjnVpt2TJEkVHRyskJEReXl666aabdN9992nXrl2XXb4dzZ8/X5MmTSrqbhRrxeG1iP/JysrSiBEj9NRTT6l06dJF2pc///xTcXFx2r59+zVf1vTp03N9/70RXM/nKb+udj8mSe+//75q164tb29v3XzzzXr77bcvu5xL7cek8/vHxx9/XJUqVZK3t7eqVq2qfv36ubR58cUXtXjxYv38889XXWeJq54iF2+88YZOnDihZs2a6a+//sqz3a+//ip3d3cNHDhQwcHBOn78uD7++GO1bt1ay5cvV4cOHay227dvV7t27VS7dm1NmDBBf/zxh958803t379fX331lcuy169fr+7du6tBgwZKSkrS1KlT1bhxY/34448unxI++uijHH3asmWLJk+erKioqMJ4Kq7Khg0bNHLkSD3yyCPy8/MrtPlu3rxZJ06c0KhRoxQZGXnJtn///bfi4+NVuXJlNWzYUN9++22u7U6fPq2+ffuqRYsWGjhwoAIDA5WYmKgRI0Zo9erVWrNmjdzc3CRJaWlpatWqlZKTk/XMM88oODhYCxcuVI8ePTRv3jw98MADuS7js88+U2Ji4iX72759ez388MMuw2699VaXxzt37pS/v7+eeeYZlS9fXklJSZo9e7aaNWumxMRENWzY8JLLuJ727dsnd/dre+B8/vz52rVrlwYPHnxNl3MjKw6vRfzPl19+qX379mnAgAFF3RX9+eefGjlypKpWrapGjRpd02VNnz5d5cuXvyGP4l/P5ym/rmY/JknvvPOOBg4cqG7dumnIkCH6/vvv9fTTT+v06dN68cUXc13G5fZjv//+u2677TZJ0sCBA1WpUiX9+eef2rRpk0u7W2+9VU2aNNFbb72lDz/88OoKNYXg0KFDJjs72xhjTKlSpUyfPn2ueNpTp06ZoKAgEx0d7TK8Y8eOpmLFiiY1NdUa9u677xpJ5uuvv7aGrV+/3qSnp7tM++uvvxovLy/Tu3fvyy6/X79+xs3Nzfz+++9X3OfCMn78eCPJHDx4sFDn+8EHHxhJZvPmzZdte/bsWfPXX38ZY4zZvHmzkWTmzJmTo116erpZv359juEjR440kkxCQoI1bNy4cUaSWb16tTUsKyvLNG3a1AQHB+dYX8YYc+bMGVO1alUTHx9vJJnY2NgcbfIafiWSkpJMiRIlzOOPP56v6QtTdna2OX369HVbXkxMjKlSpcp1W96lZGZm5rr+i1pxeC2OGDHCSDL//e9/C7UP10uVKlWu6r3/Urp06WJatWpVKPMqqEu9Lxa2unXrmjZt2lxxe0lmxIgR16w/V+NSz1ObNm1M3bp1r3+nLnI1+7HTp0+bcuXKmZiYGJe2vXv3NqVKlTLHjh3LMZ8r2Y917NjRhIWFmb///vuy/X3zzTdNqVKlzIkTJ66kPEuhBLsLXW2wM8aYevXqmebNm1uPU1NTTYkSJczQoUNd2qWnp5vSpUubfv36XXaejRs3No0bN75km7Nnzxo/Pz/Ttm3bK+rnyZMnzZAhQ8xNN91kHA6HueWWW8z48eOtUGuMMQcPHsxz477wReh8E7/473I7loULF5rGjRsbb29vU65cOdO7d2/zxx9/WOPbtGmTY55Xuj7y8wa2Y8cOI8lMmTLFGta5c2dToUKFHG2dO89Vq1blGDdy5EhTuXJlc/r06csGu9OnT5szZ85ccR+NOR+mfH19Tc+ePS/ZLiYmxoSFheU6rkWLFiY8PNx6PHv2bHPHHXeYChUqGIfDYWrXrm2mT5+eY7oqVaqYmJgYs3LlShMeHm68vLzMxIkTrXEXrp+jR4+a5557ztSrV8+UKlXKlClTxnTo0MFs377dZZ5r1641ksyCBQvM6NGjTaVKlYyXl5e58847zf79+612uW0PF4a8KVOmmDp16hgfHx/j5+dnwsPDzbx58y75HKWnp5vXXnvNNG7c2Pj6+pqSJUuaVq1amTVr1ri0c74Wxo8fbyZOnGiqVatm3N3dzbZt24wxxuzdu9d069bN+Pv7Gy8vLxMeHm4+//zzSy7byY6vRWc/9u7da7p3727KlCljAgICzNNPP51je8/MzDTx8fGmWrVqxuFwmCpVqphhw4aZs2fP5lnnhS7e7ubMmWMkmR9++ME8++yzpnz58qZkyZKma9eu5siRIy7TZmdnm1GjRplKlSoZHx8f07ZtW7Nr164c88zIyDBxcXGmRo0axsvLywQEBJjbbrst19f/hc6cOWMcDoeJi4vLdfxHH31kmjZtam2zt99+u8uHfWOMmTZtmqlTp45xOBymYsWK5sknnzTHjx93aeMMG7t37zZt27Y1Pj4+JiQkxLzxxhtWG+fr7OK/C7epH3/80URHRxtfX1/j4+NjWrdubX744Qdr/J49e4y3t7d56KGHXJb//fffG3d3d/PCCy8YY86vk4uXc7mQl9v6/eOPP0zfvn1NYGCgcTgcpk6dOub99993aXOl7x9OU6dONWFhYcbb29s0bdrUrFu3zrRp08bq3+Wepyt5rotSbvux5cuXG0lm+fLlLm03bNhgJJmPPvoox3wutx/bu3evkWTtJ86cOWMyMjLy7NfPP/9sJJnPPvvsquoplFOxVystLU0ZGRn6+++/9eGHH2rXrl16+eWXrfE7d+7UuXPn1KRJE5fpHA6HGjVqpG3btl1y/sYYJScnq27dupdst2LFCqWkpKh3796X7bMxRl26dNHatWvVr18/NWrUSF9//bWGDh2q//znP5o4ceJl53Ghe++9V7/++qv++c9/auLEiSpfvrwkqUKFCnlOM3fuXPXt21dNmzbVmDFjlJycrMmTJ2v9+vXatm2b/Pz89Morr6hmzZqaNWuW4uPjFRYWpurVq19V365GUlKSJFn9l6T09HT5+PjkaFuyZElJ0tatW12uaTx8+LDGjh2r2bNn5zrdhebOnavp06fLGKPatWvr1VdfzfPUbkpKijIzM5WUlKRJkyYpLS1N7dq1u+T8e/bsqYcfflibN29W06ZNreH//ve/9eOPP2r8+PHWsBkzZqhu3brq0qWLSpQooS+//FJPPvmksrOzFRsb6zLfffv26f7779fjjz+u/v37q2bNmrku/1//+peWLl2q7t27KywsTMnJyXrnnXfUpk0b7dmzRyEhIS7tx44dK3d3dz3//PNKTU3VuHHj1Lt3b23cuFGS9Morryg1NVV//PGHtY06r1l699139fTTT+u+++7TM888o7Nnz2rHjh3auHFjns+pdP71+9577+n+++9X//79deLECb3//vuKjo7Wpk2bcpyGmTNnjs6ePasBAwbIy8tLAQEB2r17t2677TZVqlRJL730kkqVKqWFCxeqa9euWrx4se655548l2/312KPHj1UtWpVjRkzRj/++KOmTJmi48ePu5yOeeyxx/TBBx/ovvvu03PPPaeNGzdqzJgx2rt3r5YsWXJV9V/oqaeekr+/v0aMGKFDhw5p0qRJGjRokBYsWGC1GT58uEaPHq1OnTqpU6dO+umnnxQVFaWMjAyXecXFxWnMmDF67LHH1KxZM6WlpWnLli366aefLnlN89atW5WRkaHGjRvnGDdy5EjFxcWpZcuWio+Pl8Ph0MaNG7VmzRrrcpq4uDiNHDlSkZGReuKJJ7Rv3z7NmDFDmzdv1vr16+Xp6WnN7/jx4+rQoYPuvfde9ejRQ59++qlefPFF1a9fXx07dlTt2rUVHx+v4cOHa8CAAbr99tslSS1btpQkrVmzRh07dlR4eLhGjBghd3d3zZkzR3feeae+//57NWvWTLVr19aoUaM0dOhQ3XffferSpYtOnTqlRx55RLVq1VJ8fLwkadKkSdY1ha+88ookKSgo6KrWX3Jyslq0aGFd31WhQgV99dVX6tevn9LS0nJcjnG59w/p/PvcoEGDdPvtt+vZZ5/VoUOH1LVrV/n7++umm26SpMs+T1fyXF9KamqqMjMzL1u/t7d3vq7JzG0/5swZF+eQ8PBwubu7a9u2bXrwwQet4VeyH/vmm28knV+v7dq105o1a+Th4aH27dtrxowZqlq1qkv7OnXqyMfHR+vXr7/ke2IOVxUDr8CVHLGLjo62Er3D4TCPP/64yyfSRYsWGUlm3bp1Oabt3r27CQ4OvuT8P/roIyMpx6eUi3Xr1s14eXnl+CSXm6VLlxpJZvTo0S7D77vvPuPm5mZ+++03Y8yVHyUw5upO/2RkZJjAwEBTr149l+dq2bJlRpIZPny4Ncz56ftKTv9cKD9H7CIjI42vr6/Lc/jUU08Zd3d3c+jQIZe2vXr1MpLMoEGDXIbfd999pmXLltZj5XHErmXLlmbSpEnm888/NzNmzDD16tVz+fRzsZo1a1rbWenSpc2rr75qsrKyLllPamqq8fLyMs8995zL8HHjxhk3Nzfz73//2xqW2+nU6OhoU61aNZdhzk/iK1euzNH+4qMcZ8+ezdHHgwcPGi8vLxMfH28Nc35Crl27tsupzcmTJxtJZufOndawvE7F3n333fk6PXLu3Lkcp1OPHz9ugoKCzKOPPurSb0nG19c3x1Gfdu3amfr167scYcrOzjYtW7Y0N9988yWXb9fXovOIXZcuXVyGP/nkk0aS+fnnn40xxmzfvt1IMo899phLu+eff95IcjlyenGdTnkdsYuMjHQ56vnss88aDw8Pk5KSYowx5siRI8bhcJiYmBiXdi+//HKOI5INGzbMcRrrSrz33ns5tmFjjNm/f79xd3c399xzT47XiLMvzv5FRUW5tJk6daqRZGbPnm0Ncx5R/fDDD61h6enpJjg42HTr1s0altf7YnZ2trn55ptNdHS0y3Nx+vRpExYWZtq3b28Ny8rKMq1atTJBQUHm77//NrGxsaZEiRI5touCnort16+fqVixYo7TfL169TJly5a13rOu9P0jPT3dlCtXzjRt2tRkZmZa7ebOnZvjiOLlTsVeyXOdl9yOfuf2l99LAXLbj8XGxhoPD49c21eoUMH06tXLZdiV7MeefvppI8mUK1fOdOjQwSxYsMCMHz/elC5d2lSvXt2cOnUqx7JuueUW07Fjx6uqp0hudzJ27FitWrVK77//vlq0aKGMjAydO3fOGn/mzBlJkpeXV45pvb29rfG5+eWXXxQbG6uIiAj16dMnz3ZpaWlavny5OnXqdEUXS69YsUIeHh56+umnXYY/99xzMsa4fKHjWtiyZYuOHDmiJ598Ut7e3tbwmJgY1apVS8uXL7+my8/NP/7xD33zzTcaO3asy3P42GOPycPDQz169NCGDRt04MABjRkzxjqScOH6W7t2rRYvXnxF39pcv369nnnmGXXp0kUDBw7U1q1bVa9ePb388su5bhNz5szRypUrNX36dNWuXVtnzpxRVlbWJZfh6+urjh07auHChTr/2jxvwYIFatGihSpXrmwNu/BTWWpqqv7++2+1adNG//rXv5Samuoy37CwMEVHR1+2Ri8vL+vLFFlZWTp69KhKly6tmjVr6qeffsrRvm/fvnI4HNZj5yflf/3rX5ddlp+fn/744w9t3rz5sm0v5OHhYS0zOztbx44ds46w59bHbt26uRz9OnbsmNasWaMePXroxIkT+vvvv/X333/r6NGjio6O1v79+/Wf//wnz+Xb/bV48dHep556StL5ui/8d8iQIS7tnnvuOUkq0PIHDBjgcvH47bffrqysLP373/+WdP6IQ0ZGhp566imXdrl9McfPz0+7d+/W/v37r6oPR48elST5+/u7DF+6dKmys7M1fPjwHF84cvbF2b/Bgwe7tOnfv798fX1zPDelS5d2OericDjUrFmzK3r9bN++Xfv379cDDzygo0ePWtvxqVOn1K5dO61bt07Z2dmSJHd3d82dO1cnT55Ux44dNX36dA0bNizH0aCCMMZo8eLF6ty5s4wxVn/+/vtvRUdHKzU1Ncfr83LvH1u2bNHRo0fVv39/lSjxvxN8vXv3zrF+Lqcgz/Vbb72lhISEy/698MILV9UnKe/92JkzZ1yemwtdnEOudD/mvKtHcHCwli9frh49euj555/Xu+++qwMHDmj+/Pk5pvH397/qWyAVyanYC0/VPPjgg2rcuLEeeeQRffrpp5L+t8NMT0/PMe3Zs2fzPMyZlJSkmJgYlS1bVp9++qk8PDzy7MPixYt19uzZKzoNK50/FRcSEqIyZcq4DK9du7Y1/lpyzj+3U3i1atXSDz/8cE2Xf7EFCxbo1VdfVb9+/fTEE0+4jGvQoIHmz5+vgQMHWt/+CQ4O1qRJk/TEE09Yh8rPnTunp59+Wg899JDLac8r5XA4NGjQICvktWrVymV8RESE9f9evXpZ6+rNN9+85Hx79uyppUuXKjExUS1bttSBAwe0devWHC/a9evXa8SIEUpMTNTp06ddxqWmpqps2bLW47CwsCuqKTs7W5MnT9b06dN18OBBlyBarly5HO0vDJrS/3aGx48fv+yyXnzxRX3zzTdq1qyZatSooaioKD3wwAPWOruUDz74QG+99ZZ++eUXl1MkudV58bDffvtNxhi99tpreu2113Kd/5EjR1SpUqVcx9n9tXjzzTe7PK5evbrc3d116NAha/nu7u6qUaOGS7vg4GD5+fkVqP7LbU/OeV/cxwoVKuTY0cfHx+vuu+/WLbfconr16qlDhw566KGH1KBBgyvqy4UfrCTpwIEDcnd3V506dfKcJq9143A4VK1atRzPzU033eQSUKXzNe/YseOy/XMG1ksdQEhNTbWel+rVqysuLk5Dhw5VvXr18tz28+u///2vUlJSNGvWLM2aNSvXNkeOHHF5fKXr++JtrUSJEjlOG15OQZ7r8PDwq1rWlbrUfszHxyfH5QVOF+aQq9mPOafp0aOHyweP7t2766GHHtKGDRv02GOPuUxjjMnxvF1OkQS7CzkcDnXp0kVjx47VmTNn5OPjo4oVK0pSrrdO+euvv3JcZySdfwF17NhRKSkp+v7773Ntc6F58+apbNmyuuuuuwqnkP8vrxVwuSNFN5KEhAQ9/PDDiomJ0cyZM3Nt47yW5Oeff1ZWVpYaN25s3UrllltukSR9+OGH2rdvn9555x1rp+V04sQJHTp0SIGBgda1ebkJDQ2VdP4o0KX4+/vrzjvv1Lx58y4b7Dp37qySJUtq4cKFatmypRYuXCh3d3d1797danPgwAG1a9dOtWrV0oQJExQaGiqHw6EVK1Zo4sSJ1id1p8tdO+j0j3/8Q6+99poeffRRjRo1SgEBAXJ3d9fgwYNzzFNSnh9eLt4p5qZ27drat2+fli1bppUrV2rx4sWaPn26hg8frpEjR+Y53ccff6xHHnlEXbt21dChQxUYGCgPDw+NGTNGBw4cyNH+4tqddTz//PN5HsW8eEeSH3Z5LeZVx9W+2V8or+egINvTxVq3bq0DBw7o888/16pVq/Tee+9p4sSJmjlzZo6d14WcH2COHz9uXcN1rRSkXud2PH78+Dxv73Hx9V6rVq2SdP7WIEePHlVwcPBV9PbK+vPggw/mGTYvDtWFub4vpyDLOnbsWJ4h60I+Pj4uH6gv5XL7sYoVKyorK0tHjhxRYGCgNTwjI0NHjx61MsbV7Mec01x87aSHh4fKlSuX6wfy48eP5/ggdTlFHuyk84c8jTE6ceKEfHx8VK9ePZUoUUJbtmxxufNyRkaGtm/fnuNuzGfPnlXnzp3166+/6ptvvrnkJzrpfDhcu3atHnnkkVxP9+amSpUq+uabb3TixAmXIwW//PKLNV763yeei290mtun6Kt5Y3bOf9++fbrzzjtdxu3bt88af61t3LhR99xzj5o0aaKFCxe6HJ6/mMPhcPkE47xw1Hk/r8OHDyszMzPXI0QffvihPvzwQy1ZskRdu3bNcxnOw/iXutDd6cyZMzlOkeamVKlSuuuuu7Ro0SJNmDBBCxYs0O233+7yYeHLL79Uenq6vvjiC5dPvWvXrr3s/C/l008/1R133KH333/fZXhKSorLhb1X41LbWalSpdSzZ0/17NlTGRkZuvfee/X6669r2LBhLqcZL+5jtWrV9Nlnn7nMe8SIEVfUn2rVqkmSPD0983VvN7u/Fvfv3+9ylPO3335Tdna2dYSkSpUqys7O1v79+62jlNL5C+dTUlJclu/v75+j/oyMjEveb/RSnPPev3+/tR6l80eLctspBQQEqG/fvurbt69Onjyp1q1bKy4u7pLBrlatWpKkgwcPqn79+tbw6tWrKzs7W3v27MkzSF24bi7sX0ZGhg4ePJiv7S2vbcP5RRhfX98rmu/MmTOVkJCg119/XWPGjNHjjz+uzz///IqWdSUqVKigMmXKKCsrq9Dumeh8Pn/77Tfdcccd1vBz587p0KFDLkGxIH2/nHvvvVfffffdZdv16dPnim7wfCX7Mec2tmXLFnXq1MkavmXLFmVnZ1vjr2Y/5jzyePGlJs4vk168Hzt37px+//13denS5bI1Xei6XmN38WFg6fyb7uLFixUaGmql4rJlyyoyMlIff/yxTpw4YbX96KOPdPLkSZcjJ1lZWerZs6cSExO1aNEil9Nvefnkk0+UnZ19xadhJalTp07KysrS1KlTXYZPnDhRbm5u1rd6fH19Vb58ea1bt86l3fTp03PM03kH+iu5232TJk0UGBiomTNnupyi/uqrr7R3717FxMRccS355VxO1apVtWzZsis+CiWd3xHMnDlTd911l3XErlevXlqyZEmOP+n8871kyRI1b95c0vkdx8VOnDihSZMmqXz58i6H6nPbzg4dOqTVq1df8TUtPXv21J9//qn33ntPP//8s3r27Oky3vnp88JPm6mpqZozZ84VzT8vHh4eOT7BLlq06JLXnF1OqVKlcg20zmuZnBwOh+rUqSNjzCW/gZZb7Rs3brzszaWdAgMD1bZtW73zzju5Bozc1vWF7P5anDZtmstj553unXU5dzIXXxowYcIESXJZfvXq1XPUP2vWrHwftYyMjJSnp6fefvttl/Wf27VFF29fpUuXVo0aNXK9xOZC4eHhcjgc2rJli8vwrl27yt3dXfHx8TmOXjv7EhkZKYfDoSlTprj07/3331dqamq+1k1e20Z4eLiqV6+uN998M9efPbtwOz548KCGDh2qbt266eWXX9abb76pL774IseNZ0uVKpXvXz/x8PBQt27dtHjx4lx/Zedyr6vcNGnSROXKldO7777rch38vHnzcgT5q3kNXa3CvMbuSvdjd955pwICAjRjxgyX4TNmzFDJkiWtbelq9mNt27ZVYGCg5s2b5/JLF3PnzlVWVlaOb4vv2bNHZ8+edfl28ZUolCN2X375pfWzF5mZmdqxY4dGjx4tSerSpYuV6jt27KibbrpJzZs3V2BgoA4fPqw5c+bozz//dPk6vSS9/vrratmypdq0aaMBAwbojz/+0FtvvaWoqCiXX6h47rnn9MUXX6hz5846duyYPv74Y5f5XHixptO8efMUEhKitm3bXnGNnTt31h133KFXXnlFhw4dUsOGDbVq1Sp9/vnnGjx4sMttDB577DGNHTtWjz32mJo0aaJ169bl+hNnzjDyyiuvqFevXvL09FTnzp1z/ckhT09PvfHGG+rbt6/atGmj+++/37rFQtWqVfXss89ecS0Xmzp1qlJSUvTnn39KOr8+//jjD0nnL9wuW7asTpw4oejoaB0/flxDhw7NcRFy9erVXUJ1nTp1rJ+ZO3jwoGbMmKGAgACXQ961atWyPp1fLCwszOVI3bRp07R06VJ17txZlStX1l9//aXZs2fr8OHD+uijj1wucq1fv77atWunRo0ayd/fX/v379f777+vzMzMHD8/lpdOnTqpTJkyev755603zAtFRUXJ4XCoc+fOevzxx3Xy5Em9++67CgwMzPfREEm66667FB8fr759+6ply5bauXOn5s2b53L04WqFh4drwYIFGjJkiJo2barSpUurc+fOioqKUnBwsG677TYFBQVp7969mjp1qmJiYnJcv3ZxHz/77DPdc889iomJ0cGDBzVz5kzVqVPnin/Xc9q0aWrVqpXq16+v/v37q1q1akpOTlZiYqL++OOPS/6Mjp1fi9L5ENClSxd16NBBiYmJ+vjjj/XAAw9Yv5jSsGFD9enTR7NmzVJKSoratGmjTZs26YMPPlDXrl1djqw89thj1p3z27dvr59//llff/11vo/+VqhQQc8//7zGjBmju+66S506ddK2bdv01Vdf5ZhnnTp11LZtW4WHhysgIEBbtmzRp59+mufPLDl5e3srKipK33zzjXUrEOn86flXXnlFo0aN0u233657771XXl5e2rx5s0JCQjRmzBhVqFBBw4YN08iRI9WhQwd16dJF+/bt0/Tp09W0adNc9weXU716dfn5+WnmzJkqU6aMSpUqpebNmyssLEzvvfeeOnbsqLp166pv376qVKmS/vOf/2jt2rXy9fXVl19+KWOMHn30Ufn4+Fgh4fHHH9fixYv1zDPPKDIy0jobEB4erhkzZmj06NGqUaOGAgMDcxwVvpSxY8dq7dq1at68ufr37686dero2LFj+umnn/TNN99c9pKVizkcDsXFxempp57SnXfeqR49eujQoUOaO3euqlev7nKU7lLPU0EV1jV2V7Mf8/Hx0ahRoxQbG6vu3bsrOjpa33//vT7++GO9/vrrCggIkHR1+zEvLy+NHz9effr0UevWrfXQQw/p8OHDmjx5srVNXyghIUElS5a8+p88varv0OahT58+eX79+MKvPk+dOtW0atXKlC9f3pQoUcJUqFDBdO7cOdfbmhhz/gaOLVu2NN7e3qZChQomNjbWpKWlubS53NegL/bLL78YSWbIkCFXXeeJEyfMs88+a0JCQoynp6e5+eabc9wU1ZjzX3fv16+fKVu2rClTpozp0aOHOXLkSK63HnDe6NPd3f2KbrewYMECc+utt1o3/Lz4pqjGXP3tTnK7Mabzz9kf560j8vq7+GvmvXr1MqGhocbhcJiQkBAzcOBAk5ycfEX9US5fE1+1apVp3769CQ4ONp6ensbPz89ERUW5/LqF04gRI0yTJk2Mv7+/KVGihAkJCTG9evUyO3bsuKLlO/Xu3du6BURuvvjiC9OgQQPj7e1tqlatat544w0ze/bsHOvReYPi3OR2u5PnnnvOVKxY0fj4+JjbbrvNJCYmutwM1Jj/3a5g0aJFLvPL7RYfJ0+eNA888IDx8/Mz0v9uUPzOO++Y1q1bm3LlyhkvLy9TvXp1M3ToUJdfe8lNdna2+cc//mGqVKlivLy8zK233mqWLVtm+vTp43JblQtvUJybAwcOmIcffthap5UqVTJ33XWX+fTTTy+5fGPs+Vp03u5kz5495r777jNlypQx/v7+ZtCgQbneoHjkyJEmLCzMeHp6mtDQ0FxvUJyVlWVefPFF64bD0dHR5rfffsvzdicX99O5na1du9ZlniNHjrS20bxuUDx69GjTrFkz4+fnZ3x8fEytWrXM66+/fskbsjp99tlnxs3NzRw+fDjHuNmzZ1vPu7+/v2nTpo3LLwYYc35fU6tWLePp6WmCgoLME088kecNii928XZsjDGff/65qVOnjilRokSO19e2bdvMvffea72OqlSpYnr06GG9NzlvIbJ48WKXeR4+fNj4+vqaTp06WcOSkpJMTEyMKVOmTI7bieQmt+04OTnZxMbGmtDQUOPp6WmCg4NNu3btzKxZs6w2V/P+Ycz5G5k7X+/NmjUz69evN+Hh4aZDhw5X9DxdzXN9LV3tfswYY2bNmmVq1qxpHA6HqV69upk4cWKO95nc5LYfc/rnP/9pGjZsaLy8vExQUJAZNGhQjmxjjDHNmzc3Dz744FXX6fb/OwAAQLGQlZWlOnXqqEePHho1alRRdwcXyc7OVoUKFXTvvffq3XffLeru2NL27dvVuHFj/fTTT1f927tFch87AADy4uHhofj4eE2bNu2KT+/j2jh79myO634//PBDHTt27KouZ8LVGTt2rO67776rDnWSxBE7AACQq2+//VbPPvusunfvrnLlyumnn37S+++/r9q1a2vr1q153sQXRadY3O4EAAAUP1WrVlVoaKimTJmiY8eOKSAgQA8//LDGjh1LqCumOGIHAABgE1xjBwAAYBMEOwAAAJvgGjuby87O1p9//qkyZcpc0598AQAUb+b//3RnSEiIy4/Qw14Idjb3559/KjQ0tKi7AQAoJn7//XfddNNNRd0NXCMEO5tz/jTU77//Ll9f3yLuTf5kZmZq1apVioqKkqenZ1F3p8DsVI+dapGop7ijnoJJS0tTaGjoJX8yEDc+gp3NOU+/+vr63tDBrmTJkvL19bXNm7ld6rFTLRL1FHfUUzi4LMfeOMkOAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsokRRdwA3jqovLS+S5Xp5GI1rJtWL+1rpWW6SpENjY4qkLwAAFGccsQMAALAJgl0u1q1bp86dOyskJERubm5aunSpy3hjjIYPH66KFSvKx8dHkZGR2r9/v0ubY8eOqXfv3vL19ZWfn5/69eunkydPurTZsWOHbr/9dnl7eys0NFTjxo3L0ZdFixapVq1a8vb2Vv369bVixYpCrxcAANgDwS4Xp06dUsOGDTVt2rRcx48bN05TpkzRzJkztXHjRpUqVUrR0dE6e/as1aZ3797avXu3EhIStGzZMq1bt04DBgywxqelpSkqKkpVqlTR1q1bNX78eMXFxWnWrFlWmw0bNuj+++9Xv379tG3bNnXt2lVdu3bVrl27rl3xAADghsU1drno2LGjOnbsmOs4Y4wmTZqkV199VXfffbck6cMPP1RQUJCWLl2qXr16ae/evVq5cqU2b96sJk2aSJLefvttderUSW+++aZCQkI0b948ZWRkaPbs2XI4HKpbt662b9+uCRMmWAFw8uTJ6tChg4YOHSpJGjVqlBISEjR16lTNnDnzOjwTAADgRkKwu0oHDx5UUlKSIiMjrWFly5ZV8+bNlZiYqF69eikxMVF+fn5WqJOkyMhIubu7a+PGjbrnnnuUmJio1q1by+FwWG2io6P1xhtv6Pjx4/L391diYqKGDBnisvzo6Ogcp4YvlJ6ervT0dOtxWlqaJCkzM1OZmZkFqt3LwxRo+nwv1924/CupwLUUJWffb+QanOxUi0Q9xR31FM7yYG8Eu6uUlJQkSQoKCnIZHhQUZI1LSkpSYGCgy/gSJUooICDApU1YWFiOeTjH+fv7Kykp6ZLLyc2YMWM0cuTIHMNXrVqlkiVLXkmJeRrXrECTF9ioJtnW/+1wrWFCQkJRd6HQ2KkWiXqKO+rJn9OnT1+X5aBoEexsZtiwYS5H+dLS0hQaGqqoqCj5+voWaN714r4uaPfyxcvdaFSTbL22xV3p2edvd7IrLrpI+lIYMjMzlZCQoPbt28vT07Oou1MgdqpFop7ijnoKxnkGB/ZGsLtKwcHBkqTk5GRVrFjRGp6cnKxGjRpZbY4cOeIy3blz53Ts2DFr+uDgYCUnJ7u0cT6+XBvn+Nx4eXnJy8srx3BPT88Cv3E47yFXVNKz3aw+2OFNvTDWSXFhp1ok6inuqCf/y4H98a3YqxQWFqbg4GCtXr3aGpaWlqaNGzcqIiJCkhQREaGUlBRt3brVarNmzRplZ2erefPmVpt169a5XPOQkJCgmjVryt/f32pz4XKcbZzLAQAAuBDBLhcnT57U9u3btX37dknnvzCxfft2HT58WG5ubho8eLBGjx6tL774Qjt37tTDDz+skJAQde3aVZJUu3ZtdejQQf3799emTZu0fv16DRo0SL169VJISIgk6YEHHpDD4VC/fv20e/duLViwQJMnT3Y5jfrMM89o5cqVeuutt/TLL78oLi5OW7Zs0aBBg673UwIAAG4AnIrNxZYtW3THHXdYj51hq0+fPpo7d65eeOEFnTp1SgMGDFBKSopatWqllStXytvb25pm3rx5GjRokNq1ayd3d3d169ZNU6ZMscaXLVtWq1atUmxsrMLDw1W+fHkNHz7c5V53LVu21Pz58/Xqq6/q5Zdf1s0336ylS5eqXr161+FZAAAANxqCXS7atm0rY/K+tYebm5vi4+MVHx+fZ5uAgADNnz//kstp0KCBvv/++0u26d69u7p3737pDgMAAIhTsQAAALZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwS7fMrKytJrr72msLAw+fj4qHr16ho1apSMMVYbY4yGDx+uihUrysfHR5GRkdq/f7/LfI4dO6bevXvL19dXfn5+6tevn06ePOnSZseOHbr99tvl7e2t0NBQjRs37rrUCAAAbiwEu3x64403NGPGDE2dOlV79+7VG2+8oXHjxuntt9+22owbN05TpkzRzJkztXHjRpUqVUrR0dE6e/as1aZ3797avXu3EhIStGzZMq1bt04DBgywxqelpSkqKkpVqlTR1q1bNX78eMXFxWnWrFnXtV4AAFD8lSjqDtyoNmzYoLvvvlsxMTGSpKpVq+qf//ynNm3aJOn80bpJkybp1Vdf1d133y1J+vDDDxUUFKSlS5eqV69e2rt3r1auXKnNmzerSZMmkqS3335bnTp10ptvvqmQkBDNmzdPGRkZmj17thwOh+rWravt27drwoQJLgEQAACAYJdPLVu21KxZs/Trr7/qlltu0c8//6wffvhBEyZMkCQdPHhQSUlJioyMtKYpW7asmjdvrsTERPXq1UuJiYny8/OzQp0kRUZGyt3dXRs3btQ999yjxMREtW7dWg6Hw2oTHR2tN954Q8ePH5e/v79Lv9LT05Wenm49TktLkyRlZmYqMzOzQDV7eZjLN7oGvNyNy7+SClxLUXL2/UauwclOtUjUU9xRT+EsD/ZGsMunl156SWlpaapVq5Y8PDyUlZWl119/Xb1795YkJSUlSZKCgoJcpgsKCrLGJSUlKTAw0GV8iRIlFBAQ4NImLCwsxzyc4y4OdmPGjNHIkSNz9HfVqlUqWbJkfsuVJI1rVqDJC2xUk2zr/ytWrCjCnhSOhISEou5CobFTLRL1FHfUkz+nT5++LstB0SLY5dPChQs1b948zZ8/3zo9OnjwYIWEhKhPnz5F1q9hw4ZpyJAh1uO0tDSFhoYqKipKvr6+BZp3vbivC9q9fPFyNxrVJFuvbXFXerabJGlXXHSR9KUwZGZmKiEhQe3bt5enp2dRd6dA7FSLRD3FHfUUjPMMDuyNYJdPQ4cO1UsvvaRevXpJkurXr69///vfGjNmjPr06aPg4GBJUnJysipWrGhNl5ycrEaNGkmSgoODdeTIEZf5njt3TseOHbOmDw4OVnJysksb52Nnmwt5eXnJy8srx3BPT88Cv3GkZ7kVaPqCSs92s/pghzf1wlgnxYWdapGop7ijnvwvB/bHt2Lz6fTp03J3d336PDw8lJ19/nRhWFiYgoODtXr1amt8WlqaNm7cqIiICElSRESEUlJStHXrVqvNmjVrlJ2drebNm1tt1q1b53JtREJCgmrWrJnjNCwAAPi/jWCXT507d9brr7+u5cuX69ChQ1qyZIkmTJige+65R5Lk5uamwYMHa/To0friiy+0c+dOPfzwwwoJCVHXrl0lSbVr11aHDh3Uv39/bdq0SevXr9egQYPUq1cvhYSESJIeeOABORwO9evXT7t379aCBQs0efJkl9OtAAAAEqdi8+3tt9/Wa6+9pieffFJHjhxRSEiIHn/8cQ0fPtxq88ILL+jUqVMaMGCAUlJS1KpVK61cuVLe3t5Wm3nz5mnQoEFq166d3N3d1a1bN02ZMsUaX7ZsWa1atUqxsbEKDw9X+fLlNXz4cG51AgAAciDY5VOZMmU0adIkTZo0Kc82bm5uio+PV3x8fJ5tAgICNH/+/Esuq0GDBvr+++/z21UAAPB/BKdiAQAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYAAAA2QbADAACwCYIdAACATRDsAAAAbIJgBwAAYBMEOwAAAJsg2AEAANgEwQ4AAMAmCHYF8J///EcPPvigypUrJx8fH9WvX19btmyxxhtjNHz4cFWsWFE+Pj6KjIzU/v37XeZx7Ngx9e7dW76+vvLz81O/fv108uRJlzY7duzQ7bffLm9vb4WGhmrcuHHXpT4AAHBjIdjl0/Hjx3XbbbfJ09NTX331lfbs2aO33npL/v7+Vptx48ZpypQpmjlzpjZu3KhSpUopOjpaZ8+etdr07t1bu3fvVkJCgpYtW6Z169ZpwIAB1vi0tDRFRUWpSpUq2rp1q8aPH6+4uDjNmjXrutYLAACKvxJF3YEb1RtvvKHQ0FDNmTPHGhYWFmb93xijSZMm6dVXX9Xdd98tSfrwww8VFBSkpUuXqlevXtq7d69WrlypzZs3q0mTJpKkt99+W506ddKbb76pkJAQzZs3TxkZGZo9e7YcDofq1q2r7du3a8KECS4BEAAAgGCXT1988YWio6PVvXt3fffdd6pUqZKefPJJ9e/fX5J08OBBJSUlKTIy0pqmbNmyat68uRITE9WrVy8lJibKz8/PCnWSFBkZKXd3d23cuFH33HOPEhMT1bp1azkcDqtNdHS03njjDR0/ftzlCKEkpaenKz093XqclpYmScrMzFRmZmaBavbyMAWaPt/LdTcu/0oqcC1Fydn3G7kGJzvVIlFPcUc9hbM82BvBLp/+9a9/acaMGRoyZIhefvllbd68WU8//bQcDof69OmjpKQkSVJQUJDLdEFBQda4pKQkBQYGuowvUaKEAgICXNpceCTwwnkmJSXlCHZjxozRyJEjc/R31apVKlmyZAEqlsY1K9DkBTaqSbb1/xUrVhRhTwpHQkJCUXeh0NipFol6ijvqyZ/Tp09fl+WgaBHs8ik7O1tNmjTRP/7xD0nSrbfeql27dmnmzJnq06dPkfVr2LBhGjJkiPU4LS1NoaGhioqKkq+vb4HmXS/u64J2L1+83I1GNcnWa1vclZ7tJknaFRddJH0pDJmZmUpISFD79u3l6elZ1N0pEDvVIlFPcUc9BeM8gwN7I9jlU8WKFVWnTh2XYbVr19bixYslScHBwZKk5ORkVaxY0WqTnJysRo0aWW2OHDniMo9z587p2LFj1vTBwcFKTk52aeN87GxzIS8vL3l5eeUY7unpWeA3jvQstwJNX1Dp2W5WH+zwpl4Y66S4sFMtEvUUd9ST/+XA/vhWbD7ddttt2rdvn8uwX3/9VVWqVJF0/osUwcHBWr16tTU+LS1NGzduVEREhCQpIiJCKSkp2rp1q9VmzZo1ys7OVvPmza0269atc7k2IiEhQTVr1sxxGhYAAPzfRrDLp2effVY//vij/vGPf+i3337T/PnzNWvWLMXGxkqS3NzcNHjwYI0ePVpffPGFdu7cqYcfflghISHq2rWrpPNH+Dp06KD+/ftr06ZNWr9+vQYNGqRevXopJCREkvTAAw/I4XCoX79+2r17txYsWKDJkye7nG4FAACQOBWbb02bNtWSJUs0bNgwxcfHKywsTJMmTVLv3r2tNi+88IJOnTqlAQMGKCUlRa1atdLKlSvl7e1ttZk3b54GDRqkdu3ayd3dXd26ddOUKVOs8WXLltWqVasUGxur8PBwlS9fXsOHD+dWJwAAIAeCXQHcdddduuuuu/Ic7+bmpvj4eMXHx+fZJiAgQPPnz7/kcho0aKDvv/8+3/0EAAD/N3AqFgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmBXCMaOHSs3NzcNHjzYGnb27FnFxsaqXLlyKl26tLp166bk5GSX6Q4fPqyYmBiVLFlSgYGBGjp0qM6dO+fS5ttvv1Xjxo3l5eWlGjVqaO7cudehIgAAcCMi2BXQ5s2b9c4776hBgwYuw5999ll9+eWXWrRokb777jv9+eefuvfee63xWVlZiomJUUZGhjZs2KAPPvhAc+fO1fDhw602Bw8eVExMjO644w5t375dgwcP1mOPPaavv/76utUHAABuHAS7Ajh58qR69+6td999V/7+/tbw1NRUvf/++5owYYLuvPNOhYeHa86cOdqwYYN+/PFHSdKqVau0Z88effzxx2rUqJE6duyoUaNGadq0acrIyJAkzZw5U2FhYXrrrbdUu3ZtDRo0SPfdd58mTpxYJPUCAIDirURRd+BGFhsbq5iYGEVGRmr06NHW8K1btyozM1ORkZHWsFq1aqly5cpKTExUixYtlJiYqPr16ysoKMhqEx0drSeeeEK7d+/WrbfeqsTERJd5ONtceMr3Yunp6UpPT7cep6WlSZIyMzOVmZlZoHq9PEyBps/3ct2Ny7+SClxLUXL2/UauwclOtUjUU9xRT+EsD/ZGsMunTz75RD/99JM2b96cY1xSUpIcDof8/PxchgcFBSkpKclqc2Goc453jrtUm7S0NJ05c0Y+Pj45lj1mzBiNHDkyx/BVq1apZMmSV15gLsY1K9DkBTaqSbb1/xUrVhRhTwpHQkJCUXeh0NipFol6ijvqyZ/Tp09fl+WgaBHs8uH333/XM888o4SEBHl7exd1d1wMGzZMQ4YMsR6npaUpNDRUUVFR8vX1LdC868UVzbV9Xu5Go5pk67Ut7krPdpMk7YqLLpK+FIbMzEwlJCSoffv28vT0LOruFIidapGop7ijnoJxnsGBvRHs8mHr1q06cuSIGjdubA3LysrSunXrNHXqVH399dfKyMhQSkqKy1G75ORkBQcHS5KCg4O1adMml/k6vzV7YZuLv0mbnJwsX1/fXI/WSZKXl5e8vLxyDPf09CzwG0d6lluBpi+o9Gw3qw92eFMvjHVSXNipFol6ijvqyf9yYH98eSIf2rVrp507d2r79u3WX5MmTdS7d2/r/56enlq9erU1zb59+3T48GFFRERIkiIiIrRz504dOXLEapOQkCBfX1/VqVPHanPhPJxtnPMAAAC4EEfs8qFMmTKqV6+ey7BSpUqpXLly1vB+/fppyJAhCggIkK+vr5566ilFRESoRYsWkqSoqCjVqVNHDz30kMaNG6ekpCS9+uqrio2NtY64DRw4UFOnTtULL7ygRx99VGvWrNHChQu1fPny61swAAC4IRDsrpGJEyfK3d1d3bp1U3p6uqKjozV9+nRrvIeHh5YtW6YnnnhCERERKlWqlPr06aP4+HirTVhYmJYvX65nn31WkydP1k033aT33ntP0dE37vVlAADg2iHYFZJvv/3W5bG3t7emTZumadOm5TlNlSpVLvvtzrZt22rbtm2F0UUAAGBzXGMHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsMunMWPGqGnTpipTpowCAwPVtWtX7du3z6XN2bNnFRsbq3Llyql06dLq1q2bkpOTXdocPnxYMTExKlmypAIDAzV06FCdO3fOpc23336rxo0by8vLSzVq1NDcuXOvdXkAAOAGRLDLp++++06xsbH68ccflZCQoMzMTEVFRenUqVNWm2effVZffvmlFi1apO+++05//vmn7r33Xmt8VlaWYmJilJGRoQ0bNuiDDz7Q3LlzNXz4cKvNwYMHFRMTozvuuEPbt2/X4MGD9dhjj+nrr7++rvUCAIDir0RRd+BGtXLlSpfHc+fOVWBgoLZu3arWrVsrNTVV77//vubPn68777xTkjRnzhzVrl1bP/74o1q0aKFVq1Zpz549+uabbxQUFKRGjRpp1KhRevHFFxUXFyeHw6GZM2cqLCxMb731liSpdu3a+uGHHzRx4kRFR0df97oBAEDxRbArJKmpqZKkgIAASdLWrVuVmZmpyMhIq02tWrVUuXJlJSYmqkWLFkpMTFT9+vUVFBRktYmOjtYTTzyh3bt369Zbb1ViYqLLPJxtBg8enGs/0tPTlZ6ebj1OS0uTJGVmZiozM7NANXp5mAJNn+/luhuXfyUVuJai5Oz7jVyDk51qkainuKOewlke7I1gVwiys7M1ePBg3XbbbapXr54kKSkpSQ6HQ35+fi5tg4KClJSUZLW5MNQ5xzvHXapNWlqazpw5Ix8fH5dxY8aM0ciRI3P0cdWqVSpZsmT+i5Q0rlmBJi+wUU2yrf+vWLGiCHtSOBISEoq6C4XGTrVI1FPcUU/+nD59+rosB0WLYFcIYmNjtWvXLv3www9F3RUNGzZMQ4YMsR6npaUpNDRUUVFR8vX1LdC868UVzXV9Xu5Go5pk67Ut7krPdpMk7Yq7cU9DZ2ZmKiEhQe3bt5enp2dRd6dA7FSLRD3FHfUUjPMMDuyNYFdAgwYN0rJly7Ru3TrddNNN1vDg4GBlZGQoJSXF5ahdcnKygoODrTabNm1ymZ/zW7MXtrn4m7TJycny9fXNcbROkry8vOTl5ZVjuKenZ4HfONKz3Ao0fUGlZ7tZfbDDm3phrJPiwk61SNRT3FFP/pcD++NbsflkjNGgQYO0ZMkSrVmzRmFhYS7jw8PD5enpqdWrV1vD9u3bp8OHDysiIkKSFBERoZ07d+rIkSNWm4SEBPn6+qpOnTpWmwvn4WzjnAcAAIATR+zyKTY2VvPnz9fnn3+uMmXKWNfElS1bVj4+Pipbtqz69eunIUOGKCAgQL6+vnrqqacUERGhFi1aSJKioqJUp04dPfTQQxo3bpySkpL06quvKjY21jrqNnDgQE2dOlUvvPCCHn30Ua1Zs0YLFy7U8uXLi6x2AABQPHHELp9mzJih1NRUtW3bVhUrVrT+FixYYLWZOHGi7rrrLnXr1k2tW7dWcHCwPvvsM2u8h4eHli1bJg8PD0VEROjBBx/Uww8/rPj4eKtNWFiYli9froSEBDVs2FBvvfWW3nvvPW51AgAAcuCIXT4Zc/lbf3h7e2vatGmaNm1anm2qVKly2W94tm3bVtu2bbvqPgIAgP9bOGIHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyhR1B0A8qPqS8uLuguWQ2NjiroLAABI4ogdAACAbXDEDgBQbFzqaLyXh9G4ZlK9uK+VnuV2zfvC0XjciAh2AP5PuJLT99c7OFxrdqsHwOUR7ABcM8XpWkjgal3r7fdqgjdHD3GluMYOAADAJjhiB9jM9TxKxqk+ACheOGIHAABgEwQ7AAAAmyDYAQAA2ATX2AEFdLXXtHFdGgDgWuGI3Q1i2rRpqlq1qry9vdW8eXNt2rSpqLsEAACKGYLdDWDBggUaMmSIRowYoZ9++kkNGzZUdHS0jhw5UtRdAwAAxQjB7gYwYcIE9e/fX3379lWdOnU0c+ZMlSxZUrNnzy7qrgEAgGKEa+yKuYyMDG3dulXDhg2zhrm7uysyMlKJiYk52qenpys9Pd16nJqaKkk6duyYMjMzC9SXEudOFWj6fC832+j06WyVyHRXVvaNf02aneqxUy0S9RR3/5frOXr0aIGXd+LECUmSMabA80LxRbAr5v7++29lZWUpKCjIZXhQUJB++eWXHO3HjBmjkSNH5hgeFhZ2zfp4PTxQ1B0oZHaqx061SNRT3P1fraf8W4W3zBMnTqhs2bKFN0MUKwQ7mxk2bJiGDBliPc7OztaxY8dUrlw5ubndmJ9w09LSFBoaqt9//12+vr5F3Z0Cs1M9dqpFop7ijnoKxhijEydOKCQk5JovC0WHYFfMlS9fXh4eHkpOTnYZnpycrODg4Bztvby85OXl5TLMz8/vWnbxuvH19bXFm7mTneqxUy0S9RR31JN/HKmzP748Ucw5HA6Fh4dr9erV1rDs7GytXr1aERERRdgzAABQ3HDE7gYwZMgQ9enTR02aNFGzZs00adIknTp1Sn379i3qrgEAgGKEYHcD6Nmzp/773/9q+PDhSkpKUqNGjbRy5cocX6iwKy8vL40YMSLHKeYblZ3qsVMtEvUUd9QDXJ6b4XvPAAAAtsA1dgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmCHay4uLk5ubm4uf7Vq1bLGnz17VrGxsSpXrpxKly6tbt265bgh8+HDhxUTE6OSJUsqMDBQQ4cO1blz51zafPvtt2rcuLG8vLxUo0YNzZ07t1D6v27dOnXu3FkhISFyc3PT0qVLXcYbYzR8+HBVrFhRPj4+ioyM1P79+13aHDt2TL1795avr6/8/PzUr18/nTx50qXNjh07dPvtt8vb21uhoaEaN25cjr4sWrRItWrVkre3t+rXr68VK1YUej2PPPJIjvXVoUOHYlnPmDFj1LRpU5UpU0aBgYHq2rWr9u3b59Lmem5f06ZNU9WqVeXt7a3mzZtr06ZNhV5P27Ztc6yfgQMHFst6ZsyYoQYNGlg34I2IiNBXX31ljb+R1s2V1HMjrRvYmAGusREjRpi6deuav/76y/r773//a40fOHCgCQ0NNatXrzZbtmwxLVq0MC1btrTGnzt3ztSrV89ERkaabdu2mRUrVpjy5cubYcOGWW3+9a9/mZIlS5ohQ4aYPXv2mLffftt4eHiYlStXFrj/K1asMK+88or57LPPjCSzZMkSl/Fjx441ZcuWNUuXLjU///yz6dKliwkLCzNnzpyx2nTo0ME0bNjQ/Pjjj+b77783NWrUMPfff781PjU11QQFBZnevXubXbt2mX/+85/Gx8fHvPPOO1ab9evXGw8PDzNu3DizZ88e8+qrrxpPT0+zc+fOQq2nT58+pkOHDi7r69ixYy5tiks90dHRZs6cOWbXrl1m+/btplOnTqZy5crm5MmTVpvrtX198sknxuFwmNmzZ5vdu3eb/v37Gz8/P5OcnFyo9bRp08b079/fZf2kpqYWy3q++OILs3z5cvPrr7+affv2mZdfftl4enqaXbt2GWNurHVzJfXcSOsG9kWwwzU3YsQI07Bhw1zHpaSkGE9PT7No0SJr2N69e40kk5iYaIw5H0Tc3d1NUlKS1WbGjBnG19fXpKenG2OMeeGFF0zdunVd5t2zZ08THR1dqLVcHISys7NNcHCwGT9+vEtNXl5e5p///Kcxxpg9e/YYSWbz5s1Wm6+++sq4ubmZ//znP8YYY6ZPn278/f2teowx5sUXXzQ1a9a0Hvfo0cPExMS49Kd58+bm8ccfL7R6jDkf7O6+++48pynO9Rw5csRIMt99950x5vpuX82aNTOxsbHW46ysLBMSEmLGjBlTaPUYcz48PPPMM3lOU5zrMcYYf39/8957793w6+bieoy58dcN7IFTsbgu9u/fr5CQEFWrVk29e/fW4cOHJUlbt25VZmamIiMjrba1atVS5cqVlZiYKElKTExU/fr1XW7IHB0drbS0NO3evdtqc+E8nG2c87hWDh48qKSkJJdlly1bVs2bN3fpv5+fn5o0aWK1iYyMlLu7uzZu3Gi1ad26tRwOh0v/9+3bp+PHj1ttrleN3377rQIDA1WzZk098cQTOnr0qDWuONeTmpoqSQoICJB0/bavjIwMbd261aWNu7u7IiMjC7Uep3nz5ql8+fKqV6+ehg0bptOnT1vjims9WVlZ+uSTT3Tq1ClFRETc8Ovm4nqcbsR1A3vhlydwzTVv3lxz585VzZo19ddff2nkyJG6/fbbtWvXLiUlJcnhcMjPz89lmqCgICUlJUmSkpKScvzKhvPx5dqkpaXpzJkz8vHxuSa1OZef27Iv7FtgYKDL+BIlSiggIMClTVhYWI55OMf5+/vnWaNzHoWlQ4cOuvfeexUWFqYDBw7o5ZdfVseOHZWYmCgPD49iW092drYGDx6s2267TfXq1bOWdT22r+PHjysrKyvXNr/88kuh1SNJDzzwgKpUqaKQkBDt2LFDL774ovbt26fPPvusWNazc+dORURE6OzZsypdurSWLFmiOnXqaPv27TfkusmrHunGWzewJ4IdrrmOHTta/2/QoIGaN2+uKlWqaOHChdcscCH/evXqZf2/fv36atCggapXr65vv/1W7dq1K8KeXVpsbKx27dqlH374oai7UijyqmfAgAHW/+vXr6+KFSuqXbt2OnDggKpXr369u3lZNWvW1Pbt25WamqpPP/1Uffr00XfffVfU3cq3vOqpU6fODbduYE+cisV15+fnp1tuuUW//fabgoODlZGRoZSUFJc2ycnJCg4OliQFBwfn+Kac8/Hl2vj6+l7T8Ohcfm7LvrBvR44ccRl/7tw5HTt2rFBqdI6/VqpVq6by5cvrt99+s/pR3OoZNGiQli1bprVr1+qmm26yhl+v7at8+fLy8PC45vXkpnnz5pLksn6KUz0Oh0M1atRQeHi4xowZo4YNG2ry5Mk37LrJq57cFPd1A3si2OG6O3nypA4cOKCKFSsqPDxcnp6eWr16tTV+3759Onz4sHXdSkREhHbu3OkSJhISEuTr62udAomIiHCZh7PNhde+XAthYWEKDg52WXZaWpo2btzo0v+UlBRt3brVarNmzRplZ2dbb/wRERFat26dMjMzXfpfs2ZN+fv7W22KosY//vhDR48eVcWKFYtdPcYYDRo0SEuWLNGaNWtynP69XtuXw+FQeHi4S5vs7GytXr26UOvJzfbt2yXJZf0Ul3pyk52drfT09Btu3VyuntzcaOsGNlHU396A/T333HPm22+/NQcPHjTr1683kZGRpnz58ubIkSPGmPO3PKhcubJZs2aN2bJli4mIiDARERHW9M5bBERFRZnt27eblStXmgoVKuR6i4ChQ4eavXv3mmnTphXa7U5OnDhhtm3bZrZt22YkmQkTJpht27aZf//738aY87c78fPzM59//rnZsWOHufvuu3O93cmtt95qNm7caH744Qdz8803u9weJCUlxQQFBZmHHnrI7Nq1y3zyySemZMmSOW4PUqJECfPmm2+avXv3mhEjRuTrdieXqufEiRPm+eefN4mJiebgwYPmm2++MY0bNzY333yzOXv2bLGr54knnjBly5Y13377rcstJk6fPm21uV7b1yeffGK8vLzM3LlzzZ49e8yAAQOMn5+fyzcgC1rPb7/9ZuLj482WLVvMwYMHzeeff26qVatmWrduXSzreemll8x3331nDh48aHbs2GFeeukl4+bmZlatWnXDrZvL1XOjrRvYF8EO11zPnj1NxYoVjcPhMJUqVTI9e/Y0v/32mzX+zJkz5sknnzT+/v6mZMmS5p577jF//fWXyzwOHTpkOnbsaHx8fEz58uXNc889ZzIzM13arF271jRq1Mg4HA5TrVo1M2fOnELp/9q1a42kHH99+vQxxpy/5clrr71mgoKCjJeXl2nXrp3Zt2+fyzyOHj1q7r//flO6dGnj6+tr+vbta06cOOHS5ueffzatWrUyXl5eplKlSmbs2LE5+rJw4UJzyy23GIfDYerWrWuWL19eqPWcPn3aREVFmQoVKhhPT09TpUoV079//xw7jOJST251SHJZ99dz+3r77bdN5cqVjcPhMM2aNTM//vhjodZz+PBh07p1axMQEGC8vLxMjRo1zNChQ13ulVac6nn00UdNlSpVjMPhMBUqVDDt2rWzQp0xN9a6uVw9N9q6gX25GWPM9Ts+CAAAgGuFa+wAAABsgmAHAABgEwQ7AAAAmyDYAQAA2ATBDgAAwCYIdgAAADZBsAMAALAJgh0AAIBNEOwAAABsgmAHAABgEwQ7AAAAmyDYAQAA2MT/AxfuxmKOSwz/AAAAAElFTkSuQmCC", "text/plain": [ - "
" + "19310" ] }, + "execution_count": 84, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "context_to_check = 2046\n", - "checks = check_mutation_positions(result_df.to_pandas(), context_to_check)\n", - "checks[checks[\"out_of_bounds\"]].codon_position.hist(figsize=(5, 5))\n", - "plt.title(\n", - " f\" {checks['out_of_bounds'].sum()} out of {len(checks)} variants are out of bounds (context length = {context_to_check})\"\n", + "genes = pl.read_csv(gencode_v47_file, separator=\"\\t\")\n", + "genes.head()\n", + "genes = (\n", + " genes.filter(pl.col(\"is_canonical\"))\n", + " .filter(pl.col(\"length_divisible_by_3\"))\n", + " .filter(pl.col(\"has_start_codon\"))\n", + " .filter(pl.col(\"has_stop_codon\"))\n", ")\n", - "plt.show()" + "genes = genes.with_row_index(\"id\")\n", + "genes.head()\n", + "genes[\"is_canonical\"].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "69b84700", + "metadata": {}, + "source": [ + "### COSMIC" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "6b85ad06", + "execution_count": 86, + "id": "bd6453ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "49970" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Keep only WGS and WXS samples\n", + "cosmic_samples = pl.read_csv(cosmic_files[\"cosmic_samples\"], separator=\"\\t\")\n", + "cosmic_wxs_samples = (\n", + " cosmic_samples.filter((pl.col(\"WHOLE_GENOME_SCREEN\") == \"y\") | (pl.col(\"WHOLE_EXOME_SCREEN\") == \"y\"))[\n", + " \"COSMIC_SAMPLE_ID\"\n", + " ]\n", + " .unique()\n", + " .to_list()\n", + ")\n", + "len(cosmic_wxs_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "625222f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2004598\n", + "2 953793\n", + "3 923499\n", + "4 923499\n" + ] + } + ], + "source": [ + "columns = [\n", + " \"MUTATION_ID\",\n", + " \"GENE_SYMBOL\",\n", + " \"TRANSCRIPT_ACCESSION\",\n", + " \"MUTATION_CDS\",\n", + " \"MUTATION_AA\",\n", + " \"MUTATION_DESCRIPTION\",\n", + " \"CHROMOSOME\",\n", + " \"GENOME_START\",\n", + " \"GENOME_STOP\",\n", + " \"STRAND\",\n", + " \"HGVSP\",\n", + " \"HGVSC\",\n", + " \"HGVSG\",\n", + " \"GENOMIC_WT_ALLELE\",\n", + " \"GENOMIC_MUT_ALLELE\",\n", + "]\n", + "\n", + "\n", + "# %%\n", + "data = pl.read_csv(cosmic_files[\"cosmic_mutant_census\"], infer_schema_length=100000, separator=\"\\t\")\n", + "data = data.with_columns(\n", + " pl.col(\"CHROMOSOME\").map_elements(\n", + " lambda x: str(int(float(x))) if x not in [\"X\", \"Y\"] else x, return_dtype=pl.String\n", + " )\n", + ")\n", + "data = data.with_columns(pl.col(\"GENOME_START\").cast(pl.Int64))\n", + "data = data.with_columns(pl.col(\"GENOME_STOP\").cast(pl.Int64))\n", + "print(0, data.height)\n", + "data = data.filter(pl.col(\"COSMIC_SAMPLE_ID\").is_in(cosmic_wxs_samples))\n", + "\n", + "## Each row is a variant in a sample, so need to group by variant across samples.\n", + "num_samples = data.group_by(\"HGVSC\").len(\"num_samples\")\n", + "somatic = data.filter(pl.col(\"MUTATION_SOMATIC_STATUS\") == \"Confirmed somatic variant\")[\"HGVSC\"].unique().to_list()\n", + "data_grouped = data.group_by(columns).first().select(columns)\n", + "data_grouped = data_grouped.with_columns(pl.col(\"HGVSC\").is_in(somatic).alias(\"somatic\"))\n", + "data_grouped = data_grouped.join(num_samples, on=\"HGVSC\", how=\"left\")\n", + "data_grouped = data_grouped.with_columns(\n", + " pl.col(\"GENOMIC_WT_ALLELE\").alias(\"ref\"), pl.col(\"GENOMIC_MUT_ALLELE\").alias(\"alt\")\n", + ")\n", + "data_grouped = data_grouped.filter((pl.col(\"ref\").str.len_chars() == 1) & (pl.col(\"alt\").str.len_chars() == 1))\n", + "data_grouped = data_grouped.filter(pl.col(\"ref\") != pl.col(\"alt\"))\n", + "print(2, data_grouped.height)\n", + "data_grouped = data_grouped.with_columns(\n", + " (\"chr\" + pl.col(\"CHROMOSOME\")).alias(\"chrom\"), (pl.col(\"GENOME_START\")).cast(pl.Int64).alias(\"pos\")\n", + ")\n", + "data_grouped = data_grouped.filter(pl.col(\"chrom\").is_in(valid_chroms))\n", + "print(3, data_grouped.height)\n", + "data_grouped = data_grouped.unique()\n", + "print(4, data_grouped.height)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ac6c0ad1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 19310 genes and 923499 variants...\n" + ] + } + ], + "source": [ + "cols = [x for x in data_grouped.columns if x != \"chrom\"]\n", + "gene_variant_mapping = map_variants_to_genes_by_exons_efficient(genes, data_grouped, variant_columns=cols)\n", + "result = convert_gene_variant_mapping_to_df(gene_variant_mapping, genes, cols)\n", + "\n", + "result = result.filter(pl.col(\"GENE_SYMBOL\") == pl.col(\"gene_name\"))\n", + "result = result.filter(pl.col(\"ref_seq\").str.len_chars() % 3 == 0)\n", + "result.write_csv(COSMIC_OUTPUT_FILE)" + ] + }, + { + "cell_type": "markdown", + "id": "7c30abbc", + "metadata": {}, + "source": [ + "### gnomAD common variants" + ] + }, + { + "cell_type": "markdown", + "id": "378a80d1", + "metadata": {}, + "source": [ + "#### The gnomad data need to be downloaded from https://gnomad.broadinstitute.org/\n", + "To convert the vcf files to tsv files, run the following command with `bcftools`.\n", + "```\n", + "bcftools query -f '\\''%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AC\\t%AN\\n'\\'' \\\n", + " -i '\\''TYPE=\"snp\" & FILTER=\"PASS\"'\\'' \\\n", + " \"gnomad..v4.1.sites..vcf.bgz\" | \\\n", + " gzip > \".tsv.gz\"\n", + "```\n", + "\n", + "The `` and `` need to be replaced by the actual names." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "67283569", "metadata": {}, "outputs": [], "source": [ - "# Save processed results, dset, and refseq tables\n", - "dset.write_csv(f\"{OUTPUT_DIR}/clinvar_synom.csv\")" + "# get all variants from gnomAD, including exome and genome\n", + "exome_variants = []\n", + "for chrom in [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]:\n", + " variants = pl.read_csv(\n", + " os.path.join(gnomad_files[\"gnomad_exomes\"], f\"{chrom}.tsv.gz\"), separator=\"\\t\", has_header=False\n", + " )\n", + " exome_variants.append(variants)\n", + "exome_variants = pl.concat(exome_variants)\n", + "exome_variants.columns = [\"chrom\", \"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "exome_variants = exome_variants.filter(pl.col(\"an\") > 100000)\n", + "\n", + "genome_variants = []\n", + "for chrom in [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]:\n", + " variants = pl.read_csv(\n", + " os.path.join(gnomad_files[\"gnomad_genomes\"], f\"{chrom}.tsv.gz\"), separator=\"\\t\", has_header=False\n", + " )\n", + " genome_variants.append(variants)\n", + "genome_variants = pl.concat(genome_variants)\n", + "genome_variants.columns = [\"chrom\", \"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "genome_variants = genome_variants.filter(pl.col(\"an\") > 25000)\n", + "\n", + "all_variants = (\n", + " pl.concat([exome_variants, genome_variants])\n", + " .sort(\"af\", descending=True)\n", + " .unique(subset=[\"chrom\", \"pos\", \"ref\", \"alt\"], keep=\"first\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "b47f7a09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 19310 genes and 12910000 variants...\n" + ] + } + ], + "source": [ + "common_variants = all_variants.filter(pl.col(\"af\") > 0.01).sort([\"chrom\", \"pos\"])\n", + "cols = [\"pos\", \"ref\", \"alt\", \"af\", \"ac\", \"an\"]\n", + "gene_variant_mapping = map_variants_to_genes_by_exons_efficient(genes, common_variants, variant_columns=cols)\n", + "\n", + "result = convert_gene_variant_mapping_to_df(gene_variant_mapping, genes, cols)\n", + "result = result.filter(pl.col(\"ref_seq\").str.len_chars() % 3 == 0)\n", + "result.write_csv(GNOMAD_OUTPUT_FILE)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8e555940-85c7-430c-8d41-e8fae2de5df6", + "id": "5a1d4872", "metadata": {}, "outputs": [], "source": [] @@ -3521,7 +6509,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -3535,7 +6523,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt b/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt index 9e28afa182..2bdfb2ed09 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt +++ b/bionemo-recipes/recipes/codonfm_ptl_te/requirements.txt @@ -158,6 +158,7 @@ numpy==1.26.4 nvidia-dali-cuda130==1.51.2 nvidia-resiliency-ext==0.4.1 omegaconf==2.3.0 +openpyxl==3.1.5 opt_einsum==3.4.0 optree==0.17.0 optuna==2.10.1 From f35dd901748eaa4fa12b47df76b92cd110c9f4ef Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 25 Feb 2026 03:19:26 +0000 Subject: [PATCH 04/13] updated nptebook --- .../00-Mutation-Datasets-Preprocessing.ipynb | 470 ++++-------------- 1 file changed, 96 insertions(+), 374 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb index a22bede07c..9169379569 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb @@ -25,23 +25,12 @@ "\n", "## Required Pre-processing Steps\n", "\n", - "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed.\n", + "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed and saved at `/data/ncbi`\n", "\n", - "### 1. Open-source Data Download\n", + "#### 1. Open-source Data Download\n", "\n", - "There are two ways to obtain the data used by this notebook:\n", "\n", - "a. **Manual:**\n", - " - Use the links provided above to download each file individually.\n", - " - Use the [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) to export the required tables as TSV.\n", - " - Save them into the corresponding subdirectories under `DATA_DIR` (matching the filenames in the directory structure section above).\n", - "\n", - "b. **Automatic (recommended):**\n", - " - Create a UCSC account: [hgLogin](https://genome.ucsc.edu/cgi-bin/hgLogin)\n", - " - Generate an API key: [hgHubConnect](https://genome.ucsc.edu/cgi-bin/hgHubConnect) → click **\"generate key\"**\n", - " - Paste the key into `UCSC_API_KEY` in the download cell below, then run the cell.\n", - "\n", - "#### 1.a. Manual Download - Reference Files\n", + "##### Reference Files\n", "| File | Origin |\n", "|----------------|-------- |\n", "| `hg19.fa` | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz) |\n", @@ -80,319 +69,74 @@ "| `hg19.100way.phyloP100way.bw` | UCSC Genome Browser | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw) |\n", "| `ucsc_pliByGene_hg38.tsv` | UCSC Genome Browser → Table Browser | [Download](https://genome.ucsc.edu/cgi-bin/hgTables) (table: `pliByGene`) |\n", "| `gnomad.v2.1.1.lof_metrics.by_transcript.txt` | gnomAD | [Download](https://gnomad.broadinstitute.org/downloads) |\n", - "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n" - ] - }, - { - "cell_type": "markdown", - "id": "0ffb5238", - "metadata": {}, - "source": [ - "### 1.b. Automatic Download\n", - "\n", - "If you choose **Automatic**:\n", - " 1. Set the `DATA_DIR` where the files should be saved.\n", - " 2. Set the `UCSC_API_KEY` to download the tables form the UCSC table browser.\n", - " 3. Run the next cell to download the required datasets into `DATA_DIR`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6d796a4", - "metadata": {}, - "outputs": [], - "source": [ - "import gzip\n", - "import os\n", - "import shutil\n", - "import urllib.request\n", - "\n", - "import pandas as pd\n", - "import requests\n", - "\n", + "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n", "\n", - "# ── Set data directory ───────────────────────────────────────\n", - "DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n", - "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", - "UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n", - "# ─────────────────────────────────────────────────────────────\n", - "\n", - "# Create output directory\n", - "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", - "\n", - "for subdir in [\n", - " \"reference/hg19\",\n", - " \"reference/hg38\",\n", - " \"alphamissense_data\",\n", - " \"ddd_asd_zhouetal\",\n", - " \"clinvar_syn\",\n", - "]:\n", - " os.makedirs(os.path.join(DATA_DIR, subdir), exist_ok=True)\n", - "\n", - "\n", - "def download_file(url, dest, decompress_gz=False):\n", - " \"\"\"Download *url* → *dest*, optionally gunzipping in place. Skips if target already exists.\"\"\"\n", - " final = dest[:-3] if decompress_gz and dest.endswith(\".gz\") else dest\n", - " if os.path.exists(final):\n", - " print(f\" [skip] {os.path.relpath(final, DATA_DIR)}\")\n", - " return\n", - " print(f\" Downloading → {os.path.relpath(final, DATA_DIR)} ...\")\n", - " urllib.request.urlretrieve(url, dest)\n", - " if decompress_gz and dest.endswith(\".gz\"):\n", - " with gzip.open(dest, \"rb\") as f_in, open(final, \"wb\") as f_out:\n", - " shutil.copyfileobj(f_in, f_out)\n", - " os.remove(dest)\n", - "\n", - "\n", - "# ── 1. Reference genomes ────────────────────────────────────\n", - "print(\"Reference genomes\")\n", - "download_file(\n", - " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz\",\n", - " os.path.join(DATA_DIR, \"reference/hg19/hg19.fa.gz\"),\n", - " decompress_gz=True,\n", - ")\n", - "download_file(\n", - " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\",\n", - " os.path.join(DATA_DIR, \"reference/hg38/hg38.fa.gz\"),\n", - " decompress_gz=True,\n", - ")\n", - "\n", - "# ── 2. GENCODE annotation (GTF) ─────────────────────────────\n", - "print(\"GENCODE annotation\")\n", - "download_file(\n", - " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n", - " os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n", - " decompress_gz=True,\n", - ")\n", - "\n", - "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n", - "print(\"DDD / ASD variant files\")\n", - "xlsx_sources = {\n", - " \"asd_discov\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM5_ESM.xlsx\",\n", - " \"asd_rep\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM6_ESM.xlsx\",\n", - " \"ddd_other\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM7_ESM.xlsx\",\n", - "}\n", - "\n", - "for name, url in xlsx_sources.items():\n", - " csv_path = os.path.join(DATA_DIR, \"ddd_asd_zhouetal\", f\"{name}.csv\")\n", - " if os.path.exists(csv_path):\n", - " print(f\" [skip] ddd_asd_zhouetal/{name}.csv\")\n", - " continue\n", - " xlsx_path = csv_path.replace(\".csv\", \".xlsx\")\n", - " download_file(url, xlsx_path)\n", - " print(f\" Converting {name}.xlsx → csv ...\")\n", - " pd.read_excel(xlsx_path).to_csv(csv_path, index=False)\n", - "\n", - "# ── 4. ClinVar variant summary ──────────────────────────────\n", - "print(\"ClinVar variant summary\")\n", - "download_file(\n", - " \"https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz\",\n", - " os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n", - ")\n", - "\n", - "# ── 5. phyloP conservation scores ───────────────────────────\n", - "print(\"phyloP447way conservation scores\")\n", - "download_file(\n", - " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n", - " os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n", - ")\n", - "\n", - "# ── 6. UCSC Table Browser downloads ─────────────────────────\n", - "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n", - "UCSC_TABLES = {\n", - " \"wgEncodeGencodeCompV32\": {\n", - " \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n", - " \"subdir\": \"\",\n", - " \"form\": {\n", - " \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n", - " \"clade\": \"mammal\",\n", - " \"org\": \"Human\",\n", - " \"db\": \"hg38\",\n", - " \"hgta_group\": \"allTables\",\n", - " \"hgta_track\": \"hg38\",\n", - " \"hgta_table\": \"wgEncodeGencodeCompV32\",\n", - " \"hgta_regionType\": \"genome\",\n", - " \"position\": \"chr7:155,799,529-155,812,871\",\n", - " \"hgta_outSep\": \"tab\",\n", - " \"hgta_doTopSubmit\": \"Get output\",\n", - " },\n", - " },\n", - " \"ncbiRefSeq\": {\n", - " \"filename\": \"ucsc_refseq_hg38.tsv\",\n", - " \"subdir\": \"clinvar_syn\",\n", - " \"form\": {\n", - " \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n", - " \"clade\": \"mammal\",\n", - " \"org\": \"Human\",\n", - " \"db\": \"hg38\",\n", - " \"hgta_group\": \"allTables\",\n", - " \"hgta_track\": \"hg38\",\n", - " \"hgta_table\": \"ncbiRefSeq\",\n", - " \"hgta_regionType\": \"genome\",\n", - " \"position\": \"chr7:155,799,529-155,812,871\",\n", - " \"hgta_outSep\": \"tab\",\n", - " \"hgta_doTopSubmit\": \"Get output\",\n", - " },\n", - " },\n", - " \"ncbiRefSeqHistorical\": {\n", - " \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n", - " \"subdir\": \"clinvar_syn\",\n", - " \"form\": {\n", - " \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n", - " \"clade\": \"mammal\",\n", - " \"org\": \"Human\",\n", - " \"db\": \"hg38\",\n", - " \"hgta_group\": \"allTables\",\n", - " \"hgta_track\": \"hg38\",\n", - " \"hgta_table\": \"ncbiRefSeqHistorical\",\n", - " \"hgta_regionType\": \"genome\",\n", - " \"position\": \"chr7:155,799,529-155,812,871\",\n", - " \"hgta_outSep\": \"tab\",\n", - " \"hgta_doTopSubmit\": \"Get output\",\n", - " },\n", - " },\n", - " \"pliByGene\": {\n", - " \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n", - " \"subdir\": \"\",\n", - " \"form\": {\n", - " \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n", - " \"clade\": \"mammal\",\n", - " \"org\": \"Human\",\n", - " \"db\": \"hg38\",\n", - " \"hgta_group\": \"varRep\",\n", - " \"hgta_track\": \"gnomadPLI\",\n", - " \"hgta_table\": \"pliByGene\",\n", - " \"hgta_regionType\": \"genome\",\n", - " \"position\": \"chr7:155,799,529-155,812,871\",\n", - " \"hgta_outSep\": \"tab\",\n", - " \"hgta_doTopSubmit\": \"Get output\",\n", - " },\n", - " },\n", - "}\n", - "\n", - "print(\"UCSC Table Browser downloads\")\n", - "if not UCSC_API_KEY:\n", - " print(\" UCSC_API_KEY is not set — skipping automatic download.\")\n", - " print(\" Download these tables manually from https://genome.ucsc.edu/cgi-bin/hgTables:\")\n", - " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", - " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", - " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", - " status = \"found\" if os.path.exists(dest) else \"MISSING\"\n", - " rel = os.path.join(tbl_cfg[\"subdir\"], tbl_cfg[\"filename\"]) if tbl_cfg[\"subdir\"] else tbl_cfg[\"filename\"]\n", - " print(f\" [{status}] {rel} (table: {tbl_name})\")\n", - "else:\n", - " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", - " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", - " os.makedirs(dest_dir, exist_ok=True)\n", - " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", - "\n", - " if os.path.exists(dest):\n", - " print(f\" [skip] {os.path.relpath(dest, DATA_DIR)}\")\n", - " continue\n", + "#### 2. Data Scripts\n", "\n", - " print(f\" Downloading {tbl_name} → {os.path.relpath(dest, DATA_DIR)} ...\")\n", - " form = {**tbl_cfg[\"form\"], \"apiKey\": UCSC_API_KEY}\n", - " resp = requests.post(UCSC_URL, data=form, timeout=300)\n", - " resp.raise_for_status()\n", - "\n", - " if \"\" in resp.text:\n", - " raise RuntimeError(f\"UCSC returned an error for {tbl_name}. Re-run the cell to retry.\")\n", - "\n", - " lines = resp.text.splitlines(keepends=True)\n", - " while lines:\n", - " tail = lines[-1].strip()\n", - " if not tail or tail.startswith(\"---\") or \"cookie\" in tail.lower():\n", - " lines.pop()\n", - " else:\n", - " break\n", + "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", "\n", - " with open(dest, \"w\") as f:\n", - " f.writelines(lines)\n", - " print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n", + "| File | Purpose | How to Generate |\n", + "|------|---------|-----------------| \n", + "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", + "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `000-Annotation-File-Processing.ipynb` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", + "| `gencode.v47.basic.annotation.processed.filtered.tsv` | Filtered transcripts with CDS sequences (hg38) | Run `000-Annotation-File-Processing.ipynb` Part 1 on the GENCODE v47 GTF file. |\n", "\n", - "print(\"\\nDone.\")" + "---\n" ] }, { "cell_type": "markdown", - "id": "59e18758", + "id": "8d094b99", "metadata": {}, "source": [ - "### 2. Download AlphaMissense Data\n", - "\n", - "The **AlphaMissense** data can only be downloaded manually due to the webiste's bot protection. [Download the zip file](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) in the `DATA_DIR/alphamissense_data` and run the next cell:" + "# Imports and Paths setup" ] }, { "cell_type": "code", - "execution_count": null, - "id": "40b6713c", + "execution_count": 1, + "id": "8dfcfbe1", "metadata": {}, "outputs": [], "source": [ - "import zipfile\n", - "\n", - "\n", - "print(\"AlphaMissense data\")\n", - "\n", - "am_data_dir = os.path.join(DATA_DIR, \"alphamissense_data\")\n", - "am_zip_path = os.path.join(am_data_dir, \"science.adg7492_data_s1_to_s9.zip\")\n", - "am_clinvar_path = os.path.join(am_data_dir, \"alphamissense_clinvar.csv\")\n", - "am_hotspot_path = os.path.join(am_data_dir, \"alphamissense_cancer_hotspot.csv\")\n", - "\n", - "if not os.path.exists(am_zip_path):\n", - " raise FileNotFoundError(\n", - " f\"Required file not found: {am_zip_path}\\n\"\n", - " \"Please manually download science.adg7492_data_s1_to_s9.zip into DATA_DIR/alphamissense_data/.\"\n", - " )\n", - "\n", - "with zipfile.ZipFile(am_zip_path, \"r\") as zf:\n", - " print(f\" Extracting zip → {zf.namelist()}\")\n", - " zf.extractall(am_data_dir)\n", - "\n", - "rename_map = {\n", - " \"science.adg7492_data_s5.csv\": am_clinvar_path,\n", - " \"science.adg7492_data_s6.csv\": am_hotspot_path,\n", - "}\n", - "\n", - "for src_name, dst_path in rename_map.items():\n", - " src_path = os.path.join(am_data_dir, src_name)\n", - " if os.path.exists(src_path):\n", - " os.replace(src_path, dst_path)\n", - " print(f\" Renamed {src_name} -> {os.path.basename(dst_path)}\")\n", - " elif os.path.exists(dst_path):\n", - " print(f\" [skip] {os.path.basename(dst_path)} already present\")\n", - " else:\n", - " raise FileNotFoundError(f\"Expected file not found after extraction: {src_path}\")" + "# Uncomment to install PyBigWig\n", + "# !pip install pyBigWig" ] }, { - "cell_type": "markdown", - "id": "5dd4178e", + "cell_type": "code", + "execution_count": 3, + "id": "5dae6998", "metadata": {}, + "outputs": [], "source": [ - "### 3. Data Scripts\n", + "import ast\n", + "import json\n", + "import os\n", + "import warnings\n", "\n", - "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyBigWig\n", + "import pyfaidx\n", + "import seaborn as sns\n", + "from Bio.Data import CodonTable\n", + "from Bio.Seq import Seq\n", + "from matplotlib.ticker import LogLocator\n", + "from tqdm import tqdm\n", "\n", - "| File | Purpose | How to Generate |\n", - "|------|---------|-----------------| \n", - "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", - "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `000-Annotation-File-Processing.ipynb` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", - "| `gencode.v47.basic.annotation.processed.filtered.tsv` | Filtered transcripts with CDS sequences (hg38) | Run `000-Annotation-File-Processing.ipynb` Part 1 on the GENCODE v47 GTF file. |\n", "\n", - "---" + "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", - "id": "21c15660", + "id": "01a546f6", "metadata": {}, "source": [ - "### 4. Downloaded Data Integrity Check\n", - "\n", - "Run the following cell to ensure that the `DATA_DIR` path structure (containing the files from the required pre-processing step) is in place:\n", + "Before setting the `DATA_DIR` path, ensure the following directory structure (containing the files from the required pre-processing steo) is in place:\n", "\n", "```\n", "📁 DATA_DIR/\n", @@ -430,92 +174,20 @@ " │ └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", " └── 📁 gnomad.genomes.v4.1/\n", " └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", - "```" + "```\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "be4365d5", + "execution_count": 74, + "id": "48d31ff4", "metadata": {}, "outputs": [], "source": [ - "expected_files = [\n", - " \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n", - " \"alphamissense_data/alphamissense_cancer_hotspot.csv\",\n", - " \"alphamissense_data/alphamissense_clinvar.csv\",\n", - " \"ddd_asd_zhouetal/asd_discov.csv\",\n", - " \"ddd_asd_zhouetal/asd_rep.csv\",\n", - " \"ddd_asd_zhouetal/ddd_other.csv\",\n", - " \"clinvar_syn/variant_summary.txt.gz\",\n", - " \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n", - " \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n", - " \"reference/hg19/hg19.fa\",\n", - " \"reference/hg19/hg19.fa.fai\",\n", - " \"reference/hg38/hg38.fa\",\n", - " \"reference/hg38/hg38.fa.fai\",\n", - " \"codon_counts_nopathogen.json\",\n", - " \"gencode.v47lift37.basic.annotation.processed.tsv\",\n", - " \"ucsc_gencodev32_hg38.tsv\",\n", - " \"ucsc_pliByGene_hg38.tsv\",\n", - " \"hg38.phyloP447way.bw\",\n", - "]\n", + "DATA_DIR = \"/data/for_paper/data\" # set this to the path of your data directory\n", "\n", - "missing = [f for f in expected_files if not os.path.exists(os.path.join(DATA_DIR, f))]\n", - "if missing:\n", - " print(f\"{len(missing)} file(s) missing from {DATA_DIR}:\")\n", - " for f in missing:\n", - " print(f\" ✗ {f}\")\n", - " raise FileNotFoundError(f\"{len(missing)} required file(s) missing — see list above.\")\n", - "else:\n", - " print(f\"All {len(expected_files)} required files found in {DATA_DIR}.\")" - ] - }, - { - "cell_type": "markdown", - "id": "8d094b99", - "metadata": {}, - "source": [ - "# Imports and Paths setup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "8dfcfbe1", - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment to install PyBigWig\n", - "# !pip install pyBigWig" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5dae6998", - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "import json\n", - "import os\n", - "import warnings\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import polars as pl\n", - "import pyBigWig\n", - "import pyfaidx\n", - "import seaborn as sns\n", - "from Bio.Data import CodonTable\n", - "from Bio.Seq import Seq\n", - "from matplotlib.ticker import LogLocator\n", - "from tqdm import tqdm\n", - "\n", - "\n", - "warnings.filterwarnings(\"ignore\")" + "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)" ] }, { @@ -6282,6 +5954,56 @@ "genes[\"is_canonical\"].sum()" ] }, + { + "cell_type": "code", + "execution_count": 85, + "id": "f9aefcc3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "shape: (19_310, 22)\n", + "┌───────┬─────────────┬────────────┬───────┬───┬────────────┬────────────┬────────────┬────────────┐\n", + "│ id ┆ gene_id ┆ name ┆ chrom ┆ … ┆ has_stop_c ┆ length_div ┆ has_intern ┆ cds_length │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ odon ┆ isible_by_ ┆ al_stop_co ┆ --- │\n", + "│ u64 ┆ str ┆ str ┆ str ┆ ┆ --- ┆ 3 ┆ dons ┆ i64 │\n", + "│ ┆ ┆ ┆ ┆ ┆ bool ┆ --- ┆ --- ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ bool ┆ bool ┆ │\n", + "╞═══════╪═════════════╪════════════╪═══════╪═══╪════════════╪════════════╪════════════╪════════════╡\n", + "│ 0 ┆ ENSG0000018 ┆ ENST000006 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 981 │\n", + "│ ┆ 6092.7 ┆ 41515.2 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1 ┆ ENSG0000028 ┆ ENST000003 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 939 │\n", + "│ ┆ 4662.2 ┆ 32831.5 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2 ┆ ENSG0000018 ┆ ENST000006 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 2535 │\n", + "│ ┆ 7634.13 ┆ 16016.5 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 3 ┆ ENSG0000018 ┆ ENST000003 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 2250 │\n", + "│ ┆ 8976.11 ┆ 27044.7 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 4 ┆ ENSG0000018 ┆ ENST000003 ┆ chr1 ┆ … ┆ true ┆ true ┆ false ┆ 1929 │\n", + "│ ┆ 7961.15 ┆ 38591.8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 19305 ┆ ENSG0000018 ┆ ENST000003 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 1266 │\n", + "│ ┆ 5973.12 ┆ 34398.8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19306 ┆ ENSG0000016 ┆ ENST000006 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 867 │\n", + "│ ┆ 8939.13 ┆ 95325.1 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19307 ┆ ENSG0000012 ┆ ENST000002 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 663 │\n", + "│ ┆ 4333.16 ┆ 86448.12 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19308 ┆ ENSG0000012 ┆ ENST000002 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 1566 │\n", + "│ ┆ 4334.19 ┆ 44174.11 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 19309 ┆ ENSG0000018 ┆ ENST000003 ┆ chrX ┆ … ┆ true ┆ true ┆ false ┆ 1437 │\n", + "│ ┆ 2484.15 ┆ 59512.8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────┴─────────────┴────────────┴───────┴───┴────────────┴────────────┴────────────┴────────────┘" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genes" + ] + }, { "cell_type": "markdown", "id": "69b84700", From 276866b1953ab05893b18164fdf8d1a9cbbb8838 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 24 Feb 2026 19:44:24 -0800 Subject: [PATCH 05/13] added download scripts --- .../00-Mutation-Datasets-Preprocessing.ipynb | 420 ++++++++++++++++-- 1 file changed, 374 insertions(+), 46 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb index 9169379569..08b3b0d85a 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb @@ -25,12 +25,23 @@ "\n", "## Required Pre-processing Steps\n", "\n", - "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed and saved at `/data/ncbi`\n", + "Before generation the mutation sequences for zero-shot benchmarks, ensure that the following files are downloaded/processed.\n", "\n", - "#### 1. Open-source Data Download\n", + "### 1. Open-source Data Download\n", "\n", + "There are two ways to obtain the data used by this notebook:\n", "\n", - "##### Reference Files\n", + "a. **Manual:**\n", + " - Use the links provided above to download each file individually.\n", + " - Use the [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) to export the required tables as TSV.\n", + " - Save them into the corresponding subdirectories under `DATA_DIR` (matching the filenames in the directory structure section above).\n", + "\n", + "b. **Automatic (recommended):**\n", + " - Create a UCSC account: [hgLogin](https://genome.ucsc.edu/cgi-bin/hgLogin)\n", + " - Generate an API key: [hgHubConnect](https://genome.ucsc.edu/cgi-bin/hgHubConnect) → click **\"generate key\"**\n", + " - Paste the key into `UCSC_API_KEY` in the download cell below, then run the cell.\n", + "\n", + "#### 1.a. Manual Download - Reference Files\n", "| File | Origin |\n", "|----------------|-------- |\n", "| `hg19.fa` | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz) |\n", @@ -69,74 +80,319 @@ "| `hg19.100way.phyloP100way.bw` | UCSC Genome Browser | [Download](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw) |\n", "| `ucsc_pliByGene_hg38.tsv` | UCSC Genome Browser → Table Browser | [Download](https://genome.ucsc.edu/cgi-bin/hgTables) (table: `pliByGene`) |\n", "| `gnomad.v2.1.1.lof_metrics.by_transcript.txt` | gnomAD | [Download](https://gnomad.broadinstitute.org/downloads) |\n", - "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n", + "| `variant_summary.txt.gz` | NCBI ClinVar (FTP) | [Download](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) |\n" + ] + }, + { + "cell_type": "markdown", + "id": "74ade3be", + "metadata": {}, + "source": [ + "### 1.b. Automatic Download\n", "\n", - "#### 2. Data Scripts\n", + "If you choose **Automatic**:\n", + " 1. Set the `DATA_DIR` where the files should be saved.\n", + " 2. Set the `UCSC_API_KEY` to download the tables form the UCSC table browser.\n", + " 3. Run the next cell to download the required datasets into `DATA_DIR`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713c7737", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import os\n", + "import shutil\n", + "import urllib.request\n", "\n", - "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", + "import pandas as pd\n", + "import requests\n", "\n", - "| File | Purpose | How to Generate |\n", - "|------|---------|-----------------| \n", - "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", - "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `000-Annotation-File-Processing.ipynb` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", - "| `gencode.v47.basic.annotation.processed.filtered.tsv` | Filtered transcripts with CDS sequences (hg38) | Run `000-Annotation-File-Processing.ipynb` Part 1 on the GENCODE v47 GTF file. |\n", "\n", - "---\n" + "# ── Set data directory ───────────────────────────────────────\n", + "DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n", + "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", + "UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n", + "# ─────────────────────────────────────────────────────────────\n", + "\n", + "# Create output directory\n", + "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", + "\n", + "for subdir in [\n", + " \"reference/hg19\",\n", + " \"reference/hg38\",\n", + " \"alphamissense_data\",\n", + " \"ddd_asd_zhouetal\",\n", + " \"clinvar_syn\",\n", + "]:\n", + " os.makedirs(os.path.join(DATA_DIR, subdir), exist_ok=True)\n", + "\n", + "\n", + "def download_file(url, dest, decompress_gz=False):\n", + " \"\"\"Download *url* → *dest*, optionally gunzipping in place. Skips if target already exists.\"\"\"\n", + " final = dest[:-3] if decompress_gz and dest.endswith(\".gz\") else dest\n", + " if os.path.exists(final):\n", + " print(f\" [skip] {os.path.relpath(final, DATA_DIR)}\")\n", + " return\n", + " print(f\" Downloading → {os.path.relpath(final, DATA_DIR)} ...\")\n", + " urllib.request.urlretrieve(url, dest)\n", + " if decompress_gz and dest.endswith(\".gz\"):\n", + " with gzip.open(dest, \"rb\") as f_in, open(final, \"wb\") as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + " os.remove(dest)\n", + "\n", + "\n", + "# ── 1. Reference genomes ────────────────────────────────────\n", + "print(\"Reference genomes\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz\",\n", + " os.path.join(DATA_DIR, \"reference/hg19/hg19.fa.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\",\n", + " os.path.join(DATA_DIR, \"reference/hg38/hg38.fa.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "# ── 2. GENCODE annotation (GTF) ─────────────────────────────\n", + "print(\"GENCODE annotation\")\n", + "download_file(\n", + " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n", + " os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n", + "print(\"DDD / ASD variant files\")\n", + "xlsx_sources = {\n", + " \"asd_discov\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM5_ESM.xlsx\",\n", + " \"asd_rep\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM6_ESM.xlsx\",\n", + " \"ddd_other\": \"https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01148-2/MediaObjects/41588_2022_1148_MOESM7_ESM.xlsx\",\n", + "}\n", + "\n", + "for name, url in xlsx_sources.items():\n", + " csv_path = os.path.join(DATA_DIR, \"ddd_asd_zhouetal\", f\"{name}.csv\")\n", + " if os.path.exists(csv_path):\n", + " print(f\" [skip] ddd_asd_zhouetal/{name}.csv\")\n", + " continue\n", + " xlsx_path = csv_path.replace(\".csv\", \".xlsx\")\n", + " download_file(url, xlsx_path)\n", + " print(f\" Converting {name}.xlsx → csv ...\")\n", + " pd.read_excel(xlsx_path).to_csv(csv_path, index=False)\n", + "\n", + "# ── 4. ClinVar variant summary ──────────────────────────────\n", + "print(\"ClinVar variant summary\")\n", + "download_file(\n", + " \"https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz\",\n", + " os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n", + ")\n", + "\n", + "# ── 5. phyloP conservation scores ───────────────────────────\n", + "print(\"phyloP447way conservation scores\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n", + " os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n", + ")\n", + "\n", + "# ── 6. UCSC Table Browser downloads ─────────────────────────\n", + "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n", + "UCSC_TABLES = {\n", + " \"wgEncodeGencodeCompV32\": {\n", + " \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n", + " \"subdir\": \"\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"wgEncodeGencodeCompV32\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"ncbiRefSeq\": {\n", + " \"filename\": \"ucsc_refseq_hg38.tsv\",\n", + " \"subdir\": \"clinvar_syn\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"ncbiRefSeq\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"ncbiRefSeqHistorical\": {\n", + " \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n", + " \"subdir\": \"clinvar_syn\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"allTables\",\n", + " \"hgta_track\": \"hg38\",\n", + " \"hgta_table\": \"ncbiRefSeqHistorical\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + " \"pliByGene\": {\n", + " \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n", + " \"subdir\": \"\",\n", + " \"form\": {\n", + " \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n", + " \"clade\": \"mammal\",\n", + " \"org\": \"Human\",\n", + " \"db\": \"hg38\",\n", + " \"hgta_group\": \"varRep\",\n", + " \"hgta_track\": \"gnomadPLI\",\n", + " \"hgta_table\": \"pliByGene\",\n", + " \"hgta_regionType\": \"genome\",\n", + " \"position\": \"chr7:155,799,529-155,812,871\",\n", + " \"hgta_outSep\": \"tab\",\n", + " \"hgta_doTopSubmit\": \"Get output\",\n", + " },\n", + " },\n", + "}\n", + "\n", + "print(\"UCSC Table Browser downloads\")\n", + "if not UCSC_API_KEY:\n", + " print(\" UCSC_API_KEY is not set — skipping automatic download.\")\n", + " print(\" Download these tables manually from https://genome.ucsc.edu/cgi-bin/hgTables:\")\n", + " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", + " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", + " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", + " status = \"found\" if os.path.exists(dest) else \"MISSING\"\n", + " rel = os.path.join(tbl_cfg[\"subdir\"], tbl_cfg[\"filename\"]) if tbl_cfg[\"subdir\"] else tbl_cfg[\"filename\"]\n", + " print(f\" [{status}] {rel} (table: {tbl_name})\")\n", + "else:\n", + " for tbl_name, tbl_cfg in UCSC_TABLES.items():\n", + " dest_dir = os.path.join(DATA_DIR, tbl_cfg[\"subdir\"]) if tbl_cfg[\"subdir\"] else DATA_DIR\n", + " os.makedirs(dest_dir, exist_ok=True)\n", + " dest = os.path.join(dest_dir, tbl_cfg[\"filename\"])\n", + "\n", + " if os.path.exists(dest):\n", + " print(f\" [skip] {os.path.relpath(dest, DATA_DIR)}\")\n", + " continue\n", + "\n", + " print(f\" Downloading {tbl_name} → {os.path.relpath(dest, DATA_DIR)} ...\")\n", + " form = {**tbl_cfg[\"form\"], \"apiKey\": UCSC_API_KEY}\n", + " resp = requests.post(UCSC_URL, data=form, timeout=300)\n", + " resp.raise_for_status()\n", + "\n", + " if \"\" in resp.text:\n", + " raise RuntimeError(f\"UCSC returned an error for {tbl_name}. Re-run the cell to retry.\")\n", + "\n", + " lines = resp.text.splitlines(keepends=True)\n", + " while lines:\n", + " tail = lines[-1].strip()\n", + " if not tail or tail.startswith(\"---\") or \"cookie\" in tail.lower():\n", + " lines.pop()\n", + " else:\n", + " break\n", + "\n", + " with open(dest, \"w\") as f:\n", + " f.writelines(lines)\n", + " print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n", + "\n", + "print(\"\\nDone.\")" ] }, { "cell_type": "markdown", - "id": "8d094b99", + "id": "3ba6d77d", "metadata": {}, "source": [ - "# Imports and Paths setup" + "### 2. Download AlphaMissense Data\n", + "\n", + "The **AlphaMissense** data can only be downloaded manually due to the webiste's bot protection. [Download the zip file](https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip) in the `DATA_DIR/alphamissense_data` and run the next cell:" ] }, { "cell_type": "code", - "execution_count": 1, - "id": "8dfcfbe1", + "execution_count": null, + "id": "8741cb10", "metadata": {}, "outputs": [], "source": [ - "# Uncomment to install PyBigWig\n", - "# !pip install pyBigWig" + "import zipfile\n", + "\n", + "\n", + "print(\"AlphaMissense data\")\n", + "\n", + "am_data_dir = os.path.join(DATA_DIR, \"alphamissense_data\")\n", + "am_zip_path = os.path.join(am_data_dir, \"science.adg7492_data_s1_to_s9.zip\")\n", + "am_clinvar_path = os.path.join(am_data_dir, \"alphamissense_clinvar.csv\")\n", + "am_hotspot_path = os.path.join(am_data_dir, \"alphamissense_cancer_hotspot.csv\")\n", + "\n", + "if not os.path.exists(am_zip_path):\n", + " raise FileNotFoundError(\n", + " f\"Required file not found: {am_zip_path}\\n\"\n", + " \"Please manually download science.adg7492_data_s1_to_s9.zip into DATA_DIR/alphamissense_data/.\"\n", + " )\n", + "\n", + "with zipfile.ZipFile(am_zip_path, \"r\") as zf:\n", + " print(f\" Extracting zip → {zf.namelist()}\")\n", + " zf.extractall(am_data_dir)\n", + "\n", + "rename_map = {\n", + " \"science.adg7492_data_s5.csv\": am_clinvar_path,\n", + " \"science.adg7492_data_s6.csv\": am_hotspot_path,\n", + "}\n", + "\n", + "for src_name, dst_path in rename_map.items():\n", + " src_path = os.path.join(am_data_dir, src_name)\n", + " if os.path.exists(src_path):\n", + " os.replace(src_path, dst_path)\n", + " print(f\" Renamed {src_name} -> {os.path.basename(dst_path)}\")\n", + " elif os.path.exists(dst_path):\n", + " print(f\" [skip] {os.path.basename(dst_path)} already present\")\n", + " else:\n", + " raise FileNotFoundError(f\"Expected file not found after extraction: {src_path}\")" ] }, { - "cell_type": "code", - "execution_count": 3, - "id": "5dae6998", + "cell_type": "markdown", + "id": "b95805e7", "metadata": {}, - "outputs": [], "source": [ - "import ast\n", - "import json\n", - "import os\n", - "import warnings\n", + "### 3. Data Scripts\n", "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import polars as pl\n", - "import pyBigWig\n", - "import pyfaidx\n", - "import seaborn as sns\n", - "from Bio.Data import CodonTable\n", - "from Bio.Seq import Seq\n", - "from matplotlib.ticker import LogLocator\n", - "from tqdm import tqdm\n", + "Before running this notebook, ensure the following preprocessing scripts have been executed:\n", "\n", + "| File | Purpose | How to Generate |\n", + "|------|---------|-----------------| \n", + "| `codon_counts_nopathogen.json` | Codon counts by taxonomic group (used for codon frequency features) | Run `python data_scripts/check_codon_frequency.py` after completing NCBI preprocessing in `data_scripts/data_curation/`. Place or symlink the produced file at `/data/ncbi/codon_counts_nopathogen.json`. |\n", + "| `gencode.v47lift37.basic.annotation.processed.tsv` | Processed GTF annotation with CDS coordinates | Run `000-Annotation-File-Processing.ipynb` on the downloaded GENCODE GTF file `gencode.v47lift37.basic.annotation.gtf`. |\n", + "| `gencode.v47.basic.annotation.processed.filtered.tsv` | Filtered transcripts with CDS sequences (hg38) | Run `000-Annotation-File-Processing.ipynb` Part 1 on the GENCODE v47 GTF file. |\n", "\n", - "warnings.filterwarnings(\"ignore\")" + "---" ] }, { "cell_type": "markdown", - "id": "01a546f6", + "id": "ffb9ba7a", "metadata": {}, "source": [ - "Before setting the `DATA_DIR` path, ensure the following directory structure (containing the files from the required pre-processing steo) is in place:\n", + "### 4. Downloaded Data Integrity Check\n", + "\n", + "Run the following cell to ensure that the `DATA_DIR` path structure (containing the files from the required pre-processing step) is in place:\n", "\n", "```\n", "📁 DATA_DIR/\n", @@ -174,20 +430,92 @@ " │ └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", " └── 📁 gnomad.genomes.v4.1/\n", " └── {chrom}.tsv.gz (chr1-22, chrX, chrY)\n", - "```\n" + "```" ] }, { "cell_type": "code", - "execution_count": 74, - "id": "48d31ff4", + "execution_count": null, + "id": "b28b4e2d", "metadata": {}, "outputs": [], "source": [ - "DATA_DIR = \"/data/for_paper/data\" # set this to the path of your data directory\n", + "expected_files = [\n", + " \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n", + " \"alphamissense_data/alphamissense_cancer_hotspot.csv\",\n", + " \"alphamissense_data/alphamissense_clinvar.csv\",\n", + " \"ddd_asd_zhouetal/asd_discov.csv\",\n", + " \"ddd_asd_zhouetal/asd_rep.csv\",\n", + " \"ddd_asd_zhouetal/ddd_other.csv\",\n", + " \"clinvar_syn/variant_summary.txt.gz\",\n", + " \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n", + " \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n", + " \"reference/hg19/hg19.fa\",\n", + " \"reference/hg19/hg19.fa.fai\",\n", + " \"reference/hg38/hg38.fa\",\n", + " \"reference/hg38/hg38.fa.fai\",\n", + " \"codon_counts_nopathogen.json\",\n", + " \"gencode.v47lift37.basic.annotation.processed.tsv\",\n", + " \"ucsc_gencodev32_hg38.tsv\",\n", + " \"ucsc_pliByGene_hg38.tsv\",\n", + " \"hg38.phyloP447way.bw\",\n", + "]\n", "\n", - "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", - "os.makedirs(OUTPUT_DIR, exist_ok=True)" + "missing = [f for f in expected_files if not os.path.exists(os.path.join(DATA_DIR, f))]\n", + "if missing:\n", + " print(f\"{len(missing)} file(s) missing from {DATA_DIR}:\")\n", + " for f in missing:\n", + " print(f\" ✗ {f}\")\n", + " raise FileNotFoundError(f\"{len(missing)} required file(s) missing — see list above.\")\n", + "else:\n", + " print(f\"All {len(expected_files)} required files found in {DATA_DIR}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d094b99", + "metadata": {}, + "source": [ + "# Imports and Paths setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8dfcfbe1", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to install PyBigWig\n", + "# !pip install pyBigWig" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5dae6998", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import json\n", + "import os\n", + "import warnings\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyBigWig\n", + "import pyfaidx\n", + "import seaborn as sns\n", + "from Bio.Data import CodonTable\n", + "from Bio.Seq import Seq\n", + "from matplotlib.ticker import LogLocator\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "warnings.filterwarnings(\"ignore\")" ] }, { From d7c7bd1cab794a57d3f96854d5a16aa22d5d8349 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 25 Feb 2026 04:22:16 -0800 Subject: [PATCH 06/13] Improve check_codon_frequency --- .../data_scripts/check_codon_frequency.py | 73 ++++++++++--------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py index 5c130a4f96..567fcc73d5 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py @@ -15,6 +15,7 @@ # %% +import argparse import json import sys from pathlib import Path @@ -26,38 +27,44 @@ sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer +def main(data_dir: Path): + data_path = data_dir / Path("processed_unfiltered") + tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json"))) + metadata = json.load(open(data_path / "metadata.json")) + tokenizer = Tokenizer() -data_path = Path("/data/ncbi/processed_unfiltered") -tax_ids_to_remove = json.load(open("/data/ncbi/taxids_to_remove.json")) -metadata = json.load(open(data_path / "metadata.json")) -tokenizer = Tokenizer() - - -groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 -counts = {g: np.zeros(tokenizer.vocab_size) for g in groups} -for fm, cm in tqdm(zip(metadata["file_metadata"], metadata["chunks"]), total=len(metadata["file_metadata"])): - group = fm["file_name"][:-4] - if group in tax_ids_to_remove: - curr_taxids_to_remove = set(tax_ids_to_remove[group]) - else: - curr_taxids_to_remove = set() - mmap = np.memmap( - data_path / cm["sequences"]["path"], - dtype=cm["sequences"]["dtype"], - mode="r", - shape=tuple(cm["sequences"]["shape"]), - ) - idx_mmap = np.memmap( - data_path / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) - ) - for start, end, taxid in idx_mmap: - if taxid in curr_taxids_to_remove: - continue - seq = mmap[start:end] - idx, count = np.unique(seq, return_counts=True) - counts[group][idx] += count + groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 + counts = {g: np.zeros(tokenizer.vocab_size) for g in groups} + for fm, cm in tqdm(zip(metadata["file_metadata"], metadata["chunks"]), total=len(metadata["file_metadata"])): + group = fm["file_name"][:-4] + if group in tax_ids_to_remove: + curr_taxids_to_remove = set(tax_ids_to_remove[group]) + else: + curr_taxids_to_remove = set() + mmap = np.memmap( + data_path / cm["sequences"]["path"], + dtype=cm["sequences"]["dtype"], + mode="r", + shape=tuple(cm["sequences"]["shape"]), + ) + idx_mmap = np.memmap( + data_path / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) + ) + for start, end, taxid in idx_mmap: + if taxid in curr_taxids_to_remove: + continue + seq = mmap[start:end] + idx, count = np.unique(seq, return_counts=True) + counts[group][idx] += count -# %% -for g in counts: - counts[g] = counts[g].tolist() -json.dump(counts, open("/data/ncbi/codon_counts_nopathogen.json", "w")) + # %% + for g in counts: + counts[g] = counts[g].tolist() + json.dump(counts, open("/data/ncbi/codon_counts_nopathogen.json", "w")) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Check codon frequency") + parser.add_argument("--data_dir", type=str, required=True) + args = parser.parse_args() + main(Path(args.data_dir)) \ No newline at end of file From edf20bbf8d308edeff4e73972f9595eb0a6b5c20 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 25 Feb 2026 07:00:24 -0800 Subject: [PATCH 07/13] fix memmap creator --- .../codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py index e7f4b8b1bb..183aaee376 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py @@ -17,6 +17,7 @@ import argparse import json import os +import sys from multiprocessing import Pool, cpu_count import numpy as np @@ -24,6 +25,7 @@ import pyarrow.parquet as pq from tqdm import tqdm +sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer From 617e82fa30a0216b1c4d2d0d2992a4829f9d6689 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 25 Feb 2026 12:31:44 -0800 Subject: [PATCH 08/13] added new notebook --- .../000-Annotation-File-Processing.ipynb | 589 ++++++++++++++++++ 1 file changed, 589 insertions(+) create mode 100644 bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb new file mode 100644 index 0000000000..2602b47b02 --- /dev/null +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb @@ -0,0 +1,589 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 38, + "id": "92fb3d66", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GENCODE version: v47\n", + "Reference directory: /data/for_paper/data/reference\n" + ] + } + ], + "source": [ + "# Annotation File Processing\n", + "# This notebook processes GENCODE GTF annotation files to extract protein-coding\n", + "# transcript information and exports it in a tabular format suitable for downstream analysis.\n", + "\n", + "import polars as pl\n", + "\n", + "# =============================================================================\n", + "# Configuration\n", + "# =============================================================================\n", + "REFERENCE_DIR = \"/data/for_paper/data/reference\"\n", + "GENCODE_VERSION = \"v47\"\n", + "\n", + "# Input: GENCODE GTF files (downloaded from https://www.gencodegenes.org/)\n", + "GTF_FILES = {\n", + " \"hg38\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.gtf.gz\",\n", + " \"hg19\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}lift37.basic.annotation.gtf.gz\",\n", + "}\n", + "\n", + "# Output: Processed annotation TSV files\n", + "OUTPUT_FILES = {\n", + " \"hg38\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.processed.tsv\",\n", + " \"hg19\": f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}lift37.basic.annotation.processed.tsv\",\n", + "}\n", + "\n", + "print(f\"GENCODE version: {GENCODE_VERSION}\")\n", + "print(f\"Reference directory: {REFERENCE_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "562e1104", + "metadata": {}, + "outputs": [], + "source": [ + "# Helper functions to validate and adjust CDS (coding sequence) boundaries\n", + "# These functions ensure that the exon coordinates properly include the stop codon (3 bp)\n", + "\n", + "def check_start_alignment(row):\n", + " \"\"\"\n", + " Validates that CDS start aligns with exon start and adjusts for stop codon on minus strand.\n", + " For minus strand genes, the stop codon is at the 3' end (lowest genomic position), so we subtract 3 bp.\n", + " \"\"\"\n", + " cds_start = row['cds_start']\n", + " exon_starts = list(map(int, row['exon_starts'].strip(',').split(',')))\n", + " if row['strand'] == '-':\n", + " # Extend first exon by 3 bp to include stop codon (on minus strand)\n", + " exon_starts[0] -= 3\n", + " assert cds_start == exon_starts[0], f\"{cds_start} != {exon_starts[0]} {row['transcript_id']}\"\n", + "\n", + " exon_starts = ','.join(map(str, exon_starts)) + ','\n", + " return exon_starts\n", + "\n", + "def check_end_alignment(row):\n", + " \"\"\"\n", + " Validates that CDS end aligns with exon end and adjusts for stop codon on plus strand.\n", + " For plus strand genes, the stop codon is at the 3' end (highest genomic position), so we add 3 bp.\n", + " \"\"\"\n", + " cds_end = row['cds_end']\n", + " exon_ends = list(map(int, row['exon_ends'].strip(',').split(',')))\n", + " if row['strand'] == '+':\n", + " # Extend last exon by 3 bp to include stop codon (on plus strand)\n", + " exon_ends[-1] += 3\n", + " assert cds_end == exon_ends[-1], f\"{cds_end} != {exon_ends[-1]} {row['transcript_id']}\"\n", + "\n", + " exon_ends = ','.join(map(str, exon_ends)) + ','\n", + " return exon_ends" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "af9b630f", + "metadata": {}, + "outputs": [], + "source": [ + "def process_gtf_file(gtf_file, output_file):\n", + " \"\"\"\n", + " Process a GENCODE GTF file and extract protein-coding transcript annotations.\n", + " \n", + " This function:\n", + " 1. Parses the GTF file and extracts relevant attributes\n", + " 2. Filters for protein-coding genes only\n", + " 3. Aggregates exon/CDS coordinates per transcript\n", + " 4. Adjusts coordinates to include stop codons\n", + " 5. Outputs a tab-separated file with transcript annotations\n", + " \n", + " Args:\n", + " gtf_file: Path to input GENCODE GTF file (can be gzipped)\n", + " output_file: Path for output TSV file\n", + " \"\"\"\n", + " # Read GTF file (standard 9-column format)\n", + " gtf = pl.read_csv(gtf_file, comment_prefix='#', separator='\\t', has_header=False)\n", + " gtf.columns = ['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']\n", + " \n", + " # Parse the attribute column (column 9) to extract key-value pairs\n", + " # GTF attributes are semicolon-separated with format: key \"value\"\n", + " gtf = gtf.with_columns([\n", + " pl.col('attribute').str.extract('gene_id \"(.*?)\"', 1).alias('gene_id'),\n", + " pl.col('attribute').str.extract('transcript_id \"(.*?)\"', 1).alias('transcript_id'),\n", + " pl.col('attribute').str.extract('gene_name \"(.*?)\"', 1).alias('gene_name'),\n", + " pl.col('attribute').str.extract('gene_type \"(.*?)\"', 1).alias('gene_type'),\n", + " pl.col('attribute').str.extract('transcript_type \"(.*?)\"', 1).alias('transcript_type'),\n", + " pl.col('attribute').str.extract('exon_number (.*?);', 1).alias('exon_number')\n", + " ])\n", + " \n", + " # Flag canonical transcripts (Ensembl canonical and MANE Select)\n", + " gtf = gtf.with_columns(pl.col('attribute').str.contains('Ensembl_canonical').alias('is_canonical'))\n", + " gtf = gtf.with_columns(pl.col('attribute').str.contains('MANE_Select').alias('is_mane_select'))\n", + " \n", + " # Filter to protein-coding genes only, exclude gene-level features\n", + " protein_coding_gtf = gtf.filter((pl.col('gene_type') == 'protein_coding') & (pl.col('feature') != 'gene'))\n", + " # Filter to protein-coding transcripts only\n", + " protein_coding_gtf = protein_coding_gtf.filter(pl.col('transcript_type') == 'protein_coding')\n", + " \n", + " # Convert from 1-based (GTF) to 0-based coordinates (BED-like format)\n", + " protein_coding_gtf = protein_coding_gtf.with_columns(pl.col('start') - 1)\n", + " \n", + " # Aggregate CDS exon coordinates per transcript\n", + " # Creates comma-separated lists of exon start/end positions (sorted by genomic position)\n", + " exon_starts = protein_coding_gtf.filter(pl.col('feature') == 'CDS').group_by('transcript_id').agg(\n", + " (pl.col('start').sort().cast(str).str.join(',') + ',').alias('exon_starts'),\n", + " (pl.col('end').sort().cast(str).str.join(',') + ',').alias('exon_ends'),\n", + " pl.col('exon_number').max().alias('exon_count')\n", + " )\n", + " \n", + " # Calculate CDS boundaries with stop codon adjustment\n", + " # GENCODE GTF excludes stop codon from CDS, but we want to include it\n", + " # For + strand: stop codon is after the last CDS position (add 3 to max end)\n", + " # For - strand: stop codon is before the first CDS position (subtract 3 from min start)\n", + " # Note: Using min()/max() instead of first()/last() to avoid dependency on row order\n", + " cds_starts = protein_coding_gtf.filter(pl.col('feature') == 'CDS').group_by('transcript_id').agg(\n", + " pl.when(pl.col('strand').first() == '-')\n", + " .then(pl.col('start').min() - 3) # Include stop codon at 3' end (lowest genomic position)\n", + " .otherwise(pl.col('start').min()) # 5' end, no adjustment needed\n", + " .alias('cds_start'),\n", + " pl.when(pl.col('strand').first() == '-')\n", + " .then(pl.col('end').max()) # 5' end (highest genomic position), no adjustment\n", + " .otherwise(pl.col('end').max() + 3) # Include stop codon at 3' end\n", + " .alias('cds_end'),\n", + " )\n", + " \n", + " # Get transcript-level metadata (gene info, coordinates, canonical status)\n", + " tx_starts = protein_coding_gtf.filter(pl.col('feature') == 'transcript').group_by('transcript_id').agg(\n", + " pl.col('gene_id').first().alias('gene_id'),\n", + " pl.col('gene_name').first().alias('gene_name'),\n", + " pl.col('chrom').first().alias('chrom'),\n", + " pl.col('strand').first().alias('strand'),\n", + " pl.col('start').min().alias('tx_start'),\n", + " pl.col('end').max().alias('tx_end'),\n", + " pl.col('transcript_type').first().alias('transcript_type'),\n", + " pl.col('is_canonical').first().alias('is_canonical'),\n", + " pl.col('is_mane_select').first().alias('is_mane_select'),\n", + " )\n", + " \n", + " # Join all transcript information together\n", + " joined_df = tx_starts.join(cds_starts, on='transcript_id', how='inner')\\\n", + " .join(exon_starts, on='transcript_id', how='inner')\n", + " \n", + " # Validate and adjust exon coordinates to include stop codon\n", + " joined_df = joined_df.with_columns(\n", + " pl.struct(['cds_start', 'exon_starts', 'strand', 'transcript_id']).map_elements(check_start_alignment, return_dtype=pl.Utf8).alias('exon_starts'),\n", + " pl.struct(['cds_end', 'exon_ends', 'strand', 'transcript_id']).map_elements(check_end_alignment, return_dtype=pl.Utf8).alias('exon_ends')\n", + " )\n", + " \n", + " # Sort by chromosome and position, then select and rename columns for output\n", + " joined_df = joined_df.sort(['chrom', 'tx_start'])\n", + " joined_df = joined_df.select([\n", + " 'gene_id',\n", + " 'transcript_id',\n", + " 'chrom',\n", + " 'strand',\n", + " 'tx_start',\n", + " 'tx_end', \n", + " 'cds_start',\n", + " 'cds_end',\n", + " 'exon_count',\n", + " 'exon_starts',\n", + " 'exon_ends',\n", + " 'gene_name',\n", + " 'transcript_type',\n", + " 'is_canonical',\n", + " 'is_mane_select'\n", + " ]).rename({\n", + " 'transcript_id': 'name', # Transcript ID becomes the 'name' field\n", + " 'tx_start': 'txStart', # Transcript start position\n", + " 'tx_end': 'txEnd', # Transcript end position\n", + " 'cds_start': 'cdsStart', # CDS start (including stop codon adjustment)\n", + " 'cds_end': 'cdsEnd', # CDS end (including stop codon adjustment)\n", + " 'exon_starts': 'exonStarts', # Comma-separated exon start positions\n", + " 'exon_ends': 'exonEnds' # Comma-separated exon end positions\n", + " })\n", + "\n", + " # Write output as tab-separated file\n", + " joined_df.write_csv(output_file, separator='\\t')\n", + "\n", + " return joined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "62147898", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing hg38: /data/for_paper/data/reference/gencode.v47.basic.annotation.gtf.gz\n", + "Output saved to: /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.tsv\n", + "\n", + "Processing hg19: /data/for_paper/data/reference/gencode.v47lift37.basic.annotation.gtf.gz\n", + "Output saved to: /data/for_paper/data/reference/gencode.v47lift37.basic.annotation.processed.tsv\n", + "\n" + ] + } + ], + "source": [ + "# Process GENCODE annotation files for both GRCh38 (hg38) and GRCh37 (hg19) assemblies\n", + "\n", + "for assembly in [\"hg38\", \"hg19\"]:\n", + " gtf_file = GTF_FILES[assembly]\n", + " output_file = OUTPUT_FILES[assembly]\n", + " \n", + " print(f\"Processing {assembly}: {gtf_file}\")\n", + " _ = process_gtf_file(gtf_file, output_file)\n", + " print(f\"Output saved to: {output_file}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "10eb865d", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 1: Quality Control of CDS and Extract CDS Sequence\n", + "\n", + "This section extracts CDS sequences from the reference genome and validates them for downstream variant analysis.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1c02b365", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reference genome: /data/for_paper/data/reference/hg38/hg38.fa\n", + "Annotation file: /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.tsv\n", + "Output file: /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.filtered.tsv\n" + ] + } + ], + "source": [ + "# =============================================================================\n", + "# Additional Configuration for CDS Extraction\n", + "# =============================================================================\n", + "\n", + "# Reference genome (GRCh38/hg38)\n", + "REFERENCE_GENOME = f\"{REFERENCE_DIR}/hg38/hg38.fa\"\n", + "\n", + "# Input: Processed annotation from above\n", + "ANNOTATION_FILE = OUTPUT_FILES[\"hg38\"]\n", + "\n", + "# Output: Filtered transcripts with CDS sequences\n", + "FILTERED_TRANSCRIPTS_FILE = f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.processed.filtered.tsv\"\n", + "\n", + "# Valid chromosomes for analysis\n", + "VALID_CHROMS = [f'chr{i}' for i in range(1, 23)] + ['chrX']\n", + "\n", + "# DNA complement mapping for reverse complement operations\n", + "COMPLEMENT = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}\n", + "\n", + "print(f\"Reference genome: {REFERENCE_GENOME}\")\n", + "print(f\"Annotation file: {ANNOTATION_FILE}\")\n", + "print(f\"Output file: {FILTERED_TRANSCRIPTS_FILE}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "486819ee", + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================================\n", + "# CDS Extraction and Quality Control Functions\n", + "# =============================================================================\n", + "\n", + "def extract_cds_sequence(row, fasta):\n", + " \"\"\"\n", + " Extract the coding sequence (CDS) for a transcript from the reference genome.\n", + " \n", + " This function:\n", + " 1. Iterates through exons and extracts only the CDS-overlapping portions\n", + " 2. Concatenates exon sequences in genomic order\n", + " 3. Reverse complements for minus strand genes\n", + " \n", + " Args:\n", + " row: DataFrame row containing transcript annotation (chrom, strand, cdsStart, cdsEnd, exonStarts, exonEnds)\n", + " fasta: Dictionary mapping chromosome names to their sequences\n", + " \n", + " Returns:\n", + " str: The complete CDS sequence in 5' to 3' orientation (transcript strand)\n", + " \"\"\"\n", + " chrom = row['chrom']\n", + " strand = row['strand']\n", + " cds_start = row['cdsStart']\n", + " cds_end = row['cdsEnd']\n", + " \n", + " # Parse comma-separated exon coordinates from annotation file\n", + " exon_starts = [int(x) for x in row['exonStarts'].rstrip(',').split(',')]\n", + " exon_ends = [int(x) for x in row['exonEnds'].rstrip(',').split(',')]\n", + "\n", + " # Ensure exon boundaries encompass the full CDS (handles edge cases)\n", + " if exon_starts[0] > cds_start:\n", + " exon_starts[0] = cds_start\n", + " if exon_ends[-1] < cds_end:\n", + " exon_ends[-1] = cds_end\n", + "\n", + " # Extract CDS sequence by iterating through exons\n", + " cds_sequence = \"\"\n", + " \n", + " for start, end in zip(exon_starts, exon_ends):\n", + " # Find overlap between this exon and the CDS region\n", + " overlap_start = max(start, cds_start)\n", + " overlap_end = min(end, cds_end)\n", + " \n", + " if overlap_start < overlap_end:\n", + " # Extract sequence from this exon segment (0-based coordinates)\n", + " seq = str(fasta[chrom][overlap_start:overlap_end]).upper()\n", + " cds_sequence += seq\n", + " \n", + " # For minus strand genes, reverse complement to get 5' to 3' orientation\n", + " if strand == '-':\n", + " cds_sequence = ''.join(COMPLEMENT[base] for base in cds_sequence[::-1])\n", + " \n", + " return cds_sequence\n", + "\n", + "\n", + "def check_cds_quality(sequence):\n", + " \"\"\"\n", + " Validate CDS sequence quality for downstream variant analysis.\n", + " \n", + " Quality criteria checked:\n", + " 1. Starts with ATG (methionine start codon)\n", + " 2. Ends with a stop codon (TAA, TAG, or TGA)\n", + " 3. Length is divisible by 3 (complete codons)\n", + " 4. No premature stop codons within the coding region\n", + " \n", + " Args:\n", + " sequence: CDS nucleotide sequence string\n", + " \n", + " Returns:\n", + " dict: Quality metrics including boolean flags and sequence length\n", + " \"\"\"\n", + " if not sequence or len(sequence) < 3:\n", + " return {\n", + " 'has_start_codon': False,\n", + " 'has_stop_codon': False,\n", + " 'length_divisible_by_3': False,\n", + " 'has_internal_stop_codons': False,\n", + " 'length': len(sequence) if sequence else 0\n", + " }\n", + " \n", + " # Check for canonical start codon (ATG = Methionine)\n", + " has_start_codon = sequence[:3] == 'ATG'\n", + " \n", + " # Check for stop codon at the end\n", + " has_stop_codon = sequence[-3:] in ['TAA', 'TAG', 'TGA']\n", + " \n", + " # CDS should be in-frame (length divisible by 3)\n", + " length_divisible_by_3 = len(sequence) % 3 == 0\n", + " \n", + " # Check for internal stop codons (premature termination)\n", + " # These indicate potential annotation errors or pseudogenes\n", + " has_internal_stop_codons = False\n", + " if len(sequence) >= 6: # Need at least 2 codons to check for internal stops\n", + " # Check all codons except the last one (which should be a stop)\n", + " for i in range(0, len(sequence) - 3, 3):\n", + " codon = sequence[i:i+3]\n", + " if codon in ['TAA', 'TAG', 'TGA']:\n", + " has_internal_stop_codons = True\n", + " break\n", + " \n", + " return {\n", + " 'has_start_codon': has_start_codon,\n", + " 'has_stop_codon': has_stop_codon,\n", + " 'length_divisible_by_3': length_divisible_by_3,\n", + " 'has_internal_stop_codons': has_internal_stop_codons,\n", + " 'length': len(sequence)\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "7a083e5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 23 chromosomes from /data/for_paper/data/reference/hg38/hg38.fa\n" + ] + } + ], + "source": [ + "# =============================================================================\n", + "# Load Reference Genome (GRCh38/hg38)\n", + "# =============================================================================\n", + "# Pre-load chromosome sequences into memory for faster access during variant generation.\n", + "# Only loading standard chromosomes (1-22, X, Y) - excluding patches and alternate contigs.\n", + "\n", + "import pyfaidx\n", + "fasta = {}\n", + "\n", + "with pyfaidx.Fasta(REFERENCE_GENOME) as f:\n", + " for chrom in VALID_CHROMS:\n", + " fasta[chrom] = f[chrom][:].seq\n", + "print(f\"Loaded {len(fasta)} chromosomes from {REFERENCE_GENOME}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "9f350338", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 64,488 transcripts from /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.tsv\n", + "After deduplicating by genomic structure: 51,650 transcripts\n", + "Extracting CDS sequences...\n", + "Running quality checks...\n", + "\n", + "============================================================\n", + "CDS Quality Summary (before filtering):\n", + "============================================================\n", + "Total transcripts: 51,650\n", + "Has start codon (ATG): 51,419 (99.6%)\n", + "Has stop codon (TAA/TAG/TGA): 51,341 (99.4%)\n", + "Length divisible by 3: 51,573 (99.9%)\n", + "Has internal stop codons: 113 (0.2%)\n", + "All quality criteria met: 51,061\n", + "\n", + "After quality filtering: 51,061 transcripts\n", + "After filtering to canonical transcripts: 19,407 transcripts\n", + "After canonical filter + CDS deduplication: 19,310 unique transcripts\n", + " (Removed 31,751 transcripts)\n", + "Saved 19,310 unique transcripts to /data/for_paper/data/reference/gencode.v47.basic.annotation.processed.filtered.tsv\n" + ] + } + ], + "source": [ + "# =============================================================================\n", + "# Load Annotations, Extract CDS Sequences, and Apply Quality Filters\n", + "# =============================================================================\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 1: Load processed GENCODE annotation\n", + "# -----------------------------------------------------------------------------\n", + "# Input: TSV file from the processing above containing\n", + "# transcript coordinates, exon boundaries, and canonical status flags\n", + "ann = pl.read_csv(ANNOTATION_FILE, separator='\\t')\n", + "ann = ann.filter(pl.col('chrom').is_in(VALID_CHROMS))\n", + "print(f\"Loaded {len(ann):,} transcripts from {ANNOTATION_FILE}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 2: Deduplicate transcripts with identical genomic structure\n", + "# -----------------------------------------------------------------------------\n", + "# Multiple transcript IDs can map to the same CDS coordinates (e.g., RefSeq vs Ensembl)\n", + "# Keep MANE Select > Ensembl Canonical when duplicates exist\n", + "ann = ann.sort(['is_mane_select', 'is_canonical'], descending=True)\n", + "ann = ann.unique(subset=['chrom', 'strand', 'cdsStart', 'cdsEnd', 'exonStarts', 'exonEnds'])\n", + "print(f\"After deduplicating by genomic structure: {len(ann):,} transcripts\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 3: Extract CDS sequences from reference genome\n", + "# -----------------------------------------------------------------------------\n", + "print(\"Extracting CDS sequences...\")\n", + "sequences = [extract_cds_sequence(row, fasta) for row in ann.iter_rows(named=True)]\n", + "ann = ann.with_columns(pl.Series(\"cds_sequence\", sequences))\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 4: Quality control - validate CDS sequences\n", + "# -----------------------------------------------------------------------------\n", + "print(\"Running quality checks...\")\n", + "quality_checks = [check_cds_quality(row['cds_sequence']) for row in ann.iter_rows(named=True)]\n", + "\n", + "ann = ann.with_columns([\n", + " pl.Series(\"has_start_codon\", [q['has_start_codon'] for q in quality_checks]),\n", + " pl.Series(\"has_stop_codon\", [q['has_stop_codon'] for q in quality_checks]),\n", + " pl.Series(\"length_divisible_by_3\", [q['length_divisible_by_3'] for q in quality_checks]),\n", + " pl.Series(\"has_internal_stop_codons\", [q['has_internal_stop_codons'] for q in quality_checks]),\n", + " pl.Series(\"cds_length\", [q['length'] for q in quality_checks])\n", + "])\n", + "\n", + "# Print quality summary\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"CDS Quality Summary (before filtering):\")\n", + "print(\"=\"*60)\n", + "print(f\"Total transcripts: {len(ann):,}\")\n", + "print(f\"Has start codon (ATG): {ann['has_start_codon'].sum():,} ({ann['has_start_codon'].mean()*100:.1f}%)\")\n", + "print(f\"Has stop codon (TAA/TAG/TGA): {ann['has_stop_codon'].sum():,} ({ann['has_stop_codon'].mean()*100:.1f}%)\")\n", + "print(f\"Length divisible by 3: {ann['length_divisible_by_3'].sum():,} ({ann['length_divisible_by_3'].mean()*100:.1f}%)\")\n", + "print(f\"Has internal stop codons: {ann['has_internal_stop_codons'].sum():,} ({ann['has_internal_stop_codons'].mean()*100:.1f}%)\")\n", + "print(f\"All quality criteria met: {(ann['has_start_codon'] & ann['has_stop_codon'] & ann['length_divisible_by_3'] & ~ann['has_internal_stop_codons']).sum():,}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 5: Apply quality filters\n", + "# -----------------------------------------------------------------------------\n", + "# Keep only transcripts that pass all quality checks\n", + "ann = ann.filter(\n", + " pl.col('has_start_codon') & \n", + " pl.col('has_stop_codon') & \n", + " pl.col('length_divisible_by_3') & \n", + " ~pl.col('has_internal_stop_codons')\n", + ")\n", + "print(f\"\\nAfter quality filtering: {len(ann):,} transcripts\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Step 6: Filter to canonical transcripts and deduplicate by CDS sequence\n", + "# -----------------------------------------------------------------------------\n", + "# Keep only MANE Select or Ensembl Canonical transcripts\n", + "# Then deduplicate by CDS sequence (different transcripts can encode identical proteins)\n", + "initial_count = len(ann)\n", + "ann = ann.filter(pl.col('is_mane_select') | pl.col('is_canonical'))\n", + "print(f\"After filtering to canonical transcripts: {len(ann):,} transcripts\")\n", + "\n", + "ann = ann.sort(['is_mane_select', 'is_canonical'], descending=True)\n", + "ann = ann.unique(subset=['cds_sequence'], keep='first')\n", + "ann = ann.sort(['chrom', 'txStart'])\n", + "\n", + "print(f\"After canonical filter + CDS deduplication: {len(ann):,} unique transcripts\")\n", + "print(f\" (Removed {initial_count - len(ann):,} transcripts)\")\n", + "ann.write_csv(FILTERED_TRANSCRIPTS_FILE, separator='\\t')\n", + "print(f\"Saved {len(ann):,} unique transcripts to {FILTERED_TRANSCRIPTS_FILE}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 60c642235b82173c526a9060db72d778edc81c02 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 26 Feb 2026 11:35:29 -0800 Subject: [PATCH 09/13] updated nb --- .../data_scripts/check_codon_frequency.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py index 567fcc73d5..6e3d6d6634 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py @@ -27,10 +27,9 @@ sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer -def main(data_dir: Path): - data_path = data_dir / Path("processed_unfiltered") +def main(pretraining_processed_data_dir: Path, data_dir: Path): tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json"))) - metadata = json.load(open(data_path / "metadata.json")) + metadata = json.load(open(pretraining_processed_data_dir / "metadata.json")) tokenizer = Tokenizer() groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 @@ -42,13 +41,13 @@ def main(data_dir: Path): else: curr_taxids_to_remove = set() mmap = np.memmap( - data_path / cm["sequences"]["path"], + pretraining_processed_data_dir / cm["sequences"]["path"], dtype=cm["sequences"]["dtype"], mode="r", shape=tuple(cm["sequences"]["shape"]), ) idx_mmap = np.memmap( - data_path / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) + pretraining_processed_data_dir / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) ) for start, end, taxid in idx_mmap: if taxid in curr_taxids_to_remove: @@ -60,11 +59,12 @@ def main(data_dir: Path): # %% for g in counts: counts[g] = counts[g].tolist() - json.dump(counts, open("/data/ncbi/codon_counts_nopathogen.json", "w")) + json.dump(counts, open(data_dir / "codon_counts_nopathogen.json", "w")) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Check codon frequency") + parser.add_argument("--pretraining_processed_data_dir", type=str, required=True) parser.add_argument("--data_dir", type=str, required=True) args = parser.parse_args() - main(Path(args.data_dir)) \ No newline at end of file + main(Path(args.pretraining_processed_data_dir), Path(args.data_dir)) \ No newline at end of file From 6ed39b73ff6783459e80c9340961adb2720d59b5 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 26 Feb 2026 11:39:33 -0800 Subject: [PATCH 10/13] updated notebook --- .../data_scripts/check_codon_frequency.py | 9 +- .../ncbi_memmap_dataset_creator.py | 1 + .../00-Mutation-Datasets-Preprocessing.ipynb | 171 +++++++-- .../000-Annotation-File-Processing.ipynb | 363 ++++++++++-------- 4 files changed, 356 insertions(+), 188 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py index 6e3d6d6634..1354b2aa8f 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py @@ -27,7 +27,9 @@ sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer + def main(pretraining_processed_data_dir: Path, data_dir: Path): + """Check codon frequency.""" tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json"))) metadata = json.load(open(pretraining_processed_data_dir / "metadata.json")) tokenizer = Tokenizer() @@ -47,7 +49,10 @@ def main(pretraining_processed_data_dir: Path, data_dir: Path): shape=tuple(cm["sequences"]["shape"]), ) idx_mmap = np.memmap( - pretraining_processed_data_dir / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) + pretraining_processed_data_dir / cm["index"]["path"], + dtype=cm["index"]["dtype"], + mode="r", + shape=tuple(cm["index"]["shape"]), ) for start, end, taxid in idx_mmap: if taxid in curr_taxids_to_remove: @@ -67,4 +72,4 @@ def main(pretraining_processed_data_dir: Path, data_dir: Path): parser.add_argument("--pretraining_processed_data_dir", type=str, required=True) parser.add_argument("--data_dir", type=str, required=True) args = parser.parse_args() - main(Path(args.pretraining_processed_data_dir), Path(args.data_dir)) \ No newline at end of file + main(Path(args.pretraining_processed_data_dir), Path(args.data_dir)) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py index 183aaee376..9a34b66a80 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py @@ -25,6 +25,7 @@ import pyarrow.parquet as pq from tqdm import tqdm + sys.path.append("/workspace/codonfm") from src.tokenizer import Tokenizer diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb index 08b3b0d85a..65503c1939 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb @@ -52,6 +52,7 @@ "| Annotation File | Origin | Table |\n", "|----------------|--------|-------|\n", "| `gencode.v47lift37.basic.annotation.gtf` | [GENCODE Release 47lift37](https://www.gencodegenes.org/human/release_47lift37.html) | - |\n", + "| `gencode.v47.basic.annotation.gtf.gz` | [GENCODE Release 47](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz) | - |\n", "| `ucsc_gencodev32_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `wgEncodeGencodeCompV32` |\n", "| `ucsc_refseq_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeq` |\n", "| `ucsc_refseq_hist_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeqHistorical` |\n", @@ -69,8 +70,9 @@ "| `chd_mutation_ctrl.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S10 |\n", "| `Cosmic_Sample_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", "| `Cosmic_MutantCensus_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", - "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n", - "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n", + "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n", + "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n", + "\n", "\n", "##### ClinVar Synonymous Matching Features\n", "\n", @@ -101,7 +103,38 @@ "execution_count": null, "id": "713c7737", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reference genomes\n", + " [skip] reference/hg19/hg19.fa\n", + " [skip] reference/hg38/hg38.fa\n", + "GENCODE annotation\n", + " [skip] reference/gencode.v47lift37.basic.annotation.gtf.gz\n", + " [skip] reference/gencode.v47.basic.annotation.gtf.gz\n", + "DDD / ASD variant files\n", + " [skip] ddd_asd_zhouetal/asd_discov.csv\n", + " [skip] ddd_asd_zhouetal/asd_rep.csv\n", + " [skip] ddd_asd_zhouetal/ddd_other.csv\n", + "ClinVar variant summary\n", + " [skip] clinvar_syn/variant_summary.txt.gz\n", + " Downloading → reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt ...\n", + "phyloP447way conservation scores\n", + " [skip] reference/hg38.phyloP447way.bw\n", + "hg19.100way.phyloP100way.bw conservation scores\n", + " Downloading → reference/hg19.100way.phyloP100way.bw ...\n", + "UCSC Table Browser downloads\n", + " [skip] reference/ucsc_gencodev32_hg38.tsv\n", + " [skip] reference/ucsc_refseq_hg38.tsv\n", + " [skip] reference/ucsc_refseq_hist_hg38.tsv\n", + " [skip] reference/ucsc_pliByGene_hg38.tsv\n", + "\n", + "Done.\n" + ] + } + ], "source": [ "import gzip\n", "import os\n", @@ -114,7 +147,7 @@ "\n", "# ── Set data directory ───────────────────────────────────────\n", "DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n", - "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", + "OUTPUT_DIR = \"/data/ncbi/mutation_datasets\" # output directory where all processed datasets will be saved\n", "UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n", "# ─────────────────────────────────────────────────────────────\n", "\n", @@ -162,10 +195,17 @@ "print(\"GENCODE annotation\")\n", "download_file(\n", " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n", - " os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n", - " decompress_gz=True,\n", + " os.path.join(DATA_DIR, \"reference/gencode.v47lift37.basic.annotation.gtf.gz\"),\n", + " decompress_gz=False,\n", + ")\n", + "\n", + "download_file(\n", + " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz\",\n", + " os.path.join(DATA_DIR, \"reference/gencode.v47.basic.annotation.gtf.gz\"),\n", + " decompress_gz=False,\n", ")\n", "\n", + "\n", "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n", "print(\"DDD / ASD variant files\")\n", "xlsx_sources = {\n", @@ -191,19 +231,33 @@ " os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n", ")\n", "\n", - "# ── 5. phyloP conservation scores ───────────────────────────\n", + "# ── 5. ClinVar gnomAD ──────────────────────────────\n", + "download_file(\n", + " \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz\",\n", + " os.path.join(DATA_DIR, \"reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt\"),\n", + " decompress_gz=True,\n", + ")\n", + "\n", + "\n", + "# ── 6. phyloP conservation scores ───────────────────────────\n", "print(\"phyloP447way conservation scores\")\n", "download_file(\n", " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n", - " os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n", + " os.path.join(DATA_DIR, \"reference/hg38.phyloP447way.bw\"),\n", ")\n", "\n", - "# ── 6. UCSC Table Browser downloads ─────────────────────────\n", + "print(\"hg19.100way.phyloP100way.bw conservation scores\")\n", + "download_file(\n", + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw\",\n", + " os.path.join(DATA_DIR, \"reference/hg19.100way.phyloP100way.bw\"),\n", + ")\n", + "\n", + "# ── 7. UCSC Table Browser downloads ─────────────────────────\n", "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n", "UCSC_TABLES = {\n", " \"wgEncodeGencodeCompV32\": {\n", " \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n", - " \"subdir\": \"\",\n", + " \"subdir\": \"reference\",\n", " \"form\": {\n", " \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n", " \"clade\": \"mammal\",\n", @@ -220,7 +274,7 @@ " },\n", " \"ncbiRefSeq\": {\n", " \"filename\": \"ucsc_refseq_hg38.tsv\",\n", - " \"subdir\": \"clinvar_syn\",\n", + " \"subdir\": \"reference\",\n", " \"form\": {\n", " \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n", " \"clade\": \"mammal\",\n", @@ -237,7 +291,7 @@ " },\n", " \"ncbiRefSeqHistorical\": {\n", " \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n", - " \"subdir\": \"clinvar_syn\",\n", + " \"subdir\": \"reference\",\n", " \"form\": {\n", " \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n", " \"clade\": \"mammal\",\n", @@ -254,7 +308,7 @@ " },\n", " \"pliByGene\": {\n", " \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n", - " \"subdir\": \"\",\n", + " \"subdir\": \"reference\",\n", " \"form\": {\n", " \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n", " \"clade\": \"mammal\",\n", @@ -311,6 +365,24 @@ " f.writelines(lines)\n", " print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n", "\n", + "# ── 8. gnomAD v4.1 VCF files (exomes + genomes, chr1-22, X, Y) ──\n", + "GNOMAD_S3 = \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf\"\n", + "GNOMAD_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]\n", + "gnomad_datasets = {\n", + " \"exomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.exomes.v4.1\"),\n", + " \"genomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.genomes.v4.1\"),\n", + "}\n", + "\n", + "for ds_type, out_dir in gnomad_datasets.items():\n", + " os.makedirs(out_dir, exist_ok=True)\n", + " print(f\"gnomAD {ds_type} VCFs\")\n", + " for chrom in GNOMAD_CHROMS:\n", + " vcf_name = f\"gnomad.{ds_type}.v4.1.sites.{chrom}.vcf.bgz\"\n", + " download_file(\n", + " f\"{GNOMAD_S3}/{ds_type}/{vcf_name}\",\n", + " os.path.join(out_dir, vcf_name),\n", + " )\n", + "\n", "print(\"\\nDone.\")" ] }, @@ -326,10 +398,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "8741cb10", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AlphaMissense data\n", + " Extracting zip → ['science.adg7492_data_captions.pdf', 'science.adg7492_data_s1_to_s4_and_s9.xlsx', 'science.adg7492_data_s5.csv', 'science.adg7492_data_s6.csv', 'science.adg7492_data_s7.csv', 'science.adg7492_data_s8.zip']\n", + " Renamed science.adg7492_data_s5.csv -> alphamissense_clinvar.csv\n", + " Renamed science.adg7492_data_s6.csv -> alphamissense_cancer_hotspot.csv\n" + ] + } + ], "source": [ "import zipfile\n", "\n", @@ -385,6 +468,24 @@ "---" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e544a031-fec7-4765-8a33-2f26c415b5ac", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 68/68 [1:09:44<00:00, 61.53s/it]\n" + ] + } + ], + "source": [ + "%run ../data_scripts/check_codon_frequency.py --pretraining_processed_data_dir $DATA_DIR/pretraining/postprocessed/ --data_dir $DATA_DIR" + ] + }, { "cell_type": "markdown", "id": "ffb9ba7a", @@ -412,11 +513,12 @@ "│ ├── 📄 ucsc_gencodev32_hg38.tsv\n", "│ ├── 📄 ucsc_pliByGene_hg38.tsv\n", "│ ├── 📄 hg38.phyloP447way.bw\n", + "| |── 📄 hg19.100way.phyloP100way.bw\n", + "| |── 📄 gnomad.v2.1.1.lof_metrics.by_transcript.txt\n", "│ ├── ucsc_refseq_hg38.tsv\n", "│ ├── ucsc_refseq_hist_hg38.tsv\n", "│ ├── hg19/\n", "│ │ ├── hg19.fa\n", - "│ │ └── hg19.fa.fai\n", "│ └── hg38/\n", "│ ├── hg38.fa\n", "│ └── hg38.fa.fai\n", @@ -435,10 +537,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "b28b4e2d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5 file(s) missing from /data/balvisio/ncbi:\n", + " ✗ alphamissense_data/AlphaMissense_hg19.tsv.gz\n", + " ✗ reference/hg19/hg19.fa.fai\n", + " ✗ reference/hg38/hg38.fa.fai\n", + " ✗ codon_counts_nopathogen.json\n", + " ✗ gencode.v47lift37.basic.annotation.processed.tsv\n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "5 required file(s) missing — see list above.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 27\u001b[39m\n\u001b[32m 25\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m missing:\n\u001b[32m 26\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m ✗ \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m27\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(missing)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required file(s) missing — see list above.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 28\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAll \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(expected_files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[31mFileNotFoundError\u001b[39m: 5 required file(s) missing — see list above." + ] + } + ], "source": [ "expected_files = [\n", " \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n", @@ -448,10 +574,9 @@ " \"ddd_asd_zhouetal/asd_rep.csv\",\n", " \"ddd_asd_zhouetal/ddd_other.csv\",\n", " \"clinvar_syn/variant_summary.txt.gz\",\n", - " \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n", - " \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n", + " \"reference/ucsc_refseq_hg38.tsv\",\n", + " \"reference/ucsc_refseq_hist_hg38.tsv\",\n", " \"reference/hg19/hg19.fa\",\n", - " \"reference/hg19/hg19.fa.fai\",\n", " \"reference/hg38/hg38.fa\",\n", " \"reference/hg38/hg38.fa.fai\",\n", " \"codon_counts_nopathogen.json\",\n", @@ -6559,7 +6684,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -6573,7 +6698,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb index 2602b47b02..70216b8057 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb +++ b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb @@ -22,6 +22,7 @@ "\n", "import polars as pl\n", "\n", + "\n", "# =============================================================================\n", "# Configuration\n", "# =============================================================================\n", @@ -54,34 +55,36 @@ "# Helper functions to validate and adjust CDS (coding sequence) boundaries\n", "# These functions ensure that the exon coordinates properly include the stop codon (3 bp)\n", "\n", + "\n", "def check_start_alignment(row):\n", " \"\"\"\n", " Validates that CDS start aligns with exon start and adjusts for stop codon on minus strand.\n", " For minus strand genes, the stop codon is at the 3' end (lowest genomic position), so we subtract 3 bp.\n", " \"\"\"\n", - " cds_start = row['cds_start']\n", - " exon_starts = list(map(int, row['exon_starts'].strip(',').split(',')))\n", - " if row['strand'] == '-':\n", + " cds_start = row[\"cds_start\"]\n", + " exon_starts = list(map(int, row[\"exon_starts\"].strip(\",\").split(\",\")))\n", + " if row[\"strand\"] == \"-\":\n", " # Extend first exon by 3 bp to include stop codon (on minus strand)\n", " exon_starts[0] -= 3\n", " assert cds_start == exon_starts[0], f\"{cds_start} != {exon_starts[0]} {row['transcript_id']}\"\n", "\n", - " exon_starts = ','.join(map(str, exon_starts)) + ','\n", + " exon_starts = \",\".join(map(str, exon_starts)) + \",\"\n", " return exon_starts\n", "\n", + "\n", "def check_end_alignment(row):\n", " \"\"\"\n", " Validates that CDS end aligns with exon end and adjusts for stop codon on plus strand.\n", " For plus strand genes, the stop codon is at the 3' end (highest genomic position), so we add 3 bp.\n", " \"\"\"\n", - " cds_end = row['cds_end']\n", - " exon_ends = list(map(int, row['exon_ends'].strip(',').split(',')))\n", - " if row['strand'] == '+':\n", + " cds_end = row[\"cds_end\"]\n", + " exon_ends = list(map(int, row[\"exon_ends\"].strip(\",\").split(\",\")))\n", + " if row[\"strand\"] == \"+\":\n", " # Extend last exon by 3 bp to include stop codon (on plus strand)\n", " exon_ends[-1] += 3\n", " assert cds_end == exon_ends[-1], f\"{cds_end} != {exon_ends[-1]} {row['transcript_id']}\"\n", "\n", - " exon_ends = ','.join(map(str, exon_ends)) + ','\n", + " exon_ends = \",\".join(map(str, exon_ends)) + \",\"\n", " return exon_ends" ] }, @@ -95,122 +98,145 @@ "def process_gtf_file(gtf_file, output_file):\n", " \"\"\"\n", " Process a GENCODE GTF file and extract protein-coding transcript annotations.\n", - " \n", + "\n", " This function:\n", " 1. Parses the GTF file and extracts relevant attributes\n", " 2. Filters for protein-coding genes only\n", " 3. Aggregates exon/CDS coordinates per transcript\n", " 4. Adjusts coordinates to include stop codons\n", " 5. Outputs a tab-separated file with transcript annotations\n", - " \n", + "\n", " Args:\n", " gtf_file: Path to input GENCODE GTF file (can be gzipped)\n", " output_file: Path for output TSV file\n", " \"\"\"\n", " # Read GTF file (standard 9-column format)\n", - " gtf = pl.read_csv(gtf_file, comment_prefix='#', separator='\\t', has_header=False)\n", - " gtf.columns = ['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']\n", - " \n", + " gtf = pl.read_csv(gtf_file, comment_prefix=\"#\", separator=\"\\t\", has_header=False)\n", + " gtf.columns = [\"chrom\", \"source\", \"feature\", \"start\", \"end\", \"score\", \"strand\", \"frame\", \"attribute\"]\n", + "\n", " # Parse the attribute column (column 9) to extract key-value pairs\n", " # GTF attributes are semicolon-separated with format: key \"value\"\n", - " gtf = gtf.with_columns([\n", - " pl.col('attribute').str.extract('gene_id \"(.*?)\"', 1).alias('gene_id'),\n", - " pl.col('attribute').str.extract('transcript_id \"(.*?)\"', 1).alias('transcript_id'),\n", - " pl.col('attribute').str.extract('gene_name \"(.*?)\"', 1).alias('gene_name'),\n", - " pl.col('attribute').str.extract('gene_type \"(.*?)\"', 1).alias('gene_type'),\n", - " pl.col('attribute').str.extract('transcript_type \"(.*?)\"', 1).alias('transcript_type'),\n", - " pl.col('attribute').str.extract('exon_number (.*?);', 1).alias('exon_number')\n", - " ])\n", - " \n", + " gtf = gtf.with_columns(\n", + " [\n", + " pl.col(\"attribute\").str.extract('gene_id \"(.*?)\"', 1).alias(\"gene_id\"),\n", + " pl.col(\"attribute\").str.extract('transcript_id \"(.*?)\"', 1).alias(\"transcript_id\"),\n", + " pl.col(\"attribute\").str.extract('gene_name \"(.*?)\"', 1).alias(\"gene_name\"),\n", + " pl.col(\"attribute\").str.extract('gene_type \"(.*?)\"', 1).alias(\"gene_type\"),\n", + " pl.col(\"attribute\").str.extract('transcript_type \"(.*?)\"', 1).alias(\"transcript_type\"),\n", + " pl.col(\"attribute\").str.extract(\"exon_number (.*?);\", 1).alias(\"exon_number\"),\n", + " ]\n", + " )\n", + "\n", " # Flag canonical transcripts (Ensembl canonical and MANE Select)\n", - " gtf = gtf.with_columns(pl.col('attribute').str.contains('Ensembl_canonical').alias('is_canonical'))\n", - " gtf = gtf.with_columns(pl.col('attribute').str.contains('MANE_Select').alias('is_mane_select'))\n", - " \n", + " gtf = gtf.with_columns(pl.col(\"attribute\").str.contains(\"Ensembl_canonical\").alias(\"is_canonical\"))\n", + " gtf = gtf.with_columns(pl.col(\"attribute\").str.contains(\"MANE_Select\").alias(\"is_mane_select\"))\n", + "\n", " # Filter to protein-coding genes only, exclude gene-level features\n", - " protein_coding_gtf = gtf.filter((pl.col('gene_type') == 'protein_coding') & (pl.col('feature') != 'gene'))\n", + " protein_coding_gtf = gtf.filter((pl.col(\"gene_type\") == \"protein_coding\") & (pl.col(\"feature\") != \"gene\"))\n", " # Filter to protein-coding transcripts only\n", - " protein_coding_gtf = protein_coding_gtf.filter(pl.col('transcript_type') == 'protein_coding')\n", - " \n", + " protein_coding_gtf = protein_coding_gtf.filter(pl.col(\"transcript_type\") == \"protein_coding\")\n", + "\n", " # Convert from 1-based (GTF) to 0-based coordinates (BED-like format)\n", - " protein_coding_gtf = protein_coding_gtf.with_columns(pl.col('start') - 1)\n", - " \n", + " protein_coding_gtf = protein_coding_gtf.with_columns(pl.col(\"start\") - 1)\n", + "\n", " # Aggregate CDS exon coordinates per transcript\n", " # Creates comma-separated lists of exon start/end positions (sorted by genomic position)\n", - " exon_starts = protein_coding_gtf.filter(pl.col('feature') == 'CDS').group_by('transcript_id').agg(\n", - " (pl.col('start').sort().cast(str).str.join(',') + ',').alias('exon_starts'),\n", - " (pl.col('end').sort().cast(str).str.join(',') + ',').alias('exon_ends'),\n", - " pl.col('exon_number').max().alias('exon_count')\n", + " exon_starts = (\n", + " protein_coding_gtf.filter(pl.col(\"feature\") == \"CDS\")\n", + " .group_by(\"transcript_id\")\n", + " .agg(\n", + " (pl.col(\"start\").sort().cast(str).str.join(\",\") + \",\").alias(\"exon_starts\"),\n", + " (pl.col(\"end\").sort().cast(str).str.join(\",\") + \",\").alias(\"exon_ends\"),\n", + " pl.col(\"exon_number\").max().alias(\"exon_count\"),\n", + " )\n", " )\n", - " \n", + "\n", " # Calculate CDS boundaries with stop codon adjustment\n", " # GENCODE GTF excludes stop codon from CDS, but we want to include it\n", " # For + strand: stop codon is after the last CDS position (add 3 to max end)\n", " # For - strand: stop codon is before the first CDS position (subtract 3 from min start)\n", " # Note: Using min()/max() instead of first()/last() to avoid dependency on row order\n", - " cds_starts = protein_coding_gtf.filter(pl.col('feature') == 'CDS').group_by('transcript_id').agg(\n", - " pl.when(pl.col('strand').first() == '-')\n", - " .then(pl.col('start').min() - 3) # Include stop codon at 3' end (lowest genomic position)\n", - " .otherwise(pl.col('start').min()) # 5' end, no adjustment needed\n", - " .alias('cds_start'),\n", - " pl.when(pl.col('strand').first() == '-')\n", - " .then(pl.col('end').max()) # 5' end (highest genomic position), no adjustment\n", - " .otherwise(pl.col('end').max() + 3) # Include stop codon at 3' end\n", - " .alias('cds_end'),\n", + " cds_starts = (\n", + " protein_coding_gtf.filter(pl.col(\"feature\") == \"CDS\")\n", + " .group_by(\"transcript_id\")\n", + " .agg(\n", + " pl.when(pl.col(\"strand\").first() == \"-\")\n", + " .then(pl.col(\"start\").min() - 3) # Include stop codon at 3' end (lowest genomic position)\n", + " .otherwise(pl.col(\"start\").min()) # 5' end, no adjustment needed\n", + " .alias(\"cds_start\"),\n", + " pl.when(pl.col(\"strand\").first() == \"-\")\n", + " .then(pl.col(\"end\").max()) # 5' end (highest genomic position), no adjustment\n", + " .otherwise(pl.col(\"end\").max() + 3) # Include stop codon at 3' end\n", + " .alias(\"cds_end\"),\n", + " )\n", " )\n", - " \n", + "\n", " # Get transcript-level metadata (gene info, coordinates, canonical status)\n", - " tx_starts = protein_coding_gtf.filter(pl.col('feature') == 'transcript').group_by('transcript_id').agg(\n", - " pl.col('gene_id').first().alias('gene_id'),\n", - " pl.col('gene_name').first().alias('gene_name'),\n", - " pl.col('chrom').first().alias('chrom'),\n", - " pl.col('strand').first().alias('strand'),\n", - " pl.col('start').min().alias('tx_start'),\n", - " pl.col('end').max().alias('tx_end'),\n", - " pl.col('transcript_type').first().alias('transcript_type'),\n", - " pl.col('is_canonical').first().alias('is_canonical'),\n", - " pl.col('is_mane_select').first().alias('is_mane_select'),\n", + " tx_starts = (\n", + " protein_coding_gtf.filter(pl.col(\"feature\") == \"transcript\")\n", + " .group_by(\"transcript_id\")\n", + " .agg(\n", + " pl.col(\"gene_id\").first().alias(\"gene_id\"),\n", + " pl.col(\"gene_name\").first().alias(\"gene_name\"),\n", + " pl.col(\"chrom\").first().alias(\"chrom\"),\n", + " pl.col(\"strand\").first().alias(\"strand\"),\n", + " pl.col(\"start\").min().alias(\"tx_start\"),\n", + " pl.col(\"end\").max().alias(\"tx_end\"),\n", + " pl.col(\"transcript_type\").first().alias(\"transcript_type\"),\n", + " pl.col(\"is_canonical\").first().alias(\"is_canonical\"),\n", + " pl.col(\"is_mane_select\").first().alias(\"is_mane_select\"),\n", + " )\n", " )\n", - " \n", + "\n", " # Join all transcript information together\n", - " joined_df = tx_starts.join(cds_starts, on='transcript_id', how='inner')\\\n", - " .join(exon_starts, on='transcript_id', how='inner')\n", - " \n", + " joined_df = tx_starts.join(cds_starts, on=\"transcript_id\", how=\"inner\").join(\n", + " exon_starts, on=\"transcript_id\", how=\"inner\"\n", + " )\n", + "\n", " # Validate and adjust exon coordinates to include stop codon\n", " joined_df = joined_df.with_columns(\n", - " pl.struct(['cds_start', 'exon_starts', 'strand', 'transcript_id']).map_elements(check_start_alignment, return_dtype=pl.Utf8).alias('exon_starts'),\n", - " pl.struct(['cds_end', 'exon_ends', 'strand', 'transcript_id']).map_elements(check_end_alignment, return_dtype=pl.Utf8).alias('exon_ends')\n", + " pl.struct([\"cds_start\", \"exon_starts\", \"strand\", \"transcript_id\"])\n", + " .map_elements(check_start_alignment, return_dtype=pl.Utf8)\n", + " .alias(\"exon_starts\"),\n", + " pl.struct([\"cds_end\", \"exon_ends\", \"strand\", \"transcript_id\"])\n", + " .map_elements(check_end_alignment, return_dtype=pl.Utf8)\n", + " .alias(\"exon_ends\"),\n", " )\n", - " \n", + "\n", " # Sort by chromosome and position, then select and rename columns for output\n", - " joined_df = joined_df.sort(['chrom', 'tx_start'])\n", - " joined_df = joined_df.select([\n", - " 'gene_id',\n", - " 'transcript_id',\n", - " 'chrom',\n", - " 'strand',\n", - " 'tx_start',\n", - " 'tx_end', \n", - " 'cds_start',\n", - " 'cds_end',\n", - " 'exon_count',\n", - " 'exon_starts',\n", - " 'exon_ends',\n", - " 'gene_name',\n", - " 'transcript_type',\n", - " 'is_canonical',\n", - " 'is_mane_select'\n", - " ]).rename({\n", - " 'transcript_id': 'name', # Transcript ID becomes the 'name' field\n", - " 'tx_start': 'txStart', # Transcript start position\n", - " 'tx_end': 'txEnd', # Transcript end position\n", - " 'cds_start': 'cdsStart', # CDS start (including stop codon adjustment)\n", - " 'cds_end': 'cdsEnd', # CDS end (including stop codon adjustment)\n", - " 'exon_starts': 'exonStarts', # Comma-separated exon start positions\n", - " 'exon_ends': 'exonEnds' # Comma-separated exon end positions\n", - " })\n", + " joined_df = joined_df.sort([\"chrom\", \"tx_start\"])\n", + " joined_df = joined_df.select(\n", + " [\n", + " \"gene_id\",\n", + " \"transcript_id\",\n", + " \"chrom\",\n", + " \"strand\",\n", + " \"tx_start\",\n", + " \"tx_end\",\n", + " \"cds_start\",\n", + " \"cds_end\",\n", + " \"exon_count\",\n", + " \"exon_starts\",\n", + " \"exon_ends\",\n", + " \"gene_name\",\n", + " \"transcript_type\",\n", + " \"is_canonical\",\n", + " \"is_mane_select\",\n", + " ]\n", + " ).rename(\n", + " {\n", + " \"transcript_id\": \"name\", # Transcript ID becomes the 'name' field\n", + " \"tx_start\": \"txStart\", # Transcript start position\n", + " \"tx_end\": \"txEnd\", # Transcript end position\n", + " \"cds_start\": \"cdsStart\", # CDS start (including stop codon adjustment)\n", + " \"cds_end\": \"cdsEnd\", # CDS end (including stop codon adjustment)\n", + " \"exon_starts\": \"exonStarts\", # Comma-separated exon start positions\n", + " \"exon_ends\": \"exonEnds\", # Comma-separated exon end positions\n", + " }\n", + " )\n", "\n", " # Write output as tab-separated file\n", - " joined_df.write_csv(output_file, separator='\\t')\n", + " joined_df.write_csv(output_file, separator=\"\\t\")\n", "\n", " return joined_df" ] @@ -240,7 +266,7 @@ "for assembly in [\"hg38\", \"hg19\"]:\n", " gtf_file = GTF_FILES[assembly]\n", " output_file = OUTPUT_FILES[assembly]\n", - " \n", + "\n", " print(f\"Processing {assembly}: {gtf_file}\")\n", " _ = process_gtf_file(gtf_file, output_file)\n", " print(f\"Output saved to: {output_file}\\n\")" @@ -289,14 +315,14 @@ "FILTERED_TRANSCRIPTS_FILE = f\"{REFERENCE_DIR}/gencode.{GENCODE_VERSION}.basic.annotation.processed.filtered.tsv\"\n", "\n", "# Valid chromosomes for analysis\n", - "VALID_CHROMS = [f'chr{i}' for i in range(1, 23)] + ['chrX']\n", + "VALID_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\"]\n", "\n", "# DNA complement mapping for reverse complement operations\n", - "COMPLEMENT = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}\n", + "COMPLEMENT = {\"A\": \"T\", \"T\": \"A\", \"G\": \"C\", \"C\": \"G\", \"N\": \"N\"}\n", "\n", "print(f\"Reference genome: {REFERENCE_GENOME}\")\n", "print(f\"Annotation file: {ANNOTATION_FILE}\")\n", - "print(f\"Output file: {FILTERED_TRANSCRIPTS_FILE}\")\n" + "print(f\"Output file: {FILTERED_TRANSCRIPTS_FILE}\")" ] }, { @@ -310,30 +336,31 @@ "# CDS Extraction and Quality Control Functions\n", "# =============================================================================\n", "\n", + "\n", "def extract_cds_sequence(row, fasta):\n", " \"\"\"\n", " Extract the coding sequence (CDS) for a transcript from the reference genome.\n", - " \n", + "\n", " This function:\n", " 1. Iterates through exons and extracts only the CDS-overlapping portions\n", " 2. Concatenates exon sequences in genomic order\n", " 3. Reverse complements for minus strand genes\n", - " \n", + "\n", " Args:\n", " row: DataFrame row containing transcript annotation (chrom, strand, cdsStart, cdsEnd, exonStarts, exonEnds)\n", " fasta: Dictionary mapping chromosome names to their sequences\n", - " \n", + "\n", " Returns:\n", " str: The complete CDS sequence in 5' to 3' orientation (transcript strand)\n", " \"\"\"\n", - " chrom = row['chrom']\n", - " strand = row['strand']\n", - " cds_start = row['cdsStart']\n", - " cds_end = row['cdsEnd']\n", - " \n", + " chrom = row[\"chrom\"]\n", + " strand = row[\"strand\"]\n", + " cds_start = row[\"cdsStart\"]\n", + " cds_end = row[\"cdsEnd\"]\n", + "\n", " # Parse comma-separated exon coordinates from annotation file\n", - " exon_starts = [int(x) for x in row['exonStarts'].rstrip(',').split(',')]\n", - " exon_ends = [int(x) for x in row['exonEnds'].rstrip(',').split(',')]\n", + " exon_starts = [int(x) for x in row[\"exonStarts\"].rstrip(\",\").split(\",\")]\n", + " exon_ends = [int(x) for x in row[\"exonEnds\"].rstrip(\",\").split(\",\")]\n", "\n", " # Ensure exon boundaries encompass the full CDS (handles edge cases)\n", " if exon_starts[0] > cds_start:\n", @@ -343,76 +370,76 @@ "\n", " # Extract CDS sequence by iterating through exons\n", " cds_sequence = \"\"\n", - " \n", + "\n", " for start, end in zip(exon_starts, exon_ends):\n", " # Find overlap between this exon and the CDS region\n", " overlap_start = max(start, cds_start)\n", " overlap_end = min(end, cds_end)\n", - " \n", + "\n", " if overlap_start < overlap_end:\n", " # Extract sequence from this exon segment (0-based coordinates)\n", " seq = str(fasta[chrom][overlap_start:overlap_end]).upper()\n", " cds_sequence += seq\n", - " \n", + "\n", " # For minus strand genes, reverse complement to get 5' to 3' orientation\n", - " if strand == '-':\n", - " cds_sequence = ''.join(COMPLEMENT[base] for base in cds_sequence[::-1])\n", - " \n", + " if strand == \"-\":\n", + " cds_sequence = \"\".join(COMPLEMENT[base] for base in cds_sequence[::-1])\n", + "\n", " return cds_sequence\n", "\n", "\n", "def check_cds_quality(sequence):\n", " \"\"\"\n", " Validate CDS sequence quality for downstream variant analysis.\n", - " \n", + "\n", " Quality criteria checked:\n", " 1. Starts with ATG (methionine start codon)\n", " 2. Ends with a stop codon (TAA, TAG, or TGA)\n", " 3. Length is divisible by 3 (complete codons)\n", " 4. No premature stop codons within the coding region\n", - " \n", + "\n", " Args:\n", " sequence: CDS nucleotide sequence string\n", - " \n", + "\n", " Returns:\n", " dict: Quality metrics including boolean flags and sequence length\n", " \"\"\"\n", " if not sequence or len(sequence) < 3:\n", " return {\n", - " 'has_start_codon': False,\n", - " 'has_stop_codon': False,\n", - " 'length_divisible_by_3': False,\n", - " 'has_internal_stop_codons': False,\n", - " 'length': len(sequence) if sequence else 0\n", + " \"has_start_codon\": False,\n", + " \"has_stop_codon\": False,\n", + " \"length_divisible_by_3\": False,\n", + " \"has_internal_stop_codons\": False,\n", + " \"length\": len(sequence) if sequence else 0,\n", " }\n", - " \n", + "\n", " # Check for canonical start codon (ATG = Methionine)\n", - " has_start_codon = sequence[:3] == 'ATG'\n", - " \n", + " has_start_codon = sequence[:3] == \"ATG\"\n", + "\n", " # Check for stop codon at the end\n", - " has_stop_codon = sequence[-3:] in ['TAA', 'TAG', 'TGA']\n", - " \n", + " has_stop_codon = sequence[-3:] in [\"TAA\", \"TAG\", \"TGA\"]\n", + "\n", " # CDS should be in-frame (length divisible by 3)\n", " length_divisible_by_3 = len(sequence) % 3 == 0\n", - " \n", + "\n", " # Check for internal stop codons (premature termination)\n", " # These indicate potential annotation errors or pseudogenes\n", " has_internal_stop_codons = False\n", " if len(sequence) >= 6: # Need at least 2 codons to check for internal stops\n", " # Check all codons except the last one (which should be a stop)\n", " for i in range(0, len(sequence) - 3, 3):\n", - " codon = sequence[i:i+3]\n", - " if codon in ['TAA', 'TAG', 'TGA']:\n", + " codon = sequence[i : i + 3]\n", + " if codon in [\"TAA\", \"TAG\", \"TGA\"]:\n", " has_internal_stop_codons = True\n", " break\n", - " \n", + "\n", " return {\n", - " 'has_start_codon': has_start_codon,\n", - " 'has_stop_codon': has_stop_codon,\n", - " 'length_divisible_by_3': length_divisible_by_3,\n", - " 'has_internal_stop_codons': has_internal_stop_codons,\n", - " 'length': len(sequence)\n", - " }\n" + " \"has_start_codon\": has_start_codon,\n", + " \"has_stop_codon\": has_stop_codon,\n", + " \"length_divisible_by_3\": length_divisible_by_3,\n", + " \"has_internal_stop_codons\": has_internal_stop_codons,\n", + " \"length\": len(sequence),\n", + " }" ] }, { @@ -437,12 +464,14 @@ "# Only loading standard chromosomes (1-22, X, Y) - excluding patches and alternate contigs.\n", "\n", "import pyfaidx\n", + "\n", + "\n", "fasta = {}\n", "\n", "with pyfaidx.Fasta(REFERENCE_GENOME) as f:\n", " for chrom in VALID_CHROMS:\n", " fasta[chrom] = f[chrom][:].seq\n", - "print(f\"Loaded {len(fasta)} chromosomes from {REFERENCE_GENOME}\")\n" + "print(f\"Loaded {len(fasta)} chromosomes from {REFERENCE_GENOME}\")" ] }, { @@ -488,8 +517,8 @@ "# -----------------------------------------------------------------------------\n", "# Input: TSV file from the processing above containing\n", "# transcript coordinates, exon boundaries, and canonical status flags\n", - "ann = pl.read_csv(ANNOTATION_FILE, separator='\\t')\n", - "ann = ann.filter(pl.col('chrom').is_in(VALID_CHROMS))\n", + "ann = pl.read_csv(ANNOTATION_FILE, separator=\"\\t\")\n", + "ann = ann.filter(pl.col(\"chrom\").is_in(VALID_CHROMS))\n", "print(f\"Loaded {len(ann):,} transcripts from {ANNOTATION_FILE}\")\n", "\n", "# -----------------------------------------------------------------------------\n", @@ -497,8 +526,8 @@ "# -----------------------------------------------------------------------------\n", "# Multiple transcript IDs can map to the same CDS coordinates (e.g., RefSeq vs Ensembl)\n", "# Keep MANE Select > Ensembl Canonical when duplicates exist\n", - "ann = ann.sort(['is_mane_select', 'is_canonical'], descending=True)\n", - "ann = ann.unique(subset=['chrom', 'strand', 'cdsStart', 'cdsEnd', 'exonStarts', 'exonEnds'])\n", + "ann = ann.sort([\"is_mane_select\", \"is_canonical\"], descending=True)\n", + "ann = ann.unique(subset=[\"chrom\", \"strand\", \"cdsStart\", \"cdsEnd\", \"exonStarts\", \"exonEnds\"])\n", "print(f\"After deduplicating by genomic structure: {len(ann):,} transcripts\")\n", "\n", "# -----------------------------------------------------------------------------\n", @@ -512,36 +541,44 @@ "# Step 4: Quality control - validate CDS sequences\n", "# -----------------------------------------------------------------------------\n", "print(\"Running quality checks...\")\n", - "quality_checks = [check_cds_quality(row['cds_sequence']) for row in ann.iter_rows(named=True)]\n", - "\n", - "ann = ann.with_columns([\n", - " pl.Series(\"has_start_codon\", [q['has_start_codon'] for q in quality_checks]),\n", - " pl.Series(\"has_stop_codon\", [q['has_stop_codon'] for q in quality_checks]),\n", - " pl.Series(\"length_divisible_by_3\", [q['length_divisible_by_3'] for q in quality_checks]),\n", - " pl.Series(\"has_internal_stop_codons\", [q['has_internal_stop_codons'] for q in quality_checks]),\n", - " pl.Series(\"cds_length\", [q['length'] for q in quality_checks])\n", - "])\n", + "quality_checks = [check_cds_quality(row[\"cds_sequence\"]) for row in ann.iter_rows(named=True)]\n", + "\n", + "ann = ann.with_columns(\n", + " [\n", + " pl.Series(\"has_start_codon\", [q[\"has_start_codon\"] for q in quality_checks]),\n", + " pl.Series(\"has_stop_codon\", [q[\"has_stop_codon\"] for q in quality_checks]),\n", + " pl.Series(\"length_divisible_by_3\", [q[\"length_divisible_by_3\"] for q in quality_checks]),\n", + " pl.Series(\"has_internal_stop_codons\", [q[\"has_internal_stop_codons\"] for q in quality_checks]),\n", + " pl.Series(\"cds_length\", [q[\"length\"] for q in quality_checks]),\n", + " ]\n", + ")\n", "\n", "# Print quality summary\n", - "print(\"\\n\" + \"=\"*60)\n", + "print(\"\\n\" + \"=\" * 60)\n", "print(\"CDS Quality Summary (before filtering):\")\n", - "print(\"=\"*60)\n", + "print(\"=\" * 60)\n", "print(f\"Total transcripts: {len(ann):,}\")\n", - "print(f\"Has start codon (ATG): {ann['has_start_codon'].sum():,} ({ann['has_start_codon'].mean()*100:.1f}%)\")\n", - "print(f\"Has stop codon (TAA/TAG/TGA): {ann['has_stop_codon'].sum():,} ({ann['has_stop_codon'].mean()*100:.1f}%)\")\n", - "print(f\"Length divisible by 3: {ann['length_divisible_by_3'].sum():,} ({ann['length_divisible_by_3'].mean()*100:.1f}%)\")\n", - "print(f\"Has internal stop codons: {ann['has_internal_stop_codons'].sum():,} ({ann['has_internal_stop_codons'].mean()*100:.1f}%)\")\n", - "print(f\"All quality criteria met: {(ann['has_start_codon'] & ann['has_stop_codon'] & ann['length_divisible_by_3'] & ~ann['has_internal_stop_codons']).sum():,}\")\n", + "print(f\"Has start codon (ATG): {ann['has_start_codon'].sum():,} ({ann['has_start_codon'].mean() * 100:.1f}%)\")\n", + "print(f\"Has stop codon (TAA/TAG/TGA): {ann['has_stop_codon'].sum():,} ({ann['has_stop_codon'].mean() * 100:.1f}%)\")\n", + "print(\n", + " f\"Length divisible by 3: {ann['length_divisible_by_3'].sum():,} ({ann['length_divisible_by_3'].mean() * 100:.1f}%)\"\n", + ")\n", + "print(\n", + " f\"Has internal stop codons: {ann['has_internal_stop_codons'].sum():,} ({ann['has_internal_stop_codons'].mean() * 100:.1f}%)\"\n", + ")\n", + "print(\n", + " f\"All quality criteria met: {(ann['has_start_codon'] & ann['has_stop_codon'] & ann['length_divisible_by_3'] & ~ann['has_internal_stop_codons']).sum():,}\"\n", + ")\n", "\n", "# -----------------------------------------------------------------------------\n", "# Step 5: Apply quality filters\n", "# -----------------------------------------------------------------------------\n", "# Keep only transcripts that pass all quality checks\n", "ann = ann.filter(\n", - " pl.col('has_start_codon') & \n", - " pl.col('has_stop_codon') & \n", - " pl.col('length_divisible_by_3') & \n", - " ~pl.col('has_internal_stop_codons')\n", + " pl.col(\"has_start_codon\")\n", + " & pl.col(\"has_stop_codon\")\n", + " & pl.col(\"length_divisible_by_3\")\n", + " & ~pl.col(\"has_internal_stop_codons\")\n", ")\n", "print(f\"\\nAfter quality filtering: {len(ann):,} transcripts\")\n", "\n", @@ -551,17 +588,17 @@ "# Keep only MANE Select or Ensembl Canonical transcripts\n", "# Then deduplicate by CDS sequence (different transcripts can encode identical proteins)\n", "initial_count = len(ann)\n", - "ann = ann.filter(pl.col('is_mane_select') | pl.col('is_canonical'))\n", + "ann = ann.filter(pl.col(\"is_mane_select\") | pl.col(\"is_canonical\"))\n", "print(f\"After filtering to canonical transcripts: {len(ann):,} transcripts\")\n", "\n", - "ann = ann.sort(['is_mane_select', 'is_canonical'], descending=True)\n", - "ann = ann.unique(subset=['cds_sequence'], keep='first')\n", - "ann = ann.sort(['chrom', 'txStart'])\n", + "ann = ann.sort([\"is_mane_select\", \"is_canonical\"], descending=True)\n", + "ann = ann.unique(subset=[\"cds_sequence\"], keep=\"first\")\n", + "ann = ann.sort([\"chrom\", \"txStart\"])\n", "\n", "print(f\"After canonical filter + CDS deduplication: {len(ann):,} unique transcripts\")\n", "print(f\" (Removed {initial_count - len(ann):,} transcripts)\")\n", - "ann.write_csv(FILTERED_TRANSCRIPTS_FILE, separator='\\t')\n", - "print(f\"Saved {len(ann):,} unique transcripts to {FILTERED_TRANSCRIPTS_FILE}\")\n" + "ann.write_csv(FILTERED_TRANSCRIPTS_FILE, separator=\"\\t\")\n", + "print(f\"Saved {len(ann):,} unique transcripts to {FILTERED_TRANSCRIPTS_FILE}\")" ] } ], From ef722b60bc6448fcc31498b01fe7f458d5f4a25c Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 26 Feb 2026 19:56:57 -0800 Subject: [PATCH 11/13] fix small config bug --- bionemo-recipes/recipes/codonfm_ptl_te/src/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py index effa1a455c..21354fd815 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/config.py @@ -133,8 +133,8 @@ def get_callbacks_config(args: Any) -> Dict[str, fdl.Config]: ), "model_summary": fdl.Config(ModelSummary, max_depth=-1), "lr_monitor": fdl.Config(LearningRateMonitor, logging_interval="step", log_weight_decay=True), - "grad_norm_callback": fdl.Config(GradientNormLogger, log_every_n_steps=100), - "timer_callback": fdl.Config(StepTimingCallback, log_every_n_steps=100, mode="train"), + "grad_norm_callback": fdl.Config(GradientNormLogger, log_every_n_steps=args.log_every_n_steps), + "timer_callback": fdl.Config(StepTimingCallback, log_every_n_steps=args.log_every_n_steps, mode="train"), } if args.mode == "eval": callbacks["pred_writer"] = fdl.Config( From a3142e2622a7eac064bde6523ae58a5bc04dfc9f Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Fri, 27 Feb 2026 18:06:14 -0800 Subject: [PATCH 12/13] fix runner missing param --- .../notebooks/download_ucsc_tables.py | 149 ------------------ .../recipes/codonfm_ptl_te/src/runner.py | 1 + 2 files changed, 1 insertion(+), 149 deletions(-) delete mode 100644 bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py deleted file mode 100644 index 9e191b9035..0000000000 --- a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/download_ucsc_tables.py +++ /dev/null @@ -1,149 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-Apache2 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Download TSV files from the UCSC Table Browser via POST requests. - -Usage: - python download_ucsc_tables.py # download all four tables - python download_ucsc_tables.py --table ncbiRefSeq # download a single table - python download_ucsc_tables.py --output-dir /data/ncbi # custom output directory -""" - -import argparse -import os - -import requests - - -UCSC_URL = "https://genome.ucsc.edu/cgi-bin/hgTables" - -TABLES = { - "wgEncodeGencodeCompV32": { - "hgsid": "3727160771_KywqrMbVutzoVUyr47py53TcxZMg", # pragma: allowlist secret - "clade": "mammal", - "org": "Human", - "db": "hg38", - "hgta_group": "allTables", - "hgta_track": "hg38", - "hgta_table": "wgEncodeGencodeCompV32", - "hgta_regionType": "genome", - "position": "chr7:155,799,529-155,812,871", - "hgta_outSep": "tab", - "hgta_doTopSubmit": "Get output", - "filename": "ucsc_gencodev32_hg38.tsv", - }, - "ncbiRefSeq": { - "hgsid": "3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0", # pragma: allowlist secret - "clade": "mammal", - "org": "Human", - "db": "hg38", - "hgta_group": "allTables", - "hgta_track": "hg38", - "hgta_table": "ncbiRefSeq", - "hgta_regionType": "genome", - "position": "chr7:155,799,529-155,812,871", - "hgta_outSep": "tab", - "hgta_doTopSubmit": "Get output", - "subdir": "clinvar_syn", - "filename": "ucsc_refseq_hg38.tsv", - }, - "ncbiRefSeqHistorical": { - "hgsid": "3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0", # pragma: allowlist secret - "clade": "mammal", - "org": "Human", - "db": "hg38", - "hgta_group": "allTables", - "hgta_track": "hg38", - "hgta_table": "ncbiRefSeqHistorical", - "hgta_regionType": "genome", - "position": "chr7:155,799,529-155,812,871", - "hgta_outSep": "tab", - "hgta_doTopSubmit": "Get output", - "subdir": "clinvar_syn", - "filename": "ucsc_refseq_hist_hg38.tsv", - }, - "pliByGene": { - "hgsid": "3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3", # pragma: allowlist secret - "clade": "mammal", - "org": "Human", - "db": "hg38", - "hgta_group": "varRep", - "hgta_track": "gnomadPLI", - "hgta_table": "pliByGene", - "hgta_regionType": "genome", - "position": "chr7:155,799,529-155,812,871", - "hgta_outSep": "tab", - "hgta_doTopSubmit": "Get output", - "filename": "ucsc_pliByGene_hg38.tsv", - }, -} - - -def download_table(table_name: str, output_dir: str, api_key: str) -> str: - """POST to the UCSC Table Browser and save the result as a TSV.""" - cfg = TABLES[table_name] - cfg["apiKey"] = api_key - dest_dir = os.path.join(output_dir, cfg.get("subdir", "")) if cfg.get("subdir") else output_dir - os.makedirs(dest_dir, exist_ok=True) - dest = os.path.join(dest_dir, cfg["filename"]) - - if os.path.exists(dest): - print(f" [skip] {dest}") - return dest - - print(f" Downloading {table_name} → {dest} ...") - - resp = requests.post(UCSC_URL, timeout=300, data=cfg) - resp.raise_for_status() - - if "" in resp.text: - error_start = resp.text.index("") - error_end = ( - resp.text.index("") if "" in resp.text else error_start + 500 - ) - raise RuntimeError(f"UCSC returned an error:\n{resp.text[error_start:error_end]}") - - lines = resp.text.splitlines(keepends=True) - while lines: - tail = lines[-1].strip() - if not tail or tail.startswith("---") or "cookie" in tail.lower(): - lines.pop() - else: - break - - with open(dest, "w") as f: - f.writelines(lines) - - print(f" [done] {dest} ({len(lines):,} lines)") - return dest - - -def main(): - """Download UCSC Table Browser tables as TSV.""" - parser = argparse.ArgumentParser(description="Download UCSC Table Browser tables as TSV") - parser.add_argument("--table", choices=list(TABLES.keys()), help="Single table to download (default: all)") - parser.add_argument("--output-dir", default=".", help="Base output directory (default: cwd)") - parser.add_argument("--api-key", required=True, help="API key for UCSC Table Browser") - args = parser.parse_args() - - tables = [args.table] if args.table else list(TABLES.keys()) - - for t in tables: - print(f"=== {t} ===") - download_table(t, args.output_dir, args.api_key) - - -if __name__ == "__main__": - main() diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py index 16a5eecba0..bef7727b7e 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/runner.py @@ -151,6 +151,7 @@ def get_parser(): # noqa: D103 default=None, help="For evaluation, the directory to write predictions to.", ) + parser.add_argument("--task_type", type=str, default=None, help="For evaluation, the task type to run.") # Finetune specific parser.add_argument( From a661b6121f90b43b800a57c0d738af7254f99280 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 2 Mar 2026 12:03:48 -0800 Subject: [PATCH 13/13] added bendchmarking target --- .../recipes/codonfm_ptl_te/Dockerfile | 5 ++ .../src/models/components/mha.py | 70 +++++++++++++------ 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/Dockerfile b/bionemo-recipes/recipes/codonfm_ptl_te/Dockerfile index 068cfa9757..fef2844ae8 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/Dockerfile +++ b/bionemo-recipes/recipes/codonfm_ptl_te/Dockerfile @@ -64,3 +64,8 @@ RUN chown -R ${USERNAME:-vscode}:${USERNAME:-vscode} /workspace/codonfm # Switch to the non-root user USER $USERNAME + +# ----------------- For benchmarking only ----------------- +FROM production AS benchmarking + +RUN pip install -v --no-build-isolation -U git+https://github.com/facebookresearch/xformers.git@v0.0.32.post2#egg=xformers --no-deps diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/models/components/mha.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/models/components/mha.py index af69dd2988..264f0eb262 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/models/components/mha.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/models/components/mha.py @@ -14,6 +14,10 @@ # limitations under the License. +import importlib.util +import os +from functools import lru_cache + import torch from einops import rearrange from torch import nn @@ -21,13 +25,14 @@ from src.models.components.rotary_embedding import RotaryEmbedding, apply_rotary_pos_emb -try: - import xformers.ops as xops +@lru_cache +def is_xformers_available(): + """Check whether xformers is installed.""" + return importlib.util.find_spec("xformers") is not None - HAVE_XFORMERS = True -except ImportError: - xops = None - HAVE_XFORMERS = False + +if is_xformers_available(): + import xformers.ops as xops class MultiHeadAttention(nn.Module): @@ -58,6 +63,10 @@ def __init__( # noqa: D107 self.key = nn.Linear(embed_dim, embed_dim) self.value = nn.Linear(embed_dim, embed_dim) + self.use_xformers = os.environ.get("USE_XFORMERS", "0").lower() in ("1", "true") + if self.use_xformers: + assert is_xformers_available(), "USE_XFORMERS=1 but xformers is not installed" + self.rotary_emb = RotaryEmbedding( dim=embed_dim // num_heads, theta=rotary_theta, @@ -95,21 +104,38 @@ def forward( q = apply_rotary_pos_emb(q, cos, sin) k = apply_rotary_pos_emb(k, cos, sin) - # - q = rearrange(q, "b s h d -> b h s d") - k = rearrange(k, "b s h d -> b h s d") - v = rearrange(v, "b s h d -> b h s d") - - # torch native sdpa is numerically equivalent to Memory-efficient attention from xformers. we replaced xformers with torch native sdpa to be able to build the Transformer Engine model in the same container. - x = torch.nn.functional.scaled_dot_product_attention( - query=q, - key=k, - value=v, - attn_mask=attention_mask, - dropout_p=self.dropout_rate if self.training else 0.0, - ) - - # x: (batch_size, query_seq_len, n_head, head_dim) - x = rearrange(x, "b h q d -> b q (h d)", h=self.num_heads) + if self.use_xformers: + padding_bias = attention_mask.repeat(1, 1, attention_mask.size(-1), 1) + padding_bias = padding_bias.to(q.dtype) + padding_bias = padding_bias.repeat(1, self.num_heads, 1, 1) + + attn_bias = padding_bias + + x = xops.memory_efficient_attention( + query=q, + key=k, + value=v, + op=None, + attn_bias=attn_bias, + p=self.dropout_rate if self.training else 0.0, + ) + + # x: (batch_size, query_seq_len, n_head, head_dim) + x = rearrange(x, "b q h d -> b q (h d)", h=self.num_heads) + else: + q = rearrange(q, "b s h d -> b h s d") + k = rearrange(k, "b s h d -> b h s d") + v = rearrange(v, "b s h d -> b h s d") + + x = torch.nn.functional.scaled_dot_product_attention( + query=q, + key=k, + value=v, + attn_mask=attention_mask, + dropout_p=self.dropout_rate if self.training else 0.0, + ) + + # x: (batch_size, n_head, query_seq_len, head_dim) + x = rearrange(x, "b h q d -> b q (h d)", h=self.num_heads) return x