WGLab · jonperdomo · May 11, 2026 · Mar 4, 2025 · Mar 14, 2025 · Mar 16, 2025
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -0,0 +1,43 @@
+# This is a basic workflow to help you get started with Actions
+
+name: unit tests
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the "main" branch
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - name: Set up conda environment
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-variant: Miniforge3   # uses mamba automatically
+          activate-environment: contextscore
+          environment-file: environment.yml
+          auto-activate-base: false
+          use-mamba: true
+          cache-environment: true      # ← caches the env
+          cache-downloads: true        # ← caches downloaded packages
+
+      - name: Run tests
+        shell: bash --login {0}
+        run: |
+          mkdir tests/output
+          python -m pytest
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,22 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+
+# Ignore the output/ folder
+output/
+scripts/
+
+# VS Code settings
+.vscode/launch.json
+
+# Testing scripts
+linktoscripts
+truvari_results_Simulated_*/
+conda/contextscore-models/
+tests/fixtures/output.vcf.avinput
+tests/fixtures/output.vcf.bed
+tests/fixtures/annotations/features.tsv
+tests/fixtures/annotations/regions.hg38_multianno.txt
+
+# Database files
+data/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.md
+include LICENSE
+recursive-include data *
diff --git a/README.md b/README.md
@@ -1,2 +1,39 @@
+[![unit tests](https://github.com/WGLab/ContextScore/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/WGLab/ContextScore/actions/workflows/unit-tests.yml)
+
 # ContextScore
-Assign confidence scores to SV datasets based on coverage, genomic context, and other important alignment features
+<p>
+<img src="https://github.com/user-attachments/assets/03603ad1-df9d-438d-911c-81af0cf612e3" alt="ContextSV" align="left" style="width:100px;"/>
+Filtering step for the <a href="https://github.com/WGLab/ContextSV">ContextSV</a> long-read structural variant (SV) caller, utilizing a Random Forest model trained on SV validation features. Assign confidence scores to SV datasets based on coverage, genomic context, and other important alignment features, then filter low-confidence SVs to increase the precision of the final callset. Genomic context is determined from annotations using ANNOVAR and UCSC databases.
+</p>
+<br clear="left"/>
+
+## Installation
+```bash
+conda install -c wglab -c bioconda -c conda-forge contextscore
+
+# Or using mamba (faster dependency resolution):
+mamba install -c wglab contextscore
+```
+
+## ANNOVAR setup
+[ANNOVAR](https://annovar.openbioinformatics.org/en/latest/user-guide/download/) is required for annotations and must be installed separately.
+
+These are the required ANNOVAR components for ContextScore:
+- `--annovar`: directory containing `annotate_variation.pl` and `table_annovar.pl`
+- `--annovar-db`: ANNOVAR database directory
+
+## User Workflow
+```bash
+contextscore --input input.vcf --output scored.vcf --sample-coverage 30 --buildver {hg38,hg19} --threshold 0.2 \
+	--annovar /path/to/annovar --annovar-db /path/to/humandb
+```
+
+## Sources for additional annotations (under `data/` directory):
+| File | Source | Description | Link |
+| --- | --- | --- | --- |
+| `cytobands_hg{19,38}.txt` | UCSC Genome Browser | Cytoband annotations for human genome builds hg19 and hg38 | [UCSC hg19](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz) / [UCSC hg38](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) |
+| `hg{19,38}_segmental_duplications.bed` | UCSC Genome Browser | Segmental duplication annotations for human genome builds hg19 and hg38 | [UCSC hg19](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/segmentalDuplications.txt.gz) / [UCSC hg38](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/segmentalDuplications.txt.gz) |
+| `phastcons100way_hg{19,38}.bed` | UCSC Genome Browser | PhastCons conservation scores for human genome builds hg19 and hg38 | [UCSC hg19](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/phastCons100way.txt.gz) / [UCSC hg38](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/phastCons100way.txt.gz) |
+| `simple_repeats_hg{19,38}.bed` | UCSC Genome Browser | Simple repeat annotations for human genome builds hg19 and hg38 | [UCSC hg19](https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/simpleRepeat.txt.gz) / [UCSC hg38](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/simpleRepeat.txt.gz) |
+| `fragile_sites_hg38.bed` / `fragile_sites_hg19_liftover.bed` | [HumCFS](https://webs.iiitd.edu.in/raghava/humcfs/download.html) | Fragile site annotations for human genome builds hg38 and hg19 (liftover) | [HumCFS](https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip) |
+
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -0,0 +1,42 @@
+{% set name = "contextscore" %}
+{% set version = "0.1.0" %}
+
+package:
+  name: {{ name|lower }}
+  version: {{ version }}
+
+source:
+  path: ..
+
+build:
+  number: 0
+  skip: true  # [win]
+  script: "{{ PYTHON }} -m pip install . --no-deps -vv"
+
+requirements:
+  host:
+    - python >=3.10,<3.11
+    - pip
+    - setuptools
+  run:
+    - python >=3.10,<3.11
+    - numpy
+    - pandas
+    - scikit-learn =1.6.1  # For consistency with model training environment
+    - joblib
+    - bedtools
+    - contextscore-models
+
+about:
+  home: https://github.com/WGLab/ContextScore
+  summary: Assign confidence scores to structural variant datasets.
+  description: |
+    ContextScore prediction package. Model weights are distributed separately
+    (for example via contextscore-models) and can be provided via --model or
+    CONTEXTSCORE_MODEL_PATH.
+  license: MIT
+  license_file: LICENSE
+
+extra:
+  recipe-maintainers:
+    - WGLab
diff --git a/contextscore/TrainingAnnotationsSummary.tsv b/contextscore/TrainingAnnotationsSummary.tsv
@@ -0,0 +1,2 @@
+True Positives
+Total	Fragile Sites	Telomeres	Centromeres	Segmental Duplications	Conserved Regions
diff --git a/contextscore/__init__.py b/contextscore/__init__.py
diff --git a/contextscore/__main__.py b/contextscore/__main__.py
@@ -0,0 +1,5 @@
+from .predict import main
+
+
+if __name__ == '__main__':
+    main()
diff --git a/contextscore/download_tables.py b/contextscore/download_tables.py
@@ -0,0 +1,61 @@
+import pandas as pd
+import pymysql
+from pathlib import Path
+
+def download_ucsc(table_name: str, 
+                 genome_version: str = "hg38",
+                 output_file: str = "ucsc_table.bed") -> None:
+    """
+    Downloads the UCSC Simple Repeats table and saves it as a BED file for use with BEDTools.
+    Note: This function requires access to the UCSC MySQL database.
+    """
+    print("Downloading UCSC " + table_name + " table for " + genome_version + "...")
+
+    # Connect to UCSC MySQL database
+    conn = pymysql.connect(host="genome-mysql.soe.ucsc.edu",
+                        user="genome",
+                        password="",
+                        database="hg38")  # Change to the desired genome version (e.g., hg19, mm10)
+
+    query = f"""
+    SELECT
+        chrom AS chr, 
+        chromStart AS start, 
+        chromEnd AS end, 
+        name
+    FROM
+        {table_name}
+    WHERE
+        chrom IS NOT NULL AND
+        chromStart IS NOT NULL AND
+        chromEnd IS NOT NULL
+    AND
+        chromStart >= 0 AND
+        chromEnd > chromStart
+    AND
+        chromStart < chromEnd;
+    """
+    df = pd.read_sql(query, conn)
+
+    # Close connection
+    conn.close()
+
+    # Save as BED file for BEDTools
+    df.to_csv(output_file, sep="\t", index=False, header=False)
+    print("Downloaded UCSC " + table_name + " table for " + genome_version + " and saved as " + output_file)
+
+if __name__ == "__main__":
+    data_dir = Path(__file__).resolve().parents[1] / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download the UCSC Simple Repeats table for hg38
+    simple_repeat_file = str(data_dir / "simple_repeats_hg38.bed")
+    download_ucsc(table_name="simpleRepeat",
+                 genome_version="hg38",
+                 output_file=simple_repeat_file)
+
+    # Download the UCSC phastCons100way table for hg38
+    phastcons_file = str(data_dir / "phastcons100way_hg38.bed")
+    download_ucsc(table_name="phastCons100way",
+                 genome_version="hg38",
+                 output_file=phastcons_file)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		True Positives
		Total Fragile Sites Telomeres Centromeres Segmental Duplications Conserved Regions