diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 00000000..8bb8e28f --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1 @@ +FROM mcr.microsoft.com/devcontainers/anaconda:1-3 diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..4dda9426 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,23 @@ +// Small devcontainer which loads anaconda. All postinstallation steps have to be done manually. +// This comes with snakemake and docker-in-docker. + +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda +{ + "name": "Anaconda (Python 3)", + "build": { + "context": "..", + "dockerfile": "Dockerfile" + }, + "features": { + "ghcr.io/devcontainers/features/docker-in-docker:2": {}, + // For yamlfmt + "ghcr.io/devcontainers/features/go:1": {}, + // For web display + "ghcr.io/devcontainers/features/node:1": {}, + // For scripting + "ghcr.io/va-h/devcontainers-features/uv:1": {}, + // For paxtools + "ghcr.io/devcontainers/features/java:1": {} + } +} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 00000000..a6da8ded --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,111 @@ +name: Test SPRAS + +on: + pull_request: + branches: [main] + push: + branches: [main] + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +jobs: + pre-commit: + name: Run pre-commit checks + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Run pre-commit checks + uses: pre-commit/action@v3.0.0 + checks: + name: Run workflow + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: true + - name: Install uv for scripting + uses: astral-sh/setup-uv@v6.1.0 + with: + version: "0.7.13" + - name: Setup conda + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: spras + environment-file: spras/environment.yml + auto-activate-base: false + miniconda-version: 'latest' + # Install spras in the environment using pip + - name: Install spras in conda env + shell: bash --login {0} + run: pip install ./spras + # Log conda environment contents + - name: Log conda environment + shell: bash --login {0} + run: conda list + - name: Fetch Artifact Cache + id: fetch-cache + uses: actions/cache/restore@v4 + with: + path: cache/artifacts + key: cache-artifacts + - name: Process raw data through Snakemake + run: sh run_snakemake.sh + - name: Cache Artifact Cache + id: cache-cache + uses: actions/cache/save@v4 + with: + path: cache/artifacts + key: cache-artifacts + - name: Run Snakemake workflow for DMMMs + shell: bash --login {0} + run: snakemake --cores 4 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile + - name: Setup PNPM + uses: pnpm/action-setup@v4 + with: + version: 10 + - name: Install web dependencies + working-directory: ./web + run: pnpm install + - name: Run web builder + working-directory: ./web + run: pnpm build + - name: Upload built website distribution folder + uses: actions/upload-artifact@v4 + with: + name: build + path: web/dist + pages: + needs: checks + if: github.event_name != 'pull_request' + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + concurrency: + group: 'pages' + cancel-in-progress: true + steps: + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + name: build + path: dist + - name: Setup Pages + uses: actions/configure-pages@v2 + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: dist + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..196f12a8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +.idea/ + +# Snakemake +.snakemake + +# Output +/output +/web/output + +# pnpm +.pnpm-store + +# mac +.DS_Store diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..11f80da4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "spras"] + path = spras + url = https://github.com/Reed-CompBio/spras diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..f687b4c2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,33 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +default_language_version: + # Match this to the version specified in environment.yml + python: python3.11 +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 # Use the ref you want to point at + hooks: + # Attempts to load all yaml files to verify syntax. + - id: check-yaml + # Attempts to load all TOML files to verify syntax. + - id: check-toml + # Trims trailing whitespace. + - id: trailing-whitespace + # Preserves Markdown hard linebreaks. + args: [--markdown-linebreak-ext=md] + # Do not trim whitespace from all files, input files may need trailing whitespace for empty values in columns. + types_or: [markdown, python, yaml] + # Skip this Markdown file, which has an example of an input text file within it. + exclude: input/README.md + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: 'v0.15.4' + hooks: + - id: ruff + - repo: https://github.com/google/yamlfmt + rev: v0.17.0 + hooks: + - id: yamlfmt + - repo: https://github.com/crate-ci/typos + rev: v1.34.0 + hooks: + - id: typos diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..24ee5b1b --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 00000000..22a15055 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,4 @@ +{ + "recommendations": ["astro-build.astro-vscode"], + "unwantedRecommendations": [] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..bebd33fc --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "editor.rulers": [ + 150 + ] +} \ No newline at end of file diff --git a/.yamlfmt.yaml b/.yamlfmt.yaml new file mode 100644 index 00000000..9d3236aa --- /dev/null +++ b/.yamlfmt.yaml @@ -0,0 +1,2 @@ +formatter: + retain_line_breaks_single: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..bb33c0d5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,133 @@ +# Contributing + +## Helping Out + +There are `TODOs` that better enhance the reproducibility and accuracy of datasets or analysis of algorithm outputs, as well as +[open resolvable issues](https://github.com/Reed-CompBio/spras-benchmarking/). + +## Adding a dataset + +**Check that your data provider isn't already a dataset in `datasets`.** There are some datasets that are able to serve more data, and only use +a subset of it: these datasets can be extended for your needs. + +The goal of a dataset is to take raw data and produce data to be fed to SPRAS. +We'll follow along with `datasets/contributing`. This mini-tutorial assumes that you already have familiarity with SPRAS +[as per its contributing guide](https://spras.readthedocs.io/en/latest/contributing/index.html). + +### Uploading raw data + +This is a fake dataset: the data can be generated by running `datasets/contributing/raw_generation.py`, where the following artifacts will output: +- `sources.txt` +- `targets.txt` +- `gold-standard.tsv` +- `interactome.tsv` + +Unlike in this example, the data used in other datasets comes from other sources (whether that's supplementary info in a paper, or out of +biological databases like UniProt.) These artifacts can be large, and occasionally update, so we store them in Google Drive for caching and download +them when we want to reconstruct a dataset. + +Note that the four artifacts above change every time `raw_generation.py` is run. Upload those artifacts to Google Drive in a folder of your choice. +Share the file and allow for _Anyone with the link_ to _View_ the file. + +Once shared, copying the URL should look something like: + +> https://drive.google.com/file/d/1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h/view?usp=sharing + +We always drop the entire `/view?...` suffix, and replace `/file/d/` with `/uc?id=`, which turns the URL to a direct download link, which is internally +downloaded with [gdown](https://github.com/wkentaro/gdown). Those post-processing steps should make the URL now look as so: + +> https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h + +Now, add a directive to `cache/directory.py` under `Contributing`. Since this doesn't have an online URL, this should use `CacheItem.cache_only`, to +indicate that no other online database serves this URL. + +Your new directive under the `directory` dictionary should look something as so, with one entry for every artifact: + +```python +..., +"Contributing": { + "interactome.tsv": CacheItem.cache_only( + name="Randomly-generated contributing interactome", + cached="https://drive.google.com/uc?id=..." + ), + ... +} +``` + +### Setting up a workflow + +Now, we need to make these files SPRAS-compatible. To do this, we'll set up a `Snakefile`, which will handle: +- Artifact downloading +- Script running. + +`sources.txt` and `targets.txt` are already in a SPRAS-ready format, but we need to process `gold-standard.tsv` and `interactome.tsv`. + +Create a `Snakefile` under your dataset with the top-level directives: + +```python +# This provides the `produce_fetch_rules` util to allows us to automatically fetch the Google Drive data. +include: "../../cache/Snakefile" + +rule all: + input: + # The two files we will be passing to SPRAS + "raw/sources.txt", + "raw/targets.txt", + # The two files we will be processing + "processed/gold-standard.tsv", + "processed/interactome.tsv" +``` + +We'll generate four `fetch` rules, or rules that tell Snakemake to download the data we uploaded to Google Drive earlier. + +```python +produce_fetch_rules({ + # The value array is a path into the dictionary from `cache/directory.py`. + "raw/sources.txt": ["Contributing", "sources.txt"], + # and so on for targets, gold-standard, and interactome: + # note that excluding these three stops the Snakemake file from working by design! + ... +}) +``` + +Create two scripts that make `gold-standard.tsv` and `interactome.tsv` SPRAS-ready, consulting +the [SPRAS file format documentation](https://spras.readthedocs.io/en/latest/output.html). You can use any dependencies inside the top-level +`pyproject.toml`, and you can test out your scripts with `uv run diff --git a/web/src/components/Visualization.astro b/web/src/components/Visualization.astro new file mode 100644 index 00000000..a9b2dae6 --- /dev/null +++ b/web/src/components/Visualization.astro @@ -0,0 +1,47 @@ +--- +import VisualizationScript from "./VisualizationScript.astro"; + +interface Props { + interactome: string; +} + +const { interactome } = Astro.props; + +const edgeLimit = 300; + +const noHeaderInteractomeArray = interactome.trim().split("\n").slice(1); +const noHeaderInteractome = noHeaderInteractomeArray.length > edgeLimit ? "BIG" : noHeaderInteractomeArray.join("\n"); +--- + + + +{ + noHeaderInteractome === "" ? ( +
There is nothing to visualize.
+ ) : ( ++ There are {noHeaderInteractomeArray.length} edges, which is over the {edgeLimit} edge maximum. Visualizing it + may lag your machine, and may also not be visually meaningful. If you do want to see this data visualized, using + the local Cytoscape analyzer SPRAS has may be a better option. +
+
+
+ For information about the algorithm parameters, see the associated documentation page. For information about the dataset itself, go to the respective dataset page.
+
+ type, the (dataset) category, the dataset, the
+algorithm, and the parameters [hash](https://en.wikipedia.org/wiki/Hash_function).
+
+There are also pages related to different categories of these runs:
+
+- type-category-dataset
+- type-category
+
+
+The type classifies a dataset and the algorithms it runs on. In this case, PRA datasets run on all algorithms, while disease module datasets only run on DMMM algorithms.
+
+The category classifies what provider a dataset comes from.
diff --git a/web/src/pages/index.astro b/web/src/pages/index.astro
new file mode 100644
index 00000000..35a8be75
--- /dev/null
+++ b/web/src/pages/index.astro
@@ -0,0 +1,76 @@
+---
+import Colors from "../components/Colors.astro";
+import BaseLayout from "../layouts/BaseLayout.astro";
+import { parseOutputString } from "../lib/outputStyle";
+import { getStaticPaths } from "./[uid]/index.astro";
+
+import Description from "./description.md";
+---
+
+
+
+
+
+This contains analysis associated with datasets running on a particular algorithm type under some category.
+ +This contains analysis associated with datasets running on a particular algorithm type.
+ +