From 2db8890b3aedb85684210d57c63d25eb3a5be5f8 Mon Sep 17 00:00:00 2001 From: BenjaminIsaac0111 <12176376+BenjaminIsaac0111@users.noreply.github.com> Date: Thu, 26 Feb 2026 19:45:02 +0000 Subject: [PATCH] refactor: improve setup UX, HF authentication, and download docs - Enhanced setup scripts with strict error handling and Conda prerequisite checks. - Added explicit PyTorch and CUDA installation logic to automated setup. - Integrated Hugging Face authentication status checks into setup flow. - Updated README with environment activation and dataset authentication requirements. - Expanded download documentation with examples for full dataset and filtered subsets. --- README.md | 37 ++++++++++++++++++++++++++++++++----- setup.ps1 | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- setup.sh | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 131 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index f7884ff..e57d9d2 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,14 @@ > [!WARNING] > **Work in Progress**: This project is under active development. Core architectures, CLI flags, and data formats are subject to major changes. -A transformer-based model for spatial transcriptomics that bridges histology and biological pathways. +**SpatialTranscriptFormer** bridges histology and biological pathways through a high-performance transformer architecture. By modeling the dense interplay between morphological features and gene expression signatures, it provides an interpretable and spatially-coherent mapping of the tissue microenvironment. -## Key Features +## Key Technical Pillars - **Quad-Flow Interaction**: Configurable attention between Pathways and Histology patches (`p2p`, `p2h`, `h2p`, `h2h`). - **Pathway Bottleneck**: Interpretable gene expression prediction via 50 MSigDB Hallmark tokens. - **Spatial Pattern Coherence**: Optimized using a composite **MSE + PCC (Pearson Correlation) loss** to prevent spatial collapse and ensure accurate morphology-expression mapping. +- **Foundation Model Ready**: Native support for **CTransPath**, **Phikon**, **Hibou**, and **GigaPath**. - **Biologically Informed Initialization**: Gene reconstruction weights derived from known hallmark memberships. ## License @@ -30,18 +31,44 @@ This project requires [Conda](https://docs.conda.io/en/latest/). 1. Clone the repository. 2. Run the automated setup script: - - On Windows: `.\setup.ps1` +3. On Windows: `.\setup.ps1` - On Linux/HPC: `bash setup.sh` ## Usage +**Before running any commands**, you must activate the conda environment: + +```bash +conda activate SpatialTranscriptFormer +``` + ### Download HEST Data -Download specific subsets using filters or patterns: +> [!CAUTION] +> **Authentication Required**: The HEST dataset is gated. You must accept the terms of use at [MahmoodLab/hest](https://huggingface.co/datasets/MahmoodLab/hest) and authenticate with your Hugging Face account to download the data. + +Please provide your token using ONE of the following methods before running the download tool: + +1. **Persistent Login**: Run `huggingface-cli login` and paste your access token when prompted. +2. **Environment Variable**: Set the `HF_TOKEN` environment variable in your active terminal session. + +Once authenticated, download specific subsets using filters or the entire dataset: ```bash -# Download only the Bowel Cancer subset (including ST data and WSIs) +# Option 1: Download the ENTIRE HEST dataset (requires confirmation) +stf-download --local_dir hest_data + +# Option 2: Download a specific subset (e.g., Bowel Cancer) stf-download --organ Bowel --disease Cancer --local_dir hest_data + +# Option 3: Filter by technology (e.g., Visium) +stf-download --tech Visium --local_dir hest_data +``` + +To see all available organs in the metadata: + +```bash +stf-download --list_organs ``` ### Train Models diff --git a/setup.ps1 b/setup.ps1 index 9c1bd04..544e83c 100644 --- a/setup.ps1 +++ b/setup.ps1 @@ -1,9 +1,20 @@ # setup.ps1 - Automated environment setup for SpatialTranscriptFormer +$ErrorActionPreference = 'Stop' + Write-Host "--- SpatialTranscriptFormer Setup ---" -ForegroundColor Cyan $EnvName = "SpatialTranscriptFormer" +# Check if conda exists +try { + conda --version | Out-Null +} +catch { + Write-Error "Conda was not found. Please ensure Conda is installed and added to your PATH." + exit 1 +} + # Check if conda environment exists $CondaEnv = conda env list | Select-String $EnvName if ($null -eq $CondaEnv) { @@ -14,12 +25,52 @@ else { Write-Host "Conda environment '$EnvName' already exists." -ForegroundColor Green } +Write-Host "Installing PyTorch (CUDA 11.8)..." -ForegroundColor Yellow +conda run -n $EnvName pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118 +if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to install PyTorch." + exit $LASTEXITCODE +} + Write-Host "Installing/Updating package in editable mode..." -ForegroundColor Yellow conda run -n $EnvName pip install -e .[dev] +if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to install SpatialTranscriptFormer." + exit $LASTEXITCODE +} + +Write-Host "Checking Hugging Face authentication..." -ForegroundColor Yellow +$HFLoginStatus = conda run -n $EnvName huggingface-cli whoami 2>&1 +if ($LASTEXITCODE -ne 0 -or $HFLoginStatus -match "Not logged in") { + $HFNeedLogin = $true +} +else { + $HFNeedLogin = $false + Write-Host "Hugging Face authentication found: $HFLoginStatus" -ForegroundColor Green +} Write-Host "" -Write-Host "Setup Complete!" -ForegroundColor Green -Write-Host "You can now use the following commands:" +Write-Host "=========================================" -ForegroundColor Green +Write-Host " SETUP COMPLETE! " -ForegroundColor Green +Write-Host "=========================================" -ForegroundColor Green +Write-Host "" +Write-Host "IMPORTANT: You must activate the environment before using the tools:" -ForegroundColor Yellow +Write-Host " conda activate $EnvName" -ForegroundColor Cyan +Write-Host "" + +if ($HFNeedLogin) { + Write-Host "------------------------------------------------------------" -ForegroundColor DarkYellow + Write-Host "DATASET ACCESS REQUIRES AUTHENTICATION" -ForegroundColor Red + Write-Host "The HEST-1k dataset on Hugging Face is gated. You must provide an access token." -ForegroundColor DarkYellow + Write-Host "Please do ONE of the following before downloading data:" + Write-Host " Option A (Persistent): Run 'conda run -n $EnvName huggingface-cli login' and paste your token." + Write-Host " Option B (Temporary): Set the 'HF_TOKEN' environment variable." + Write-Host "Get your token from: https://huggingface.co/settings/tokens" -ForegroundColor DarkCyan + Write-Host "------------------------------------------------------------" -ForegroundColor DarkYellow + Write-Host "" +} + +Write-Host "You can then use the following commands:" Write-Host " stf-download --help" Write-Host " stf-split --help" Write-Host " stf-build-vocab --help" diff --git a/setup.sh b/setup.sh index a16c23e..e37fdd1 100644 --- a/setup.sh +++ b/setup.sh @@ -1,10 +1,18 @@ #!/bin/bash # setup.sh - Automated environment setup for SpatialTranscriptFormer (Linux/HPC) +set -e + echo "--- SpatialTranscriptFormer Setup ---" ENV_NAME="SpatialTranscriptFormer" +# Check if conda exists +if ! command -v conda &> /dev/null; then + echo "Error: conda was not found. Please ensure Conda is installed and in your PATH." + exit 1 +fi + # Check if conda environment exists if ! conda env list | grep -q "$ENV_NAME"; then echo "Creating conda environment '$ENV_NAME' with Python 3.9..." @@ -13,12 +21,48 @@ else echo "Conda environment '$ENV_NAME' already exists." fi +echo "Installing PyTorch (CUDA 11.8)..." +conda run -n $ENV_NAME pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118 + echo "Installing/Updating package in editable mode..." conda run -n $ENV_NAME pip install -e .[dev] +echo "Checking Hugging Face authentication..." +# Temporarily disable exit on error for this check +set +e +HF_STATUS=$(conda run -n $ENV_NAME huggingface-cli whoami 2>&1) +HF_EXIT=$? +set -e + +if [ $HF_EXIT -ne 0 ] || [[ "$HF_STATUS" == *"Not logged in"* ]]; then + HF_NEED_LOGIN=true +else + HF_NEED_LOGIN=false + echo "Hugging Face authentication found: $HF_STATUS" +fi + +echo "" +echo "=========================================" +echo " SETUP COMPLETE! " +echo "=========================================" +echo "" +echo "IMPORTANT: You must activate the environment before using the tools:" +echo " conda activate $ENV_NAME" echo "" -echo "Setup Complete!" -echo "You can now use the following commands (after activating the environment):" + +if [ "$HF_NEED_LOGIN" = true ]; then + echo "------------------------------------------------------------" + echo "DATASET ACCESS REQUIRES AUTHENTICATION" + echo "The HEST-1k dataset on Hugging Face is gated. You must provide an access token." + echo "Please do ONE of the following before downloading data:" + echo " Option A (Persistent): Run 'conda run -n $ENV_NAME huggingface-cli login' and paste your token." + echo " Option B (Temporary): Run 'export HF_TOKEN=your_token_here'" + echo "Get your token from: https://huggingface.co/settings/tokens" + echo "------------------------------------------------------------" + echo "" +fi + +echo "You can then use the following commands:" echo " stf-download --help" echo " stf-split --help" echo " stf-build-vocab --help"