From ab187585f379bd2f2d18bf4e46cab78322618a34 Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Fri, 10 Apr 2026 14:27:04 +0800 Subject: [PATCH 01/12] Add Modal L4 GPU CI/CD pipeline - GitHub Actions workflow with manual trigger (workflow_dispatch) - Modal runner using L4 GPU with model weight caching via Volume Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/test.yml | 27 ++++++++++++ ci/modal_runner.py | 85 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 .github/workflows/test.yml create mode 100644 ci/modal_runner.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..741ebdc --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,27 @@ +name: GPU Tests (L4) + +on: + workflow_dispatch: + +jobs: + test: + name: Run tests on L4 GPU via Modal + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Modal + run: pip install modal + + - name: Run tests on B200 + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + run: modal run ci/modal_runner.py diff --git a/ci/modal_runner.py b/ci/modal_runner.py new file mode 100644 index 0000000..c5e2903 --- /dev/null +++ b/ci/modal_runner.py @@ -0,0 +1,85 @@ +import modal + +app = modal.App("esm-efficient-tests") + +image = ( + modal.Image.from_registry( + "nvidia/cuda:12.8.0-devel-ubuntu22.04", + add_python="3.11", + ) + .pip_install( + "torch>=2.7.0", + extra_index_url="https://download.pytorch.org/whl/cu128", + ) + .run_commands( + "pip install flash-attn --no-build-isolation", + gpu="L4", + ) + .pip_install( + "einops", + "accelerate", + "pandas", + "numpy", + "polars", + "torchmetrics", + "lightning", + "scikit-learn", + "huggingface_hub", + "safetensors", + "pytest", + "pytest-runner", + "pooch", + "esm", + ) + .run_commands( + "pip install git+https://github.com/MuhammedHasan/fair-esm.git", + ) +) + +# Cache downloaded model weights across runs (~800MB total) +model_cache = modal.Volume.from_name("esm-model-cache", create_if_missing=True) + + +@app.function( + gpu="L4", + image=image, + mounts=[modal.Mount.from_local_dir(".", remote_path="/app")], + volumes={"/model-cache": model_cache}, + timeout=3600, +) +def run_tests(): + import os + import shutil + import subprocess + + os.chdir("/app") + + # Restore cached model weights to avoid re-downloading each run + cache_dir = "/model-cache/test-data" + os.makedirs(cache_dir, exist_ok=True) + for fname in os.listdir(cache_dir): + dst = f"/app/tests/data/{fname}" + if not os.path.exists(dst): + shutil.copy2(f"{cache_dir}/{fname}", dst) + + # Install the package itself + subprocess.run(["pip", "install", "-e", "."], check=True) + + # Run tests + result = subprocess.run(["pytest", "tests/", "-v", "--tb=short"]) + + # Save newly downloaded models to cache + for fname in os.listdir("/app/tests/data"): + if fname.endswith((".pt", ".pth")): + dst = f"{cache_dir}/{fname}" + if not os.path.exists(dst): + shutil.copy2(f"/app/tests/data/{fname}", dst) + model_cache.commit() + + if result.returncode != 0: + raise SystemExit(result.returncode) + + +@app.local_entrypoint() +def main(): + run_tests.remote() From 8226a3ca08e895e685f381e81bb91f75d685e9ef Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Fri, 10 Apr 2026 14:31:01 +0800 Subject: [PATCH 02/12] Fix Modal API: replace Mount with copy_local_dir modal.Mount removed in newer Modal versions. Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index c5e2903..8e02269 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -34,6 +34,7 @@ .run_commands( "pip install git+https://github.com/MuhammedHasan/fair-esm.git", ) + .copy_local_dir(".", "/app") ) # Cache downloaded model weights across runs (~800MB total) @@ -43,7 +44,6 @@ @app.function( gpu="L4", image=image, - mounts=[modal.Mount.from_local_dir(".", remote_path="/app")], volumes={"/model-cache": model_cache}, timeout=3600, ) From 8776bb0923172cc358ed33976546a111284a956d Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Fri, 10 Apr 2026 14:33:57 +0800 Subject: [PATCH 03/12] Fix Modal: clone repo inside container instead of local dir copy Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/test.yml | 2 +- ci/modal_runner.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 741ebdc..749b004 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,4 +24,4 @@ jobs: env: MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - run: modal run ci/modal_runner.py + run: modal run ci/modal_runner.py --ref ${{ github.sha }} diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 8e02269..bb5b32f 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -7,6 +7,7 @@ "nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11", ) + .apt_install("git") .pip_install( "torch>=2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128", @@ -34,7 +35,6 @@ .run_commands( "pip install git+https://github.com/MuhammedHasan/fair-esm.git", ) - .copy_local_dir(".", "/app") ) # Cache downloaded model weights across runs (~800MB total) @@ -47,12 +47,14 @@ volumes={"/model-cache": model_cache}, timeout=3600, ) -def run_tests(): +def run_tests(repo_url: str, ref: str): import os import shutil import subprocess - os.chdir("/app") + # Clone the repo at the specific commit + subprocess.run(["git", "clone", repo_url, "/app"], check=True) + subprocess.run(["git", "checkout", ref], cwd="/app", check=True) # Restore cached model weights to avoid re-downloading each run cache_dir = "/model-cache/test-data" @@ -63,10 +65,10 @@ def run_tests(): shutil.copy2(f"{cache_dir}/{fname}", dst) # Install the package itself - subprocess.run(["pip", "install", "-e", "."], check=True) + subprocess.run(["pip", "install", "-e", "."], cwd="/app", check=True) # Run tests - result = subprocess.run(["pytest", "tests/", "-v", "--tb=short"]) + result = subprocess.run(["pytest", "tests/", "-v", "--tb=short"], cwd="/app") # Save newly downloaded models to cache for fname in os.listdir("/app/tests/data"): @@ -81,5 +83,8 @@ def run_tests(): @app.local_entrypoint() -def main(): - run_tests.remote() +def main( + repo_url: str = "https://github.com/hmtcelik/esm-efficient.git", + ref: str = "master", +): + run_tests.remote(repo_url, ref) From 5c1954354778bc7ea20370dc5cfde155548b0f53 Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Fri, 10 Apr 2026 14:46:13 +0800 Subject: [PATCH 04/12] Fix flash-attn build: install packaging dependency first Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index bb5b32f..9d0e66a 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -13,6 +13,7 @@ extra_index_url="https://download.pytorch.org/whl/cu128", ) .run_commands( + "pip install packaging", "pip install flash-attn --no-build-isolation", gpu="L4", ) From 177620466abf141ebbb6236882e588b65c05d5c7 Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Fri, 10 Apr 2026 14:48:23 +0800 Subject: [PATCH 05/12] Add ninja for faster flash-attn compilation, move packaging to pip layer Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 9d0e66a..2969ff1 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -9,11 +9,12 @@ ) .apt_install("git") .pip_install( + "packaging", + "ninja", "torch>=2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128", ) .run_commands( - "pip install packaging", "pip install flash-attn --no-build-isolation", gpu="L4", ) From de40f3fe519fd3733f285e0c134d66941b570c0f Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 05:35:17 +0800 Subject: [PATCH 06/12] Add wheel and setuptools for flash-attn build deps Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 2969ff1..77e738e 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -11,6 +11,8 @@ .pip_install( "packaging", "ninja", + "wheel", + "setuptools", "torch>=2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128", ) From 40edf039e7144e6204dfb0a3422f153c8efd957c Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 06:16:08 +0800 Subject: [PATCH 07/12] Add build-essential for g++ compiler needed by flash-attn Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 77e738e..9fcdcfe 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -7,7 +7,7 @@ "nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11", ) - .apt_install("git") + .apt_install("git", "build-essential") .pip_install( "packaging", "ninja", From 460cca1222d255307223592af97ecf18adadd463 Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 06:46:55 +0800 Subject: [PATCH 08/12] Force g++ compiler for flash-attn build via CXX env var Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 9fcdcfe..1e74454 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -17,7 +17,7 @@ extra_index_url="https://download.pytorch.org/whl/cu128", ) .run_commands( - "pip install flash-attn --no-build-isolation", + "CXX=g++ CC=gcc pip install flash-attn --no-build-isolation", gpu="L4", ) .pip_install( From 0e1776361e8a49aad931226cac5c80e3000cb4ac Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 07:03:19 +0800 Subject: [PATCH 09/12] Limit flash-attn parallel compilation to 4 jobs to avoid OOM Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 1e74454..1e1f742 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -17,7 +17,7 @@ extra_index_url="https://download.pytorch.org/whl/cu128", ) .run_commands( - "CXX=g++ CC=gcc pip install flash-attn --no-build-isolation", + "MAX_JOBS=4 CXX=g++ CC=gcc pip install flash-attn --no-build-isolation", gpu="L4", ) .pip_install( From 9aedd0af4948eda304e63748457e0b4d57ea4e04 Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 08:52:52 +0800 Subject: [PATCH 10/12] Add httpx dependency required by esm package Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 1e1f742..251a1c0 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -35,6 +35,7 @@ "pytest-runner", "pooch", "esm", + "httpx", ) .run_commands( "pip install git+https://github.com/MuhammedHasan/fair-esm.git", From c83db92ad1581aeae13711569c047c84ff9bcce1 Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 09:13:02 +0800 Subject: [PATCH 11/12] Install torchvision from PyTorch index to match torch version Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index 251a1c0..d91246f 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -14,6 +14,7 @@ "wheel", "setuptools", "torch>=2.7.0", + "torchvision", extra_index_url="https://download.pytorch.org/whl/cu128", ) .run_commands( From 8c5dcd961153834339fa35b0b947c81acf12c0ec Mon Sep 17 00:00:00 2001 From: Abdulhamit Celik Date: Sat, 11 Apr 2026 10:29:51 +0800 Subject: [PATCH 12/12] Add bitsandbytes, patch fair-esm for PyTorch 2.6+ torch.load compatibility Co-Authored-By: Claude Sonnet 4.6 --- ci/modal_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/modal_runner.py b/ci/modal_runner.py index d91246f..5262917 100644 --- a/ci/modal_runner.py +++ b/ci/modal_runner.py @@ -37,9 +37,12 @@ "pooch", "esm", "httpx", + "bitsandbytes", ) .run_commands( "pip install git+https://github.com/MuhammedHasan/fair-esm.git", + # Patch fair-esm for PyTorch 2.6+ compatibility (weights_only default changed) + "sed -i 's/torch.load(str(model_location), map_location=\"cpu\")/torch.load(str(model_location), map_location=\"cpu\", weights_only=False)/' /usr/local/lib/python3.11/site-packages/fair_esm/pretrained.py", ) )