From 6ff7e6e218111e5d43d48a194eb8bb1db99922dc Mon Sep 17 00:00:00 2001 From: rdk Date: Wed, 25 Feb 2026 19:14:15 +0100 Subject: [PATCH 1/2] Add FastAPI conservation server, CLI tools, and batch processing - Conservation server exposing HMM pipeline via HTTP (port 8030) - run_conservation.py for single FASTA file processing - Batch script for parallel conservation computation - Input validation for HMM sequence database file - Docker Compose service and Dockerfile updates --- .../hmm_based/conservation_hmm_based.py | 11 ++ docker-compose.yml | 15 ++ executor-p2rank/Dockerfile | 3 + executor-p2rank/conservation/README.md | 179 ++++++++++++++++++ executor-p2rank/conservation/__init__.py | 0 .../calculate_conservations_batch.sh | 22 +++ .../conservation_server/__init__.py | 0 .../conservation/conservation_server/main.py | 92 +++++++++ .../conservation_server/requirements.txt | 2 + .../conservation/run_conservation.py | 117 ++++++++++++ executor-p2rank/conservation_cache.py | 3 +- executor-p2rank/executor.py | 3 + 12 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 executor-p2rank/conservation/README.md create mode 100644 executor-p2rank/conservation/__init__.py create mode 100644 executor-p2rank/conservation/calculate_conservations_batch.sh create mode 100644 executor-p2rank/conservation/conservation_server/__init__.py create mode 100644 executor-p2rank/conservation/conservation_server/main.py create mode 100644 executor-p2rank/conservation/conservation_server/requirements.txt create mode 100644 executor-p2rank/conservation/run_conservation.py diff --git a/conservation/hmm_based/conservation_hmm_based.py b/conservation/hmm_based/conservation_hmm_based.py index 5a26d6e4..81ab90d3 100644 --- a/conservation/hmm_based/conservation_hmm_based.py +++ b/conservation/hmm_based/conservation_hmm_based.py @@ -68,6 +68,17 @@ def compute_conservation( mask_output: bool, max_seqs: int, ): + if database_file is None: + raise RuntimeError( + "HMM sequence database file is not set." + " Set HMM_SEQUENCE_FILE environment variable.") + if not os.path.isfile(database_file): + raise RuntimeError( + f"HMM sequence database file not found: {database_file}") + if not os.access(database_file, os.R_OK): + raise RuntimeError( + f"HMM sequence database file is not readable: {database_file}") + unweighted_msa_file = _generate_msa( fasta_file, database_file, working_directory, execute_command) diff --git a/docker-compose.yml b/docker-compose.yml index e6b37671..85a8544d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,21 @@ services: - predictions:/data/prankweb/predictions - docking:/data/prankweb/docking - tunnels:/data/prankweb/tunnels + conservation-server: + # TODO: Mount the conservation volume so the HMM database is available. + # Use `docker compose run` with `-v` as described in conservation/README.md + # for now. To make `docker compose up` work, add: + # volumes: + # - conservation:/data/conservation + build: + context: ./ + dockerfile: ./executor-p2rank/Dockerfile + args: + UID: ${UID:-5988} + GID: ${GID:-5988} + command: ["uvicorn", "conservation.conservation_server.main:app", "--host", "0.0.0.0", "--port", "8030"] + ports: + - "8030:8030" executor-docking: build: context: ./ diff --git a/executor-p2rank/Dockerfile b/executor-p2rank/Dockerfile index 3b3b6b12..c8d8f27f 100644 --- a/executor-p2rank/Dockerfile +++ b/executor-p2rank/Dockerfile @@ -192,6 +192,9 @@ RUN chmod a+x ./p2rank.sh \ && chmod a+x ./run_p2rank.py \ && chmod a+x ./run_p2rank_task.py +COPY --chown=user:user ./executor-p2rank/conservation/conservation_server/requirements.txt ./conservation/conservation_server/ +RUN pip3 install -r ./conservation/conservation_server/requirements.txt + # # administration tools WORKDIR /opt/administration diff --git a/executor-p2rank/conservation/README.md b/executor-p2rank/conservation/README.md new file mode 100644 index 00000000..bd8f1052 --- /dev/null +++ b/executor-p2rank/conservation/README.md @@ -0,0 +1,179 @@ +# Conservation + +Tools for computing HMM-based conservation scores for protein sequences (without running P2Rank predictions). +All tools are designed to run inside the `executor-p2rank` Docker image for now. + +## Conservation server + +A FastAPI web server that exposes conservation computation as an HTTP endpoint. + +### Running the server using Docker + +```bash +# to avoid running docker as root, add current user to docker group (you may need to log out and back in for this to take effect) +sudo usermod -aG docker $USER + +# prepare data/cache directory on the host +mkdir -p /ssd/p2rank-conservation-docker-data/hmm-based + +# download latest uniref database +cd /ssd/p2rank-conservation-docker-data/hmm-based +wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.fasta.gz && gunzip uniref50.fasta.gz + +# alternatively you can download the database inside the container using download_database.py script from prankweb +# but this downloads old version of uniref50 (from 2023-11-01). Might be useful for reproducibility of old results. +docker compose run -it --rm -p 8030:8030 \ + --user "$(id -u):$(id -g)" \ + -v /ssd/p2rank-conservation-docker-data:/data/conservation \ + conservation-server bash +/opt/hmm-based-conservation/download_database.py +# exit container + +# build/rebuild docker image (only necessary after changes in this repo) +docker compose build conservation-server + +# run server with mounted data directory +docker compose run --rm -p 8030:8030 \ + --user "$(id -u):$(id -g)" \ + -v /ssd/p2rank-conservation-docker-data:/data/conservation \ + conservation-server +``` + +The server starts on port 8030. +Check API documentation at http://localhost:8030/docs . + +To run with more workers for concurrent requests: + +```bash +docker compose run --rm -p 8030:8030 conservation-server \ + uvicorn conservation.conservation_server.main:app \ + --host 0.0.0.0 --port 8030 --workers 8 +``` + +Set `--workers` to `n_cpu * 2` for CPU-bound workloads. + +### Endpoints + +#### GET /health + +Returns `{"status": "ok"}` when the server is running. + +```bash +curl http://localhost:8030/health +``` + +#### POST /conservation + +Accepts FASTA content, runs the HMM conservation pipeline, returns the raw +`.hom` file as plain text. + +Request body (JSON): +```json +{ + "fasta_content": ">sp|P00520|ABL1_MOUSE\nMLPPGPLLLLLLLSGTARLFS" +} +``` + +Response (text/plain): +``` +0 0.583 M +1 1.247 L +2 0.891 P +... +``` + +Example: + +```bash +curl -X POST http://localhost:8030/conservation \ + -H "Content-Type: application/json" \ + -d '{"fasta_content": ">test\nGSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV"}' +``` + +### Interactive API docs + +FastAPI auto-generates interactive API documentation at: + +- Swagger UI: http://localhost:8030/docs +- ReDoc: http://localhost:8030/redoc + +## Contents + +- `run_conservation.py` -- CLI script that computes conservation for a single FASTA file. +- `calculate_conservations_batch.sh` -- Batch script that runs `run_conservation.py` in parallel over a directory of FASTA files. +- `conservation_server/` -- FastAPI web server that exposes conservation computation via HTTP. + +## Data directory + +The conservation pipeline requires the HMM sequence database and optionally +a cache directory. Inside the Docker container these are configured via +environment variables baked into the image: + +| Variable | Default in image | Description | +|---|---|---| +| `HMM_SEQUENCE_FILE` | `/data/conservation/hmm-based/uniref50.fasta` | UniRef50 FASTA database used by HMMER | +| `HMM_CONSERVATION_CACHE` | `/data/conservation/hmm-based-cache/` | Directory for caching computed scores | +| `HMMER_DIR` | `/opt/hmm-based-conservation-dependencies/hmmer-3.3.2/bin/` | Path to HMMER binaries (phmmer, esl-weight, esl-alistat) | + +The database files are stored in the `prankweb_conservation` Docker volume, +mounted at `/data/conservation` inside the container. + +## Building the Docker image + +The image is built from the project root (not from this directory) because +the Dockerfile references multiple project directories: + +```bash +cd /path/to/prankweb # project root containing docker-compose.yml +docker compose build conservation-server +``` + +This uses the same multi-stage `executor-p2rank/Dockerfile` that builds +HMMER, BLAST, and other dependencies from source. + +## run_conservation.py + +Computes HMM-based conservation for a single FASTA file. + +``` +./run_conservation.py --file /path/to/input.fasta --output /path/to/output --working /tmp/work +``` + +Arguments: +- `--file` -- Path to a FASTA file containing a single protein sequence. +- `--output` -- Directory where the `.hom` output file will be written (default: `./`). +- `--working` -- Temporary working directory for intermediate HMMER files (default: `./working`). + +Output is a `.hom` file (TSV with columns: position index, conservation score, amino acid). +The file is named after the input: `input.fasta` produces `input.hom`. + +Example inside Docker: + +```bash +docker compose run --rm conservation-server \ + python conservation/run_conservation.py \ + --file /data/conservation/example.fasta \ + --output /tmp/out \ + --working /tmp/work +``` + +## calculate_conservations_batch.sh + +Batch wrapper that finds all `.fasta` files in a directory and processes them +in parallel using `xargs -P`. + +``` +./calculate_conservations_batch.sh +``` + +Arguments: +1. Number of parallel processes. +2. Input directory containing `.fasta` files. +3. Output directory for `.hom` files. + +Example inside Docker: + +```bash +docker compose run --rm conservation-server \ + bash conservation/calculate_conservations_batch.sh 4 /data/input /data/output +``` diff --git a/executor-p2rank/conservation/__init__.py b/executor-p2rank/conservation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/executor-p2rank/conservation/calculate_conservations_batch.sh b/executor-p2rank/conservation/calculate_conservations_batch.sh new file mode 100644 index 00000000..b6300f9d --- /dev/null +++ b/executor-p2rank/conservation/calculate_conservations_batch.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +n_proc=$1 +input_dir=$2 +output_dir=$3 + +work_dir="${output_dir}/.work_dir" + +n=$(find "$input_dir" -name "*.fasta" | wc -l) + +echo "processing $n fasta files" + +mkdir -p "$output_dir" +mkdir -p "$work_dir" + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +find "$input_dir" -name "*.fasta" | xargs -P "${n_proc}" -I{} "$SCRIPT_DIR/run_conservation.py" --file {} --output "$output_dir" --working "$work_dir" + +echo "processed $n fasta files" +echo "output_dir: $output_dir" +echo Done. diff --git a/executor-p2rank/conservation/conservation_server/__init__.py b/executor-p2rank/conservation/conservation_server/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/executor-p2rank/conservation/conservation_server/main.py b/executor-p2rank/conservation/conservation_server/main.py new file mode 100644 index 00000000..113c3766 --- /dev/null +++ b/executor-p2rank/conservation/conservation_server/main.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Minimal FastAPI server wrapping the HMM-based conservation computation. +""" + +import logging +import os +import sys +import tempfile +import shutil +import traceback + +from fastapi import FastAPI, HTTPException +from fastapi.responses import PlainTextResponse +from pydantic import BaseModel, Field + +# Add executor-p2rank root to path (same pattern as run_conservation.py:15). +_executor_root = os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__)))) +if _executor_root not in sys.path: + sys.path.insert(0, _executor_root) + +from conservation.run_conservation import main as run_conservation_main + +logger = logging.getLogger("conservation_server") + +app = FastAPI(title="Conservation Server") + + +class ConservationRequest(BaseModel): + fasta_content: str = Field( + ..., + description="Raw FASTA content (header line + sequence).", + ) + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/conservation", response_class=PlainTextResponse) +def compute_conservation(request: ConservationRequest): + """ + Accept FASTA content, run HMM conservation pipeline, return raw .hom file. + + Sync endpoint — FastAPI runs it in a thread pool automatically, + so the subprocess-heavy computation does not block the event loop. + """ + tmp_dir = tempfile.mkdtemp(prefix="conservation_") + try: + fasta_path = os.path.join(tmp_dir, "input.fasta") + with open(fasta_path, "w") as f: + f.write(request.fasta_content) + if not request.fasta_content.endswith("\n"): + f.write("\n") + + working_dir = os.path.join(tmp_dir, "working") + output_dir = os.path.join(tmp_dir, "output") + os.makedirs(working_dir) + os.makedirs(output_dir) + + run_conservation_main({ + "file": fasta_path, + "working": working_dir, + "output": output_dir, + }) + + hom_file = os.path.join(output_dir, "input.hom") + if not os.path.exists(hom_file): + raise HTTPException( + status_code=500, + detail="Conservation computation failed: output file not produced.", + ) + + with open(hom_file) as f: + return f.read() + + except HTTPException: + raise + except Exception as e: + logger.exception("Conservation computation failed.") + # TODO: Reconsider exposing full traceback in production. + # Consider returning only a generic error message and logging + # the traceback server-side only. + raise HTTPException(status_code=500, detail={ + "error": type(e).__name__, + "message": str(e), + "traceback": traceback.format_exc().splitlines(), + }) + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) diff --git a/executor-p2rank/conservation/conservation_server/requirements.txt b/executor-p2rank/conservation/conservation_server/requirements.txt new file mode 100644 index 00000000..3fb50f07 --- /dev/null +++ b/executor-p2rank/conservation/conservation_server/requirements.txt @@ -0,0 +1,2 @@ +fastapi==0.115.6 +uvicorn[standard]==0.34.0 diff --git a/executor-p2rank/conservation/run_conservation.py b/executor-p2rank/conservation/run_conservation.py new file mode 100644 index 00000000..3ebd95d2 --- /dev/null +++ b/executor-p2rank/conservation/run_conservation.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# +# Calculate conservation (or get from cache) for given fasta +# without running p2rank. +# + +import os +import argparse +import logging +import subprocess +import sys + +# Add parent directory to path so we can import modules from executor-p2rank. +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from model import * +import conservation_wrapper + +logging.getLogger().setLevel(logging.DEBUG) +logger = logging.getLogger("prankweb") + + +@dataclass +class ConservationExecution: + # Input fasta file + fasta_file: str + # Path to the working directory. + working_directory: str + # Path to the output directory. + output_directory: str + # Used for standard output + stdout: typing.TextIO + # Used for error output + stderr: typing.TextIO + # Selected configuration pipeline. + conservation: ConservationType = ConservationType.NONE + # Optional, shell execution function. + execute_command: typing.Optional[typing.Callable] = None + # If true and files produced by external command, the command is not + # executed. + lazy_execution: bool = False + + +def _read_arguments() -> typing.Dict[str, str]: + parser = argparse.ArgumentParser() + parser.add_argument( + "--file", + required=True, + help="Absolute path to fasta file.") + parser.add_argument( + "--working", + default="./working", + help="Temporary working directory for intermediate files.") + parser.add_argument( + "--output", + default="./", + help="Output directory.") + return vars(parser.parse_args()) + + +def _create_execute_command(configuration: ConservationExecution): + if configuration.execute_command is not None: + return + + def execute_command(command: str, ignore_return_code: bool = True): + logger.debug(f"Executing '{command}' ...") + result = subprocess.run( + command, + shell=True, + env=os.environ.copy(), + stdout=configuration.stdout, + stderr=configuration.stderr, + ) + # Throw for non-zero (failure) return code. + if not ignore_return_code: + result.check_returncode() + logger.debug(f"Executing '{command}' ... done") + + configuration.execute_command = execute_command + + +def main(arguments): + _setuplog_handler() + + configuration = ConservationExecution( + fasta_file=arguments["file"], + working_directory=arguments["working"], + output_directory=arguments["output"], + stdout=sys.stdout, + stderr=sys.stderr, + conservation=ConservationType.HMM + ) + _create_execute_command(configuration) + + out_file_name = configuration.output_directory + "/" + os.path.basename(configuration.fasta_file).removesuffix('.fasta') + ".hom" + + conservation_wrapper.compute_hmm_based_conservation( + configuration.fasta_file, configuration.working_directory, out_file_name, configuration.execute_command) + + logger.info("Done. Conservation out file: " + out_file_name) + + +def _setuplog_handler(): + root = logging.getLogger() + if root.handlers: + return + handler = logging.StreamHandler() + handler.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "%(asctime)s [%(levelname)s] : %(message)s", + "%Y-%m-%dT%H:%M:%S") + handler.setFormatter(formatter) + root.addHandler(handler) + + +if __name__ == "__main__": + main(_read_arguments()) diff --git a/executor-p2rank/conservation_cache.py b/executor-p2rank/conservation_cache.py index ee6d81a3..12edcc24 100644 --- a/executor-p2rank/conservation_cache.py +++ b/executor-p2rank/conservation_cache.py @@ -15,6 +15,7 @@ def create_hom_from_cache( fasta_file: str, output_file: str) -> bool: if cache_directory is None: return False + logging.info("Fasta file: " + fasta_file) sequences = _load_fasta_file(fasta_file) if len(sequences) != 1: logging.warning( @@ -24,7 +25,7 @@ def create_hom_from_cache( conservation = load_from_cache(cache_directory, sequence) if conservation is None: return False - logging.info("Using conservation from cache.") + logging.info("Using conservation from cache. Writing to file: " + output_file) _write_hom_file(output_file, conservation) return True diff --git a/executor-p2rank/executor.py b/executor-p2rank/executor.py index 27a37aaf..205f0a92 100644 --- a/executor-p2rank/executor.py +++ b/executor-p2rank/executor.py @@ -210,6 +210,9 @@ def _prepare_conservation( configuration) cache[fasta] = output_file result[chain] = output_file + + logger.info("Structure: " + str(structure)) + logger.info("Structure file: " + configuration.structure_file) return result From ea2ae537c3a7c6a590120ec83d7f0edee52e348a Mon Sep 17 00:00:00 2001 From: rdk Date: Wed, 25 Feb 2026 23:35:44 +0100 Subject: [PATCH 2/2] Add local/ to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 717bbe6a..417f003b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ data/ dist/ node_modules/ venv/ +local/ # Administration stuff (symlinked files) executor_p2rank \ No newline at end of file