From 6ff7e6e218111e5d43d48a194eb8bb1db99922dc Mon Sep 17 00:00:00 2001
From: rdk <rdk@users.noreply.github.com>
Date: Wed, 25 Feb 2026 19:14:15 +0100
Subject: [PATCH 1/2] Add FastAPI conservation server, CLI tools, and batch
 processing

- Conservation server exposing HMM pipeline via HTTP (port 8030)
- run_conservation.py for single FASTA file processing
- Batch script for parallel conservation computation
- Input validation for HMM sequence database file
- Docker Compose service and Dockerfile updates
---
 .../hmm_based/conservation_hmm_based.py       |  11 ++
 docker-compose.yml                            |  15 ++
 executor-p2rank/Dockerfile                    |   3 +
 executor-p2rank/conservation/README.md        | 179 ++++++++++++++++++
 executor-p2rank/conservation/__init__.py      |   0
 .../calculate_conservations_batch.sh          |  22 +++
 .../conservation_server/__init__.py           |   0
 .../conservation/conservation_server/main.py  |  92 +++++++++
 .../conservation_server/requirements.txt      |   2 +
 .../conservation/run_conservation.py          | 117 ++++++++++++
 executor-p2rank/conservation_cache.py         |   3 +-
 executor-p2rank/executor.py                   |   3 +
 12 files changed, 446 insertions(+), 1 deletion(-)
 create mode 100644 executor-p2rank/conservation/README.md
 create mode 100644 executor-p2rank/conservation/__init__.py
 create mode 100644 executor-p2rank/conservation/calculate_conservations_batch.sh
 create mode 100644 executor-p2rank/conservation/conservation_server/__init__.py
 create mode 100644 executor-p2rank/conservation/conservation_server/main.py
 create mode 100644 executor-p2rank/conservation/conservation_server/requirements.txt
 create mode 100644 executor-p2rank/conservation/run_conservation.py

diff --git a/conservation/hmm_based/conservation_hmm_based.py b/conservation/hmm_based/conservation_hmm_based.py
index 5a26d6e4..81ab90d3 100644
--- a/conservation/hmm_based/conservation_hmm_based.py
+++ b/conservation/hmm_based/conservation_hmm_based.py
@@ -68,6 +68,17 @@ def compute_conservation(
         mask_output: bool,
         max_seqs: int,
 ):
+    if database_file is None:
+        raise RuntimeError(
+            "HMM sequence database file is not set."
+            " Set HMM_SEQUENCE_FILE environment variable.")
+    if not os.path.isfile(database_file):
+        raise RuntimeError(
+            f"HMM sequence database file not found: {database_file}")
+    if not os.access(database_file, os.R_OK):
+        raise RuntimeError(
+            f"HMM sequence database file is not readable: {database_file}")
+
     unweighted_msa_file = _generate_msa(
         fasta_file, database_file, working_directory, execute_command)
 
diff --git a/docker-compose.yml b/docker-compose.yml
index e6b37671..85a8544d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -72,6 +72,21 @@ services:
       - predictions:/data/prankweb/predictions
       - docking:/data/prankweb/docking
       - tunnels:/data/prankweb/tunnels
+  conservation-server:
+    # TODO: Mount the conservation volume so the HMM database is available.
+    # Use `docker compose run` with `-v` as described in conservation/README.md
+    # for now. To make `docker compose up` work, add:
+    #   volumes:
+    #     - conservation:/data/conservation
+    build:
+      context: ./
+      dockerfile: ./executor-p2rank/Dockerfile
+      args:
+        UID: ${UID:-5988}
+        GID: ${GID:-5988}
+    command: ["uvicorn", "conservation.conservation_server.main:app", "--host", "0.0.0.0", "--port", "8030"]
+    ports:
+      - "8030:8030"
   executor-docking:
     build:
       context: ./
diff --git a/executor-p2rank/Dockerfile b/executor-p2rank/Dockerfile
index 3b3b6b12..c8d8f27f 100644
--- a/executor-p2rank/Dockerfile
+++ b/executor-p2rank/Dockerfile
@@ -192,6 +192,9 @@ RUN chmod a+x ./p2rank.sh \
   && chmod a+x ./run_p2rank.py \
   && chmod a+x ./run_p2rank_task.py
 
+COPY --chown=user:user ./executor-p2rank/conservation/conservation_server/requirements.txt ./conservation/conservation_server/
+RUN pip3 install -r ./conservation/conservation_server/requirements.txt
+
 #
 # administration tools
 WORKDIR /opt/administration
diff --git a/executor-p2rank/conservation/README.md b/executor-p2rank/conservation/README.md
new file mode 100644
index 00000000..bd8f1052
--- /dev/null
+++ b/executor-p2rank/conservation/README.md
@@ -0,0 +1,179 @@
+# Conservation
+
+Tools for computing HMM-based conservation scores for protein sequences (without running P2Rank predictions).
+All tools are designed to run inside the `executor-p2rank` Docker image for now.
+
+## Conservation server
+
+A FastAPI web server that exposes conservation computation as an HTTP endpoint.
+
+### Running the server using Docker
+
+```bash
+# to avoid running docker as root, add current user to docker group (you may need to log out and back in for this to take effect)
+sudo usermod -aG docker $USER
+
+# prepare data/cache directory on the host
+mkdir -p /ssd/p2rank-conservation-docker-data/hmm-based
+
+# download latest uniref database
+cd /ssd/p2rank-conservation-docker-data/hmm-based
+wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.fasta.gz && gunzip uniref50.fasta.gz
+
+# alternatively you can download the database inside the container using download_database.py script from prankweb
+# but this downloads old version of uniref50 (from 2023-11-01). Might be useful for reproducibility of old results.
+docker compose run -it --rm -p 8030:8030 \
+    --user "$(id -u):$(id -g)" \
+    -v /ssd/p2rank-conservation-docker-data:/data/conservation \
+    conservation-server bash
+/opt/hmm-based-conservation/download_database.py
+# exit container
+
+# build/rebuild docker image (only necessary after changes in this repo)
+docker compose build conservation-server
+
+# run server with mounted data directory
+docker compose run --rm -p 8030:8030 \
+    --user "$(id -u):$(id -g)" \
+    -v /ssd/p2rank-conservation-docker-data:/data/conservation \
+    conservation-server
+```
+
+The server starts on port 8030.
+Check API documentation at http://localhost:8030/docs .
+
+To run with more workers for concurrent requests:
+
+```bash
+docker compose run --rm -p 8030:8030 conservation-server \
+  uvicorn conservation.conservation_server.main:app \
+    --host 0.0.0.0 --port 8030 --workers 8
+```
+
+Set `--workers` to `n_cpu * 2` for CPU-bound workloads.
+
+### Endpoints
+
+#### GET /health
+
+Returns `{"status": "ok"}` when the server is running.
+
+```bash
+curl http://localhost:8030/health
+```
+
+#### POST /conservation
+
+Accepts FASTA content, runs the HMM conservation pipeline, returns the raw
+`.hom` file as plain text.
+
+Request body (JSON):
+```json
+{
+  "fasta_content": ">sp|P00520|ABL1_MOUSE\nMLPPGPLLLLLLLSGTARLFS"
+}
+```
+
+Response (text/plain):
+```
+0	0.583	M
+1	1.247	L
+2	0.891	P
+...
+```
+
+Example:
+
+```bash
+curl -X POST http://localhost:8030/conservation \
+  -H "Content-Type: application/json" \
+  -d '{"fasta_content": ">test\nGSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV"}'
+```
+
+### Interactive API docs
+
+FastAPI auto-generates interactive API documentation at:
+
+- Swagger UI: http://localhost:8030/docs
+- ReDoc: http://localhost:8030/redoc
+
+## Contents
+
+- `run_conservation.py` -- CLI script that computes conservation for a single FASTA file.
+- `calculate_conservations_batch.sh` -- Batch script that runs `run_conservation.py` in parallel over a directory of FASTA files.
+- `conservation_server/` -- FastAPI web server that exposes conservation computation via HTTP.
+
+## Data directory
+
+The conservation pipeline requires the HMM sequence database and optionally
+a cache directory. Inside the Docker container these are configured via
+environment variables baked into the image:
+
+| Variable | Default in image | Description |
+|---|---|---|
+| `HMM_SEQUENCE_FILE` | `/data/conservation/hmm-based/uniref50.fasta` | UniRef50 FASTA database used by HMMER |
+| `HMM_CONSERVATION_CACHE` | `/data/conservation/hmm-based-cache/` | Directory for caching computed scores |
+| `HMMER_DIR` | `/opt/hmm-based-conservation-dependencies/hmmer-3.3.2/bin/` | Path to HMMER binaries (phmmer, esl-weight, esl-alistat) |
+
+The database files are stored in the `prankweb_conservation` Docker volume,
+mounted at `/data/conservation` inside the container.
+
+## Building the Docker image
+
+The image is built from the project root (not from this directory) because
+the Dockerfile references multiple project directories:
+
+```bash
+cd /path/to/prankweb   # project root containing docker-compose.yml
+docker compose build conservation-server
+```
+
+This uses the same multi-stage `executor-p2rank/Dockerfile` that builds
+HMMER, BLAST, and other dependencies from source.
+
+## run_conservation.py
+
+Computes HMM-based conservation for a single FASTA file.
+
+```
+./run_conservation.py --file /path/to/input.fasta --output /path/to/output --working /tmp/work
+```
+
+Arguments:
+- `--file` -- Path to a FASTA file containing a single protein sequence.
+- `--output` -- Directory where the `.hom` output file will be written (default: `./`).
+- `--working` -- Temporary working directory for intermediate HMMER files (default: `./working`).
+
+Output is a `.hom` file (TSV with columns: position index, conservation score, amino acid).
+The file is named after the input: `input.fasta` produces `input.hom`.
+
+Example inside Docker:
+
+```bash
+docker compose run --rm conservation-server \
+  python conservation/run_conservation.py \
+    --file /data/conservation/example.fasta \
+    --output /tmp/out \
+    --working /tmp/work
+```
+
+## calculate_conservations_batch.sh
+
+Batch wrapper that finds all `.fasta` files in a directory and processes them
+in parallel using `xargs -P`.
+
+```
+./calculate_conservations_batch.sh <n_processes> <input_dir> <output_dir>
+```
+
+Arguments:
+1. Number of parallel processes.
+2. Input directory containing `.fasta` files.
+3. Output directory for `.hom` files.
+
+Example inside Docker:
+
+```bash
+docker compose run --rm conservation-server \
+  bash conservation/calculate_conservations_batch.sh 4 /data/input /data/output
+```
diff --git a/executor-p2rank/conservation/__init__.py b/executor-p2rank/conservation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/executor-p2rank/conservation/calculate_conservations_batch.sh b/executor-p2rank/conservation/calculate_conservations_batch.sh
new file mode 100644
index 00000000..b6300f9d
--- /dev/null
+++ b/executor-p2rank/conservation/calculate_conservations_batch.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+n_proc=$1
+input_dir=$2
+output_dir=$3
+
+work_dir="${output_dir}/.work_dir"
+
+n=$(find "$input_dir" -name "*.fasta" | wc -l)
+
+echo "processing $n fasta files"
+
+mkdir -p "$output_dir"
+mkdir -p "$work_dir"
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+find "$input_dir" -name "*.fasta" | xargs -P "${n_proc}" -I{} "$SCRIPT_DIR/run_conservation.py" --file {} --output "$output_dir" --working "$work_dir"
+
+echo "processed $n fasta files"
+echo "output_dir: $output_dir"
+echo Done.
diff --git a/executor-p2rank/conservation/conservation_server/__init__.py b/executor-p2rank/conservation/conservation_server/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/executor-p2rank/conservation/conservation_server/main.py b/executor-p2rank/conservation/conservation_server/main.py
new file mode 100644
index 00000000..113c3766
--- /dev/null
+++ b/executor-p2rank/conservation/conservation_server/main.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Minimal FastAPI server wrapping the HMM-based conservation computation.
+"""
+
+import logging
+import os
+import sys
+import tempfile
+import shutil
+import traceback
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import PlainTextResponse
+from pydantic import BaseModel, Field
+
+# Add executor-p2rank root to path (same pattern as run_conservation.py:15).
+_executor_root = os.path.dirname(os.path.dirname(os.path.dirname(
+    os.path.abspath(__file__))))
+if _executor_root not in sys.path:
+    sys.path.insert(0, _executor_root)
+
+from conservation.run_conservation import main as run_conservation_main
+
+logger = logging.getLogger("conservation_server")
+
+app = FastAPI(title="Conservation Server")
+
+
+class ConservationRequest(BaseModel):
+    fasta_content: str = Field(
+        ...,
+        description="Raw FASTA content (header line + sequence).",
+    )
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.post("/conservation", response_class=PlainTextResponse)
+def compute_conservation(request: ConservationRequest):
+    """
+    Accept FASTA content, run HMM conservation pipeline, return raw .hom file.
+
+    Sync endpoint — FastAPI runs it in a thread pool automatically,
+    so the subprocess-heavy computation does not block the event loop.
+    """
+    tmp_dir = tempfile.mkdtemp(prefix="conservation_")
+    try:
+        fasta_path = os.path.join(tmp_dir, "input.fasta")
+        with open(fasta_path, "w") as f:
+            f.write(request.fasta_content)
+            if not request.fasta_content.endswith("\n"):
+                f.write("\n")
+
+        working_dir = os.path.join(tmp_dir, "working")
+        output_dir = os.path.join(tmp_dir, "output")
+        os.makedirs(working_dir)
+        os.makedirs(output_dir)
+
+        run_conservation_main({
+            "file": fasta_path,
+            "working": working_dir,
+            "output": output_dir,
+        })
+
+        hom_file = os.path.join(output_dir, "input.hom")
+        if not os.path.exists(hom_file):
+            raise HTTPException(
+                status_code=500,
+                detail="Conservation computation failed: output file not produced.",
+            )
+
+        with open(hom_file) as f:
+            return f.read()
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Conservation computation failed.")
+        # TODO: Reconsider exposing full traceback in production.
+        # Consider returning only a generic error message and logging
+        # the traceback server-side only.
+        raise HTTPException(status_code=500, detail={
+            "error": type(e).__name__,
+            "message": str(e),
+            "traceback": traceback.format_exc().splitlines(),
+        })
+    finally:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/executor-p2rank/conservation/conservation_server/requirements.txt b/executor-p2rank/conservation/conservation_server/requirements.txt
new file mode 100644
index 00000000..3fb50f07
--- /dev/null
+++ b/executor-p2rank/conservation/conservation_server/requirements.txt
@@ -0,0 +1,2 @@
+fastapi==0.115.6
+uvicorn[standard]==0.34.0
diff --git a/executor-p2rank/conservation/run_conservation.py b/executor-p2rank/conservation/run_conservation.py
new file mode 100644
index 00000000..3ebd95d2
--- /dev/null
+++ b/executor-p2rank/conservation/run_conservation.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+#
+# Calculate conservation  (or get from cache) for given fasta
+# without running p2rank.
+#
+
+import os
+import argparse
+import logging
+import subprocess
+import sys
+
+# Add parent directory to path so we can import modules from executor-p2rank.
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from model import *
+import conservation_wrapper
+
+logging.getLogger().setLevel(logging.DEBUG)
+logger = logging.getLogger("prankweb")
+
+
+@dataclass
+class ConservationExecution:
+    # Input fasta file
+    fasta_file: str
+    # Path to the working directory.
+    working_directory: str
+    # Path to the output directory.
+    output_directory: str
+    # Used for standard output
+    stdout: typing.TextIO
+    # Used for error output
+    stderr: typing.TextIO
+    # Selected configuration pipeline.
+    conservation: ConservationType = ConservationType.NONE
+    # Optional, shell execution function.
+    execute_command: typing.Optional[typing.Callable] = None
+    # If true and files produced by external command, the command is not
+    # executed.
+    lazy_execution: bool = False
+
+
+def _read_arguments() -> typing.Dict[str, str]:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--file",
+        required=True,
+        help="Absolute path to fasta file.")
+    parser.add_argument(
+        "--working",
+        default="./working",
+        help="Temporary working directory for intermediate files.")
+    parser.add_argument(
+        "--output",
+        default="./",
+        help="Output directory.")
+    return vars(parser.parse_args())
+
+
+def _create_execute_command(configuration: ConservationExecution):
+    if configuration.execute_command is not None:
+        return
+
+    def execute_command(command: str, ignore_return_code: bool = True):
+        logger.debug(f"Executing '{command}' ...")
+        result = subprocess.run(
+            command,
+            shell=True,
+            env=os.environ.copy(),
+            stdout=configuration.stdout,
+            stderr=configuration.stderr,
+        )
+        # Throw for non-zero (failure) return code.
+        if not ignore_return_code:
+            result.check_returncode()
+        logger.debug(f"Executing '{command}' ... done")
+
+    configuration.execute_command = execute_command
+
+
+def main(arguments):
+    _setuplog_handler()
+
+    configuration = ConservationExecution(
+        fasta_file=arguments["file"],
+        working_directory=arguments["working"],
+        output_directory=arguments["output"],
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        conservation=ConservationType.HMM
+    )
+    _create_execute_command(configuration)
+
+    out_file_name = configuration.output_directory + "/" + os.path.basename(configuration.fasta_file).removesuffix('.fasta') + ".hom"
+
+    conservation_wrapper.compute_hmm_based_conservation(
+        configuration.fasta_file, configuration.working_directory, out_file_name, configuration.execute_command)
+
+    logger.info("Done. Conservation out file: " + out_file_name)
+
+
+def _setuplog_handler():
+    root = logging.getLogger()
+    if root.handlers:
+        return
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        "%(asctime)s [%(levelname)s] : %(message)s",
+        "%Y-%m-%dT%H:%M:%S")
+    handler.setFormatter(formatter)
+    root.addHandler(handler)
+
+
+if __name__ == "__main__":
+    main(_read_arguments())
diff --git a/executor-p2rank/conservation_cache.py b/executor-p2rank/conservation_cache.py
index ee6d81a3..12edcc24 100644
--- a/executor-p2rank/conservation_cache.py
+++ b/executor-p2rank/conservation_cache.py
@@ -15,6 +15,7 @@ def create_hom_from_cache(
         fasta_file: str, output_file: str) -> bool:
     if cache_directory is None:
         return False
+    logging.info("Fasta file: " + fasta_file)
     sequences = _load_fasta_file(fasta_file)
     if len(sequences) != 1:
         logging.warning(
@@ -24,7 +25,7 @@ def create_hom_from_cache(
     conservation = load_from_cache(cache_directory, sequence)
     if conservation is None:
         return False
-    logging.info("Using conservation from cache.")
+    logging.info("Using conservation from cache. Writing to file: " + output_file)
     _write_hom_file(output_file, conservation)
     return True
 
diff --git a/executor-p2rank/executor.py b/executor-p2rank/executor.py
index 27a37aaf..205f0a92 100644
--- a/executor-p2rank/executor.py
+++ b/executor-p2rank/executor.py
@@ -210,6 +210,9 @@ def _prepare_conservation(
                 configuration)
             cache[fasta] = output_file
         result[chain] = output_file
+
+    logger.info("Structure: " + str(structure))
+    logger.info("Structure file: " + configuration.structure_file)
     return result
 
 

From ea2ae537c3a7c6a590120ec83d7f0edee52e348a Mon Sep 17 00:00:00 2001
From: rdk <rdk@users.noreply.github.com>
Date: Wed, 25 Feb 2026 23:35:44 +0100
Subject: [PATCH 2/2] Add local/ to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 717bbe6a..417f003b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ data/
 dist/
 node_modules/
 venv/
+local/
 
 # Administration stuff (symlinked files)
 executor_p2rank
\ No newline at end of file