From 9f85be0af84e9054469f3e57b77c57c36503c8db Mon Sep 17 00:00:00 2001
From: chenzihong-gavin <chenzihong_gavin@foxmail.com>
Date: Thu, 5 Feb 2026 17:59:23 +0800
Subject: [PATCH 1/8] feat: add interpro searcher

---
 graphgen/models/__init__.py                   |   2 +
 .../models/searcher/db/interpro_searcher.py   | 456 ++++++++++++++++++
 graphgen/operators/search/search_service.py   |   5 +
 3 files changed, 463 insertions(+)
 create mode 100644 graphgen/models/searcher/db/interpro_searcher.py

diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
index 6b75587c..95ccd1ae 100644
--- a/graphgen/models/__init__.py
+++ b/graphgen/models/__init__.py
@@ -42,6 +42,7 @@
         TXTReader,
     )
     from .rephraser import StyleControlledRephraser
+    from .searcher.db.interpro_searcher import InterProSearch
     from .searcher.db.ncbi_searcher import NCBISearch
     from .searcher.db.rnacentral_searcher import RNACentralSearch
     from .searcher.db.uniprot_searcher import UniProtSearch
@@ -95,6 +96,7 @@
     "TXTReader": ".reader",
     "HuggingFaceReader": ".reader",
     # Searcher
+    "InterProSearch": ".searcher.db.interpro_searcher",
     "NCBISearch": ".searcher.db.ncbi_searcher",
     "RNACentralSearch": ".searcher.db.rnacentral_searcher",
     "UniProtSearch": ".searcher.db.uniprot_searcher",
diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py
new file mode 100644
index 00000000..b90d3899
--- /dev/null
+++ b/graphgen/models/searcher/db/interpro_searcher.py
@@ -0,0 +1,456 @@
+import re
+import time
+from typing import Dict, Optional
+
+import requests
+from requests.exceptions import RequestException
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+from graphgen.utils import logger
+
+
+class InterProSearch(BaseSearcher):
+    """
+    InterPro Search client to search protein domains and functional annotations.
+    Supports:
+    1) Get protein domain information by UniProt accession number.
+    2) Search with protein sequence using EBI InterProScan API.
+    3) Parse domain matches and associated GO terms, pathways.
+
+    API Documentation: https://www.ebi.ac.uk/Tools/services/rest/iprscan5
+    """
+
+    def __init__(
+        self,
+        email: str = "graphgen@example.com",
+        api_timeout: int = 30,
+    ):
+        """
+        Initialize the InterPro Search client.
+
+        Args:
+            email (str): Email address for EBI API requests.
+            api_timeout (int): Request timeout in seconds.
+        """
+        self.base_url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5"
+        self.email = email
+        self.api_timeout = api_timeout
+        self.poll_interval = 5  # Fixed interval between status checks
+        self.max_polls = 120  # Maximum polling attempts (10 minutes with 5s interval)
+
+    @staticmethod
+    def _is_protein_sequence(text: str) -> bool:
+        """Check if text looks like a protein sequence."""
+        # Remove common FASTA header prefix
+        if text.startswith(">"):
+            text = "\n".join(text.split("\n")[1:])
+        # Check if contains mostly protein amino acids
+        text = text.strip().replace("\n", "").replace(" ", "")
+        # Protein sequences contain only A-Z letters (standard amino acids)
+        return bool(re.fullmatch(r"[A-Z]+", text, re.I)) and len(text) > 10
+
+    @staticmethod
+    def _is_uniprot_accession(text: str) -> bool:
+        """Check if text looks like a UniProt accession number."""
+        # UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2
+        return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I))
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=5),
+        retry=retry_if_exception_type(RequestException),
+        reraise=True,
+    )
+    def _submit_job(self, sequence: str, title: str = "") -> Optional[str]:
+        """
+        Submit a protein sequence for InterProScan analysis.
+
+        Args:
+            sequence (str): Protein sequence (FASTA or raw).
+            title (str): Optional job title.
+
+        Returns:
+            Job ID if successful, None otherwise.
+        """
+        url = f"{self.base_url}/run"
+
+        # Parse sequence if FASTA format
+        if sequence.startswith(">"):
+            sequence = (
+                "\n".join(sequence.split("\n")[1:]).replace("\n", "").replace(" ", "")
+            )
+
+        params = {
+            "email": self.email,
+            "title": title or "GraphGen_Analysis",
+            "sequence": sequence,
+            "stype": "protein",
+            "appl": "Pfam,PANTHER,Gene3D,SMART",  # Multiple databases
+            "goterms": "true",
+            "pathways": "true",
+            "format": "json",
+        }
+
+        try:
+            response = requests.post(url, data=params, timeout=self.api_timeout)
+            if response.status_code == 200:
+                job_id = response.text.strip()
+                logger.debug("InterProScan job submitted: %s", job_id)
+                return job_id
+            logger.error(
+                "Failed to submit InterProScan job: %d - %s",
+                response.status_code,
+                response.text,
+            )
+            return None
+        except RequestException as e:
+            logger.error("Request error while submitting job: %s", e)
+            raise
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=5),
+        retry=retry_if_exception_type(RequestException),
+        reraise=True,
+    )
+    def _check_status(self, job_id: str) -> Optional[str]:
+        """Check the status of a submitted job."""
+        url = f"{self.base_url}/status/{job_id}"
+        try:
+            response = requests.get(url, timeout=self.api_timeout)
+            if response.status_code == 200:
+                return response.text.strip()
+            logger.warning(
+                "Failed to check job status for %s: %d",
+                job_id,
+                response.status_code,
+            )
+            return None
+        except RequestException as e:
+            logger.error("Request error while checking status: %s", e)
+            raise
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=5),
+        retry=retry_if_exception_type(RequestException),
+        reraise=True,
+    )
+    def _get_results(self, job_id: str) -> Optional[dict]:
+        """Retrieve the analysis results for a completed job."""
+        url = f"{self.base_url}/result/{job_id}/json"
+        try:
+            response = requests.get(url, timeout=self.api_timeout)
+            if response.status_code == 200:
+                return response.json()
+            logger.warning(
+                "Failed to retrieve results for job %s: %d",
+                job_id,
+                response.status_code,
+            )
+            return None
+        except RequestException as e:
+            logger.error("Request error while retrieving results: %s", e)
+            raise
+
+    def _poll_job(self, job_id: str) -> Optional[dict]:
+        """
+        Poll a job until completion and retrieve results.
+
+        Args:
+            job_id (str): The job ID to poll.
+
+        Returns:
+            Results dictionary if successful, None otherwise.
+        """
+        for attempt in range(self.max_polls):
+            status = self._check_status(job_id)
+
+            if status == "FINISHED":
+                logger.debug(
+                    "Job %s completed after %d polls",
+                    job_id,
+                    attempt + 1,
+                )
+                return self._get_results(job_id)
+
+            if status in ["FAILED", "NOT_FOUND"]:
+                logger.warning("Job %s has status: %s", job_id, status)
+                return None
+
+            if status == "RUNNING":
+                logger.debug(
+                    "Job %s still running (attempt %d/%d)",
+                    job_id,
+                    attempt + 1,
+                    self.max_polls,
+                )
+                time.sleep(self.poll_interval)
+            else:
+                logger.debug("Job %s status: %s", job_id, status)
+                time.sleep(self.poll_interval)
+
+        logger.warning(
+            "Job %s polling timed out after %d attempts", job_id, self.max_polls
+        )
+        return None
+
+    @staticmethod
+    def _parse_results(results: dict) -> Optional[dict]:
+        """
+        Parse InterProScan results into a structured format.
+
+        Args:
+            results (dict): Raw InterProScan JSON results.
+
+        Returns:
+            Parsed results with structured domain information.
+        """
+        if not results:
+            return None
+
+        domains = []
+        go_terms = set()
+        pathways = set()
+
+        # Extract matches from results
+        for result in results.get("results", []):
+            matches = result.get("matches", [])
+
+            for match in matches:
+                signature = match.get("signature", {})
+                ipr = match.get("ipr", {})
+
+                domain_info = {
+                    "signature_id": signature.get("accession"),
+                    "signature_name": signature.get("name"),
+                    "database": signature.get("database"),
+                    "interpro_id": ipr.get("id"),
+                    "interpro_name": ipr.get("name"),
+                    "start": match.get("start"),
+                    "end": match.get("end"),
+                    "score": match.get("score"),
+                    "evalue": match.get("evalue"),
+                }
+
+                # Collect GO terms
+                for go in ipr.get("go", []):
+                    go_id = go.get("id")
+                    if go_id:
+                        go_terms.add(go_id)
+
+                # Collect pathways
+                for pathway in ipr.get("pathways", []):
+                    pathway_id = pathway.get("id")
+                    if pathway_id:
+                        pathways.add(pathway_id)
+
+                domains.append(domain_info)
+
+        return {
+            "domains": domains,
+            "go_terms": sorted(list(go_terms)) if go_terms else [],
+            "pathways": sorted(list(pathways)) if pathways else [],
+            "domain_count": len(domains),
+        }
+
+    def search_by_sequence(self, sequence: str) -> Optional[Dict]:
+        """
+        Search for protein domains in a sequence using InterProScan API.
+
+        Args:
+            sequence (str): Protein sequence in FASTA or raw format.
+
+        Returns:
+            Dictionary with domain analysis results or None if failed.
+        """
+        if not sequence or not isinstance(sequence, str):
+            logger.error("Invalid sequence provided")
+            return None
+
+        sequence = sequence.strip()
+
+        if not self._is_protein_sequence(sequence):
+            logger.error("Invalid protein sequence format")
+            return None
+
+        # Submit job
+        job_id = self._submit_job(sequence)
+        if not job_id:
+            logger.error("Failed to submit InterProScan job")
+            return None
+
+        # Poll for results
+        results = self._poll_job(job_id)
+        if not results:
+            logger.error("Failed to retrieve InterProScan results for job %s", job_id)
+            return None
+
+        # Parse results
+        parsed = self._parse_results(results)
+        if parsed:
+            parsed["molecule_type"] = "protein"
+            parsed["database"] = "InterPro"
+            parsed["job_id"] = job_id
+            parsed["url"] = "https://www.ebi.ac.uk/interpro/"
+
+        return parsed
+
+    def _extract_domain_info(self, entry: dict, accession: str) -> list:
+        """Extract domain information for a specific accession from an entry."""
+        domains = []
+        proteins = entry.get("proteins", {})
+        protein_data = proteins.get(accession)
+        if protein_data:
+            entry_acc = entry.get("accession")
+            entry_name = entry.get("name")
+            entry_type = entry.get("type")
+            locations = protein_data.get("locations", [])
+            for location in locations:
+                domain_info = {
+                    "interpro_id": entry_acc,
+                    "interpro_name": entry_name,
+                    "type": entry_type,
+                    "start": location.get("start"),
+                    "end": location.get("end"),
+                }
+                domains.append(domain_info)
+        return domains
+
+    def _collect_annotation_terms(self, entry: dict) -> tuple:
+        """Collect GO terms and pathway annotations from entry."""
+        go_terms = set()
+        pathways = set()
+
+        go_list = entry.get("go_terms", [])
+        for go_item in go_list:
+            go_id = go_item.get("identifier") if isinstance(go_item, dict) else go_item
+            if go_id:
+                go_terms.add(go_id)
+
+        pathway_list = entry.get("pathways", [])
+        for pathway in pathway_list:
+            pathway_id = pathway.get("id") if isinstance(pathway, dict) else pathway
+            if pathway_id:
+                pathways.add(pathway_id)
+
+        return go_terms, pathways
+
+    def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
+        """
+        Search InterPro database by UniProt accession number.
+
+        This method queries the EBI API to get pre-computed domain information
+        for a known UniProt entry.
+
+        Args:
+            accession (str): UniProt accession number.
+
+        Returns:
+            Dictionary with domain information or None if not found.
+        """
+        if not accession or not isinstance(accession, str):
+            logger.error("Invalid accession provided")
+            return None
+
+        accession = accession.strip().upper()
+
+        # Query InterPro REST API for UniProt entry
+        url = f"https://www.ebi.ac.uk/interpro/api/entry/protein/uniprot/{accession}/"
+
+        response = requests.get(url, timeout=self.api_timeout)
+
+        if response.status_code == 404:
+            logger.info("UniProt accession %s not found in InterPro", accession)
+            return None
+        if response.status_code != 200:
+            logger.warning(
+                "Failed to search InterPro for accession %s: %d",
+                accession,
+                response.status_code,
+            )
+            return None
+
+        data = response.json()
+
+        domains = []
+        go_terms = set()
+        pathways = set()
+
+        # Parse entry information
+        for entry in data.get("results", []):
+            entry_domains = self._extract_domain_info(entry, accession)
+            domains.extend(entry_domains)
+
+            entry_go_terms, entry_pathways = self._collect_annotation_terms(entry)
+            go_terms.update(entry_go_terms)
+            pathways.update(entry_pathways)
+
+        result = {
+            "molecule_type": "protein",
+            "database": "InterPro",
+            "id": accession,
+            "domains": domains,
+            "go_terms": sorted(list(go_terms)) if go_terms else [],
+            "pathways": sorted(list(pathways)) if pathways else [],
+            "domain_count": len(domains),
+            "url": f"https://www.ebi.ac.uk/interpro/protein/uniprot/{accession}/",
+        }
+
+        return result
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=5),
+        retry=retry_if_exception_type(RequestException),
+        reraise=True,
+    )
+    def search(self, query: str, **kwargs) -> Optional[Dict]:
+        """
+        Search InterPro for protein domain information.
+
+        Automatically detects query type:
+        - UniProt accession number → lookup pre-computed domains
+        - Protein sequence (FASTA or raw) → submit for InterProScan analysis
+
+        Args:
+            query (str): Search query (UniProt ID or protein sequence).
+            **kwargs: Additional arguments (unused).
+
+        Returns:
+            Dictionary with domain information or None if not found.
+        """
+        if not query or not isinstance(query, str):
+            logger.error("Empty or non-string input")
+            return None
+
+        query = query.strip()
+        logger.debug("InterPro search query: %s", query[:100])
+
+        result = None
+
+        # Check if UniProt accession
+        if self._is_uniprot_accession(query):
+            logger.debug("Detected UniProt accession: %s", query)
+            result = self.search_by_uniprot_id(query)
+
+        # Check if protein sequence
+        elif self._is_protein_sequence(query):
+            logger.debug("Detected protein sequence (length: %d)", len(query))
+            result = self.search_by_sequence(query)
+
+        else:
+            # Try as UniProt ID first (in case format is non-standard)
+            logger.debug("Trying as UniProt accession: %s", query)
+            result = self.search_by_uniprot_id(query)
+
+        if result:
+            result["_search_query"] = query
+
+        return result
diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py
index 1a599e25..220db049 100644
--- a/graphgen/operators/search/search_service.py
+++ b/graphgen/operators/search/search_service.py
@@ -58,6 +58,11 @@ def _init_searcher(self):
 
             params = self.kwargs.get("rnacentral_params", {})
             self.searcher = RNACentralSearch(**params)
+        elif self.data_source == "interpro":
+            from graphgen.models import InterProSearch
+
+            params = self.kwargs.get("interpro_params", {})
+            self.searcher = InterProSearch(**params)
         else:
             logger.error(f"Unknown data source: {self.data_source}")
 

From 7fbb19d9106aaf72d4fd000f276f53aa403be8c4 Mon Sep 17 00:00:00 2001
From: chenzihong-gavin <chenzihong_gavin@foxmail.com>
Date: Mon, 9 Feb 2026 14:07:30 +0800
Subject: [PATCH 2/8] refactor: refactor interpro_searcher

---
 .../models/searcher/db/interpro_searcher.py   | 142 ++----------------
 1 file changed, 11 insertions(+), 131 deletions(-)

diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py
index b90d3899..9a6c539a 100644
--- a/graphgen/models/searcher/db/interpro_searcher.py
+++ b/graphgen/models/searcher/db/interpro_searcher.py
@@ -90,8 +90,8 @@ def _submit_job(self, sequence: str, title: str = "") -> Optional[str]:
             "email": self.email,
             "title": title or "GraphGen_Analysis",
             "sequence": sequence,
-            "stype": "protein",
-            "appl": "Pfam,PANTHER,Gene3D,SMART",  # Multiple databases
+            "stype": "p",
+            "appl": "NCBIfam,SMART,CDD,HAMAP",  # Multiple databases
             "goterms": "true",
             "pathways": "true",
             "format": "json",
@@ -201,65 +201,6 @@ def _poll_job(self, job_id: str) -> Optional[dict]:
         )
         return None
 
-    @staticmethod
-    def _parse_results(results: dict) -> Optional[dict]:
-        """
-        Parse InterProScan results into a structured format.
-
-        Args:
-            results (dict): Raw InterProScan JSON results.
-
-        Returns:
-            Parsed results with structured domain information.
-        """
-        if not results:
-            return None
-
-        domains = []
-        go_terms = set()
-        pathways = set()
-
-        # Extract matches from results
-        for result in results.get("results", []):
-            matches = result.get("matches", [])
-
-            for match in matches:
-                signature = match.get("signature", {})
-                ipr = match.get("ipr", {})
-
-                domain_info = {
-                    "signature_id": signature.get("accession"),
-                    "signature_name": signature.get("name"),
-                    "database": signature.get("database"),
-                    "interpro_id": ipr.get("id"),
-                    "interpro_name": ipr.get("name"),
-                    "start": match.get("start"),
-                    "end": match.get("end"),
-                    "score": match.get("score"),
-                    "evalue": match.get("evalue"),
-                }
-
-                # Collect GO terms
-                for go in ipr.get("go", []):
-                    go_id = go.get("id")
-                    if go_id:
-                        go_terms.add(go_id)
-
-                # Collect pathways
-                for pathway in ipr.get("pathways", []):
-                    pathway_id = pathway.get("id")
-                    if pathway_id:
-                        pathways.add(pathway_id)
-
-                domains.append(domain_info)
-
-        return {
-            "domains": domains,
-            "go_terms": sorted(list(go_terms)) if go_terms else [],
-            "pathways": sorted(list(pathways)) if pathways else [],
-            "domain_count": len(domains),
-        }
-
     def search_by_sequence(self, sequence: str) -> Optional[Dict]:
         """
         Search for protein domains in a sequence using InterProScan API.
@@ -292,55 +233,13 @@ def search_by_sequence(self, sequence: str) -> Optional[Dict]:
             logger.error("Failed to retrieve InterProScan results for job %s", job_id)
             return None
 
-        # Parse results
-        parsed = self._parse_results(results)
-        if parsed:
-            parsed["molecule_type"] = "protein"
-            parsed["database"] = "InterPro"
-            parsed["job_id"] = job_id
-            parsed["url"] = "https://www.ebi.ac.uk/interpro/"
-
-        return parsed
-
-    def _extract_domain_info(self, entry: dict, accession: str) -> list:
-        """Extract domain information for a specific accession from an entry."""
-        domains = []
-        proteins = entry.get("proteins", {})
-        protein_data = proteins.get(accession)
-        if protein_data:
-            entry_acc = entry.get("accession")
-            entry_name = entry.get("name")
-            entry_type = entry.get("type")
-            locations = protein_data.get("locations", [])
-            for location in locations:
-                domain_info = {
-                    "interpro_id": entry_acc,
-                    "interpro_name": entry_name,
-                    "type": entry_type,
-                    "start": location.get("start"),
-                    "end": location.get("end"),
-                }
-                domains.append(domain_info)
-        return domains
-
-    def _collect_annotation_terms(self, entry: dict) -> tuple:
-        """Collect GO terms and pathway annotations from entry."""
-        go_terms = set()
-        pathways = set()
-
-        go_list = entry.get("go_terms", [])
-        for go_item in go_list:
-            go_id = go_item.get("identifier") if isinstance(go_item, dict) else go_item
-            if go_id:
-                go_terms.add(go_id)
-
-        pathway_list = entry.get("pathways", [])
-        for pathway in pathway_list:
-            pathway_id = pathway.get("id") if isinstance(pathway, dict) else pathway
-            if pathway_id:
-                pathways.add(pathway_id)
-
-        return go_terms, pathways
+        return {
+            "molecule_type": "protein",
+            "database": "InterPro",
+            "job_id": job_id,
+            "content": results,
+            "url": f"https://www.ebi.ac.uk/interpro/result/{job_id}/",
+        }
 
     def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
         """
@@ -362,13 +261,10 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
         accession = accession.strip().upper()
 
         # Query InterPro REST API for UniProt entry
-        url = f"https://www.ebi.ac.uk/interpro/api/entry/protein/uniprot/{accession}/"
+        url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/"
 
         response = requests.get(url, timeout=self.api_timeout)
 
-        if response.status_code == 404:
-            logger.info("UniProt accession %s not found in InterPro", accession)
-            return None
         if response.status_code != 200:
             logger.warning(
                 "Failed to search InterPro for accession %s: %d",
@@ -379,27 +275,11 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
 
         data = response.json()
 
-        domains = []
-        go_terms = set()
-        pathways = set()
-
-        # Parse entry information
-        for entry in data.get("results", []):
-            entry_domains = self._extract_domain_info(entry, accession)
-            domains.extend(entry_domains)
-
-            entry_go_terms, entry_pathways = self._collect_annotation_terms(entry)
-            go_terms.update(entry_go_terms)
-            pathways.update(entry_pathways)
-
         result = {
             "molecule_type": "protein",
             "database": "InterPro",
             "id": accession,
-            "domains": domains,
-            "go_terms": sorted(list(go_terms)) if go_terms else [],
-            "pathways": sorted(list(pathways)) if pathways else [],
-            "domain_count": len(domains),
+            "content": data.get("results", []),
             "url": f"https://www.ebi.ac.uk/interpro/protein/uniprot/{accession}/",
         }
 

From 36e504f0cb953499c6c792ebde455f3f8029cc56 Mon Sep 17 00:00:00 2001
From: chenzihong-gavin <chenzihong_gavin@foxmail.com>
Date: Mon, 9 Feb 2026 14:28:49 +0800
Subject: [PATCH 3/8] feat: add example for interpro_searcher

---
 .../input_examples/search_interpro_demo.jsonl |   3 +
 .../search_protein/search_interpro/README.md  | 108 ++++++++++++++++++
 .../search_interpro/search_interpro.sh        |   5 +
 .../search_interpro_config.yaml               |  28 +++++
 .../search/search_protein/search_uniprot.sh   |   2 -
 .../{ => search_uniprot}/README.md            |   0
 .../{ => search_uniprot}/build_db.sh          |   0
 .../search_protein_config.yaml                |   0
 .../search_uniprot/search_uniprot.sh          |   2 +
 9 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 examples/input_examples/search_interpro_demo.jsonl
 create mode 100644 examples/search/search_protein/search_interpro/README.md
 create mode 100644 examples/search/search_protein/search_interpro/search_interpro.sh
 create mode 100644 examples/search/search_protein/search_interpro/search_interpro_config.yaml
 delete mode 100644 examples/search/search_protein/search_uniprot.sh
 rename examples/search/search_protein/{ => search_uniprot}/README.md (100%)
 rename examples/search/search_protein/{ => search_uniprot}/build_db.sh (100%)
 rename examples/search/search_protein/{ => search_uniprot}/search_protein_config.yaml (100%)
 create mode 100644 examples/search/search_protein/search_uniprot/search_uniprot.sh

diff --git a/examples/input_examples/search_interpro_demo.jsonl b/examples/input_examples/search_interpro_demo.jsonl
new file mode 100644
index 00000000..9fadb4b1
--- /dev/null
+++ b/examples/input_examples/search_interpro_demo.jsonl
@@ -0,0 +1,3 @@
+{"type": "protein", "content": "P01308"}
+{"type": "protein", "content": "Q96KN2"}
+{"type": "protein", "content": "MGHHHHHHHGSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md
new file mode 100644
index 00000000..1e051eb7
--- /dev/null
+++ b/examples/search/search_protein/search_interpro/README.md
@@ -0,0 +1,108 @@
+# Search Protein Domains with InterPro
+
+This example demonstrates how to search for protein domain information and functional annotations using the InterPro database.
+
+## Overview
+
+The InterPro search pipeline reads protein queries (UniProt accession numbers or protein sequences) and searches the InterPro database to find domain matches, functional annotations, GO terms, and pathways.
+
+InterPro supports two search modes:
+1. **UniProt Accession Lookup**: Fast lookup of pre-computed domain information for known UniProt entries
+2. **Protein Sequence Analysis**: Submit protein sequences for InterProScan analysis to discover domains
+
+## Quick Start
+
+### 1. Configure Search Parameters
+
+Edit `search_interpro_config.yaml` to set:
+
+- **Input file path**: Set the path to your protein sequence or UniProt ID queries
+- **InterPro parameters**:
+  - `email`: Your email address for EBI API requests (required)
+  - `api_timeout`: Request timeout in seconds (default: 30)
+
+Example configuration:
+```yaml
+input_path:
+  - examples/input_examples/search_interpro_demo.jsonl
+
+data_sources: [interpro]
+interpro_params:
+  email: your_email@example.com
+  api_timeout: 30
+```
+
+### 2. Run the Search
+
+```bash
+./search_interpro.sh
+```
+
+Or run directly with Python:
+
+```bash
+python3 -m graphgen.run \
+  --config_file examples/search/search_interpro/search_interpro_config.yaml \
+  --output_dir cache/
+```
+
+## Input Format
+
+The input file should be in JSONL format with protein queries:
+
+```jsonl
+{"type": "protein", "content": "P01308"}
+{"type": "protein", "content": "Q96KN2"}
+{"type": "protein", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
+```
+
+Or in FASTA format:
+```
+>P01308
+MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK
+
+>insulin_sequence
+MHHHHHHSSGVDLGTENLYFQS...
+```
+
+
+## Output
+
+The search results will be saved in the output directory with:
+
+```json
+{
+  "molecule_type": "protein",
+  "database": "InterPro",
+  "id": "P01308",
+  "job_id": "iprscan5-R20240123-123456-xxxx-p1m",
+  "content": {
+    "results": [
+      {
+        "xref": [
+          {
+            "ref": "INTERPRO",
+            "id": "IPR000001",
+            "name": "Domain Name"
+          }
+        ],
+        "signature_acc": "PF00001",
+        "go_annotations": [
+          {
+            "id": "GO:0001234",
+            "description": "biological process"
+          }
+        ]
+      }
+    ]
+  },
+  "url": "https://www.ebi.ac.uk/interpro/protein/uniprot/P01308/",
+  "_search_query": "P01308"
+}
+```
+
+## References
+
+- **InterPro Database**: https://www.ebi.ac.uk/interpro/
+- **EBI InterProScan API**: https://www.ebi.ac.uk/Tools/services/rest/iprscan5
+- **UniProt Database**: https://www.uniprot.org/
diff --git a/examples/search/search_protein/search_interpro/search_interpro.sh b/examples/search/search_protein/search_interpro/search_interpro.sh
new file mode 100644
index 00000000..cfc0309e
--- /dev/null
+++ b/examples/search/search_protein/search_interpro/search_interpro.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Search InterPro for protein domain annotations
+
+python3 -m graphgen.run \
+  --config_file examples/search/search_protein/search_interpro/search_interpro_config.yaml
diff --git a/examples/search/search_protein/search_interpro/search_interpro_config.yaml b/examples/search/search_protein/search_interpro/search_interpro_config.yaml
new file mode 100644
index 00000000..c2ab2bfa
--- /dev/null
+++ b/examples/search/search_protein/search_interpro/search_interpro_config.yaml
@@ -0,0 +1,28 @@
+global_params:
+  working_dir: cache
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+
+nodes:
+  - id: read_step
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/search_interpro_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: search_step
+    op_name: search
+    type: map_batch
+    dependencies:
+      - read_step # search_step depends on read_step
+    execution_params:
+      replicas: 1
+      batch_size: 10
+    save_output: true
+    params:
+      data_source: interpro # data source for searcher, support: wikipedia, google, uniprot, ncbi, interpro
+      interpro_params:
+        email: test@example.com # Email address for EBI API requests
+        api_timeout: 30 # Request timeout in seconds
diff --git a/examples/search/search_protein/search_uniprot.sh b/examples/search/search_protein/search_uniprot.sh
deleted file mode 100644
index 627735a0..00000000
--- a/examples/search/search_protein/search_uniprot.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-python3 -m graphgen.run \
---config_file examples/search/search_protein/search_protein_config.yaml
diff --git a/examples/search/search_protein/README.md b/examples/search/search_protein/search_uniprot/README.md
similarity index 100%
rename from examples/search/search_protein/README.md
rename to examples/search/search_protein/search_uniprot/README.md
diff --git a/examples/search/search_protein/build_db.sh b/examples/search/search_protein/search_uniprot/build_db.sh
similarity index 100%
rename from examples/search/search_protein/build_db.sh
rename to examples/search/search_protein/search_uniprot/build_db.sh
diff --git a/examples/search/search_protein/search_protein_config.yaml b/examples/search/search_protein/search_uniprot/search_protein_config.yaml
similarity index 100%
rename from examples/search/search_protein/search_protein_config.yaml
rename to examples/search/search_protein/search_uniprot/search_protein_config.yaml
diff --git a/examples/search/search_protein/search_uniprot/search_uniprot.sh b/examples/search/search_protein/search_uniprot/search_uniprot.sh
new file mode 100644
index 00000000..e4862572
--- /dev/null
+++ b/examples/search/search_protein/search_uniprot/search_uniprot.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/search/search_protein/search_uniprot/search_protein_config.yaml

From ae695d77a541619b5c338bea154317b87f70368a Mon Sep 17 00:00:00 2001
From: chenzihong-gavin <chenzihong_gavin@foxmail.com>
Date: Mon, 9 Feb 2026 18:44:09 +0800
Subject: [PATCH 4/8] feat: fetch detailed interpro result

---
 .../input_examples/search_interpro_demo.jsonl |   1 -
 .../search_interpro_config.yaml               |   1 -
 .../models/searcher/db/interpro_searcher.py   | 265 +++---------------
 3 files changed, 41 insertions(+), 226 deletions(-)

diff --git a/examples/input_examples/search_interpro_demo.jsonl b/examples/input_examples/search_interpro_demo.jsonl
index 9fadb4b1..2427fe0b 100644
--- a/examples/input_examples/search_interpro_demo.jsonl
+++ b/examples/input_examples/search_interpro_demo.jsonl
@@ -1,3 +1,2 @@
 {"type": "protein", "content": "P01308"}
 {"type": "protein", "content": "Q96KN2"}
-{"type": "protein", "content": "MGHHHHHHHGSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
diff --git a/examples/search/search_protein/search_interpro/search_interpro_config.yaml b/examples/search/search_protein/search_interpro/search_interpro_config.yaml
index c2ab2bfa..c1089af9 100644
--- a/examples/search/search_protein/search_interpro/search_interpro_config.yaml
+++ b/examples/search/search_protein/search_interpro/search_interpro_config.yaml
@@ -24,5 +24,4 @@ nodes:
     params:
       data_source: interpro # data source for searcher, support: wikipedia, google, uniprot, ncbi, interpro
       interpro_params:
-        email: test@example.com # Email address for EBI API requests
         api_timeout: 30 # Request timeout in seconds
diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py
index 9a6c539a..d7b140ec 100644
--- a/graphgen/models/searcher/db/interpro_searcher.py
+++ b/graphgen/models/searcher/db/interpro_searcher.py
@@ -1,5 +1,4 @@
 import re
-import time
 from typing import Dict, Optional
 
 import requests
@@ -20,40 +19,22 @@ class InterProSearch(BaseSearcher):
     InterPro Search client to search protein domains and functional annotations.
     Supports:
     1) Get protein domain information by UniProt accession number.
-    2) Search with protein sequence using EBI InterProScan API.
-    3) Parse domain matches and associated GO terms, pathways.
 
-    API Documentation: https://www.ebi.ac.uk/Tools/services/rest/iprscan5
+    API Documentation: https://www.ebi.ac.uk/interpro/api/
     """
 
     def __init__(
         self,
-        email: str = "graphgen@example.com",
         api_timeout: int = 30,
     ):
         """
         Initialize the InterPro Search client.
 
         Args:
-            email (str): Email address for EBI API requests.
             api_timeout (int): Request timeout in seconds.
         """
-        self.base_url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5"
-        self.email = email
         self.api_timeout = api_timeout
-        self.poll_interval = 5  # Fixed interval between status checks
-        self.max_polls = 120  # Maximum polling attempts (10 minutes with 5s interval)
-
-    @staticmethod
-    def _is_protein_sequence(text: str) -> bool:
-        """Check if text looks like a protein sequence."""
-        # Remove common FASTA header prefix
-        if text.startswith(">"):
-            text = "\n".join(text.split("\n")[1:])
-        # Check if contains mostly protein amino acids
-        text = text.strip().replace("\n", "").replace(" ", "")
-        # Protein sequences contain only A-Z letters (standard amino acids)
-        return bool(re.fullmatch(r"[A-Z]+", text, re.I)) and len(text) > 10
+        self.BASE_URL = "https://www.ebi.ac.uk/interpro/api"
 
     @staticmethod
     def _is_uniprot_accession(text: str) -> bool:
@@ -61,186 +42,6 @@ def _is_uniprot_accession(text: str) -> bool:
         # UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2
         return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I))
 
-    @retry(
-        stop=stop_after_attempt(3),
-        wait=wait_exponential(multiplier=1, min=2, max=5),
-        retry=retry_if_exception_type(RequestException),
-        reraise=True,
-    )
-    def _submit_job(self, sequence: str, title: str = "") -> Optional[str]:
-        """
-        Submit a protein sequence for InterProScan analysis.
-
-        Args:
-            sequence (str): Protein sequence (FASTA or raw).
-            title (str): Optional job title.
-
-        Returns:
-            Job ID if successful, None otherwise.
-        """
-        url = f"{self.base_url}/run"
-
-        # Parse sequence if FASTA format
-        if sequence.startswith(">"):
-            sequence = (
-                "\n".join(sequence.split("\n")[1:]).replace("\n", "").replace(" ", "")
-            )
-
-        params = {
-            "email": self.email,
-            "title": title or "GraphGen_Analysis",
-            "sequence": sequence,
-            "stype": "p",
-            "appl": "NCBIfam,SMART,CDD,HAMAP",  # Multiple databases
-            "goterms": "true",
-            "pathways": "true",
-            "format": "json",
-        }
-
-        try:
-            response = requests.post(url, data=params, timeout=self.api_timeout)
-            if response.status_code == 200:
-                job_id = response.text.strip()
-                logger.debug("InterProScan job submitted: %s", job_id)
-                return job_id
-            logger.error(
-                "Failed to submit InterProScan job: %d - %s",
-                response.status_code,
-                response.text,
-            )
-            return None
-        except RequestException as e:
-            logger.error("Request error while submitting job: %s", e)
-            raise
-
-    @retry(
-        stop=stop_after_attempt(3),
-        wait=wait_exponential(multiplier=1, min=2, max=5),
-        retry=retry_if_exception_type(RequestException),
-        reraise=True,
-    )
-    def _check_status(self, job_id: str) -> Optional[str]:
-        """Check the status of a submitted job."""
-        url = f"{self.base_url}/status/{job_id}"
-        try:
-            response = requests.get(url, timeout=self.api_timeout)
-            if response.status_code == 200:
-                return response.text.strip()
-            logger.warning(
-                "Failed to check job status for %s: %d",
-                job_id,
-                response.status_code,
-            )
-            return None
-        except RequestException as e:
-            logger.error("Request error while checking status: %s", e)
-            raise
-
-    @retry(
-        stop=stop_after_attempt(3),
-        wait=wait_exponential(multiplier=1, min=2, max=5),
-        retry=retry_if_exception_type(RequestException),
-        reraise=True,
-    )
-    def _get_results(self, job_id: str) -> Optional[dict]:
-        """Retrieve the analysis results for a completed job."""
-        url = f"{self.base_url}/result/{job_id}/json"
-        try:
-            response = requests.get(url, timeout=self.api_timeout)
-            if response.status_code == 200:
-                return response.json()
-            logger.warning(
-                "Failed to retrieve results for job %s: %d",
-                job_id,
-                response.status_code,
-            )
-            return None
-        except RequestException as e:
-            logger.error("Request error while retrieving results: %s", e)
-            raise
-
-    def _poll_job(self, job_id: str) -> Optional[dict]:
-        """
-        Poll a job until completion and retrieve results.
-
-        Args:
-            job_id (str): The job ID to poll.
-
-        Returns:
-            Results dictionary if successful, None otherwise.
-        """
-        for attempt in range(self.max_polls):
-            status = self._check_status(job_id)
-
-            if status == "FINISHED":
-                logger.debug(
-                    "Job %s completed after %d polls",
-                    job_id,
-                    attempt + 1,
-                )
-                return self._get_results(job_id)
-
-            if status in ["FAILED", "NOT_FOUND"]:
-                logger.warning("Job %s has status: %s", job_id, status)
-                return None
-
-            if status == "RUNNING":
-                logger.debug(
-                    "Job %s still running (attempt %d/%d)",
-                    job_id,
-                    attempt + 1,
-                    self.max_polls,
-                )
-                time.sleep(self.poll_interval)
-            else:
-                logger.debug("Job %s status: %s", job_id, status)
-                time.sleep(self.poll_interval)
-
-        logger.warning(
-            "Job %s polling timed out after %d attempts", job_id, self.max_polls
-        )
-        return None
-
-    def search_by_sequence(self, sequence: str) -> Optional[Dict]:
-        """
-        Search for protein domains in a sequence using InterProScan API.
-
-        Args:
-            sequence (str): Protein sequence in FASTA or raw format.
-
-        Returns:
-            Dictionary with domain analysis results or None if failed.
-        """
-        if not sequence or not isinstance(sequence, str):
-            logger.error("Invalid sequence provided")
-            return None
-
-        sequence = sequence.strip()
-
-        if not self._is_protein_sequence(sequence):
-            logger.error("Invalid protein sequence format")
-            return None
-
-        # Submit job
-        job_id = self._submit_job(sequence)
-        if not job_id:
-            logger.error("Failed to submit InterProScan job")
-            return None
-
-        # Poll for results
-        results = self._poll_job(job_id)
-        if not results:
-            logger.error("Failed to retrieve InterProScan results for job %s", job_id)
-            return None
-
-        return {
-            "molecule_type": "protein",
-            "database": "InterPro",
-            "job_id": job_id,
-            "content": results,
-            "url": f"https://www.ebi.ac.uk/interpro/result/{job_id}/",
-        }
-
     def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
         """
         Search InterPro database by UniProt accession number.
@@ -261,7 +62,7 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
         accession = accession.strip().upper()
 
         # Query InterPro REST API for UniProt entry
-        url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/"
+        url = f"{self.BASE_URL}/entry/interpro/protein/uniprot/{accession}/"
 
         response = requests.get(url, timeout=self.api_timeout)
 
@@ -275,6 +76,14 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
 
         data = response.json()
 
+        # Get entry details for each InterPro entry found
+        for result in data.get("results", []):
+            interpro_acc = result.get("metadata", {}).get("accession")
+            if interpro_acc:
+                entry_details = self.get_entry_details(interpro_acc)
+                if entry_details:
+                    result["entry_details"] = entry_details
+
         result = {
             "molecule_type": "protein",
             "database": "InterPro",
@@ -285,6 +94,31 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
 
         return result
 
+    def get_entry_details(self, interpro_accession: str) -> Optional[Dict]:
+        """
+        Get detailed information for a specific InterPro entry.
+
+        Args:
+            interpro_accession (str): InterPro accession number (e.g., IPR000001).
+        Returns:
+            Dictionary with entry details or None if not found.
+        """
+        if not interpro_accession or not isinstance(interpro_accession, str):
+            return None
+
+        url = f"{self.BASE_URL}/entry/interpro/{interpro_accession}/"
+
+        response = requests.get(url, timeout=self.api_timeout)
+        if response.status_code != 200:
+            logger.warning(
+                "Failed to get InterPro entry %s: %d",
+                interpro_accession,
+                response.status_code,
+            )
+            return None
+
+        return response.json()
+
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=2, max=5),
@@ -293,14 +127,10 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
     )
     def search(self, query: str, **kwargs) -> Optional[Dict]:
         """
-        Search InterPro for protein domain information.
-
-        Automatically detects query type:
-        - UniProt accession number → lookup pre-computed domains
-        - Protein sequence (FASTA or raw) → submit for InterProScan analysis
+        Search InterPro for protein domain information by UniProt accession.
 
         Args:
-            query (str): Search query (UniProt ID or protein sequence).
+            query (str): UniProt accession number (e.g., P01308, Q96KN2).
             **kwargs: Additional arguments (unused).
 
         Returns:
@@ -313,22 +143,9 @@ def search(self, query: str, **kwargs) -> Optional[Dict]:
         query = query.strip()
         logger.debug("InterPro search query: %s", query[:100])
 
-        result = None
-
-        # Check if UniProt accession
-        if self._is_uniprot_accession(query):
-            logger.debug("Detected UniProt accession: %s", query)
-            result = self.search_by_uniprot_id(query)
-
-        # Check if protein sequence
-        elif self._is_protein_sequence(query):
-            logger.debug("Detected protein sequence (length: %d)", len(query))
-            result = self.search_by_sequence(query)
-
-        else:
-            # Try as UniProt ID first (in case format is non-standard)
-            logger.debug("Trying as UniProt accession: %s", query)
-            result = self.search_by_uniprot_id(query)
+        # Search by UniProt ID
+        logger.debug("Searching for UniProt accession: %s", query)
+        result = self.search_by_uniprot_id(query)
 
         if result:
             result["_search_query"] = query

From d0b230c283195b62706e455fb0179c50ab20e524 Mon Sep 17 00:00:00 2001
From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com>
Date: Mon, 9 Feb 2026 18:57:40 +0800
Subject: [PATCH 5/8] Update graphgen/models/searcher/db/interpro_searcher.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 graphgen/models/searcher/db/interpro_searcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py
index d7b140ec..9d3e7c06 100644
--- a/graphgen/models/searcher/db/interpro_searcher.py
+++ b/graphgen/models/searcher/db/interpro_searcher.py
@@ -55,7 +55,7 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
         Returns:
             Dictionary with domain information or None if not found.
         """
-        if not accession or not isinstance(accession, str):
+        if not accession or not isinstance(accession, str) or not self._is_uniprot_accession(accession):
             logger.error("Invalid accession provided")
             return None
 

From e445052cdab3a1b6fdf6e998cb617466f4220d59 Mon Sep 17 00:00:00 2001
From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com>
Date: Mon, 9 Feb 2026 18:57:53 +0800
Subject: [PATCH 6/8] Update
 examples/search/search_protein/search_interpro/README.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 examples/search/search_protein/search_interpro/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md
index 1e051eb7..5a848c05 100644
--- a/examples/search/search_protein/search_interpro/README.md
+++ b/examples/search/search_protein/search_interpro/README.md
@@ -26,7 +26,7 @@ Example configuration:
 input_path:
   - examples/input_examples/search_interpro_demo.jsonl
 
-data_sources: [interpro]
+data_source: interpro
 interpro_params:
   email: your_email@example.com
   api_timeout: 30

From 13326339cb5778ed1de6fae75dda9db3efc937e8 Mon Sep 17 00:00:00 2001
From: chenzihong-gavin <chenzihong_gavin@foxmail.com>
Date: Mon, 9 Feb 2026 18:59:30 +0800
Subject: [PATCH 7/8] docs: update README

---
 .../search_protein/search_interpro/README.md    | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md
index 5a848c05..052d1f6c 100644
--- a/examples/search/search_protein/search_interpro/README.md
+++ b/examples/search/search_protein/search_interpro/README.md
@@ -4,11 +4,7 @@ This example demonstrates how to search for protein domain information and funct
 
 ## Overview
 
-The InterPro search pipeline reads protein queries (UniProt accession numbers or protein sequences) and searches the InterPro database to find domain matches, functional annotations, GO terms, and pathways.
-
-InterPro supports two search modes:
-1. **UniProt Accession Lookup**: Fast lookup of pre-computed domain information for known UniProt entries
-2. **Protein Sequence Analysis**: Submit protein sequences for InterProScan analysis to discover domains
+The InterPro search pipeline reads protein queries (UniProt accession numbers) and searches the InterPro database to find domain matches, functional annotations, GO terms, and pathways.
 
 ## Quick Start
 
@@ -18,7 +14,6 @@ Edit `search_interpro_config.yaml` to set:
 
 - **Input file path**: Set the path to your protein sequence or UniProt ID queries
 - **InterPro parameters**:
-  - `email`: Your email address for EBI API requests (required)
   - `api_timeout`: Request timeout in seconds (default: 30)
 
 Example configuration:
@@ -53,16 +48,6 @@ The input file should be in JSONL format with protein queries:
 ```jsonl
 {"type": "protein", "content": "P01308"}
 {"type": "protein", "content": "Q96KN2"}
-{"type": "protein", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
-```
-
-Or in FASTA format:
-```
->P01308
-MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK
-
->insulin_sequence
-MHHHHHHSSGVDLGTENLYFQS...
 ```
 
 

From bf68f13e38d8f5f95e731ad035e27cf0df7b9978 Mon Sep 17 00:00:00 2001
From: chenzihong-gavin <chenzihong_gavin@foxmail.com>
Date: Mon, 9 Feb 2026 19:03:30 +0800
Subject: [PATCH 8/8] docs: update README

---
 examples/search/search_protein/search_interpro/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md
index 052d1f6c..2a394562 100644
--- a/examples/search/search_protein/search_interpro/README.md
+++ b/examples/search/search_protein/search_interpro/README.md
@@ -23,7 +23,6 @@ input_path:
 
 data_source: interpro
 interpro_params:
-  email: your_email@example.com
   api_timeout: 30
 ```