From 9f85be0af84e9054469f3e57b77c57c36503c8db Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 5 Feb 2026 17:59:23 +0800 Subject: [PATCH 1/8] feat: add interpro searcher --- graphgen/models/__init__.py | 2 + .../models/searcher/db/interpro_searcher.py | 456 ++++++++++++++++++ graphgen/operators/search/search_service.py | 5 + 3 files changed, 463 insertions(+) create mode 100644 graphgen/models/searcher/db/interpro_searcher.py diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 6b75587c..95ccd1ae 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -42,6 +42,7 @@ TXTReader, ) from .rephraser import StyleControlledRephraser + from .searcher.db.interpro_searcher import InterProSearch from .searcher.db.ncbi_searcher import NCBISearch from .searcher.db.rnacentral_searcher import RNACentralSearch from .searcher.db.uniprot_searcher import UniProtSearch @@ -95,6 +96,7 @@ "TXTReader": ".reader", "HuggingFaceReader": ".reader", # Searcher + "InterProSearch": ".searcher.db.interpro_searcher", "NCBISearch": ".searcher.db.ncbi_searcher", "RNACentralSearch": ".searcher.db.rnacentral_searcher", "UniProtSearch": ".searcher.db.uniprot_searcher", diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py new file mode 100644 index 00000000..b90d3899 --- /dev/null +++ b/graphgen/models/searcher/db/interpro_searcher.py @@ -0,0 +1,456 @@ +import re +import time +from typing import Dict, Optional + +import requests +from requests.exceptions import RequestException +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher +from graphgen.utils import logger + + +class InterProSearch(BaseSearcher): + """ + InterPro Search client to search protein domains and functional annotations. + Supports: + 1) Get protein domain information by UniProt accession number. + 2) Search with protein sequence using EBI InterProScan API. + 3) Parse domain matches and associated GO terms, pathways. + + API Documentation: https://www.ebi.ac.uk/Tools/services/rest/iprscan5 + """ + + def __init__( + self, + email: str = "graphgen@example.com", + api_timeout: int = 30, + ): + """ + Initialize the InterPro Search client. + + Args: + email (str): Email address for EBI API requests. + api_timeout (int): Request timeout in seconds. + """ + self.base_url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5" + self.email = email + self.api_timeout = api_timeout + self.poll_interval = 5 # Fixed interval between status checks + self.max_polls = 120 # Maximum polling attempts (10 minutes with 5s interval) + + @staticmethod + def _is_protein_sequence(text: str) -> bool: + """Check if text looks like a protein sequence.""" + # Remove common FASTA header prefix + if text.startswith(">"): + text = "\n".join(text.split("\n")[1:]) + # Check if contains mostly protein amino acids + text = text.strip().replace("\n", "").replace(" ", "") + # Protein sequences contain only A-Z letters (standard amino acids) + return bool(re.fullmatch(r"[A-Z]+", text, re.I)) and len(text) > 10 + + @staticmethod + def _is_uniprot_accession(text: str) -> bool: + """Check if text looks like a UniProt accession number.""" + # UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2 + return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I)) + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=5), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + def _submit_job(self, sequence: str, title: str = "") -> Optional[str]: + """ + Submit a protein sequence for InterProScan analysis. + + Args: + sequence (str): Protein sequence (FASTA or raw). + title (str): Optional job title. + + Returns: + Job ID if successful, None otherwise. + """ + url = f"{self.base_url}/run" + + # Parse sequence if FASTA format + if sequence.startswith(">"): + sequence = ( + "\n".join(sequence.split("\n")[1:]).replace("\n", "").replace(" ", "") + ) + + params = { + "email": self.email, + "title": title or "GraphGen_Analysis", + "sequence": sequence, + "stype": "protein", + "appl": "Pfam,PANTHER,Gene3D,SMART", # Multiple databases + "goterms": "true", + "pathways": "true", + "format": "json", + } + + try: + response = requests.post(url, data=params, timeout=self.api_timeout) + if response.status_code == 200: + job_id = response.text.strip() + logger.debug("InterProScan job submitted: %s", job_id) + return job_id + logger.error( + "Failed to submit InterProScan job: %d - %s", + response.status_code, + response.text, + ) + return None + except RequestException as e: + logger.error("Request error while submitting job: %s", e) + raise + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=5), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + def _check_status(self, job_id: str) -> Optional[str]: + """Check the status of a submitted job.""" + url = f"{self.base_url}/status/{job_id}" + try: + response = requests.get(url, timeout=self.api_timeout) + if response.status_code == 200: + return response.text.strip() + logger.warning( + "Failed to check job status for %s: %d", + job_id, + response.status_code, + ) + return None + except RequestException as e: + logger.error("Request error while checking status: %s", e) + raise + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=5), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + def _get_results(self, job_id: str) -> Optional[dict]: + """Retrieve the analysis results for a completed job.""" + url = f"{self.base_url}/result/{job_id}/json" + try: + response = requests.get(url, timeout=self.api_timeout) + if response.status_code == 200: + return response.json() + logger.warning( + "Failed to retrieve results for job %s: %d", + job_id, + response.status_code, + ) + return None + except RequestException as e: + logger.error("Request error while retrieving results: %s", e) + raise + + def _poll_job(self, job_id: str) -> Optional[dict]: + """ + Poll a job until completion and retrieve results. + + Args: + job_id (str): The job ID to poll. + + Returns: + Results dictionary if successful, None otherwise. + """ + for attempt in range(self.max_polls): + status = self._check_status(job_id) + + if status == "FINISHED": + logger.debug( + "Job %s completed after %d polls", + job_id, + attempt + 1, + ) + return self._get_results(job_id) + + if status in ["FAILED", "NOT_FOUND"]: + logger.warning("Job %s has status: %s", job_id, status) + return None + + if status == "RUNNING": + logger.debug( + "Job %s still running (attempt %d/%d)", + job_id, + attempt + 1, + self.max_polls, + ) + time.sleep(self.poll_interval) + else: + logger.debug("Job %s status: %s", job_id, status) + time.sleep(self.poll_interval) + + logger.warning( + "Job %s polling timed out after %d attempts", job_id, self.max_polls + ) + return None + + @staticmethod + def _parse_results(results: dict) -> Optional[dict]: + """ + Parse InterProScan results into a structured format. + + Args: + results (dict): Raw InterProScan JSON results. + + Returns: + Parsed results with structured domain information. + """ + if not results: + return None + + domains = [] + go_terms = set() + pathways = set() + + # Extract matches from results + for result in results.get("results", []): + matches = result.get("matches", []) + + for match in matches: + signature = match.get("signature", {}) + ipr = match.get("ipr", {}) + + domain_info = { + "signature_id": signature.get("accession"), + "signature_name": signature.get("name"), + "database": signature.get("database"), + "interpro_id": ipr.get("id"), + "interpro_name": ipr.get("name"), + "start": match.get("start"), + "end": match.get("end"), + "score": match.get("score"), + "evalue": match.get("evalue"), + } + + # Collect GO terms + for go in ipr.get("go", []): + go_id = go.get("id") + if go_id: + go_terms.add(go_id) + + # Collect pathways + for pathway in ipr.get("pathways", []): + pathway_id = pathway.get("id") + if pathway_id: + pathways.add(pathway_id) + + domains.append(domain_info) + + return { + "domains": domains, + "go_terms": sorted(list(go_terms)) if go_terms else [], + "pathways": sorted(list(pathways)) if pathways else [], + "domain_count": len(domains), + } + + def search_by_sequence(self, sequence: str) -> Optional[Dict]: + """ + Search for protein domains in a sequence using InterProScan API. + + Args: + sequence (str): Protein sequence in FASTA or raw format. + + Returns: + Dictionary with domain analysis results or None if failed. + """ + if not sequence or not isinstance(sequence, str): + logger.error("Invalid sequence provided") + return None + + sequence = sequence.strip() + + if not self._is_protein_sequence(sequence): + logger.error("Invalid protein sequence format") + return None + + # Submit job + job_id = self._submit_job(sequence) + if not job_id: + logger.error("Failed to submit InterProScan job") + return None + + # Poll for results + results = self._poll_job(job_id) + if not results: + logger.error("Failed to retrieve InterProScan results for job %s", job_id) + return None + + # Parse results + parsed = self._parse_results(results) + if parsed: + parsed["molecule_type"] = "protein" + parsed["database"] = "InterPro" + parsed["job_id"] = job_id + parsed["url"] = "https://www.ebi.ac.uk/interpro/" + + return parsed + + def _extract_domain_info(self, entry: dict, accession: str) -> list: + """Extract domain information for a specific accession from an entry.""" + domains = [] + proteins = entry.get("proteins", {}) + protein_data = proteins.get(accession) + if protein_data: + entry_acc = entry.get("accession") + entry_name = entry.get("name") + entry_type = entry.get("type") + locations = protein_data.get("locations", []) + for location in locations: + domain_info = { + "interpro_id": entry_acc, + "interpro_name": entry_name, + "type": entry_type, + "start": location.get("start"), + "end": location.get("end"), + } + domains.append(domain_info) + return domains + + def _collect_annotation_terms(self, entry: dict) -> tuple: + """Collect GO terms and pathway annotations from entry.""" + go_terms = set() + pathways = set() + + go_list = entry.get("go_terms", []) + for go_item in go_list: + go_id = go_item.get("identifier") if isinstance(go_item, dict) else go_item + if go_id: + go_terms.add(go_id) + + pathway_list = entry.get("pathways", []) + for pathway in pathway_list: + pathway_id = pathway.get("id") if isinstance(pathway, dict) else pathway + if pathway_id: + pathways.add(pathway_id) + + return go_terms, pathways + + def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: + """ + Search InterPro database by UniProt accession number. + + This method queries the EBI API to get pre-computed domain information + for a known UniProt entry. + + Args: + accession (str): UniProt accession number. + + Returns: + Dictionary with domain information or None if not found. + """ + if not accession or not isinstance(accession, str): + logger.error("Invalid accession provided") + return None + + accession = accession.strip().upper() + + # Query InterPro REST API for UniProt entry + url = f"https://www.ebi.ac.uk/interpro/api/entry/protein/uniprot/{accession}/" + + response = requests.get(url, timeout=self.api_timeout) + + if response.status_code == 404: + logger.info("UniProt accession %s not found in InterPro", accession) + return None + if response.status_code != 200: + logger.warning( + "Failed to search InterPro for accession %s: %d", + accession, + response.status_code, + ) + return None + + data = response.json() + + domains = [] + go_terms = set() + pathways = set() + + # Parse entry information + for entry in data.get("results", []): + entry_domains = self._extract_domain_info(entry, accession) + domains.extend(entry_domains) + + entry_go_terms, entry_pathways = self._collect_annotation_terms(entry) + go_terms.update(entry_go_terms) + pathways.update(entry_pathways) + + result = { + "molecule_type": "protein", + "database": "InterPro", + "id": accession, + "domains": domains, + "go_terms": sorted(list(go_terms)) if go_terms else [], + "pathways": sorted(list(pathways)) if pathways else [], + "domain_count": len(domains), + "url": f"https://www.ebi.ac.uk/interpro/protein/uniprot/{accession}/", + } + + return result + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=5), + retry=retry_if_exception_type(RequestException), + reraise=True, + ) + def search(self, query: str, **kwargs) -> Optional[Dict]: + """ + Search InterPro for protein domain information. + + Automatically detects query type: + - UniProt accession number → lookup pre-computed domains + - Protein sequence (FASTA or raw) → submit for InterProScan analysis + + Args: + query (str): Search query (UniProt ID or protein sequence). + **kwargs: Additional arguments (unused). + + Returns: + Dictionary with domain information or None if not found. + """ + if not query or not isinstance(query, str): + logger.error("Empty or non-string input") + return None + + query = query.strip() + logger.debug("InterPro search query: %s", query[:100]) + + result = None + + # Check if UniProt accession + if self._is_uniprot_accession(query): + logger.debug("Detected UniProt accession: %s", query) + result = self.search_by_uniprot_id(query) + + # Check if protein sequence + elif self._is_protein_sequence(query): + logger.debug("Detected protein sequence (length: %d)", len(query)) + result = self.search_by_sequence(query) + + else: + # Try as UniProt ID first (in case format is non-standard) + logger.debug("Trying as UniProt accession: %s", query) + result = self.search_by_uniprot_id(query) + + if result: + result["_search_query"] = query + + return result diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py index 1a599e25..220db049 100644 --- a/graphgen/operators/search/search_service.py +++ b/graphgen/operators/search/search_service.py @@ -58,6 +58,11 @@ def _init_searcher(self): params = self.kwargs.get("rnacentral_params", {}) self.searcher = RNACentralSearch(**params) + elif self.data_source == "interpro": + from graphgen.models import InterProSearch + + params = self.kwargs.get("interpro_params", {}) + self.searcher = InterProSearch(**params) else: logger.error(f"Unknown data source: {self.data_source}") From 7fbb19d9106aaf72d4fd000f276f53aa403be8c4 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 14:07:30 +0800 Subject: [PATCH 2/8] refactor: refactor interpro_searcher --- .../models/searcher/db/interpro_searcher.py | 142 ++---------------- 1 file changed, 11 insertions(+), 131 deletions(-) diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py index b90d3899..9a6c539a 100644 --- a/graphgen/models/searcher/db/interpro_searcher.py +++ b/graphgen/models/searcher/db/interpro_searcher.py @@ -90,8 +90,8 @@ def _submit_job(self, sequence: str, title: str = "") -> Optional[str]: "email": self.email, "title": title or "GraphGen_Analysis", "sequence": sequence, - "stype": "protein", - "appl": "Pfam,PANTHER,Gene3D,SMART", # Multiple databases + "stype": "p", + "appl": "NCBIfam,SMART,CDD,HAMAP", # Multiple databases "goterms": "true", "pathways": "true", "format": "json", @@ -201,65 +201,6 @@ def _poll_job(self, job_id: str) -> Optional[dict]: ) return None - @staticmethod - def _parse_results(results: dict) -> Optional[dict]: - """ - Parse InterProScan results into a structured format. - - Args: - results (dict): Raw InterProScan JSON results. - - Returns: - Parsed results with structured domain information. - """ - if not results: - return None - - domains = [] - go_terms = set() - pathways = set() - - # Extract matches from results - for result in results.get("results", []): - matches = result.get("matches", []) - - for match in matches: - signature = match.get("signature", {}) - ipr = match.get("ipr", {}) - - domain_info = { - "signature_id": signature.get("accession"), - "signature_name": signature.get("name"), - "database": signature.get("database"), - "interpro_id": ipr.get("id"), - "interpro_name": ipr.get("name"), - "start": match.get("start"), - "end": match.get("end"), - "score": match.get("score"), - "evalue": match.get("evalue"), - } - - # Collect GO terms - for go in ipr.get("go", []): - go_id = go.get("id") - if go_id: - go_terms.add(go_id) - - # Collect pathways - for pathway in ipr.get("pathways", []): - pathway_id = pathway.get("id") - if pathway_id: - pathways.add(pathway_id) - - domains.append(domain_info) - - return { - "domains": domains, - "go_terms": sorted(list(go_terms)) if go_terms else [], - "pathways": sorted(list(pathways)) if pathways else [], - "domain_count": len(domains), - } - def search_by_sequence(self, sequence: str) -> Optional[Dict]: """ Search for protein domains in a sequence using InterProScan API. @@ -292,55 +233,13 @@ def search_by_sequence(self, sequence: str) -> Optional[Dict]: logger.error("Failed to retrieve InterProScan results for job %s", job_id) return None - # Parse results - parsed = self._parse_results(results) - if parsed: - parsed["molecule_type"] = "protein" - parsed["database"] = "InterPro" - parsed["job_id"] = job_id - parsed["url"] = "https://www.ebi.ac.uk/interpro/" - - return parsed - - def _extract_domain_info(self, entry: dict, accession: str) -> list: - """Extract domain information for a specific accession from an entry.""" - domains = [] - proteins = entry.get("proteins", {}) - protein_data = proteins.get(accession) - if protein_data: - entry_acc = entry.get("accession") - entry_name = entry.get("name") - entry_type = entry.get("type") - locations = protein_data.get("locations", []) - for location in locations: - domain_info = { - "interpro_id": entry_acc, - "interpro_name": entry_name, - "type": entry_type, - "start": location.get("start"), - "end": location.get("end"), - } - domains.append(domain_info) - return domains - - def _collect_annotation_terms(self, entry: dict) -> tuple: - """Collect GO terms and pathway annotations from entry.""" - go_terms = set() - pathways = set() - - go_list = entry.get("go_terms", []) - for go_item in go_list: - go_id = go_item.get("identifier") if isinstance(go_item, dict) else go_item - if go_id: - go_terms.add(go_id) - - pathway_list = entry.get("pathways", []) - for pathway in pathway_list: - pathway_id = pathway.get("id") if isinstance(pathway, dict) else pathway - if pathway_id: - pathways.add(pathway_id) - - return go_terms, pathways + return { + "molecule_type": "protein", + "database": "InterPro", + "job_id": job_id, + "content": results, + "url": f"https://www.ebi.ac.uk/interpro/result/{job_id}/", + } def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: """ @@ -362,13 +261,10 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: accession = accession.strip().upper() # Query InterPro REST API for UniProt entry - url = f"https://www.ebi.ac.uk/interpro/api/entry/protein/uniprot/{accession}/" + url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/" response = requests.get(url, timeout=self.api_timeout) - if response.status_code == 404: - logger.info("UniProt accession %s not found in InterPro", accession) - return None if response.status_code != 200: logger.warning( "Failed to search InterPro for accession %s: %d", @@ -379,27 +275,11 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: data = response.json() - domains = [] - go_terms = set() - pathways = set() - - # Parse entry information - for entry in data.get("results", []): - entry_domains = self._extract_domain_info(entry, accession) - domains.extend(entry_domains) - - entry_go_terms, entry_pathways = self._collect_annotation_terms(entry) - go_terms.update(entry_go_terms) - pathways.update(entry_pathways) - result = { "molecule_type": "protein", "database": "InterPro", "id": accession, - "domains": domains, - "go_terms": sorted(list(go_terms)) if go_terms else [], - "pathways": sorted(list(pathways)) if pathways else [], - "domain_count": len(domains), + "content": data.get("results", []), "url": f"https://www.ebi.ac.uk/interpro/protein/uniprot/{accession}/", } From 36e504f0cb953499c6c792ebde455f3f8029cc56 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 14:28:49 +0800 Subject: [PATCH 3/8] feat: add example for interpro_searcher --- .../input_examples/search_interpro_demo.jsonl | 3 + .../search_protein/search_interpro/README.md | 108 ++++++++++++++++++ .../search_interpro/search_interpro.sh | 5 + .../search_interpro_config.yaml | 28 +++++ .../search/search_protein/search_uniprot.sh | 2 - .../{ => search_uniprot}/README.md | 0 .../{ => search_uniprot}/build_db.sh | 0 .../search_protein_config.yaml | 0 .../search_uniprot/search_uniprot.sh | 2 + 9 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 examples/input_examples/search_interpro_demo.jsonl create mode 100644 examples/search/search_protein/search_interpro/README.md create mode 100644 examples/search/search_protein/search_interpro/search_interpro.sh create mode 100644 examples/search/search_protein/search_interpro/search_interpro_config.yaml delete mode 100644 examples/search/search_protein/search_uniprot.sh rename examples/search/search_protein/{ => search_uniprot}/README.md (100%) rename examples/search/search_protein/{ => search_uniprot}/build_db.sh (100%) rename examples/search/search_protein/{ => search_uniprot}/search_protein_config.yaml (100%) create mode 100644 examples/search/search_protein/search_uniprot/search_uniprot.sh diff --git a/examples/input_examples/search_interpro_demo.jsonl b/examples/input_examples/search_interpro_demo.jsonl new file mode 100644 index 00000000..9fadb4b1 --- /dev/null +++ b/examples/input_examples/search_interpro_demo.jsonl @@ -0,0 +1,3 @@ +{"type": "protein", "content": "P01308"} +{"type": "protein", "content": "Q96KN2"} +{"type": "protein", "content": "MGHHHHHHHGSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md new file mode 100644 index 00000000..1e051eb7 --- /dev/null +++ b/examples/search/search_protein/search_interpro/README.md @@ -0,0 +1,108 @@ +# Search Protein Domains with InterPro + +This example demonstrates how to search for protein domain information and functional annotations using the InterPro database. + +## Overview + +The InterPro search pipeline reads protein queries (UniProt accession numbers or protein sequences) and searches the InterPro database to find domain matches, functional annotations, GO terms, and pathways. + +InterPro supports two search modes: +1. **UniProt Accession Lookup**: Fast lookup of pre-computed domain information for known UniProt entries +2. **Protein Sequence Analysis**: Submit protein sequences for InterProScan analysis to discover domains + +## Quick Start + +### 1. Configure Search Parameters + +Edit `search_interpro_config.yaml` to set: + +- **Input file path**: Set the path to your protein sequence or UniProt ID queries +- **InterPro parameters**: + - `email`: Your email address for EBI API requests (required) + - `api_timeout`: Request timeout in seconds (default: 30) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_interpro_demo.jsonl + +data_sources: [interpro] +interpro_params: + email: your_email@example.com + api_timeout: 30 +``` + +### 2. Run the Search + +```bash +./search_interpro.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_interpro/search_interpro_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with protein queries: + +```jsonl +{"type": "protein", "content": "P01308"} +{"type": "protein", "content": "Q96KN2"} +{"type": "protein", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +Or in FASTA format: +``` +>P01308 +MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK + +>insulin_sequence +MHHHHHHSSGVDLGTENLYFQS... +``` + + +## Output + +The search results will be saved in the output directory with: + +```json +{ + "molecule_type": "protein", + "database": "InterPro", + "id": "P01308", + "job_id": "iprscan5-R20240123-123456-xxxx-p1m", + "content": { + "results": [ + { + "xref": [ + { + "ref": "INTERPRO", + "id": "IPR000001", + "name": "Domain Name" + } + ], + "signature_acc": "PF00001", + "go_annotations": [ + { + "id": "GO:0001234", + "description": "biological process" + } + ] + } + ] + }, + "url": "https://www.ebi.ac.uk/interpro/protein/uniprot/P01308/", + "_search_query": "P01308" +} +``` + +## References + +- **InterPro Database**: https://www.ebi.ac.uk/interpro/ +- **EBI InterProScan API**: https://www.ebi.ac.uk/Tools/services/rest/iprscan5 +- **UniProt Database**: https://www.uniprot.org/ diff --git a/examples/search/search_protein/search_interpro/search_interpro.sh b/examples/search/search_protein/search_interpro/search_interpro.sh new file mode 100644 index 00000000..cfc0309e --- /dev/null +++ b/examples/search/search_protein/search_interpro/search_interpro.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Search InterPro for protein domain annotations + +python3 -m graphgen.run \ + --config_file examples/search/search_protein/search_interpro/search_interpro_config.yaml diff --git a/examples/search/search_protein/search_interpro/search_interpro_config.yaml b/examples/search/search_protein/search_interpro/search_interpro_config.yaml new file mode 100644 index 00000000..c2ab2bfa --- /dev/null +++ b/examples/search/search_protein/search_interpro/search_interpro_config.yaml @@ -0,0 +1,28 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + graph_backend: kuzu # graph database backend, support: kuzu, networkx + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_interpro_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + save_output: true + params: + data_source: interpro # data source for searcher, support: wikipedia, google, uniprot, ncbi, interpro + interpro_params: + email: test@example.com # Email address for EBI API requests + api_timeout: 30 # Request timeout in seconds diff --git a/examples/search/search_protein/search_uniprot.sh b/examples/search/search_protein/search_uniprot.sh deleted file mode 100644 index 627735a0..00000000 --- a/examples/search/search_protein/search_uniprot.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file examples/search/search_protein/search_protein_config.yaml diff --git a/examples/search/search_protein/README.md b/examples/search/search_protein/search_uniprot/README.md similarity index 100% rename from examples/search/search_protein/README.md rename to examples/search/search_protein/search_uniprot/README.md diff --git a/examples/search/search_protein/build_db.sh b/examples/search/search_protein/search_uniprot/build_db.sh similarity index 100% rename from examples/search/search_protein/build_db.sh rename to examples/search/search_protein/search_uniprot/build_db.sh diff --git a/examples/search/search_protein/search_protein_config.yaml b/examples/search/search_protein/search_uniprot/search_protein_config.yaml similarity index 100% rename from examples/search/search_protein/search_protein_config.yaml rename to examples/search/search_protein/search_uniprot/search_protein_config.yaml diff --git a/examples/search/search_protein/search_uniprot/search_uniprot.sh b/examples/search/search_protein/search_uniprot/search_uniprot.sh new file mode 100644 index 00000000..e4862572 --- /dev/null +++ b/examples/search/search_protein/search_uniprot/search_uniprot.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_protein/search_uniprot/search_protein_config.yaml From ae695d77a541619b5c338bea154317b87f70368a Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 18:44:09 +0800 Subject: [PATCH 4/8] feat: fetch detailed interpro result --- .../input_examples/search_interpro_demo.jsonl | 1 - .../search_interpro_config.yaml | 1 - .../models/searcher/db/interpro_searcher.py | 265 +++--------------- 3 files changed, 41 insertions(+), 226 deletions(-) diff --git a/examples/input_examples/search_interpro_demo.jsonl b/examples/input_examples/search_interpro_demo.jsonl index 9fadb4b1..2427fe0b 100644 --- a/examples/input_examples/search_interpro_demo.jsonl +++ b/examples/input_examples/search_interpro_demo.jsonl @@ -1,3 +1,2 @@ {"type": "protein", "content": "P01308"} {"type": "protein", "content": "Q96KN2"} -{"type": "protein", "content": "MGHHHHHHHGSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} diff --git a/examples/search/search_protein/search_interpro/search_interpro_config.yaml b/examples/search/search_protein/search_interpro/search_interpro_config.yaml index c2ab2bfa..c1089af9 100644 --- a/examples/search/search_protein/search_interpro/search_interpro_config.yaml +++ b/examples/search/search_protein/search_interpro/search_interpro_config.yaml @@ -24,5 +24,4 @@ nodes: params: data_source: interpro # data source for searcher, support: wikipedia, google, uniprot, ncbi, interpro interpro_params: - email: test@example.com # Email address for EBI API requests api_timeout: 30 # Request timeout in seconds diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py index 9a6c539a..d7b140ec 100644 --- a/graphgen/models/searcher/db/interpro_searcher.py +++ b/graphgen/models/searcher/db/interpro_searcher.py @@ -1,5 +1,4 @@ import re -import time from typing import Dict, Optional import requests @@ -20,40 +19,22 @@ class InterProSearch(BaseSearcher): InterPro Search client to search protein domains and functional annotations. Supports: 1) Get protein domain information by UniProt accession number. - 2) Search with protein sequence using EBI InterProScan API. - 3) Parse domain matches and associated GO terms, pathways. - API Documentation: https://www.ebi.ac.uk/Tools/services/rest/iprscan5 + API Documentation: https://www.ebi.ac.uk/interpro/api/ """ def __init__( self, - email: str = "graphgen@example.com", api_timeout: int = 30, ): """ Initialize the InterPro Search client. Args: - email (str): Email address for EBI API requests. api_timeout (int): Request timeout in seconds. """ - self.base_url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5" - self.email = email self.api_timeout = api_timeout - self.poll_interval = 5 # Fixed interval between status checks - self.max_polls = 120 # Maximum polling attempts (10 minutes with 5s interval) - - @staticmethod - def _is_protein_sequence(text: str) -> bool: - """Check if text looks like a protein sequence.""" - # Remove common FASTA header prefix - if text.startswith(">"): - text = "\n".join(text.split("\n")[1:]) - # Check if contains mostly protein amino acids - text = text.strip().replace("\n", "").replace(" ", "") - # Protein sequences contain only A-Z letters (standard amino acids) - return bool(re.fullmatch(r"[A-Z]+", text, re.I)) and len(text) > 10 + self.BASE_URL = "https://www.ebi.ac.uk/interpro/api" @staticmethod def _is_uniprot_accession(text: str) -> bool: @@ -61,186 +42,6 @@ def _is_uniprot_accession(text: str) -> bool: # UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2 return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I)) - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=5), - retry=retry_if_exception_type(RequestException), - reraise=True, - ) - def _submit_job(self, sequence: str, title: str = "") -> Optional[str]: - """ - Submit a protein sequence for InterProScan analysis. - - Args: - sequence (str): Protein sequence (FASTA or raw). - title (str): Optional job title. - - Returns: - Job ID if successful, None otherwise. - """ - url = f"{self.base_url}/run" - - # Parse sequence if FASTA format - if sequence.startswith(">"): - sequence = ( - "\n".join(sequence.split("\n")[1:]).replace("\n", "").replace(" ", "") - ) - - params = { - "email": self.email, - "title": title or "GraphGen_Analysis", - "sequence": sequence, - "stype": "p", - "appl": "NCBIfam,SMART,CDD,HAMAP", # Multiple databases - "goterms": "true", - "pathways": "true", - "format": "json", - } - - try: - response = requests.post(url, data=params, timeout=self.api_timeout) - if response.status_code == 200: - job_id = response.text.strip() - logger.debug("InterProScan job submitted: %s", job_id) - return job_id - logger.error( - "Failed to submit InterProScan job: %d - %s", - response.status_code, - response.text, - ) - return None - except RequestException as e: - logger.error("Request error while submitting job: %s", e) - raise - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=5), - retry=retry_if_exception_type(RequestException), - reraise=True, - ) - def _check_status(self, job_id: str) -> Optional[str]: - """Check the status of a submitted job.""" - url = f"{self.base_url}/status/{job_id}" - try: - response = requests.get(url, timeout=self.api_timeout) - if response.status_code == 200: - return response.text.strip() - logger.warning( - "Failed to check job status for %s: %d", - job_id, - response.status_code, - ) - return None - except RequestException as e: - logger.error("Request error while checking status: %s", e) - raise - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=5), - retry=retry_if_exception_type(RequestException), - reraise=True, - ) - def _get_results(self, job_id: str) -> Optional[dict]: - """Retrieve the analysis results for a completed job.""" - url = f"{self.base_url}/result/{job_id}/json" - try: - response = requests.get(url, timeout=self.api_timeout) - if response.status_code == 200: - return response.json() - logger.warning( - "Failed to retrieve results for job %s: %d", - job_id, - response.status_code, - ) - return None - except RequestException as e: - logger.error("Request error while retrieving results: %s", e) - raise - - def _poll_job(self, job_id: str) -> Optional[dict]: - """ - Poll a job until completion and retrieve results. - - Args: - job_id (str): The job ID to poll. - - Returns: - Results dictionary if successful, None otherwise. - """ - for attempt in range(self.max_polls): - status = self._check_status(job_id) - - if status == "FINISHED": - logger.debug( - "Job %s completed after %d polls", - job_id, - attempt + 1, - ) - return self._get_results(job_id) - - if status in ["FAILED", "NOT_FOUND"]: - logger.warning("Job %s has status: %s", job_id, status) - return None - - if status == "RUNNING": - logger.debug( - "Job %s still running (attempt %d/%d)", - job_id, - attempt + 1, - self.max_polls, - ) - time.sleep(self.poll_interval) - else: - logger.debug("Job %s status: %s", job_id, status) - time.sleep(self.poll_interval) - - logger.warning( - "Job %s polling timed out after %d attempts", job_id, self.max_polls - ) - return None - - def search_by_sequence(self, sequence: str) -> Optional[Dict]: - """ - Search for protein domains in a sequence using InterProScan API. - - Args: - sequence (str): Protein sequence in FASTA or raw format. - - Returns: - Dictionary with domain analysis results or None if failed. - """ - if not sequence or not isinstance(sequence, str): - logger.error("Invalid sequence provided") - return None - - sequence = sequence.strip() - - if not self._is_protein_sequence(sequence): - logger.error("Invalid protein sequence format") - return None - - # Submit job - job_id = self._submit_job(sequence) - if not job_id: - logger.error("Failed to submit InterProScan job") - return None - - # Poll for results - results = self._poll_job(job_id) - if not results: - logger.error("Failed to retrieve InterProScan results for job %s", job_id) - return None - - return { - "molecule_type": "protein", - "database": "InterPro", - "job_id": job_id, - "content": results, - "url": f"https://www.ebi.ac.uk/interpro/result/{job_id}/", - } - def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: """ Search InterPro database by UniProt accession number. @@ -261,7 +62,7 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: accession = accession.strip().upper() # Query InterPro REST API for UniProt entry - url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/" + url = f"{self.BASE_URL}/entry/interpro/protein/uniprot/{accession}/" response = requests.get(url, timeout=self.api_timeout) @@ -275,6 +76,14 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: data = response.json() + # Get entry details for each InterPro entry found + for result in data.get("results", []): + interpro_acc = result.get("metadata", {}).get("accession") + if interpro_acc: + entry_details = self.get_entry_details(interpro_acc) + if entry_details: + result["entry_details"] = entry_details + result = { "molecule_type": "protein", "database": "InterPro", @@ -285,6 +94,31 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: return result + def get_entry_details(self, interpro_accession: str) -> Optional[Dict]: + """ + Get detailed information for a specific InterPro entry. + + Args: + interpro_accession (str): InterPro accession number (e.g., IPR000001). + Returns: + Dictionary with entry details or None if not found. + """ + if not interpro_accession or not isinstance(interpro_accession, str): + return None + + url = f"{self.BASE_URL}/entry/interpro/{interpro_accession}/" + + response = requests.get(url, timeout=self.api_timeout) + if response.status_code != 200: + logger.warning( + "Failed to get InterPro entry %s: %d", + interpro_accession, + response.status_code, + ) + return None + + return response.json() + @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=5), @@ -293,14 +127,10 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: ) def search(self, query: str, **kwargs) -> Optional[Dict]: """ - Search InterPro for protein domain information. - - Automatically detects query type: - - UniProt accession number → lookup pre-computed domains - - Protein sequence (FASTA or raw) → submit for InterProScan analysis + Search InterPro for protein domain information by UniProt accession. Args: - query (str): Search query (UniProt ID or protein sequence). + query (str): UniProt accession number (e.g., P01308, Q96KN2). **kwargs: Additional arguments (unused). Returns: @@ -313,22 +143,9 @@ def search(self, query: str, **kwargs) -> Optional[Dict]: query = query.strip() logger.debug("InterPro search query: %s", query[:100]) - result = None - - # Check if UniProt accession - if self._is_uniprot_accession(query): - logger.debug("Detected UniProt accession: %s", query) - result = self.search_by_uniprot_id(query) - - # Check if protein sequence - elif self._is_protein_sequence(query): - logger.debug("Detected protein sequence (length: %d)", len(query)) - result = self.search_by_sequence(query) - - else: - # Try as UniProt ID first (in case format is non-standard) - logger.debug("Trying as UniProt accession: %s", query) - result = self.search_by_uniprot_id(query) + # Search by UniProt ID + logger.debug("Searching for UniProt accession: %s", query) + result = self.search_by_uniprot_id(query) if result: result["_search_query"] = query From d0b230c283195b62706e455fb0179c50ab20e524 Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Mon, 9 Feb 2026 18:57:40 +0800 Subject: [PATCH 5/8] Update graphgen/models/searcher/db/interpro_searcher.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- graphgen/models/searcher/db/interpro_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphgen/models/searcher/db/interpro_searcher.py b/graphgen/models/searcher/db/interpro_searcher.py index d7b140ec..9d3e7c06 100644 --- a/graphgen/models/searcher/db/interpro_searcher.py +++ b/graphgen/models/searcher/db/interpro_searcher.py @@ -55,7 +55,7 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: Returns: Dictionary with domain information or None if not found. """ - if not accession or not isinstance(accession, str): + if not accession or not isinstance(accession, str) or not self._is_uniprot_accession(accession): logger.error("Invalid accession provided") return None From e445052cdab3a1b6fdf6e998cb617466f4220d59 Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Mon, 9 Feb 2026 18:57:53 +0800 Subject: [PATCH 6/8] Update examples/search/search_protein/search_interpro/README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- examples/search/search_protein/search_interpro/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md index 1e051eb7..5a848c05 100644 --- a/examples/search/search_protein/search_interpro/README.md +++ b/examples/search/search_protein/search_interpro/README.md @@ -26,7 +26,7 @@ Example configuration: input_path: - examples/input_examples/search_interpro_demo.jsonl -data_sources: [interpro] +data_source: interpro interpro_params: email: your_email@example.com api_timeout: 30 From 13326339cb5778ed1de6fae75dda9db3efc937e8 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 18:59:30 +0800 Subject: [PATCH 7/8] docs: update README --- .../search_protein/search_interpro/README.md | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md index 5a848c05..052d1f6c 100644 --- a/examples/search/search_protein/search_interpro/README.md +++ b/examples/search/search_protein/search_interpro/README.md @@ -4,11 +4,7 @@ This example demonstrates how to search for protein domain information and funct ## Overview -The InterPro search pipeline reads protein queries (UniProt accession numbers or protein sequences) and searches the InterPro database to find domain matches, functional annotations, GO terms, and pathways. - -InterPro supports two search modes: -1. **UniProt Accession Lookup**: Fast lookup of pre-computed domain information for known UniProt entries -2. **Protein Sequence Analysis**: Submit protein sequences for InterProScan analysis to discover domains +The InterPro search pipeline reads protein queries (UniProt accession numbers) and searches the InterPro database to find domain matches, functional annotations, GO terms, and pathways. ## Quick Start @@ -18,7 +14,6 @@ Edit `search_interpro_config.yaml` to set: - **Input file path**: Set the path to your protein sequence or UniProt ID queries - **InterPro parameters**: - - `email`: Your email address for EBI API requests (required) - `api_timeout`: Request timeout in seconds (default: 30) Example configuration: @@ -53,16 +48,6 @@ The input file should be in JSONL format with protein queries: ```jsonl {"type": "protein", "content": "P01308"} {"type": "protein", "content": "Q96KN2"} -{"type": "protein", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} -``` - -Or in FASTA format: -``` ->P01308 -MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK - ->insulin_sequence -MHHHHHHSSGVDLGTENLYFQS... ``` From bf68f13e38d8f5f95e731ad035e27cf0df7b9978 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 9 Feb 2026 19:03:30 +0800 Subject: [PATCH 8/8] docs: update README --- examples/search/search_protein/search_interpro/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/search/search_protein/search_interpro/README.md b/examples/search/search_protein/search_interpro/README.md index 052d1f6c..2a394562 100644 --- a/examples/search/search_protein/search_interpro/README.md +++ b/examples/search/search_protein/search_interpro/README.md @@ -23,7 +23,6 @@ input_path: data_source: interpro interpro_params: - email: your_email@example.com api_timeout: 30 ```