diff --git a/examples/input_examples/search_reactome_demo.jsonl b/examples/input_examples/search_reactome_demo.jsonl new file mode 100644 index 00000000..57ebf31f --- /dev/null +++ b/examples/input_examples/search_reactome_demo.jsonl @@ -0,0 +1,4 @@ +{"content": "R-HSA-69278", "type": "reactome_id"} +{"content": "apoptosis", "type": "keyword"} +{"content": "TP53", "type": "gene_symbol"} +{"content": "MAPK1\nERK2\nPTEN", "type": "gene_list"} diff --git a/examples/search/search_reactome/search_reactome_config.yaml b/examples/search/search_reactome/search_reactome_config.yaml new file mode 100644 index 00000000..b05ae91d --- /dev/null +++ b/examples/search/search_reactome/search_reactome_config.yaml @@ -0,0 +1,29 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + graph_backend: kuzu # graph database backend, support: kuzu, networkx + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_reactome_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + save_output: true + params: + data_source: reactome # data source for searcher, support: uniprot, ncbi, rnacentral, reactome + reactome_params: + species: "Homo sapiens" # species name, support: Homo sapiens, Mus musculus, Rattus norvegicus, etc. + timeout: 30 # request timeout in seconds + max_retries: 3 # maximum number of retries for failed requests diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 6b75587c..1cab54d7 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -43,6 +43,7 @@ ) from .rephraser import StyleControlledRephraser from .searcher.db.ncbi_searcher import NCBISearch + from .searcher.db.reactome_searcher import ReactomeSearcher from .searcher.db.rnacentral_searcher import RNACentralSearch from .searcher.db.uniprot_searcher import UniProtSearch from .searcher.kg.wiki_search import WikiSearch @@ -97,6 +98,7 @@ # Searcher "NCBISearch": ".searcher.db.ncbi_searcher", "RNACentralSearch": ".searcher.db.rnacentral_searcher", + "ReactomeSearcher": ".searcher.db.reactome_searcher", "UniProtSearch": ".searcher.db.uniprot_searcher", "WikiSearch": ".searcher.kg.wiki_search", "BingSearch": ".searcher.web.bing_search", diff --git a/graphgen/models/searcher/db/reactome_searcher.py b/graphgen/models/searcher/db/reactome_searcher.py new file mode 100644 index 00000000..71db324a --- /dev/null +++ b/graphgen/models/searcher/db/reactome_searcher.py @@ -0,0 +1,302 @@ +import re +import time +from typing import Any, Dict, Optional + +import requests +from requests.exceptions import RequestException + +from graphgen.utils import logger + + +class ReactomeSearcher: + """ + Reactome Pathway Search client for retrieving biological pathways by UniProt ID. + + Supports: + 1) Search pathways associated with a protein by UniProt accession. + 2) Rank pathways by relevance (curated vs inferred, diagram availability). + 3) Fetch detailed annotations for top-ranked pathways. + + API Documentation: https://reactome.org/ContentService + """ + + CONTENT_URL = "https://reactome.org/ContentService" + + # UniProt accession pattern (e.g., P04637, Q96KN2, O14763) + UNIPROT_PATTERN = re.compile( + r"^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$" + ) + + def __init__( + self, + species: str = "Homo sapiens", + timeout: int = 30, + top_n_details: int = 5, + ): + """ + Initialize the Reactome Pathway Search client. + + Args: + species: Species name (e.g., "Homo sapiens", "Mus musculus") or code ("HSA"). + timeout: Request timeout in seconds. + top_n_details: Number of top pathways to fetch detailed annotations for. + """ + self.timeout = timeout + self.species = self._normalize_species(species) + self.top_n_details = top_n_details + self.session = requests.Session() + self.session.headers.update({"Accept": "application/json"}) + + @staticmethod + def _normalize_species(species: str) -> str: + """Convert species code to full name.""" + species_map = { + "HSA": "Homo sapiens", + "MMU": "Mus musculus", + "RNO": "Rattus norvegicus", + "GGA": "Gallus gallus", + "CEL": "Caenorhabditis elegans", + "DME": "Drosophila melanogaster", + } + return species_map.get(species.upper(), species) + + @staticmethod + def _is_uniprot_accession(text: str) -> bool: + """Check if text is a valid UniProt accession number.""" + if not text or not isinstance(text, str): + return False + return bool(ReactomeSearcher.UNIPROT_PATTERN.match(text.strip())) + + def _calculate_relevance_score(self, pathway: Dict[str, Any]) -> int: + """ + Calculate relevance score for pathway ranking. + Higher score indicates higher biological significance. + + Scoring criteria: + - Manual curation (not inferred): +10 + - Has pathway diagram: +5 + - Disease-related: +3 + - Specific biological terms in name: +2 + """ + score = 0 + + # Prioritize manually curated over computational predictions + if not pathway.get("isInferred", True): + score += 10 + + # Visual representations indicate well-characterized pathways + if pathway.get("hasDiagram", False): + score += 5 + + # Disease pathways often have higher clinical relevance + if pathway.get("isInDisease", False): + score += 3 + + # Prefer specific pathway types over generic classifications + name = pathway.get("displayName", "").lower() + specific_terms = [ + "signaling", + "regulation", + "activation", + "pathway", + "synthesis", + "degradation", + "repair", + "apoptosis", + ] + if any(term in name for term in specific_terms): + score += 2 + + return score + + def _fetch_pathway_details(self, pathway_stid: str) -> Optional[Dict[str, Any]]: + """ + Fetch detailed information for a specific pathway. + + Args: + pathway_stid: Reactome stable ID (e.g., "R-HSA-111288"). + + Returns: + Dictionary with detailed annotations or None if fetch fails. + """ + url = f"{self.CONTENT_URL}/data/query/{pathway_stid}" + + try: + response = self.session.get(url, timeout=self.timeout) + if response.status_code == 404: + logger.warning("Pathway %s not found in Reactome", pathway_stid) + return None + + response.raise_for_status() + data = response.json() + + # Extract key annotations + details = { + "schemaClass": data.get("schemaClass"), + "summation": data.get("summation", [None])[0] + if data.get("summation") + else None, + "compartment": [ + c.get("displayName") for c in data.get("compartment", []) + ], + "disease": [d.get("displayName") for d in data.get("disease", [])], + "sub_pathways": [ + {"stId": e.get("stId"), "name": e.get("displayName")} + for e in data.get("hasEvent", [])[:5] # First 5 sub-events + ], + "literature_references": [ + { + "pubMedId": ref.get("pubMedIdentifier"), + "title": ref.get("displayName"), + } + for ref in data.get("literatureReference", [])[:3] # Top 3 refs + ], + } + + return details + + except RequestException as e: + logger.error("Failed to fetch details for pathway %s: %s", pathway_stid, e) + return None + + def search_by_uniprot_id(self, accession: str) -> Optional[Dict]: + """ + Search Reactome pathways by UniProt accession number. + + Retrieves all pathways associated with the protein, ranks them by relevance, + and fetches detailed annotations for the top N pathways. + + Args: + accession: UniProt accession number (e.g., "P04637" for TP53). + + Returns: + Dictionary with pathway information or None if search fails: + { + "molecule_type": "protein", + "database": "Reactome", + "id": accession, + "content": { + "total_found": int, + "pathways": List[Dict] # Top pathways with details + }, + "url": str # Link to Reactome search + } + """ + if not self._is_uniprot_accession(accession): + logger.error("Invalid UniProt accession format: %s", accession) + return None + + accession = accession.strip().upper() + logger.debug("Searching Reactome pathways for %s", accession) + + # Step 1: Search for all pathways + url = f"{self.CONTENT_URL}/search/query" + params = { + "query": accession, + "species": self.species, + "rows": 100, + "type": "Pathway", + } + + try: + response = self.session.get(url, params=params, timeout=self.timeout) + response.raise_for_status() + data = response.json() + + hits = data.get("searchHits", []) + if not hits: + logger.info("No pathways found for %s in %s", accession, self.species) + return None + + # Step 2: Extract basic pathway info + pathways = [] + for hit in hits: + if hit.get("type") == "Pathway": + pathways.append( + { + "stId": hit.get("stId"), + "displayName": hit.get("displayName"), + "dbId": hit.get("dbId"), + "species": hit.get("species"), + "isInDisease": hit.get("isInDisease", False), + "isInferred": hit.get("isInferred", False), + "hasDiagram": hit.get("hasDiagram", False), + "url": f"https://reactome.org/PathwayBrowser/#{hit.get('stId')}", + } + ) + + logger.info("Found %d pathways for %s", len(pathways), accession) + + # Step 3: Rank by relevance score + scored = [(self._calculate_relevance_score(pw), pw) for pw in pathways] + scored.sort(key=lambda x: x[0], reverse=True) + sorted_pathways = [pw for _, pw in scored] + + # Step 4: Fetch details for top N pathways + top_pathways = [] + for i, pw in enumerate(sorted_pathways[: self.top_n_details]): + details = self._fetch_pathway_details(pw["stId"]) + if details: + pw["details"] = details + top_pathways.append(pw) + + # Small delay to avoid overwhelming API + if i < self.top_n_details - 1: + time.sleep(0.1) + else: + # Include pathway even if details fetch fails + pw["details"] = None + top_pathways.append(pw) + + # Construct result in standard format + result = { + "molecule_type": "protein", + "database": "Reactome", + "id": accession, + "content": { + "total_found": len(pathways), + "pathways": top_pathways, + }, + "url": f"https://reactome.org/content/query?q={accession}", + } + + return result + + except RequestException as e: + logger.error("Failed to search Reactome for %s: %s", accession, e) + return None + + def search(self, query: str, **kwargs) -> Optional[Dict]: + """ + Search Reactome for pathway information. + + Automatically detects query type (currently supports UniProt accession only). + + Args: + query: Search query (UniProt accession number). + **kwargs: Additional arguments (unused). + + Returns: + Dictionary with pathway information or None if not found. + """ + if not query or not isinstance(query, str): + logger.error("Empty or invalid input for Reactome search") + return None + + query = query.strip() + logger.debug("Reactome search query: %s", query) + + result = None + + if self._is_uniprot_accession(query): + logger.debug("Detected UniProt accession: %s", query) + result = self.search_by_uniprot_id(query) + else: + logger.warning("Query %s not recognized as UniProt accession", query) + # Try anyway as it might be a non-standard format + result = self.search_by_uniprot_id(query) + + if result: + result["_search_query"] = query + + return result diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py index 1a599e25..5ad4deb2 100644 --- a/graphgen/operators/search/search_service.py +++ b/graphgen/operators/search/search_service.py @@ -58,6 +58,11 @@ def _init_searcher(self): params = self.kwargs.get("rnacentral_params", {}) self.searcher = RNACentralSearch(**params) + elif self.data_source == "reactome": + from graphgen.models import ReactomeSearcher + + params = self.kwargs.get("reactome_params", {}) + self.searcher = ReactomeSearcher(**params) else: logger.error(f"Unknown data source: {self.data_source}")