From 3796c7c7a7820d746987d2fbda3c64cff558f94f Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Fri, 5 Dec 2025 00:59:01 +0800
Subject: [PATCH 01/16] fix: fix dna/rna local blast

---
 .gitignore                                    |   4 +
 graphgen/configs/search_dna_config.yaml       |   2 +-
 graphgen/configs/search_rna_config.yaml       |   4 +-
 graphgen/models/searcher/db/ncbi_searcher.py  |  63 +++-
 .../input_examples/search_dna_demo.jsonl      |  13 +-
 .../input_examples/search_rna_demo.jsonl      |   3 +
 scripts/search/build_db/build_dna_blast_db.sh |  69 ++++-
 scripts/search/build_db/build_rna_blast_db.sh | 277 +++++++++++-------
 uv.lock                                       |   3 +
 9 files changed, 308 insertions(+), 130 deletions(-)
 create mode 100644 uv.lock

diff --git a/.gitignore b/.gitignore
index 678cdc50..b654d301 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,3 +177,7 @@ cache
 *.pyc
 *.html
 .gradio
+
+# macOS
+.DS_Store
+**/.DS_Store
diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml
index 5245ea0c..f53a5eb8 100644
--- a/graphgen/configs/search_dna_config.yaml
+++ b/graphgen/configs/search_dna_config.yaml
@@ -13,5 +13,5 @@ pipeline:
         email: test@example.com # NCBI requires an email address
         tool: GraphGen # tool name for NCBI API
         use_local_blast: true # whether to use local blast for DNA search
-        local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
+        local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
 
diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml
index dae62ec2..10422988 100644
--- a/graphgen/configs/search_rna_config.yaml
+++ b/graphgen/configs/search_rna_config.yaml
@@ -11,6 +11,4 @@ pipeline:
       data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
       rnacentral_params:
         use_local_blast: true # whether to use local blast for RNA search
-        local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
-        # can also use DNA database with RNA sequences (if already built)
-
+        local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 0de8ecc0..8b64d4ba 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -83,6 +83,29 @@ def _nested_get(data: dict, *keys, default=None):
             data = data.get(key, default)
         return data
 
+    @staticmethod
+    def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
+        """Infer molecule_type_detail from accession prefix or gene type."""
+        if accession:
+            if accession.startswith(("NM_", "XM_")):
+                return "mRNA"
+            elif accession.startswith(("NC_", "NT_")):
+                return "genomic DNA"
+            elif accession.startswith(("NR_", "XR_")):
+                return "RNA"
+            elif accession.startswith("NG_"):
+                return "genomic region"
+        # Fallback: infer from gene type if available
+        if gene_type is not None:
+            gene_type_map = {
+                3: "rRNA",
+                4: "tRNA",
+                5: "snRNA",
+                6: "ncRNA",
+            }
+            return gene_type_map.get(gene_type)
+        return None
+
     def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
         """
         Convert an Entrez gene record to a dictionary.
@@ -120,7 +143,7 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
             else None
         )
 
-        # Extract representative accession
+        # Extract representative accession (prefer type 3 = mRNA/transcript)
         representative_accession = next(
             (
                 product.get("Gene-commentary_accession")
@@ -129,6 +152,17 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
             ),
             None,
         )
+        # Fallback: if no type 3 accession, try any available accession
+        # This is needed for genes that don't have mRNA transcripts but have other sequence records
+        if not representative_accession:
+            representative_accession = next(
+                (
+                    product.get("Gene-commentary_accession")
+                    for product in locus.get("Gene-commentary_products", [])
+                    if product.get("Gene-commentary_accession")
+                ),
+                None,
+            )
 
         # Extract function
         function = data.get("Entrezgene_summary") or next(
@@ -169,18 +203,19 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
             "sequence": None,
             "sequence_length": None,
             "gene_id": gene_id,
-            "molecule_type_detail": None,
+            "molecule_type_detail": self._infer_molecule_type_detail(
+                representative_accession, data.get("Entrezgene_type")
+            ),
             "_representative_accession": representative_accession,
         }
 
     def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
         """Get gene information by Gene ID."""
-        def _extract_from_genbank(result: dict, accession: str):
-            """Enrich result dictionary with sequence and summary information from accession."""
+        def _extract_metadata_from_genbank(result: dict, accession: str):
+            """Extract metadata from GenBank format (title, features, organism, etc.)."""
             with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
                 record = SeqIO.read(handle, "genbank")
-                result["sequence"] = str(record.seq)
-                result["sequence_length"] = len(record.seq)
+                
                 result["title"] = record.description
                 result["molecule_type_detail"] = (
                     "mRNA" if accession.startswith(("NM_", "XM_")) else
@@ -203,7 +238,20 @@ def _extract_from_genbank(result: dict, accession: str):
 
                 if not result.get("organism") and 'organism' in record.annotations:
                     result["organism"] = record.annotations['organism']
+            
+            return result
 
+        def _extract_sequence_from_fasta(result: dict, accession: str):
+            """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
+            try:
+                with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
+                    fasta_record = SeqIO.read(fasta_handle, "fasta")
+                    result["sequence"] = str(fasta_record.seq)
+                    result["sequence_length"] = len(fasta_record.seq)
+            except Exception as fasta_exc:
+                logger.warning("Failed to extract sequence from accession %s using FASTA format: %s", accession, fasta_exc)
+                result["sequence"] = None
+                result["sequence_length"] = None
             return result
 
         try:
@@ -214,7 +262,8 @@ def _extract_from_genbank(result: dict, accession: str):
 
                 result = self._gene_record_to_dict(gene_record, gene_id)
                 if accession := (preferred_accession or result.get("_representative_accession")):
-                    result = _extract_from_genbank(result, accession)
+                    result = _extract_metadata_from_genbank(result, accession)
+                    result = _extract_sequence_from_fasta(result, accession)
 
                 result.pop("_representative_accession", None)
                 return result
diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl
index 346b65f0..f423e1c1 100644
--- a/resources/input_examples/search_dna_demo.jsonl
+++ b/resources/input_examples/search_dna_demo.jsonl
@@ -1,9 +1,4 @@
-{"type": "text", "content": "TP53"}
-{"type": "text", "content": "BRCA1"}
-{"type": "text", "content": "672"}
-{"type": "text", "content": "11998"}
-{"type": "text", "content": "NM_000546"}
-{"type": "text", "content": "NM_024140"}
-{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
-{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
-
+{"type": "text", "content": "NG_033923"}
+{"type": "text", "content": "NG_056118"}
+{"type": "text", "content": ">query\nACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
+{"type": "text", "content": "ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"}
diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl
index 16e99479..896473e2 100644
--- a/resources/input_examples/search_rna_demo.jsonl
+++ b/resources/input_examples/search_rna_demo.jsonl
@@ -1,5 +1,8 @@
 {"type": "text", "content": "hsa-let-7a-1"}
+{"type": "text", "content": "XIST regulator"}
 {"type": "text", "content": "URS0000123456"}
 {"type": "text", "content": "URS0000000001"}
+{"type": "text", "content": "URS0000000787"}
+{"type": "text", "content": "GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"}
 {"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
 {"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh
index b53b4249..1928d7d0 100755
--- a/scripts/search/build_db/build_dna_blast_db.sh
+++ b/scripts/search/build_db/build_dna_blast_db.sh
@@ -24,7 +24,8 @@ set -e
 #   - {category}.{number}.genomic.fna.gz (基因组序列)
 #   - {category}.{number}.rna.fna.gz (RNA序列)
 #
-# Usage: ./build_dna_blast_db.sh [representative|complete|all]
+# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all]
+#   human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
 #   representative: Download genomic sequences from major categories (recommended, smaller)
 #                    Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
 #   complete: Download all complete genomic sequences from complete/ directory (very large)
@@ -35,7 +36,7 @@ set -e
 # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
 # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
 
-DOWNLOAD_TYPE=${1:-representative}
+DOWNLOAD_TYPE=${1:-human_mouse}
 
 # Better to use a stable DOWNLOAD_TMP name to support resuming downloads
 DOWNLOAD_TMP=_downloading_dna
@@ -57,8 +58,66 @@ else
     echo "Using date as release identifier: ${RELEASE}"
 fi
 
+# Function to check if a file contains target species
+check_file_for_species() {
+    local url=$1
+    local filename=$2
+    local temp_file="/tmp/check_${filename//\//_}"
+    
+    # Download first 500KB (enough to get many sequence headers)
+    # This should be sufficient to identify the species in most cases
+    if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
+        # Try to decompress and check for species names
+        if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
+            rm -f "${temp_file}"
+            return 0  # Contains target species
+        else
+            rm -f "${temp_file}"
+            return 1  # Does not contain target species
+        fi
+    else
+        # If partial download fails, skip this file (don't download it)
+        rm -f "${temp_file}"
+        return 1
+    fi
+}
+
 # Download based on type
 case ${DOWNLOAD_TYPE} in
+    human_mouse)
+        echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
+        echo "This will check each file to see if it contains human or mouse sequences..."
+        category="vertebrate_mammalian"
+        echo "Checking files in ${category} category..."
+        
+        # Get list of files and save to temp file to avoid subshell issues
+        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+            grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+            sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt
+        
+        file_count=0
+        download_count=0
+        
+        while read filename; do
+            file_count=$((file_count + 1))
+            url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
+            echo -n "[${file_count}] Checking ${filename}... "
+            
+            if check_file_for_species "${url}" "${filename}"; then
+                echo "✓ contains target species, downloading..."
+                download_count=$((download_count + 1))
+                wget -c -q --show-progress "${url}" || {
+                    echo "Warning: Failed to download ${filename}"
+                }
+            else
+                echo "✗ skipping (no human/mouse data)"
+            fi
+        done < /tmp/refseq_files.txt
+        
+        rm -f /tmp/refseq_files.txt
+        echo ""
+        echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences."
+        ;;
     representative)
         echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
         # Download major categories for representative coverage
@@ -109,7 +168,11 @@ case ${DOWNLOAD_TYPE} in
         ;;
     *)
         echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
-        echo "Usage: $0 [representative|complete|all]"
+        echo "Usage: $0 [human_mouse|representative|complete|all]"
+        echo "  human_mouse: Download only Homo sapiens and Mus musculus (minimal)"
+        echo "  representative: Download major categories (recommended)"
+        echo "  complete: Download all complete genomic sequences (very large)"
+        echo "  all: Download all genomic sequences (extremely large)"
         echo "Note: For RNA sequences, use build_rna_blast_db.sh instead"
         exit 1
         ;;
diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh
index 89b9dc0e..a3a7a16f 100755
--- a/scripts/search/build_db/build_rna_blast_db.sh
+++ b/scripts/search/build_db/build_rna_blast_db.sh
@@ -2,156 +2,219 @@
 
 set -e
 
-# Downloads NCBI RefSeq RNA sequences and creates BLAST databases.
-# This script specifically downloads RNA sequences (mRNA, rRNA, tRNA, etc.)
-# from RefSeq, which is suitable for RNA sequence searches.
+# Downloads RNAcentral sequences and creates BLAST databases.
+# This script downloads the RNAcentral active database, which is the same
+# data source used for online RNAcentral searches, ensuring consistency
+# between local and online search results.
 #
-# Usage: ./build_rna_blast_db.sh [representative|complete|all]
-#   representative: Download RNA sequences from major categories (recommended, smaller)
-#                    Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi, invertebrate, plant, viral
-#   complete: Download all RNA sequences from complete/ directory (very large)
-#   all: Download all RNA sequences from all categories (very large)
+# RNAcentral is a comprehensive database of non-coding RNA sequences that
+# integrates data from multiple expert databases including RefSeq, Rfam, etc.
+#
+# Usage: ./build_rna_blast_db.sh [all|list|database_name]
+#   all (default): Download complete active database (~8.4G compressed)
+#   list: List all available database subsets
+#   database_name: Download specific database subset (e.g., refseq, rfam, mirbase)
+#
+# Available database subsets (examples):
+#   - refseq.fasta (~98M): RefSeq RNA sequences
+#   - rfam.fasta (~1.5G): Rfam RNA families
+#   - mirbase.fasta (~10M): microRNA sequences
+#   - ensembl.fasta (~2.9G): Ensembl annotations
+#   - See "list" option for complete list
+#
+# The complete "active" database contains all sequences from all expert databases.
+# Using a specific database subset provides a smaller, focused database.
 #
 # We need makeblastdb on our PATH
 # For Ubuntu/Debian: sudo apt install ncbi-blast+
 # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
 # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
 
-DOWNLOAD_TYPE=${1:-representative}
+# RNAcentral HTTP base URL (using HTTPS for better reliability)
+RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral"
+RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release"
+RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences"
+RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database"
+
+# Parse command line argument
+DB_SELECTION=${1:-all}
+
+# List available databases if requested
+if [ "${DB_SELECTION}" = "list" ]; then
+    echo "Available RNAcentral database subsets:"
+    echo ""
+    echo "Fetching list from RNAcentral FTP..."
+    curl -s "${RNACENTRAL_BY_DB_URL}/" | \
+        grep -oE '<a href="[^"]*\.fasta">' | \
+        sed 's/<a href="//;s/">//' | \
+        sort | \
+        while read db; do
+            size=$(curl -s "${RNACENTRAL_BY_DB_URL}/" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown")
+            echo "  - ${db%.fasta}: ${size}"
+        done
+    echo ""
+    echo "Usage: $0 [database_name]"
+    echo "  Example: $0 refseq    # Download only RefSeq sequences (~98M)"
+    echo "  Example: $0 rfam      # Download only Rfam sequences (~1.5G)"
+    echo "  Example: $0 all       # Download complete active database (~8.4G)"
+    exit 0
+fi
 
 # Better to use a stable DOWNLOAD_TMP name to support resuming downloads
-DOWNLOAD_TMP=_downloading_rna
+DOWNLOAD_TMP=_downloading_rnacentral
 mkdir -p ${DOWNLOAD_TMP}
 cd ${DOWNLOAD_TMP}
 
-# Download RefSeq release information
-echo "Downloading RefSeq release information..."
-wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || {
-    echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier"
+# Get RNAcentral release version from release notes
+echo "Getting RNAcentral release information..."
+RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt"
+RELEASE_NOTES="release_notes.txt"
+wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || {
+    echo "Warning: Could not download release notes, using current date as release identifier"
     RELEASE=$(date +%Y%m%d)
 }
 
-if [ -f "RELEASE_NUMBER" ]; then
-    RELEASE=$(cat RELEASE_NUMBER | tr -d '\n')
-    echo "RefSeq release: ${RELEASE}"
+if [ -f "${RELEASE_NOTES}" ]; then
+    # Try to extract version from release notes (first line usually contains version info)
+    RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.' || date +%Y%m%d)
+    if [ -z "${RELEASE}" ] || [ "${RELEASE}" = "$(date +%Y%m%d)" ]; then
+        RELEASE=$(date +%Y%m%d)
+        echo "Using date as release identifier: ${RELEASE}"
+    else
+        echo "RNAcentral release: ${RELEASE}"
+    fi
 else
     RELEASE=$(date +%Y%m%d)
     echo "Using date as release identifier: ${RELEASE}"
 fi
 
-# Download based on type
-case ${DOWNLOAD_TYPE} in
-    representative)
-        echo "Downloading RefSeq representative RNA sequences (recommended, smaller size)..."
-        echo "Downloading RNA sequences from major categories..."
-        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral; do
-            echo "Downloading ${category} RNA sequences..."
-            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
-                grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
-                sed 's/href="\(.*\)"/\1/' | \
-                while read filename; do
-                    echo "  Downloading ${filename}..."
-                    wget -c -q --show-progress \
-                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
-                        echo "Warning: Failed to download ${filename}"
-                    }
-                done
-        done
-        ;;
-    complete)
-        echo "Downloading RefSeq complete RNA sequences (WARNING: very large, may take hours)..."
-        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
-            grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
-            sed 's/href="\(.*\)"/\1/' | \
-            while read filename; do
-                echo "  Downloading ${filename}..."
-                wget -c -q --show-progress \
-                    "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || {
-                    echo "Warning: Failed to download ${filename}"
-                }
-            done
-        ;;
-    all)
-        echo "Downloading all RefSeq RNA sequences from all categories (WARNING: extremely large, may take many hours)..."
-        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
-            echo "Downloading ${category} RNA sequences..."
-            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
-                grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
-                sed 's/href="\(.*\)"/\1/' | \
-                while read filename; do
-                    echo "  Downloading ${filename}..."
-                    wget -c -q --show-progress \
-                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
-                        echo "Warning: Failed to download ${filename}"
-                    }
-                done
-        done
-        ;;
-    *)
-        echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
-        echo "Usage: $0 [representative|complete|all]"
+# Download RNAcentral FASTA file
+if [ "${DB_SELECTION}" = "all" ]; then
+    # Download complete active database
+    FASTA_FILE="rnacentral_active.fasta.gz"
+    DB_NAME="rnacentral"
+    echo "Downloading RNAcentral active sequences (~8.4G)..."
+    echo "  Contains sequences currently present in at least one expert database"
+    echo "  Uses standard URS IDs (e.g., URS000149A9AF)"
+    echo "  ⭐ MATCHES the online RNAcentral API database - ensures consistency"
+    FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}"
+    IS_COMPRESSED=true
+else
+    # Download specific database subset
+    DB_NAME="${DB_SELECTION}"
+    FASTA_FILE="${DB_SELECTION}.fasta"
+    echo "Downloading RNAcentral database subset: ${DB_SELECTION}"
+    echo "  This is a subset of the active database from a specific expert database"
+    echo "  File: ${FASTA_FILE}"
+    FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}"
+    IS_COMPRESSED=false
+    
+    # Check if database exists
+    if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then
+        echo "Error: Database '${DB_SELECTION}' not found"
+        echo "Run '$0 list' to see available databases"
         exit 1
-        ;;
-esac
-
-cd ..
-
-# Create release directory
-mkdir -p refseq_rna_${RELEASE}
-mv ${DOWNLOAD_TMP}/* refseq_rna_${RELEASE}/ 2>/dev/null || true
-rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
-
-cd refseq_rna_${RELEASE}
-
-# Extract and combine sequences
-echo "Extracting and combining RNA sequences..."
-
-# Extract all downloaded RNA sequences
-if [ $(find . -name "*.rna.fna.gz" -type f | wc -l) -gt 0 ]; then
-    echo "Extracting RNA sequences..."
-    find . -name "*.rna.fna.gz" -type f -exec gunzip {} \;
+    fi
 fi
 
-# Combine all FASTA files into one
-echo "Combining all FASTA files..."
-FASTA_FILES=$(find . -name "*.fna" -type f)
-if [ -z "$FASTA_FILES" ]; then
-    FASTA_FILES=$(find . -name "*.fa" -type f)
+echo "Downloading from: ${FASTA_URL}"
+echo "This may take a while depending on your internet connection..."
+if [ "${DB_SELECTION}" = "all" ]; then
+    echo "File size is approximately 8-9GB, please be patient..."
+else
+    echo "Downloading database subset..."
 fi
+wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || {
+    echo "Error: Failed to download RNAcentral FASTA file"
+    echo "Please check your internet connection and try again"
+    echo "You can also try downloading manually from: ${FASTA_URL}"
+    exit 1
+}
 
-if [ -z "$FASTA_FILES" ]; then
-    echo "Error: No FASTA files found to combine"
+if [ ! -f "${FASTA_FILE}" ]; then
+    echo "Error: Downloaded file not found"
     exit 1
 fi
 
-echo "$FASTA_FILES" | while read -r file; do
-    if [ -f "$file" ]; then
-        cat "$file" >> refseq_rna_${RELEASE}.fasta
+cd ..
+
+# Create release directory
+if [ "${DB_SELECTION}" = "all" ]; then
+    OUTPUT_DIR="rnacentral_${RELEASE}"
+else
+    OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}"
+fi
+mkdir -p ${OUTPUT_DIR}
+mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true
+rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+
+cd ${OUTPUT_DIR}
+
+# Extract FASTA file if compressed
+echo "Preparing RNAcentral sequences..."
+if [ -f "${FASTA_FILE}" ]; then
+    if [ "${IS_COMPRESSED}" = "true" ]; then
+        echo "Decompressing ${FASTA_FILE}..."
+        OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+        gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || {
+            echo "Error: Failed to decompress FASTA file"
+            exit 1
+        }
+        # Optionally remove the compressed file to save space
+        # rm "${FASTA_FILE}"
+    else
+        # File is not compressed, just copy/rename
+        OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+        cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || {
+            echo "Error: Failed to copy FASTA file"
+            exit 1
+        }
     fi
-done
+else
+    echo "Error: FASTA file not found"
+    exit 1
+fi
 
 # Check if we have sequences
-if [ ! -s "refseq_rna_${RELEASE}.fasta" ]; then
-    echo "Error: Combined FASTA file is empty"
+if [ ! -s "${OUTPUT_FASTA}" ]; then
+    echo "Error: FASTA file is empty"
     exit 1
 fi
 
+# Get file size for user information
+FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1)
+echo "FASTA file size: ${FILE_SIZE}"
+
 echo "Creating BLAST database..."
 # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide)
-makeblastdb -in refseq_rna_${RELEASE}.fasta \
-    -out refseq_rna_${RELEASE} \
+# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers,
+# which matches the format expected by the RNACentralSearch class
+DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+makeblastdb -in "${OUTPUT_FASTA}" \
+    -out "${DB_OUTPUT_NAME}" \
     -dbtype nucl \
     -parse_seqids \
-    -title "RefSeq_RNA_${RELEASE}"
+    -title "RNAcentral_${DB_NAME}_${RELEASE}"
 
+echo ""
 echo "BLAST database created successfully!"
-echo "Database location: $(pwd)/refseq_rna_${RELEASE}"
+echo "Database location: $(pwd)/${DB_OUTPUT_NAME}"
 echo ""
-echo "To use this database, set in your config:"
-echo "  local_blast_db: $(pwd)/refseq_rna_${RELEASE}"
+echo "To use this database, set in your config (search_rna_config.yaml):"
+echo "  rnacentral_params:"
+echo "    use_local_blast: true"
+echo "    local_blast_db: $(pwd)/${DB_OUTPUT_NAME}"
 echo ""
 echo "Note: The database files are:"
-ls -lh refseq_rna_${RELEASE}.*
+ls -lh ${DB_OUTPUT_NAME}.* | head -5
+echo ""
+if [ "${DB_SELECTION}" = "all" ]; then
+    echo "This database uses RNAcentral IDs (URS...), which matches the online"
+    echo "RNAcentral search API, ensuring consistent results between local and online searches."
+else
+    echo "This is a subset database from ${DB_SELECTION} expert database."
+    echo "For full coverage matching online API, use 'all' option."
+fi
 
 cd ..
 
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 00000000..a02a6a37
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,3 @@
+version = 1
+revision = 3
+requires-python = ">=3.10"

From 9bc4ac3cdd4d8939c9fc0d96b152332e5aa08ff5 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Wed, 10 Dec 2025 01:50:37 +0800
Subject: [PATCH 02/16] fix: fix rna search with no gene info

---
 graphgen/models/searcher/db/rnacentral_searcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 58c5e86e..ba7da499 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -58,7 +58,8 @@ def _rna_data_to_dict(
             acc = xref.get("accession", {})
             if s := acc.get("species"):
                 organisms.add(s)
-            if g := acc.get("gene", "").strip():
+            gene_value = acc.get("gene")
+            if gene_value and isinstance(gene_value, str) and (g := gene_value.strip()):
                 gene_names.add(g)
             if m := xref.get("modifications"):
                 modifications.extend(m)

From 16a6b187201c1a112d908c1e2d4ab7979f4564d6 Mon Sep 17 00:00:00 2001
From: Yuchen Hua <2693275288@qq.com>
Date: Wed, 10 Dec 2025 01:58:20 +0800
Subject: [PATCH 03/16] Update
 graphgen/models/searcher/db/rnacentral_searcher.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 graphgen/models/searcher/db/rnacentral_searcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index ba7da499..e5b91c00 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -59,7 +59,7 @@ def _rna_data_to_dict(
             if s := acc.get("species"):
                 organisms.add(s)
             gene_value = acc.get("gene")
-            if gene_value and isinstance(gene_value, str) and (g := gene_value.strip()):
+            if isinstance(gene_value, str) and (g := gene_value.strip()):
                 gene_names.add(g)
             if m := xref.get("modifications"):
                 modifications.extend(m)

From 622b605de6538fefa6590a15423a6323f8bc41d6 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Wed, 10 Dec 2025 19:29:05 +0800
Subject: [PATCH 04/16] fix: disable API fallback when local BLAST is enabled

---
 graphgen/models/searcher/db/ncbi_searcher.py  | 15 ++--
 .../models/searcher/db/rnacentral_searcher.py | 17 +++--
 .../models/searcher/db/uniprot_searcher.py    | 69 ++++++++++---------
 3 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index f453c700..55ae4daf 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -393,11 +393,18 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O
                 return None
 
             # Try local BLAST first if enabled
-            if self.use_local_blast and (accession := self._local_blast(seq, threshold)):
-                logger.debug("Local BLAST found accession: %s", accession)
-                return self.get_by_accession(accession)
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+                if accession:
+                    logger.debug("Local BLAST found accession: %s", accession)
+                    return self.get_by_accession(accession)
+                logger.info(
+                    "Local BLAST found no match for sequence. "
+                    "API fallback disabled when using local database."
+                )
+                return None
 
-            # Fall back to network BLAST
+            # Fall back to network BLAST only if local BLAST is not enabled
             logger.debug("Falling back to NCBIWWW.qblast")
 
             with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle:
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index e5b91c00..a6884a61 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -255,8 +255,13 @@ def _extract_sequence(sequence: str) -> Optional[str]:
                 if accession:
                     logger.debug("Local BLAST found accession: %s", accession)
                     return self.get_by_rna_id(accession)
+                logger.info(
+                    "Local BLAST found no match for sequence. "
+                    "API fallback disabled when using local database."
+                )
+                return None
 
-            # Fall back to RNAcentral API if local BLAST didn't find result
+            # Fall back to RNAcentral API only if local BLAST is not enabled
             logger.debug("Falling back to RNAcentral API.")
 
             md5_hash = self._calculate_md5(seq)
@@ -272,11 +277,13 @@ def _extract_sequence(sequence: str) -> Optional[str]:
             if not results:
                 logger.info("No exact match found in RNAcentral for sequence")
                 return None
+
             rna_id = results[0].get("rnacentral_id")
-            if not rna_id:
-                logger.error("No RNAcentral ID found in search results.")
-                return None
-            return self.get_by_rna_id(rna_id)
+            if rna_id:
+                return self.get_by_rna_id(rna_id)
+
+            logger.error("No RNAcentral ID found in search results.")
+            return None
         except Exception as e:
             logger.error("Sequence search failed: %s", e)
             return None
diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py
index f5542f8c..a1ae2fe8 100644
--- a/graphgen/models/searcher/db/uniprot_searcher.py
+++ b/graphgen/models/searcher/db/uniprot_searcher.py
@@ -124,47 +124,52 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
             logger.error("Empty FASTA sequence provided.")
             return None
 
-        accession = None
         if self.use_local_blast:
             accession = self._local_blast(seq, threshold)
             if accession:
                 logger.debug("Local BLAST found accession: %s", accession)
+                return self.get_by_accession(accession)
+            logger.info(
+                "Local BLAST found no match for sequence. "
+                "API fallback disabled when using local database."
+            )
+            return None
 
-        if not accession:
-            logger.debug("Falling back to NCBIWWW.qblast.")
+        # Fall back to network BLAST only if local BLAST is not enabled
+        logger.debug("Falling back to NCBIWWW.qblast.")
 
-            # UniProtKB/Swiss-Prot BLAST API
-            try:
-                logger.debug(
-                    "Performing BLAST searcher for the given sequence: %s", seq
-                )
-                result_handle = NCBIWWW.qblast(
-                    program="blastp",
-                    database="swissprot",
-                    sequence=seq,
-                    hitlist_size=1,
-                    expect=threshold,
-                )
-                blast_record = NCBIXML.read(result_handle)
-            except RequestException:
-                raise
-            except Exception as e:  # pylint: disable=broad-except
-                logger.error("BLAST searcher failed: %s", e)
-                return None
+        # UniProtKB/Swiss-Prot BLAST API
+        try:
+            logger.debug(
+                "Performing BLAST searcher for the given sequence: %s", seq
+            )
+            result_handle = NCBIWWW.qblast(
+                program="blastp",
+                database="swissprot",
+                sequence=seq,
+                hitlist_size=1,
+                expect=threshold,
+            )
+            blast_record = NCBIXML.read(result_handle)
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("BLAST searcher failed: %s", e)
+            return None
 
-            if not blast_record.alignments:
-                logger.info("No BLAST hits found for the given sequence.")
-                return None
+        if not blast_record.alignments:
+            logger.info("No BLAST hits found for the given sequence.")
+            return None
 
-            best_alignment = blast_record.alignments[0]
-            best_hsp = best_alignment.hsps[0]
-            if best_hsp.expect > threshold:
-                logger.info("No BLAST hits below the threshold E-value.")
-                return None
-            hit_id = best_alignment.hit_id
+        best_alignment = blast_record.alignments[0]
+        best_hsp = best_alignment.hsps[0]
+        if best_hsp.expect > threshold:
+            logger.info("No BLAST hits below the threshold E-value.")
+            return None
 
-            # like sp|P01308.1|INS_HUMAN
-            accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
+        # like sp|P01308.1|INS_HUMAN
+        hit_id = best_alignment.hit_id
+        accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
         return self.get_by_accession(accession)
 
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:

From 9a650dc904805e58609b8059bfa660fd300c74eb Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Fri, 12 Dec 2025 00:40:25 +0800
Subject: [PATCH 05/16] add: add local rna databases and merge

---
 scripts/search/build_db/build_rna_blast_db.sh | 480 +++++++++++++-----
 1 file changed, 347 insertions(+), 133 deletions(-)

diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh
index 26e1cd33..503c654b 100755
--- a/scripts/search/build_db/build_rna_blast_db.sh
+++ b/scripts/search/build_db/build_rna_blast_db.sh
@@ -10,16 +10,20 @@ set -e
 # RNAcentral is a comprehensive database of non-coding RNA sequences that
 # integrates data from multiple expert databases including RefSeq, Rfam, etc.
 #
-# Usage: ./build_rna_blast_db.sh [all|list|database_name]
+# Usage: ./build_rna_blast_db.sh [all|list|selected|database_name...]
 #   all (default): Download complete active database (~8.4G compressed)
 #   list: List all available database subsets
+#   selected: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase)
 #   database_name: Download specific database subset (e.g., refseq, rfam, mirbase)
+#   database_name1 database_name2 ...: Download multiple database subsets
 #
 # Available database subsets (examples):
 #   - refseq.fasta (~98M): RefSeq RNA sequences
 #   - rfam.fasta (~1.5G): Rfam RNA families
 #   - mirbase.fasta (~10M): microRNA sequences
-#   - ensembl.fasta (~2.9G): Ensembl annotations
+#   - ensembl_gencode.fasta (~337M): Ensembl/GENCODE annotations (human)
+#   - gtrnadb.fasta (~38M): tRNA sequences
+#   - lncbase.fasta (~106K): Human lncRNA database
 #   - See "list" option for complete list
 #
 # The complete "active" database contains all sequences from all expert databases.
@@ -30,20 +34,24 @@ set -e
 # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
 # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
 
-# RNAcentral HTTP base URL (using HTTPS for better reliability)
+# RNAcentral base URL (using EBI HTTPS)
+# NOTE: RNAcentral only has one official mirror at EBI
 RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral"
 RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release"
 RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences"
 RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database"
 
-# Parse command line argument
+# Parse command line arguments
 DB_SELECTION=${1:-all}
 
+# Predefined database list for "selected" option
+SELECTED_DATABASES=("ensembl_gencode" "mirbase" "gtrnadb" "refseq" "lncbase")
+
 # List available databases if requested
 if [ "${DB_SELECTION}" = "list" ]; then
     echo "Available RNAcentral database subsets:"
     echo ""
-    echo "Fetching list from RNAcentral FTP..."
+    echo "Fetching list from RNAcentral..."
     listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/")
     echo "${listing}" | \
         grep -oE '<a href="[^\"]*\.fasta">' | \
@@ -54,30 +62,41 @@ if [ "${DB_SELECTION}" = "list" ]; then
             echo "  - ${db%.fasta}: ${size}"
         done
     echo ""
-    echo "Usage: $0 [database_name]"
+    echo "Usage: $0 [all|list|selected|database_name...]"
     echo "  Example: $0 refseq    # Download only RefSeq sequences (~98M)"
     echo "  Example: $0 rfam      # Download only Rfam sequences (~1.5G)"
+    echo "  Example: $0 selected   # Download predefined databases (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase)"
+    echo "  Example: $0 refseq mirbase  # Download multiple databases"
     echo "  Example: $0 all       # Download complete active database (~8.4G)"
     exit 0
 fi
 
-# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
-DOWNLOAD_TMP=_downloading_rnacentral
-mkdir -p ${DOWNLOAD_TMP}
-cd ${DOWNLOAD_TMP}
+# Determine which databases to download
+if [ "${DB_SELECTION}" = "selected" ]; then
+    # Use predefined database list
+    DATABASES=("${SELECTED_DATABASES[@]}")
+    echo "Downloading selected databases: ${DATABASES[*]}"
+elif [ "${DB_SELECTION}" = "all" ]; then
+    # Single database mode (all)
+    DATABASES=("all")
+else
+    # Multiple databases provided as arguments
+    DATABASES=("$@")
+fi
 
-# Get RNAcentral release version from release notes
+# Get RNAcentral release version from release notes (once for all databases)
 echo "Getting RNAcentral release information..."
 RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt"
-RELEASE_NOTES="release_notes.txt"
-wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || {
+RELEASE_NOTES_TMP=$(mktemp)
+wget -q "${RELEASE_NOTES_URL}" -O "${RELEASE_NOTES_TMP}" 2>/dev/null || {
     echo "Warning: Could not download release notes, using current date as release identifier"
     RELEASE=$(date +%Y%m%d)
 }
 
-if [ -f "${RELEASE_NOTES}" ]; then
+if [ -f "${RELEASE_NOTES_TMP}" ] && [ -s "${RELEASE_NOTES_TMP}" ]; then
     # Try to extract version from release notes (first line usually contains version info)
-    RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.')
+    RELEASE=$(head -1 "${RELEASE_NOTES_TMP}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.')
+    rm -f "${RELEASE_NOTES_TMP}"
 fi
 
 if [ -z "${RELEASE}" ]; then
@@ -87,133 +106,328 @@ else
     echo "RNAcentral release: ${RELEASE}"
 fi
 
-# Download RNAcentral FASTA file
-if [ "${DB_SELECTION}" = "all" ]; then
-    # Download complete active database
-    FASTA_FILE="rnacentral_active.fasta.gz"
-    DB_NAME="rnacentral"
-    echo "Downloading RNAcentral active sequences (~8.4G)..."
-    echo "  Contains sequences currently present in at least one expert database"
-    echo "  Uses standard URS IDs (e.g., URS000149A9AF)"
-    echo "  ⭐ MATCHES the online RNAcentral API database - ensures consistency"
-    FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}"
-    IS_COMPRESSED=true
-else
-    # Download specific database subset
-    DB_NAME="${DB_SELECTION}"
-    FASTA_FILE="${DB_SELECTION}.fasta"
-    echo "Downloading RNAcentral database subset: ${DB_SELECTION}"
-    echo "  This is a subset of the active database from a specific expert database"
-    echo "  File: ${FASTA_FILE}"
-    FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}"
-    IS_COMPRESSED=false
-    
-    # Check if database exists
-    if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then
-        echo "Error: Database '${DB_SELECTION}' not found"
-        echo "Run '$0 list' to see available databases"
+# Process each database
+DB_COUNT=${#DATABASES[@]}
+DB_INDEX=0
+
+for DB_SELECTION in "${DATABASES[@]}"; do
+    DB_INDEX=$((DB_INDEX + 1))
+    echo ""
+    echo "=========================================="
+    echo "Processing database ${DB_INDEX}/${DB_COUNT}: ${DB_SELECTION}"
+    echo "=========================================="
+    echo ""
+    
+    # Check if database already exists and is complete
+    # First check with current release version
+    if [ "${DB_SELECTION}" = "all" ]; then
+        OUTPUT_DIR="rnacentral_${RELEASE}"
+        DB_NAME="rnacentral"
+        DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+    else
+        OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}"
+        DB_NAME="${DB_SELECTION}"
+        DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+    fi
+    
+    # Check if BLAST database already exists with current release
+    if [ -d "${OUTPUT_DIR}" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nhr" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nin" ]; then
+        echo "✓ Database ${DB_SELECTION} already exists and appears complete: ${OUTPUT_DIR}/"
+        echo "  BLAST database: ${OUTPUT_DIR}/${DB_OUTPUT_NAME}"
+        echo "  Skipping download and database creation..."
+        continue
+    fi
+    
+    # Also check for any existing version of this database (e.g., different release dates)
+    EXISTING_DIR=$(ls -d rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1)
+    if [ -n "${EXISTING_DIR}" ] && [ "${DB_SELECTION}" != "all" ]; then
+        EXISTING_DB_NAME=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//")
+        if [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nhr" ] && [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nin" ]; then
+            echo "✓ Database ${DB_SELECTION} already exists (version ${EXISTING_DB_NAME}): ${EXISTING_DIR}/"
+            echo "  BLAST database: ${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}"
+            echo "  Skipping download and database creation..."
+            echo "  Note: Using existing version ${EXISTING_DB_NAME} instead of ${RELEASE}"
+            continue
+        fi
+    fi
+    
+    # Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+    DOWNLOAD_TMP="_downloading_rnacentral_${DB_SELECTION}"
+    mkdir -p ${DOWNLOAD_TMP}
+    cd ${DOWNLOAD_TMP}
+    
+    # Download RNAcentral FASTA file
+    if [ "${DB_SELECTION}" = "all" ]; then
+        # Download complete active database
+        FASTA_FILE="rnacentral_active.fasta.gz"
+        DB_NAME="rnacentral"
+        echo "Downloading RNAcentral active sequences (~8.4G)..."
+        echo "  Contains sequences currently present in at least one expert database"
+        echo "  Uses standard URS IDs (e.g., URS000149A9AF)"
+        echo "  ⭐ MATCHES the online RNAcentral API database - ensures consistency"
+        FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}"
+        IS_COMPRESSED=true
+    else
+        # Download specific database subset
+        DB_NAME="${DB_SELECTION}"
+        FASTA_FILE="${DB_SELECTION}.fasta"
+        echo "Downloading RNAcentral database subset: ${DB_SELECTION}"
+        echo "  This is a subset of the active database from a specific expert database"
+        echo "  File: ${FASTA_FILE}"
+        FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}"
+        IS_COMPRESSED=false
+        
+        # Check if database exists (use HTTP status code check for HTTPS)
+        HTTP_CODE=$(curl -s --max-time 10 -o /dev/null -w "%{http_code}" "${FASTA_URL}" 2>/dev/null | tail -1 || echo "000")
+        if ! echo "${HTTP_CODE}" | grep -q "^200$"; then
+            echo "Error: Database '${DB_SELECTION}' not found (HTTP code: ${HTTP_CODE})"
+            echo "Run '$0 list' to see available databases"
+            cd ..
+            rm -rf ${DOWNLOAD_TMP}
+            exit 1
+        fi
+    fi
+    
+    echo "Downloading from: ${FASTA_URL}"
+    echo "This may take a while depending on your internet connection..."
+    if [ "${DB_SELECTION}" = "all" ]; then
+        echo "File size is approximately 8-9GB, please be patient..."
+    else
+        echo "Downloading database subset..."
+    fi
+    
+    wget -c "${FASTA_URL}" || {
+        echo "Error: Failed to download RNAcentral FASTA file"
+        echo "Please check your internet connection and try again"
+        echo "URL: ${FASTA_URL}"
+        cd ..
+        rm -rf ${DOWNLOAD_TMP}
+        exit 1
+    }
+    
+    if [ ! -f "${FASTA_FILE}" ]; then
+        echo "Error: Downloaded file not found"
+        cd ..
+        rm -rf ${DOWNLOAD_TMP}
         exit 1
     fi
-fi
-
-echo "Downloading from: ${FASTA_URL}"
-echo "This may take a while depending on your internet connection..."
-if [ "${DB_SELECTION}" = "all" ]; then
-    echo "File size is approximately 8-9GB, please be patient..."
-else
-    echo "Downloading database subset..."
-fi
-wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || {
-    echo "Error: Failed to download RNAcentral FASTA file"
-    echo "Please check your internet connection and try again"
-    echo "You can also try downloading manually from: ${FASTA_URL}"
-    exit 1
-}
-
-if [ ! -f "${FASTA_FILE}" ]; then
-    echo "Error: Downloaded file not found"
-    exit 1
-fi
+    
+    cd ..
+    
+    # Create release directory
+    if [ "${DB_SELECTION}" = "all" ]; then
+        OUTPUT_DIR="rnacentral_${RELEASE}"
+    else
+        OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}"
+    fi
+    mkdir -p ${OUTPUT_DIR}
+    mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true
+    rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+    
+    cd ${OUTPUT_DIR}
+    
+    # Extract FASTA file if compressed
+    echo "Preparing RNAcentral sequences..."
+    if [ -f "${FASTA_FILE}" ]; then
+        if [ "${IS_COMPRESSED}" = "true" ]; then
+            echo "Decompressing ${FASTA_FILE}..."
+            OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+            gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || {
+                echo "Error: Failed to decompress FASTA file"
+                cd ..
+                exit 1
+            }
+            # Optionally remove the compressed file to save space
+            # rm "${FASTA_FILE}"
+        else
+            # File is not compressed, just copy/rename
+            OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
+            cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || {
+                echo "Error: Failed to copy FASTA file"
+                cd ..
+                exit 1
+            }
+        fi
+    else
+        echo "Error: FASTA file not found"
+        cd ..
+        exit 1
+    fi
+    
+    # Check if we have sequences
+    if [ ! -s "${OUTPUT_FASTA}" ]; then
+        echo "Error: FASTA file is empty"
+        cd ..
+        exit 1
+    fi
+    
+    # Get file size for user information
+    FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1)
+    echo "FASTA file size: ${FILE_SIZE}"
+    
+    echo "Creating BLAST database..."
+    # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide)
+    # Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers,
+    # which matches the format expected by the RNACentralSearch class
+    DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
+    makeblastdb -in "${OUTPUT_FASTA}" \
+        -out "${DB_OUTPUT_NAME}" \
+        -dbtype nucl \
+        -parse_seqids \
+        -title "RNAcentral_${DB_NAME}_${RELEASE}"
+    
+    echo ""
+    echo "BLAST database created successfully!"
+    echo "Database location: $(pwd)/${DB_OUTPUT_NAME}"
+    echo ""
+    echo "To use this database, set in your config (search_rna_config.yaml):"
+    echo "  rnacentral_params:"
+    echo "    use_local_blast: true"
+    echo "    local_blast_db: $(pwd)/${DB_OUTPUT_NAME}"
+    echo ""
+    echo "Note: The database files are:"
+    ls -lh ${DB_OUTPUT_NAME}.* | head -5
+    echo ""
+    if [ "${DB_SELECTION}" = "all" ]; then
+        echo "This database uses RNAcentral IDs (URS...), which matches the online"
+        echo "RNAcentral search API, ensuring consistent results between local and online searches."
+    else
+        echo "This is a subset database from ${DB_SELECTION} expert database."
+        echo "For full coverage matching online API, use 'all' option."
+    fi
+    
+    cd ..
+done
 
-cd ..
+echo ""
+echo "=========================================="
+echo "All databases processed successfully!"
+echo "=========================================="
+echo ""
 
-# Create release directory
-if [ "${DB_SELECTION}" = "all" ]; then
-    OUTPUT_DIR="rnacentral_${RELEASE}"
-else
-    OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}"
-fi
-mkdir -p ${OUTPUT_DIR}
-mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true
-rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
-
-cd ${OUTPUT_DIR}
-
-# Extract FASTA file if compressed
-echo "Preparing RNAcentral sequences..."
-if [ -f "${FASTA_FILE}" ]; then
-    if [ "${IS_COMPRESSED}" = "true" ]; then
-        echo "Decompressing ${FASTA_FILE}..."
-        OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
-        gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || {
-            echo "Error: Failed to decompress FASTA file"
-            exit 1
+# If multiple databases were downloaded, offer to merge them
+if [ ${#DATABASES[@]} -gt 1 ] && [ "${DATABASES[0]}" != "all" ]; then
+    echo "Multiple databases downloaded. Creating merged database for unified search..."
+    MERGED_DIR="rnacentral_merged_${RELEASE}"
+    mkdir -p ${MERGED_DIR}
+    cd ${MERGED_DIR}
+    
+    MERGED_FASTA="rnacentral_merged_${RELEASE}.fasta"
+    MERGED_FASTA_TMP="${MERGED_FASTA}.tmp"
+    echo "Combining FASTA files from all databases..."
+    echo "  Note: Duplicate sequence IDs will be removed (keeping first occurrence)..."
+    
+    # Combine all FASTA files into a temporary file
+    # Find actual database directories (may have different release versions)
+    FOUND_ANY=false
+    for DB_SELECTION in "${DATABASES[@]}"; do
+        [ "${DB_SELECTION}" = "all" ] && continue
+        
+        # Try current release version first, then search for any existing version
+        OUTPUT_FASTA="../rnacentral_${DB_SELECTION}_${RELEASE}/${DB_SELECTION}_${RELEASE}.fasta"
+        [ ! -f "${OUTPUT_FASTA}" ] && {
+            EXISTING_DIR=$(ls -d ../rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1)
+            [ -n "${EXISTING_DIR}" ] && {
+                EXISTING_VERSION=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//")
+                OUTPUT_FASTA="${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_VERSION}.fasta"
+            }
         }
-        # Optionally remove the compressed file to save space
-        # rm "${FASTA_FILE}"
-    else
-        # File is not compressed, just copy/rename
-        OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta"
-        cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || {
-            echo "Error: Failed to copy FASTA file"
-            exit 1
+        
+        if [ -f "${OUTPUT_FASTA}" ]; then
+            echo "  Adding ${DB_SELECTION} sequences..."
+            cat "${OUTPUT_FASTA}" >> "${MERGED_FASTA_TMP}"
+            FOUND_ANY=true
+        else
+            echo "  Warning: Could not find FASTA file for ${DB_SELECTION}"
+        fi
+    done
+    
+    # Validate that we have files to merge
+    if [ "${FOUND_ANY}" = "false" ] || [ ! -s "${MERGED_FASTA_TMP}" ]; then
+        echo "Error: No FASTA files found to merge"
+        cd ..
+        rm -rf ${MERGED_DIR}
+        exit 1
+    fi
+    
+    # Remove duplicates based on sequence ID (keeping first occurrence)
+    echo "  Removing duplicate sequence IDs..."
+    awk '
+    /^>/ {
+        # Process previous sequence if we have one
+        if (current_id != "" && !seen[current_id]) {
+            print current_header ORS current_seq
+            seen[current_id] = 1
+        }
+        # Start new sequence
+        current_header = $0
+        current_id = substr($0, 2)
+        sub(/[ \t].*/, "", current_id)  # Extract ID up to first space/tab
+        current_seq = ""
+        next
+    }
+    {
+        # Accumulate sequence data (preserve newlines)
+        current_seq = (current_seq == "" ? $0 : current_seq "\n" $0)
+    }
+    END {
+        # Process last sequence
+        if (current_id != "" && !seen[current_id]) {
+            print current_header ORS current_seq
         }
+    }
+    ' "${MERGED_FASTA_TMP}" > "${MERGED_FASTA}"
+    rm -f "${MERGED_FASTA_TMP}"
+    
+    # Check if merged file was created and has content
+    if [ ! -s "${MERGED_FASTA}" ]; then
+        echo "Warning: Merged FASTA file is empty or not created"
+        cd ..
+        rm -rf ${MERGED_DIR}
+    else
+        FILE_SIZE=$(du -h "${MERGED_FASTA}" | cut -f1)
+        echo "Merged FASTA file size: ${FILE_SIZE}"
+        
+        echo "Creating merged BLAST database..."
+        MERGED_DB_NAME="rnacentral_merged_${RELEASE}"
+        makeblastdb -in "${MERGED_FASTA}" \
+            -out "${MERGED_DB_NAME}" \
+            -dbtype nucl \
+            -parse_seqids \
+            -title "RNAcentral_Merged_${RELEASE}"
+        
+        echo ""
+        echo "✓ Merged BLAST database created successfully!"
+        echo "Database location: $(pwd)/${MERGED_DB_NAME}"
+        echo ""
+        echo "To use the merged database, set in your config (search_rna_config.yaml):"
+        echo "  rnacentral_params:"
+        echo "    use_local_blast: true"
+        echo "    local_blast_db: $(pwd)/${MERGED_DB_NAME}"
+        echo ""
+        echo "Note: The merged database includes: ${DATABASES[*]}"
+        cd ..
     fi
-else
-    echo "Error: FASTA file not found"
-    exit 1
 fi
 
-# Check if we have sequences
-if [ ! -s "${OUTPUT_FASTA}" ]; then
-    echo "Error: FASTA file is empty"
-    exit 1
-fi
-
-# Get file size for user information
-FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1)
-echo "FASTA file size: ${FILE_SIZE}"
-
-echo "Creating BLAST database..."
-# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide)
-# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers,
-# which matches the format expected by the RNACentralSearch class
-DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}"
-makeblastdb -in "${OUTPUT_FASTA}" \
-    -out "${DB_OUTPUT_NAME}" \
-    -dbtype nucl \
-    -parse_seqids \
-    -title "RNAcentral_${DB_NAME}_${RELEASE}"
-
 echo ""
-echo "BLAST database created successfully!"
-echo "Database location: $(pwd)/${DB_OUTPUT_NAME}"
-echo ""
-echo "To use this database, set in your config (search_rna_config.yaml):"
-echo "  rnacentral_params:"
-echo "    use_local_blast: true"
-echo "    local_blast_db: $(pwd)/${DB_OUTPUT_NAME}"
-echo ""
-echo "Note: The database files are:"
-ls -lh ${DB_OUTPUT_NAME}.* | head -5
-echo ""
-if [ "${DB_SELECTION}" = "all" ]; then
-    echo "This database uses RNAcentral IDs (URS...), which matches the online"
-    echo "RNAcentral search API, ensuring consistent results between local and online searches."
-else
-    echo "This is a subset database from ${DB_SELECTION} expert database."
-    echo "For full coverage matching online API, use 'all' option."
-fi
+echo "Summary of downloaded databases:"
+for DB_SELECTION in "${DATABASES[@]}"; do
+    if [ "${DB_SELECTION}" = "all" ]; then
+        OUTPUT_DIR="rnacentral_${RELEASE}"
+        DB_NAME="rnacentral"
+    else
+        OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}"
+        DB_NAME="${DB_SELECTION}"
+    fi
+    if [ -d "${OUTPUT_DIR}" ]; then
+        echo "  - ${DB_NAME}: ${OUTPUT_DIR}/"
+    fi
+done
 
-cd ..
+if [ -d "rnacentral_merged_${RELEASE}" ]; then
+    echo "  - merged (all databases): rnacentral_merged_${RELEASE}/"
+    echo ""
+    echo "💡 Recommendation: Use the merged database for searching across all databases."
+fi
 

From d80c5db0e65c997837492859ca77c86324e1fa23 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Fri, 12 Dec 2025 01:52:59 +0800
Subject: [PATCH 06/16] add: add local dna databases of more species

---
 scripts/search/build_db/build_dna_blast_db.sh | 80 +++++++++++--------
 1 file changed, 46 insertions(+), 34 deletions(-)

diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh
index 1928d7d0..21b86141 100755
--- a/scripts/search/build_db/build_dna_blast_db.sh
+++ b/scripts/search/build_db/build_dna_blast_db.sh
@@ -24,8 +24,8 @@ set -e
 #   - {category}.{number}.genomic.fna.gz (基因组序列)
 #   - {category}.{number}.rna.fna.gz (RNA序列)
 #
-# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all]
-#   human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
+# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all]
+#   human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest)
 #   representative: Download genomic sequences from major categories (recommended, smaller)
 #                    Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
 #   complete: Download all complete genomic sequences from complete/ directory (very large)
@@ -36,7 +36,7 @@ set -e
 # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
 # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
 
-DOWNLOAD_TYPE=${1:-human_mouse}
+DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast}
 
 # Better to use a stable DOWNLOAD_TMP name to support resuming downloads
 DOWNLOAD_TMP=_downloading_dna
@@ -68,7 +68,8 @@ check_file_for_species() {
     # This should be sufficient to identify the species in most cases
     if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
         # Try to decompress and check for species names
-        if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
+        # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母)
+        if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
             rm -f "${temp_file}"
             return 0  # Contains target species
         else
@@ -84,39 +85,50 @@ check_file_for_species() {
 
 # Download based on type
 case ${DOWNLOAD_TYPE} in
-    human_mouse)
-        echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
-        echo "This will check each file to see if it contains human or mouse sequences..."
-        category="vertebrate_mammalian"
-        echo "Checking files in ${category} category..."
+    human_mouse_drosophila_yeast)
+        echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..."
+        echo "This will check each file to see if it contains target species sequences..."
         
-        # Get list of files and save to temp file to avoid subshell issues
-        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
-            grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
-            sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt
-        
-        file_count=0
-        download_count=0
+        # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母)
+        categories="vertebrate_mammalian invertebrate fungi"
+        total_file_count=0
+        total_download_count=0
         
-        while read filename; do
-            file_count=$((file_count + 1))
-            url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
-            echo -n "[${file_count}] Checking ${filename}... "
+        for category in ${categories}; do
+            echo "Checking files in ${category} category..."
             
-            if check_file_for_species "${url}" "${filename}"; then
-                echo "✓ contains target species, downloading..."
-                download_count=$((download_count + 1))
-                wget -c -q --show-progress "${url}" || {
-                    echo "Warning: Failed to download ${filename}"
-                }
-            else
-                echo "✗ skipping (no human/mouse data)"
-            fi
-        done < /tmp/refseq_files.txt
+            # Get list of files and save to temp file to avoid subshell issues
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
+            
+            file_count=0
+            download_count=0
+            
+            while read filename; do
+                file_count=$((file_count + 1))
+                total_file_count=$((total_file_count + 1))
+                url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
+                echo -n "[${total_file_count}] Checking ${category}/${filename}... "
+                
+                if check_file_for_species "${url}" "${filename}"; then
+                    echo "✓ contains target species, downloading..."
+                    download_count=$((download_count + 1))
+                    total_download_count=$((total_download_count + 1))
+                    wget -c -q --show-progress "${url}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                else
+                    echo "✗ skipping (no target species data)"
+                fi
+            done < /tmp/refseq_files_${category}.txt
+            
+            rm -f /tmp/refseq_files_${category}.txt
+            echo "  ${category}: Checked ${file_count} files, downloaded ${download_count} files."
+        done
         
-        rm -f /tmp/refseq_files.txt
         echo ""
-        echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences."
+        echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)."
         ;;
     representative)
         echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
@@ -168,8 +180,8 @@ case ${DOWNLOAD_TYPE} in
         ;;
     *)
         echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
-        echo "Usage: $0 [human_mouse|representative|complete|all]"
-        echo "  human_mouse: Download only Homo sapiens and Mus musculus (minimal)"
+        echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]"
+        echo "  human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)"
         echo "  representative: Download major categories (recommended)"
         echo "  complete: Download all complete genomic sequences (very large)"
         echo "  all: Download all genomic sequences (extremely large)"

From aa76650043adf97458fe4bc0c05395039336373c Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Fri, 12 Dec 2025 01:54:43 +0800
Subject: [PATCH 07/16] add: add local UniProt mirror and more download options

---
 .../search/build_db/build_protein_blast_db.sh | 123 +++++++++++++++---
 1 file changed, 106 insertions(+), 17 deletions(-)

diff --git a/scripts/search/build_db/build_protein_blast_db.sh b/scripts/search/build_db/build_protein_blast_db.sh
index 9292875a..a9169959 100755
--- a/scripts/search/build_db/build_protein_blast_db.sh
+++ b/scripts/search/build_db/build_protein_blast_db.sh
@@ -9,48 +9,137 @@ set -e
 # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
 # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
 
+# NOTE: UniProt mirror
+# Available mirrors:
+#   - UK/EBI: ftp://ftp.ebi.ac.uk/pub/databases/uniprot (current, recommended)
+#   - US:     ftp://ftp.uniprot.org/pub/databases/uniprot
+#   - CH:     ftp://ftp.expasy.org/databases/uniprot
+UNIPROT_BASE="ftp://ftp.ebi.ac.uk/pub/databases/uniprot"
+
+# Parse command line arguments
+DOWNLOAD_MODE="sprot"  # sprot (Swiss-Prot) or full (sprot + trembl)
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  -s, --sprot-only    Download only Swiss-Prot database (recommended, high quality)"
+    echo "  -f, --full          Download full release (Swiss-Prot + TrEMBL, merged as uniprot_\${RELEASE})"
+    echo "  -h, --help          Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --sprot-only     # Download only uniprot_sprot"
+    echo "  $0 --full           # Download uniprot_\${RELEASE} (Swiss-Prot + TrEMBL)"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -s|--sprot-only)
+            DOWNLOAD_MODE="sprot"
+            shift
+            ;;
+        -f|--full)
+            DOWNLOAD_MODE="full"
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+echo "Download mode: ${DOWNLOAD_MODE}"
+if [ "${DOWNLOAD_MODE}" = "sprot" ]; then
+    echo "  - Will download: uniprot_sprot only"
+else
+    echo "  - Will download: uniprot_\${RELEASE} (Swiss-Prot + TrEMBL merged)"
+fi
+echo "Using mirror: ${UNIPROT_BASE} (EBI/UK - fast for Asia/Europe)"
+echo ""
+
 # Better to use a stable DOWNLOAD_TMP name to support resuming downloads
 DOWNLOAD_TMP=_downloading
 mkdir -p ${DOWNLOAD_TMP}
 cd ${DOWNLOAD_TMP}
 
-wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink"
+echo "Downloading RELEASE.metalink..."
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink"
 
 # Extract the release name (like 2017_10 or 2017_1)
 # Use sed for cross-platform compatibility (works on both macOS and Linux)
 RELEASE=$(sed -n 's/.*<version>\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1)
 
-wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
-wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
-wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt"
-wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README"
-wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE"
+echo "UniProt release: ${RELEASE}"
+echo ""
+
+# Download Swiss-Prot (always needed)
+echo "Downloading uniprot_sprot.fasta.gz..."
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
+
+# Download TrEMBL only if full mode
+if [ "${DOWNLOAD_MODE}" = "full" ]; then
+    echo "Downloading uniprot_trembl.fasta.gz..."
+    wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
+fi
+
+# Download metadata files
+echo "Downloading metadata files..."
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt"
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README"
+wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE"
 
 cd ..
 
-mkdir ${RELEASE}
+mkdir -p ${RELEASE}
 mv ${DOWNLOAD_TMP}/* ${RELEASE}
 rmdir ${DOWNLOAD_TMP}
 
 cd ${RELEASE}
 
+echo ""
+echo "Extracting files..."
 gunzip uniprot_sprot.fasta.gz
-gunzip uniprot_trembl.fasta.gz
 
-cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta
+if [ "${DOWNLOAD_MODE}" = "full" ]; then
+    gunzip uniprot_trembl.fasta.gz
+    echo "Merging Swiss-Prot and TrEMBL..."
+    cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta
+fi
+
+echo ""
+echo "Building BLAST databases..."
 
-makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE}
+# Always build Swiss-Prot database
 makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot
-makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl
+
+# Build full release database only if in full mode
+if [ "${DOWNLOAD_MODE}" = "full" ]; then
+    makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE}
+    makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl
+fi
 
 cd ..
 
+echo ""
 echo "BLAST databases created successfully!"
 echo "Database locations:"
-echo "  - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}"
-echo "  - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
-echo "  - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl"
-echo ""
-echo "To use these databases, set in your config:"
-echo "  local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot  # or uniprot_${RELEASE} or uniprot_trembl"
+if [ "${DOWNLOAD_MODE}" = "sprot" ]; then
+    echo "  - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
+    echo ""
+    echo "To use this database, set in your config:"
+    echo "  local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot"
+else
+    echo "  - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}"
+    echo "  - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
+    echo "  - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl"
+    echo ""
+    echo "To use these databases, set in your config:"
+    echo "  local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot  # or uniprot_${RELEASE} or uniprot_trembl"
+fi
 

From 8143ffffed3f54357c5841d1c15f3d88b2e3dbc1 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sat, 13 Dec 2025 03:43:44 +0800
Subject: [PATCH 08/16] feat: enable faster search

---
 graphgen/models/searcher/db/ncbi_searcher.py  |  29 +++-
 .../models/searcher/db/rnacentral_searcher.py | 123 ++++++++++++--
 .../models/searcher/db/uniprot_searcher.py    | 151 ++++++++++++++++--
 3 files changed, 274 insertions(+), 29 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 55ae4daf..89217e66 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -24,7 +24,7 @@
 
 @lru_cache(maxsize=None)
 def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
+    return ThreadPoolExecutor(max_workers=20)  # NOTE：can increase for better parallelism
 
 
 # ensure only one NCBI request at a time
@@ -432,16 +432,29 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona
 
         loop = asyncio.get_running_loop()
 
-        # limit concurrent requests (NCBI rate limit: max 3 requests per second)
-        async with _ncbi_lock:
-            # Auto-detect query type and execute in thread pool
-            if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+        # Auto-detect query type and execute in thread pool
+        # Only use lock for network API calls (NCBI rate limit: max 3 requests per second)
+        # Local BLAST can run in parallel
+        if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+            # FASTA sequence: use lock only if using network BLAST
+            if self.use_local_blast:
+                # Local BLAST can run in parallel, no lock needed
                 result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
-            elif re.fullmatch(r"^\d+$", query):
+            else:
+                # Network BLAST needs lock to respect rate limits
+                async with _ncbi_lock:
+                    result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
+        elif re.fullmatch(r"^\d+$", query):
+            # Gene ID: always use lock (network API call)
+            async with _ncbi_lock:
                 result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
-            elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+        elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+            # Accession: always use lock (network API call)
+            async with _ncbi_lock:
                 result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
-            else:
+        else:
+            # Keyword: always use lock (network API call)
+            async with _ncbi_lock:
                 result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
 
         if result:
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index a6884a61..3de2fd0f 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -18,12 +18,12 @@
 )
 
 from graphgen.bases import BaseSearcher
-from graphgen.utils import logger
+from graphgen.utils import logger, load_json
 
 
 @lru_cache(maxsize=None)
 def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
+    return ThreadPoolExecutor(max_workers=20)  # NOTE：can increase for better parallelism
 
 class RNACentralSearch(BaseSearcher):
     """
@@ -35,12 +35,28 @@ class RNACentralSearch(BaseSearcher):
     API Documentation: https://rnacentral.org/api/v1
     """
 
-    def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"):
+    def __init__(
+        self, 
+        use_local_blast: bool = False, 
+        local_blast_db: str = "rna_db", 
+        api_timeout: int = 5,
+        metadata_db_file: Optional[str] = None,
+        blast_num_threads: int = 4
+    ):
         super().__init__()
         self.base_url = "https://rnacentral.org/api/v1"
         self.headers = {"Accept": "application/json"}
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
+        self.api_timeout = api_timeout
+        self.metadata_db_file = metadata_db_file
+        self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
+        
+        # Load pre-built metadata database if provided
+        self._metadata_db: Optional[Dict[str, Optional[dict]]] = None
+        if self.metadata_db_file:
+            self._load_metadata_db()
+        
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
@@ -142,22 +158,60 @@ def _calculate_md5(sequence: str) -> str:
 
         return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
 
+    def _load_metadata_db(self) -> None:
+        """Load pre-built metadata database from file."""
+        if not self.metadata_db_file:
+            return
+        
+        try:
+            if os.path.isfile(self.metadata_db_file):
+                self._metadata_db = load_json(self.metadata_db_file)
+                if self._metadata_db and isinstance(self._metadata_db, dict):
+                    logger.info("Loaded %d RNA ID entries from metadata database: %s", 
+                               len(self._metadata_db), self.metadata_db_file)
+                else:
+                    logger.warning("Metadata database file %s exists but contains invalid data", 
+                                  self.metadata_db_file)
+                    self._metadata_db = None
+            else:
+                logger.warning("Metadata database file not found: %s", self.metadata_db_file)
+                logger.info("To build the database, run: python -m graphgen.models.searcher.db.build_rna_metadata_db")
+        except Exception as e:
+            logger.warning("Failed to load metadata database from %s: %s", self.metadata_db_file, e)
+            self._metadata_db = None
+
     def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
         """
         Get RNA information by RNAcentral ID.
+        First checks pre-built metadata database if available, then falls back to API.
         :param rna_id: RNAcentral ID (e.g., URS0000000001).
         :return: A dictionary containing RNA information or None if not found.
         """
+        # Check pre-built metadata database first
+        if self._metadata_db is not None:
+            if rna_id in self._metadata_db:
+                result = self._metadata_db[rna_id]
+                logger.debug("Found RNA ID %s in metadata database", rna_id)
+                return result
+            else:
+                logger.debug("RNA ID %s not found in metadata database, skipping API call", rna_id)
+                return None
+        
+        # Fall back to API if metadata database not available
         try:
             url = f"{self.base_url}/rna/{rna_id}"
             url += "?flat=true"
 
-            resp = requests.get(url, headers=self.headers, timeout=30)
+            resp = requests.get(url, headers=self.headers, timeout=self.api_timeout)
             resp.raise_for_status()
 
             rna_data = resp.json()
             xrefs_data = rna_data.get("xrefs", [])
-            return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
+            result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
+            return result
+        except requests.Timeout as e:
+            logger.warning("Timeout getting RNA ID %s (timeout=%ds): %s", rna_id, self.api_timeout, e)
+            return None
         except requests.RequestException as e:
             logger.error("Network error getting RNA ID %s: %s", rna_id, e)
             return None
@@ -179,7 +233,7 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
         try:
             url = f"{self.base_url}/rna"
             params = {"search": keyword, "format": "json"}
-            resp = requests.get(url, params=params, headers=self.headers, timeout=30)
+            resp = requests.get(url, params=params, headers=self.headers, timeout=self.api_timeout)
             resp.raise_for_status()
 
             data = resp.json()
@@ -207,22 +261,54 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
             return None
 
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
-        """Perform local BLAST search using local BLAST database."""
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        """
         try:
+            # Use temporary file for query sequence
             with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
 
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
             cmd = [
                 "blastn", "-db", self.local_blast_db, "-query", tmp_name,
-                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
+                "-evalue", str(threshold),
+                "-max_target_seqs", "1",
+                "-num_threads", str(self.blast_num_threads),
+                "-outfmt", "6 sacc"  # Only accession, tab-separated
             ]
-            logger.debug("Running local blastn for RNA: %s", " ".join(cmd))
-            out = subprocess.check_output(cmd, text=True).strip()
+            logger.debug("Running local blastn for RNA (threads=%d): %s", 
+                        self.blast_num_threads, " ".join(cmd))
+            
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd, 
+                    text=True, 
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
+            
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
+            # Clean up temp file if it still exists
+            try:
+                if 'tmp_name' in locals():
+                    os.remove(tmp_name)
+            except Exception:
+                pass
             return None
 
     def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
@@ -254,7 +340,15 @@ def _extract_sequence(sequence: str) -> Optional[str]:
                 accession = self._local_blast(seq, threshold)
                 if accession:
                     logger.debug("Local BLAST found accession: %s", accession)
-                    return self.get_by_rna_id(accession)
+                    detailed = self.get_by_rna_id(accession)
+                    if detailed:
+                        return detailed
+                    logger.info(
+                        "Local BLAST found accession %s but metadata not available in database. "
+                        "API fallback disabled when using local database.",
+                        accession
+                    )
+                    return None
                 logger.info(
                     "Local BLAST found no match for sequence. "
                     "API fallback disabled when using local database."
@@ -280,7 +374,12 @@ def _extract_sequence(sequence: str) -> Optional[str]:
 
             rna_id = results[0].get("rnacentral_id")
             if rna_id:
-                return self.get_by_rna_id(rna_id)
+                detailed = self.get_by_rna_id(rna_id)
+                if detailed:
+                    return detailed
+                # Fallback: use search result data if get_by_rna_id returns None
+                logger.debug("Using search result data for %s (get_by_rna_id returned None)", rna_id)
+                return self._rna_data_to_dict(rna_id, results[0])
 
             logger.error("No RNAcentral ID found in search results.")
             return None
diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py
index a1ae2fe8..df6a7f10 100644
--- a/graphgen/models/searcher/db/uniprot_searcher.py
+++ b/graphgen/models/searcher/db/uniprot_searcher.py
@@ -6,7 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from io import StringIO
-from typing import Dict, Optional
+from typing import Dict, Optional, List
 
 from Bio import ExPASy, SeqIO, SwissProt, UniProt
 from Bio.Blast import NCBIWWW, NCBIXML
@@ -19,12 +19,12 @@
 )
 
 from graphgen.bases import BaseSearcher
-from graphgen.utils import logger
+from graphgen.utils import logger, load_json
 
 
 @lru_cache(maxsize=None)
 def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
+    return ThreadPoolExecutor(max_workers=20)  # NOTE：can increase for better parallelism
 
 
 # ensure only one BLAST searcher at a time
@@ -39,15 +39,76 @@ class UniProtSearch(BaseSearcher):
     3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async.
     """
 
-    def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"):
+    def __init__(
+        self, 
+        use_local_blast: bool = False, 
+        local_blast_db: str = "sp_db",
+        metadata_db_file: Optional[str] = None,
+        blast_num_threads: int = 4
+    ):
         super().__init__()
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
+        self.metadata_db_file = metadata_db_file
+        self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
+        
+        # Load pre-built metadata database if provided
+        self._metadata_db: Optional[Dict[str, Optional[dict]]] = None
+        self._search_index: Optional[Dict[str, List[str]]] = None
+        if self.metadata_db_file:
+            self._load_metadata_db()
+        
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
 
+    def _load_metadata_db(self) -> None:
+        """Load pre-built metadata database from file."""
+        if not self.metadata_db_file:
+            return
+        
+        try:
+            if os.path.isfile(self.metadata_db_file):
+                data = load_json(self.metadata_db_file)
+                if data and isinstance(data, dict):
+                    # New format with metadata and search_index
+                    if "metadata" in data:
+                        self._metadata_db = data["metadata"]
+                        self._search_index = data.get("search_index", {})
+                    else:
+                        # Legacy format - assume entire dict is metadata
+                        self._metadata_db = data
+                        self._search_index = {}
+                    
+                    if self._metadata_db:
+                        logger.info("Loaded %d protein entries from metadata database: %s", 
+                                   len(self._metadata_db), self.metadata_db_file)
+                        if self._search_index:
+                            logger.info("Loaded search index with %d keywords", len(self._search_index))
+                else:
+                    logger.warning("Metadata database file %s exists but contains invalid data", 
+                                  self.metadata_db_file)
+                    self._metadata_db = None
+                    self._search_index = None
+            else:
+                logger.warning("Metadata database file not found: %s", self.metadata_db_file)
+                logger.info("To build the database, run: python -m graphgen.models.searcher.db.build_protein_metadata_db")
+        except Exception as e:
+            logger.warning("Failed to load metadata database from %s: %s", self.metadata_db_file, e)
+            self._metadata_db = None
+            self._search_index = None
+
     def get_by_accession(self, accession: str) -> Optional[dict]:
+        # Check pre-built metadata database first
+        if self._metadata_db is not None:
+            if accession in self._metadata_db:
+                result = self._metadata_db[accession]
+                logger.debug("Found accession %s in metadata database", accession)
+                return result
+            else:
+                logger.debug("Accession %s not found in metadata database, falling back to API", accession)
+        
+        # Fall back to API if metadata database not available or not found
         try:
             handle = ExPASy.get_sprot_raw(accession)
             record = SwissProt.read(handle)
@@ -85,12 +146,52 @@ def _swissprot_to_dict(record: SwissProt.Record) -> dict:
     def get_best_hit(self, keyword: str) -> Optional[Dict]:
         """
         Search UniProt with a keyword and return the best hit.
+        First tries local metadata database if available, then falls back to API.
         :param keyword: The searcher keyword.
         :return: A dictionary containing the best hit information or None if not found.
         """
         if not keyword.strip():
             return None
 
+        # Try local metadata database first if available
+        if self._search_index is not None and self._metadata_db is not None:
+            keyword_lower = keyword.lower().strip()
+            
+            # Direct match
+            if keyword_lower in self._search_index:
+                accession_ids = self._search_index[keyword_lower]
+                if accession_ids:
+                    accession = accession_ids[0]  # Get first match
+                    result = self._metadata_db.get(accession)
+                    if result:
+                        logger.debug("Found keyword '%s' in local database: %s", keyword, accession)
+                        return result
+            
+            # Partial match - search for keywords that contain the search term
+            matching_accessions = []
+            for index_keyword, accessions in self._search_index.items():
+                if keyword_lower in index_keyword or index_keyword in keyword_lower:
+                    matching_accessions.extend(accessions)
+            
+            if matching_accessions:
+                # Remove duplicates while preserving order
+                seen = set()
+                unique_accessions = []
+                for acc in matching_accessions:
+                    if acc not in seen:
+                        seen.add(acc)
+                        unique_accessions.append(acc)
+                
+                # Try each match until we find a valid result
+                for accession in unique_accessions[:10]:  # Limit to first 10 matches
+                    result = self._metadata_db.get(accession)
+                    if result:
+                        logger.debug("Found keyword '%s' via partial match in local database: %s", keyword, accession)
+                        return result
+            
+            logger.debug("Keyword '%s' not found in local database, falling back to API", keyword)
+
+        # Fall back to API search
         try:
             iterator = UniProt.search(keyword, fields=None, batch_size=1)
             hit = next(iterator, None)
@@ -175,6 +276,7 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
         """
         Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
         :param seq: The protein sequence.
         :param threshold: E-value threshold for BLAST searcher.
         :return: The accession number of the best hit or None if not found.
@@ -186,6 +288,11 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
 
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
             cmd = [
                 "blastp",
                 "-db",
@@ -196,11 +303,27 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
                 str(threshold),
                 "-max_target_seqs",
                 "1",
+                "-num_threads",
+                str(self.blast_num_threads),
                 "-outfmt",
-                "6 sacc",  # only return accession
+                "6 sacc",  # Only accession, tab-separated
             ]
-            logger.debug("Running local blastp: %s", " ".join(cmd))
-            out = subprocess.check_output(cmd, text=True).strip()
+            logger.debug("Running local blastp (threads=%d): %s", 
+                        self.blast_num_threads, " ".join(cmd))
+            
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd, 
+                    text=True, 
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
+            
             os.remove(tmp_name)
             if out:
                 return out.split("\n", maxsplit=1)[0]
@@ -239,13 +362,23 @@ async def search(
         if query.startswith(">") or re.fullmatch(
             r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
         ):
-            async with _blast_lock:
+            # Only use lock for network BLAST (NCBIWWW), local BLAST can run in parallel
+            if self.use_local_blast:
+                # Local BLAST can run in parallel, no lock needed
                 result = await loop.run_in_executor(
                     _get_pool(), self.get_by_fasta, query, threshold
                 )
+            else:
+                # Network BLAST needs lock to respect rate limits
+                async with _blast_lock:
+                    result = await loop.run_in_executor(
+                        _get_pool(), self.get_by_fasta, query, threshold
+                    )
 
         # check if accession number
-        elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I):
+        # UniProt accession IDs: 6-10 characters, must start with a letter
+        # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric)
+        elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I):
             result = await loop.run_in_executor(
                 _get_pool(), self.get_by_accession, query
             )

From af49ba2e9367e4774eb22fa95c81b15b40fdfc42 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sat, 13 Dec 2025 04:00:43 +0800
Subject: [PATCH 09/16] add: enable mid-auto save in searcher

---
 graphgen/graphgen.py                    | 41 ++++++++++++++++++---
 graphgen/operators/search/search_all.py | 47 ++++++++++++++++++++++++-
 graphgen/utils/run_concurrent.py        | 37 +++++++++++++++++++
 3 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
index bc7e7742..188a5d90 100644
--- a/graphgen/graphgen.py
+++ b/graphgen/graphgen.py
@@ -1,3 +1,4 @@
+import hashlib
 import os
 import time
 from typing import Dict
@@ -173,20 +174,52 @@ async def search(self, search_config: Dict):
         if len(seeds) == 0:
             logger.warning("All documents are already been searched")
             return
+        
+        # Get save_interval from config (default: 1000, 0 to disable)
+        save_interval = search_config.get("save_interval", 1000)
+        
         search_results = await search_all(
             seed_data=seeds,
             search_config=search_config,
+            search_storage=self.search_storage if save_interval > 0 else None,
+            save_interval=save_interval,
         )
 
-        _add_search_keys = self.search_storage.filter_keys(list(search_results.keys()))
+        # Convert search_results from {data_source: [results]} to {key: result}
+        # This maintains backward compatibility
+        flattened_results = {}
+        for data_source, result_list in search_results.items():
+            if not isinstance(result_list, list):
+                continue
+            for result in result_list:
+                if result is None:
+                    continue
+                # Use _search_query as key if available, otherwise generate a key
+                if isinstance(result, dict) and "_search_query" in result:
+                    query = result["_search_query"]
+                    key = f"{data_source}:{query}"
+                else:
+                    # Generate a unique key
+                    result_str = str(result)
+                    key_hash = hashlib.md5(result_str.encode()).hexdigest()[:8]
+                    key = f"{data_source}:{key_hash}"
+                flattened_results[key] = result
+
+        _add_search_keys = self.search_storage.filter_keys(list(flattened_results.keys()))
         search_results = {
-            k: v for k, v in search_results.items() if k in _add_search_keys
+            k: v for k, v in flattened_results.items() if k in _add_search_keys
         }
         if len(search_results) == 0:
             logger.warning("All search results are already in the storage")
             return
-        self.search_storage.upsert(search_results)
-        self.search_storage.index_done_callback()
+        
+        # Only save if not using periodic saving (to avoid duplicate saves)
+        if save_interval == 0:
+            self.search_storage.upsert(search_results)
+            self.search_storage.index_done_callback()
+        else:
+            # Results were already saved periodically, just update index
+            self.search_storage.index_done_callback()
 
     @async_to_sync_method
     async def quiz_and_judge(self, quiz_and_judge_config: Dict):
diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
index 6017cfee..17b6a417 100644
--- a/graphgen/operators/search/search_all.py
+++ b/graphgen/operators/search/search_all.py
@@ -15,12 +15,16 @@
 async def search_all(
     seed_data: dict,
     search_config: dict,
+    search_storage=None,
+    save_interval: int = 1000,
 ) -> dict:
     """
     Perform searches across multiple search types and aggregate the results.
     :param seed_data: A dictionary containing seed data with entity names.
     :param search_config: A dictionary specifying which data sources to use for searching.
-    :return: A dictionary with
+    :param search_storage: Optional storage instance for periodic saving of results.
+    :param save_interval: Number of search results to accumulate before saving (default: 1000, 0 to disable).
+    :return: A dictionary with search results
     """
 
     results = {}
@@ -31,6 +35,41 @@ async def search_all(
         data = [d["content"] for d in data if "content" in d]
         data = list(set(data))  # Remove duplicates
 
+        # Prepare save callback for this data source
+        def make_save_callback(source_name):
+            def save_callback(intermediate_results, completed_count):
+                """Save intermediate search results."""
+                if search_storage is None:
+                    return
+                
+                # Convert results list to dict format
+                # Results are tuples of (query, result_dict) or just result_dict
+                batch_results = {}
+                for result in intermediate_results:
+                    if result is None:
+                        continue
+                    # Check if result is a dict with _search_query key
+                    if isinstance(result, dict) and "_search_query" in result:
+                        query = result["_search_query"]
+                        # Create a key for the result (using query as key)
+                        key = f"{source_name}:{query}"
+                        batch_results[key] = result
+                    elif isinstance(result, dict):
+                        # If no _search_query, use a generated key
+                        key = f"{source_name}:{completed_count}"
+                        batch_results[key] = result
+                
+                if batch_results:
+                    # Filter out already existing keys
+                    new_keys = search_storage.filter_keys(list(batch_results.keys()))
+                    new_results = {k: v for k, v in batch_results.items() if k in new_keys}
+                    if new_results:
+                        search_storage.upsert(new_results)
+                        search_storage.index_done_callback()
+                        logger.debug("Saved %d intermediate results for %s", len(new_results), source_name)
+            
+            return save_callback
+
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
 
@@ -43,6 +82,8 @@ async def search_all(
                 data,
                 desc="Searching UniProt database",
                 unit="keyword",
+                save_interval=save_interval if save_interval > 0 else 0,
+                save_callback=make_save_callback("uniprot") if search_storage and save_interval > 0 else None,
             )
             results[data_source] = uniprot_results
 
@@ -58,6 +99,8 @@ async def search_all(
                 data,
                 desc="Searching NCBI database",
                 unit="keyword",
+                save_interval=save_interval if save_interval > 0 else 0,
+                save_callback=make_save_callback("ncbi") if search_storage and save_interval > 0 else None,
             )
             results[data_source] = ncbi_results
 
@@ -73,6 +116,8 @@ async def search_all(
                 data,
                 desc="Searching RNAcentral database",
                 unit="keyword",
+                save_interval=save_interval if save_interval > 0 else 0,
+                save_callback=make_save_callback("rnacentral") if search_storage and save_interval > 0 else None,
             )
             results[data_source] = rnacentral_results
 
diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py
index ac63f87b..6ea949b6 100644
--- a/graphgen/utils/run_concurrent.py
+++ b/graphgen/utils/run_concurrent.py
@@ -17,11 +17,26 @@ async def run_concurrent(
     desc: str = "processing",
     unit: str = "item",
     progress_bar: Optional[gr.Progress] = None,
+    save_interval: int = 0,
+    save_callback: Optional[Callable[[List[R], int], None]] = None,
 ) -> List[R]:
+    """
+    Run coroutines concurrently with optional periodic saving.
+    
+    :param coro_fn: Coroutine function to run for each item
+    :param items: List of items to process
+    :param desc: Description for progress bar
+    :param unit: Unit name for progress bar
+    :param progress_bar: Optional Gradio progress bar
+    :param save_interval: Number of completed tasks before calling save_callback (0 to disable)
+    :param save_callback: Callback function to save intermediate results (results, completed_count)
+    :return: List of results
+    """
     tasks = [asyncio.create_task(coro_fn(it)) for it in items]
 
     completed_count = 0
     results = []
+    pending_save_results = []
 
     pbar = tqdm_async(total=len(items), desc=desc, unit=unit)
 
@@ -32,6 +47,8 @@ async def run_concurrent(
         try:
             result = await future
             results.append(result)
+            if save_interval > 0 and save_callback is not None:
+                pending_save_results.append(result)
         except Exception as e:  # pylint: disable=broad-except
             logger.exception("Task failed: %s", e)
             # even if failed, record it to keep results consistent with tasks
@@ -44,11 +61,31 @@ async def run_concurrent(
             progress = completed_count / len(items)
             progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})")
 
+        # Periodic save
+        if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0:
+            try:
+                # Filter out exceptions before saving
+                valid_results = [res for res in pending_save_results if not isinstance(res, Exception)]
+                save_callback(valid_results, completed_count)
+                pending_save_results = []  # Clear after saving
+                logger.info("Saved intermediate results: %d/%d completed", completed_count, len(items))
+            except Exception as e:
+                logger.warning("Failed to save intermediate results: %s", e)
+
     pbar.close()
 
     if progress_bar is not None:
         progress_bar(1.0, desc=f"{desc} (completed)")
 
+    # Save remaining results if any
+    if save_interval > 0 and save_callback is not None and pending_save_results:
+        try:
+            valid_results = [res for res in pending_save_results if not isinstance(res, Exception)]
+            save_callback(valid_results, completed_count)
+            logger.info("Saved final intermediate results: %d completed", completed_count)
+        except Exception as e:
+            logger.warning("Failed to save final intermediate results: %s", e)
+
     # filter out exceptions
     results = [res for res in results if not isinstance(res, Exception)]
 

From af71e07d7bb708aa6b79c86891891fbe5e639163 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sat, 13 Dec 2025 19:38:39 +0800
Subject: [PATCH 10/16] fix: accept both U and T as RNA seq

---
 graphgen/models/searcher/db/rnacentral_searcher.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 3de2fd0f..d0b8c4f0 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -327,7 +327,8 @@ def _extract_sequence(sequence: str) -> Optional[str]:
                 seq = "".join(seq_lines[1:])
             else:
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
-            return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
+            # Accept both U (original RNA) and T (converted for local BLAST compatibility)
+            return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None
 
         try:
             seq = _extract_sequence(sequence)
@@ -404,10 +405,13 @@ async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional
 
         loop = asyncio.get_running_loop()
 
-        # check if RNA sequence (AUCG characters, contains U)
-        if query.startswith(">") or (
-            re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
-        ):
+        # check if RNA sequence (AUCG or ATCG characters, contains U or T)
+        # Note: Sequences with T are also RNA sequences
+        is_rna_sequence = query.startswith(">") or (
+            re.fullmatch(r"[AUCGTN\s]+", query, re.I) and 
+            ("U" in query.upper() or "T" in query.upper())
+        )
+        if is_rna_sequence:
             result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
         # check if RNAcentral ID (typically starts with URS)
         elif re.fullmatch(r"URS\d+", query, re.I):

From 3a0f02dca9f96381b5b380207101db7e1fdeb937 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sun, 14 Dec 2025 14:06:39 +0800
Subject: [PATCH 11/16] add: add blast threads and search max_concurrent

---
 graphgen/configs/search_dna_config.yaml       |  2 +
 graphgen/configs/search_protein_config.yaml   |  2 +
 graphgen/configs/search_rna_config.yaml       |  2 +
 graphgen/models/searcher/db/ncbi_searcher.py  | 42 +++++++-
 .../models/searcher/db/rnacentral_searcher.py | 50 +---------
 .../models/searcher/db/uniprot_searcher.py    | 96 +------------------
 graphgen/operators/search/search_all.py       | 21 +++-
 graphgen/utils/run_concurrent.py              | 24 ++++-
 8 files changed, 90 insertions(+), 149 deletions(-)

diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml
index f53a5eb8..82368754 100644
--- a/graphgen/configs/search_dna_config.yaml
+++ b/graphgen/configs/search_dna_config.yaml
@@ -14,4 +14,6 @@ pipeline:
         tool: GraphGen # tool name for NCBI API
         use_local_blast: true # whether to use local blast for DNA search
         local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
+        blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory)
+        max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited)
 
diff --git a/graphgen/configs/search_protein_config.yaml b/graphgen/configs/search_protein_config.yaml
index bfbf84eb..ed04ff12 100644
--- a/graphgen/configs/search_protein_config.yaml
+++ b/graphgen/configs/search_protein_config.yaml
@@ -13,3 +13,5 @@ pipeline:
         use_local_blast: true # whether to use local blast for uniprot search
         local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot
         # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database)
+        blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory)
+        max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited)
diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml
index 10422988..83bbca7d 100644
--- a/graphgen/configs/search_rna_config.yaml
+++ b/graphgen/configs/search_rna_config.yaml
@@ -12,3 +12,5 @@ pipeline:
       rnacentral_params:
         use_local_blast: true # whether to use local blast for RNA search
         local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
+        blast_num_threads: 2 # number of threads for BLAST search (reduce to save memory)
+        max_concurrent: 5 # maximum number of concurrent search tasks (reduce to prevent OOM, default: unlimited)
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 89217e66..0bacbfaf 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -49,6 +49,7 @@ def __init__(
         email: str = "email@example.com",
         api_key: str = "",
         tool: str = "GraphGen",
+        blast_num_threads: int = 4,
     ):
         """
         Initialize the NCBI Search client.
@@ -59,6 +60,7 @@ def __init__(
             email (str): Email address for NCBI API requests.
             api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
             tool (str): Tool name for NCBI API requests.
+            blast_num_threads (int): Number of threads for BLAST search.
         """
         super().__init__()
         Entrez.timeout = 60  # 60 seconds timeout
@@ -70,6 +72,7 @@ def __init__(
         Entrez.sleep_between_tries = 5
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
+        self.blast_num_threads = blast_num_threads
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
@@ -329,22 +332,53 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
         return None
 
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
-        """Perform local BLAST search using local BLAST database."""
+        """
+        Perform local BLAST search using local BLAST database.
+        Optimized with multi-threading and faster output format.
+        """
         try:
             with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
 
+            # Optimized BLAST command with:
+            # - num_threads: Use multiple threads for faster search
+            # - outfmt 6 sacc: Only return accession (minimal output)
+            # - max_target_seqs 1: Only need the best hit
+            # - evalue: Threshold for significance
             cmd = [
                 "blastn", "-db", self.local_blast_db, "-query", tmp_name,
-                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
+                "-evalue", str(threshold),
+                "-max_target_seqs", "1",
+                "-num_threads", str(self.blast_num_threads),
+                "-outfmt", "6 sacc"  # Only accession, tab-separated
             ]
-            logger.debug("Running local blastn: %s", " ".join(cmd))
-            out = subprocess.check_output(cmd, text=True).strip()
+            logger.debug("Running local blastn (threads=%d): %s", 
+                        self.blast_num_threads, " ".join(cmd))
+            
+            # Run BLAST with timeout to avoid hanging
+            try:
+                out = subprocess.check_output(
+                    cmd, 
+                    text=True, 
+                    timeout=300,  # 5 minute timeout for BLAST search
+                    stderr=subprocess.DEVNULL  # Suppress BLAST warnings to reduce I/O
+                ).strip()
+            except subprocess.TimeoutExpired:
+                logger.warning("BLAST search timed out after 5 minutes for sequence")
+                os.remove(tmp_name)
+                return None
+            
             os.remove(tmp_name)
             return out.split("\n", maxsplit=1)[0] if out else None
         except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
+            # Clean up temp file if it still exists
+            try:
+                if 'tmp_name' in locals():
+                    os.remove(tmp_name)
+            except Exception:
+                pass
             return None
 
     def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index d0b8c4f0..8e409ed6 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -18,7 +18,7 @@
 )
 
 from graphgen.bases import BaseSearcher
-from graphgen.utils import logger, load_json
+from graphgen.utils import logger
 
 
 @lru_cache(maxsize=None)
@@ -39,8 +39,7 @@ def __init__(
         self, 
         use_local_blast: bool = False, 
         local_blast_db: str = "rna_db", 
-        api_timeout: int = 5,
-        metadata_db_file: Optional[str] = None,
+        api_timeout: int = 30,
         blast_num_threads: int = 4
     ):
         super().__init__()
@@ -49,14 +48,8 @@ def __init__(
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
         self.api_timeout = api_timeout
-        self.metadata_db_file = metadata_db_file
         self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
         
-        # Load pre-built metadata database if provided
-        self._metadata_db: Optional[Dict[str, Optional[dict]]] = None
-        if self.metadata_db_file:
-            self._load_metadata_db()
-        
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
@@ -158,46 +151,12 @@ def _calculate_md5(sequence: str) -> str:
 
         return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
 
-    def _load_metadata_db(self) -> None:
-        """Load pre-built metadata database from file."""
-        if not self.metadata_db_file:
-            return
-        
-        try:
-            if os.path.isfile(self.metadata_db_file):
-                self._metadata_db = load_json(self.metadata_db_file)
-                if self._metadata_db and isinstance(self._metadata_db, dict):
-                    logger.info("Loaded %d RNA ID entries from metadata database: %s", 
-                               len(self._metadata_db), self.metadata_db_file)
-                else:
-                    logger.warning("Metadata database file %s exists but contains invalid data", 
-                                  self.metadata_db_file)
-                    self._metadata_db = None
-            else:
-                logger.warning("Metadata database file not found: %s", self.metadata_db_file)
-                logger.info("To build the database, run: python -m graphgen.models.searcher.db.build_rna_metadata_db")
-        except Exception as e:
-            logger.warning("Failed to load metadata database from %s: %s", self.metadata_db_file, e)
-            self._metadata_db = None
-
     def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
         """
         Get RNA information by RNAcentral ID.
-        First checks pre-built metadata database if available, then falls back to API.
         :param rna_id: RNAcentral ID (e.g., URS0000000001).
         :return: A dictionary containing RNA information or None if not found.
         """
-        # Check pre-built metadata database first
-        if self._metadata_db is not None:
-            if rna_id in self._metadata_db:
-                result = self._metadata_db[rna_id]
-                logger.debug("Found RNA ID %s in metadata database", rna_id)
-                return result
-            else:
-                logger.debug("RNA ID %s not found in metadata database, skipping API call", rna_id)
-                return None
-        
-        # Fall back to API if metadata database not available
         try:
             url = f"{self.base_url}/rna/{rna_id}"
             url += "?flat=true"
@@ -327,7 +286,7 @@ def _extract_sequence(sequence: str) -> Optional[str]:
                 seq = "".join(seq_lines[1:])
             else:
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
-            # Accept both U (original RNA) and T (converted for local BLAST compatibility)
+            # Accept both U (original RNA) and T
             return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None
 
         try:
@@ -345,8 +304,7 @@ def _extract_sequence(sequence: str) -> Optional[str]:
                     if detailed:
                         return detailed
                     logger.info(
-                        "Local BLAST found accession %s but metadata not available in database. "
-                        "API fallback disabled when using local database.",
+                        "Local BLAST found accession %s but could not retrieve metadata from API.",
                         accession
                     )
                     return None
diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py
index df6a7f10..d39031d3 100644
--- a/graphgen/models/searcher/db/uniprot_searcher.py
+++ b/graphgen/models/searcher/db/uniprot_searcher.py
@@ -19,7 +19,7 @@
 )
 
 from graphgen.bases import BaseSearcher
-from graphgen.utils import logger, load_json
+from graphgen.utils import logger
 
 
 @lru_cache(maxsize=None)
@@ -43,72 +43,18 @@ def __init__(
         self, 
         use_local_blast: bool = False, 
         local_blast_db: str = "sp_db",
-        metadata_db_file: Optional[str] = None,
         blast_num_threads: int = 4
     ):
         super().__init__()
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
-        self.metadata_db_file = metadata_db_file
         self.blast_num_threads = blast_num_threads  # Number of threads for BLAST search
         
-        # Load pre-built metadata database if provided
-        self._metadata_db: Optional[Dict[str, Optional[dict]]] = None
-        self._search_index: Optional[Dict[str, List[str]]] = None
-        if self.metadata_db_file:
-            self._load_metadata_db()
-        
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
 
-    def _load_metadata_db(self) -> None:
-        """Load pre-built metadata database from file."""
-        if not self.metadata_db_file:
-            return
-        
-        try:
-            if os.path.isfile(self.metadata_db_file):
-                data = load_json(self.metadata_db_file)
-                if data and isinstance(data, dict):
-                    # New format with metadata and search_index
-                    if "metadata" in data:
-                        self._metadata_db = data["metadata"]
-                        self._search_index = data.get("search_index", {})
-                    else:
-                        # Legacy format - assume entire dict is metadata
-                        self._metadata_db = data
-                        self._search_index = {}
-                    
-                    if self._metadata_db:
-                        logger.info("Loaded %d protein entries from metadata database: %s", 
-                                   len(self._metadata_db), self.metadata_db_file)
-                        if self._search_index:
-                            logger.info("Loaded search index with %d keywords", len(self._search_index))
-                else:
-                    logger.warning("Metadata database file %s exists but contains invalid data", 
-                                  self.metadata_db_file)
-                    self._metadata_db = None
-                    self._search_index = None
-            else:
-                logger.warning("Metadata database file not found: %s", self.metadata_db_file)
-                logger.info("To build the database, run: python -m graphgen.models.searcher.db.build_protein_metadata_db")
-        except Exception as e:
-            logger.warning("Failed to load metadata database from %s: %s", self.metadata_db_file, e)
-            self._metadata_db = None
-            self._search_index = None
-
     def get_by_accession(self, accession: str) -> Optional[dict]:
-        # Check pre-built metadata database first
-        if self._metadata_db is not None:
-            if accession in self._metadata_db:
-                result = self._metadata_db[accession]
-                logger.debug("Found accession %s in metadata database", accession)
-                return result
-            else:
-                logger.debug("Accession %s not found in metadata database, falling back to API", accession)
-        
-        # Fall back to API if metadata database not available or not found
         try:
             handle = ExPASy.get_sprot_raw(accession)
             record = SwissProt.read(handle)
@@ -146,52 +92,12 @@ def _swissprot_to_dict(record: SwissProt.Record) -> dict:
     def get_best_hit(self, keyword: str) -> Optional[Dict]:
         """
         Search UniProt with a keyword and return the best hit.
-        First tries local metadata database if available, then falls back to API.
         :param keyword: The searcher keyword.
         :return: A dictionary containing the best hit information or None if not found.
         """
         if not keyword.strip():
             return None
 
-        # Try local metadata database first if available
-        if self._search_index is not None and self._metadata_db is not None:
-            keyword_lower = keyword.lower().strip()
-            
-            # Direct match
-            if keyword_lower in self._search_index:
-                accession_ids = self._search_index[keyword_lower]
-                if accession_ids:
-                    accession = accession_ids[0]  # Get first match
-                    result = self._metadata_db.get(accession)
-                    if result:
-                        logger.debug("Found keyword '%s' in local database: %s", keyword, accession)
-                        return result
-            
-            # Partial match - search for keywords that contain the search term
-            matching_accessions = []
-            for index_keyword, accessions in self._search_index.items():
-                if keyword_lower in index_keyword or index_keyword in keyword_lower:
-                    matching_accessions.extend(accessions)
-            
-            if matching_accessions:
-                # Remove duplicates while preserving order
-                seen = set()
-                unique_accessions = []
-                for acc in matching_accessions:
-                    if acc not in seen:
-                        seen.add(acc)
-                        unique_accessions.append(acc)
-                
-                # Try each match until we find a valid result
-                for accession in unique_accessions[:10]:  # Limit to first 10 matches
-                    result = self._metadata_db.get(accession)
-                    if result:
-                        logger.debug("Found keyword '%s' via partial match in local database: %s", keyword, accession)
-                        return result
-            
-            logger.debug("Keyword '%s' not found in local database, falling back to API", keyword)
-
-        # Fall back to API search
         try:
             iterator = UniProt.search(keyword, fields=None, batch_size=1)
             hit = next(iterator, None)
diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
index 17b6a417..00acefab 100644
--- a/graphgen/operators/search/search_all.py
+++ b/graphgen/operators/search/search_all.py
@@ -73,9 +73,13 @@ def save_callback(intermediate_results, completed_count):
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
 
+            uniprot_params = search_config.get("uniprot_params", {})
             uniprot_search_client = UniProtSearch(
-                **search_config.get("uniprot_params", {})
+                **uniprot_params
             )
+            
+            # Get max_concurrent from config, default to None (unlimited) for backward compatibility
+            max_concurrent = uniprot_params.get("max_concurrent")
 
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
@@ -84,15 +88,20 @@ def save_callback(intermediate_results, completed_count):
                 unit="keyword",
                 save_interval=save_interval if save_interval > 0 else 0,
                 save_callback=make_save_callback("uniprot") if search_storage and save_interval > 0 else None,
+                max_concurrent=max_concurrent,
             )
             results[data_source] = uniprot_results
 
         elif data_source == "ncbi":
             from graphgen.models import NCBISearch
 
+            ncbi_params = search_config.get("ncbi_params", {})
             ncbi_search_client = NCBISearch(
-                **search_config.get("ncbi_params", {})
+                **ncbi_params
             )
+            
+            # Get max_concurrent from config, default to None (unlimited) for backward compatibility
+            max_concurrent = ncbi_params.get("max_concurrent")
 
             ncbi_results = await run_concurrent(
                 ncbi_search_client.search,
@@ -101,15 +110,20 @@ def save_callback(intermediate_results, completed_count):
                 unit="keyword",
                 save_interval=save_interval if save_interval > 0 else 0,
                 save_callback=make_save_callback("ncbi") if search_storage and save_interval > 0 else None,
+                max_concurrent=max_concurrent,
             )
             results[data_source] = ncbi_results
 
         elif data_source == "rnacentral":
             from graphgen.models import RNACentralSearch
 
+            rnacentral_params = search_config.get("rnacentral_params", {})
             rnacentral_search_client = RNACentralSearch(
-                **search_config.get("rnacentral_params", {})
+                **rnacentral_params
             )
+            
+            # Get max_concurrent from config, default to None (unlimited) for backward compatibility
+            max_concurrent = rnacentral_params.get("max_concurrent")
 
             rnacentral_results = await run_concurrent(
                 rnacentral_search_client.search,
@@ -118,6 +132,7 @@ def save_callback(intermediate_results, completed_count):
                 unit="keyword",
                 save_interval=save_interval if save_interval > 0 else 0,
                 save_callback=make_save_callback("rnacentral") if search_storage and save_interval > 0 else None,
+                max_concurrent=max_concurrent,
             )
             results[data_source] = rnacentral_results
 
diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py
index 6ea949b6..2a8c492c 100644
--- a/graphgen/utils/run_concurrent.py
+++ b/graphgen/utils/run_concurrent.py
@@ -19,6 +19,7 @@ async def run_concurrent(
     progress_bar: Optional[gr.Progress] = None,
     save_interval: int = 0,
     save_callback: Optional[Callable[[List[R], int], None]] = None,
+    max_concurrent: Optional[int] = None,
 ) -> List[R]:
     """
     Run coroutines concurrently with optional periodic saving.
@@ -30,9 +31,30 @@ async def run_concurrent(
     :param progress_bar: Optional Gradio progress bar
     :param save_interval: Number of completed tasks before calling save_callback (0 to disable)
     :param save_callback: Callback function to save intermediate results (results, completed_count)
+    :param max_concurrent: Maximum number of concurrent tasks (None for unlimited, default: None)
     :return: List of results
     """
-    tasks = [asyncio.create_task(coro_fn(it)) for it in items]
+    if not items:
+        return []
+    
+    # Use semaphore to limit concurrent tasks if max_concurrent is specified
+    semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent is not None and max_concurrent > 0 else None
+    
+    async def run_with_semaphore(item: T) -> R:
+        """Wrapper to apply semaphore if needed."""
+        if semaphore:
+            async with semaphore:
+                return await coro_fn(item)
+        else:
+            return await coro_fn(item)
+    
+    # Create tasks with concurrency limit
+    if max_concurrent is not None and max_concurrent > 0:
+        # Use semaphore-controlled wrapper
+        tasks = [asyncio.create_task(run_with_semaphore(it)) for it in items]
+    else:
+        # Original behavior: create all tasks at once
+        tasks = [asyncio.create_task(coro_fn(it)) for it in items]
 
     completed_count = 0
     results = []

From cf15bd1c9f7707bba05e193af161e8609ded2e28 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sun, 14 Dec 2025 15:13:18 +0800
Subject: [PATCH 12/16] fix: fix max_concurrent parameter in search_all

---
 graphgen/operators/search/search_all.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
index 00acefab..85119327 100644
--- a/graphgen/operators/search/search_all.py
+++ b/graphgen/operators/search/search_all.py
@@ -73,13 +73,13 @@ def save_callback(intermediate_results, completed_count):
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
 
-            uniprot_params = search_config.get("uniprot_params", {})
+            uniprot_params = search_config.get("uniprot_params", {}).copy()
+            # Get max_concurrent from config before passing params to constructor
+            max_concurrent = uniprot_params.pop("max_concurrent", None)
+            
             uniprot_search_client = UniProtSearch(
                 **uniprot_params
             )
-            
-            # Get max_concurrent from config, default to None (unlimited) for backward compatibility
-            max_concurrent = uniprot_params.get("max_concurrent")
 
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
@@ -95,13 +95,13 @@ def save_callback(intermediate_results, completed_count):
         elif data_source == "ncbi":
             from graphgen.models import NCBISearch
 
-            ncbi_params = search_config.get("ncbi_params", {})
+            ncbi_params = search_config.get("ncbi_params", {}).copy()
+            # Get max_concurrent from config before passing params to constructor
+            max_concurrent = ncbi_params.pop("max_concurrent", None)
+            
             ncbi_search_client = NCBISearch(
                 **ncbi_params
             )
-            
-            # Get max_concurrent from config, default to None (unlimited) for backward compatibility
-            max_concurrent = ncbi_params.get("max_concurrent")
 
             ncbi_results = await run_concurrent(
                 ncbi_search_client.search,
@@ -117,13 +117,13 @@ def save_callback(intermediate_results, completed_count):
         elif data_source == "rnacentral":
             from graphgen.models import RNACentralSearch
 
-            rnacentral_params = search_config.get("rnacentral_params", {})
+            rnacentral_params = search_config.get("rnacentral_params", {}).copy()
+            # Get max_concurrent from config before passing params to constructor
+            max_concurrent = rnacentral_params.pop("max_concurrent", None)
+            
             rnacentral_search_client = RNACentralSearch(
                 **rnacentral_params
             )
-            
-            # Get max_concurrent from config, default to None (unlimited) for backward compatibility
-            max_concurrent = rnacentral_params.get("max_concurrent")
 
             rnacentral_results = await run_concurrent(
                 rnacentral_search_client.search,

From 3a7f64b7bdbf0150dbe71d0276fc6f2f421f9a9f Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sun, 14 Dec 2025 15:23:57 +0800
Subject: [PATCH 13/16] add: support multi-file database search in dna

---
 graphgen/models/searcher/db/ncbi_searcher.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 0bacbfaf..dd5e3f2d 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -73,9 +73,16 @@ def __init__(
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
         self.blast_num_threads = blast_num_threads
-        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
-            logger.error("Local BLAST database files not found. Please check the path.")
-            self.use_local_blast = False
+        if self.use_local_blast:
+            # Check for single-file database (.nhr) or multi-file database (.00.nhr)
+            db_exists = (
+                os.path.isfile(f"{self.local_blast_db}.nhr") or
+                os.path.isfile(f"{self.local_blast_db}.00.nhr")
+            )
+            if not db_exists:
+                logger.error("Local BLAST database files not found. Please check the path.")
+                logger.error("Expected: %s.nhr or %s.00.nhr", self.local_blast_db, self.local_blast_db)
+                self.use_local_blast = False
 
     @staticmethod
     def _nested_get(data: dict, *keys, default=None):

From 2aca768a8b5e6f1f5d2d155a963f205abb801df6 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sun, 14 Dec 2025 21:23:01 +0800
Subject: [PATCH 14/16] feat: enhance JSONL reading and storage with streaming
 and batch processing capabilities

---
 graphgen/graphgen.py                    | 109 +++++++++++++++---------
 graphgen/models/reader/jsonl_reader.py  |  56 +++++++++++-
 graphgen/models/storage/json_storage.py |  37 ++++++++
 graphgen/operators/read/read_files.py   |   8 +-
 4 files changed, 170 insertions(+), 40 deletions(-)

diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
index 188a5d90..6cff74b1 100644
--- a/graphgen/graphgen.py
+++ b/graphgen/graphgen.py
@@ -88,24 +88,45 @@ def __init__(
     @async_to_sync_method
     async def read(self, read_config: Dict):
         """
-        read files from input sources
+        read files from input sources with batch processing
         """
+        # Get batch_size from config, default to 10000
+        batch_size = read_config.pop("batch_size", 10000)
+        
         doc_stream = read_files(**read_config, cache_dir=self.working_dir)
 
         batch = {}
+        total_processed = 0
+        
         for doc in doc_stream:
             doc_id = compute_mm_hash(doc, prefix="doc-")
             batch[doc_id] = doc
+            
+            # Process batch when it reaches batch_size
+            if len(batch) >= batch_size:
+                _add_doc_keys = self.full_docs_storage.filter_keys(list(batch.keys()))
+                new_docs = {k: v for k, v in batch.items() if k in _add_doc_keys}
+                if new_docs:
+                    self.full_docs_storage.upsert(new_docs)
+                    total_processed += len(new_docs)
+                    logger.info("Processed batch: %d new documents (total: %d)", len(new_docs), total_processed)
+                batch.clear()
 
         # TODO: configurable whether to use coreference resolution
 
-        _add_doc_keys = self.full_docs_storage.filter_keys(list(batch.keys()))
-        new_docs = {k: v for k, v in batch.items() if k in _add_doc_keys}
-        if len(new_docs) == 0:
+        # Process remaining documents in batch
+        if batch:
+            _add_doc_keys = self.full_docs_storage.filter_keys(list(batch.keys()))
+            new_docs = {k: v for k, v in batch.items() if k in _add_doc_keys}
+            if new_docs:
+                self.full_docs_storage.upsert(new_docs)
+                total_processed += len(new_docs)
+                logger.info("Processed final batch: %d new documents (total: %d)", len(new_docs), total_processed)
+        
+        if total_processed == 0:
             logger.warning("All documents are already in the storage")
-            return
-        self.full_docs_storage.upsert(new_docs)
-        self.full_docs_storage.index_done_callback()
+        else:
+            self.full_docs_storage.index_done_callback()
 
     @async_to_sync_method
     async def chunk(self, chunk_config: Dict):
@@ -170,44 +191,56 @@ async def build_kg(self):
     async def search(self, search_config: Dict):
         logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
 
-        seeds = self.full_docs_storage.get_all()
-        if len(seeds) == 0:
-            logger.warning("All documents are already been searched")
-            return
+        # Get search_batch_size from config (default: 10000)
+        search_batch_size = search_config.get("search_batch_size", 10000)
         
         # Get save_interval from config (default: 1000, 0 to disable)
         save_interval = search_config.get("save_interval", 1000)
         
-        search_results = await search_all(
-            seed_data=seeds,
-            search_config=search_config,
-            search_storage=self.search_storage if save_interval > 0 else None,
-            save_interval=save_interval,
-        )
-
-        # Convert search_results from {data_source: [results]} to {key: result}
-        # This maintains backward compatibility
-        flattened_results = {}
-        for data_source, result_list in search_results.items():
-            if not isinstance(result_list, list):
+        # Process in batches to avoid OOM
+        all_flattened_results = {}
+        batch_num = 0
+        
+        for seeds_batch in self.full_docs_storage.iter_batches(batch_size=search_batch_size):
+            if len(seeds_batch) == 0:
                 continue
-            for result in result_list:
-                if result is None:
+                
+            batch_num += 1
+            logger.info("Processing search batch %d with %d documents", batch_num, len(seeds_batch))
+            
+            search_results = await search_all(
+                seed_data=seeds_batch,
+                search_config=search_config,
+                search_storage=self.search_storage if save_interval > 0 else None,
+                save_interval=save_interval,
+            )
+
+            # Convert search_results from {data_source: [results]} to {key: result}
+            # This maintains backward compatibility
+            for data_source, result_list in search_results.items():
+                if not isinstance(result_list, list):
                     continue
-                # Use _search_query as key if available, otherwise generate a key
-                if isinstance(result, dict) and "_search_query" in result:
-                    query = result["_search_query"]
-                    key = f"{data_source}:{query}"
-                else:
-                    # Generate a unique key
-                    result_str = str(result)
-                    key_hash = hashlib.md5(result_str.encode()).hexdigest()[:8]
-                    key = f"{data_source}:{key_hash}"
-                flattened_results[key] = result
-
-        _add_search_keys = self.search_storage.filter_keys(list(flattened_results.keys()))
+                for result in result_list:
+                    if result is None:
+                        continue
+                    # Use _search_query as key if available, otherwise generate a key
+                    if isinstance(result, dict) and "_search_query" in result:
+                        query = result["_search_query"]
+                        key = f"{data_source}:{query}"
+                    else:
+                        # Generate a unique key
+                        result_str = str(result)
+                        key_hash = hashlib.md5(result_str.encode()).hexdigest()[:8]
+                        key = f"{data_source}:{key_hash}"
+                    all_flattened_results[key] = result
+
+        if len(all_flattened_results) == 0:
+            logger.warning("No search results generated")
+            return
+
+        _add_search_keys = self.search_storage.filter_keys(list(all_flattened_results.keys()))
         search_results = {
-            k: v for k, v in flattened_results.items() if k in _add_search_keys
+            k: v for k, v in all_flattened_results.items() if k in _add_search_keys
         }
         if len(search_results) == 0:
             logger.warning("All search results are already in the storage")
diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py
index 31bc3195..f84aeadd 100644
--- a/graphgen/models/reader/jsonl_reader.py
+++ b/graphgen/models/reader/jsonl_reader.py
@@ -1,5 +1,6 @@
 import json
-from typing import Any, Dict, List
+import os
+from typing import Any, Dict, Iterator, List
 
 from graphgen.bases.base_reader import BaseReader
 from graphgen.utils import logger
@@ -28,3 +29,56 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
                 except json.JSONDecodeError as e:
                     logger.error("Error decoding JSON line: %s. Error: %s", line, e)
         return self.filter(docs)
+
+    def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]:
+        """
+        Stream read JSONL files line by line without loading entire file into memory.
+        Returns an iterator that yields filtered documents.
+        
+        :param file_path: Path to the JSONL file.
+        :return: Iterator of dictionaries containing the data.
+        """
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    doc = json.loads(line)
+                    assert "type" in doc, f"Missing 'type' in document: {doc}"
+                    if doc.get("type") == "text" and self.text_column not in doc:
+                        raise ValueError(
+                            f"Missing '{self.text_column}' in document: {doc}"
+                        )
+                    
+                    # Apply filtering logic inline (similar to BaseReader.filter)
+                    if doc.get("type") == "text":
+                        content = doc.get(self.text_column, "").strip()
+                        if content:
+                            yield doc
+                    elif doc.get("type") in ("image", "table", "equation"):
+                        img_path = doc.get("img_path")
+                        if self._image_exists(img_path):
+                            yield doc
+                    else:
+                        yield doc
+                except json.JSONDecodeError as e:
+                    logger.error("Error decoding JSON line: %s. Error: %s", line, e)
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            import requests
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except Exception:
+            return False
diff --git a/graphgen/models/storage/json_storage.py b/graphgen/models/storage/json_storage.py
index 53962117..ae41fa21 100644
--- a/graphgen/models/storage/json_storage.py
+++ b/graphgen/models/storage/json_storage.py
@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass
+from typing import Iterator, Tuple
 
 from graphgen.bases.base_storage import BaseKVStorage, BaseListStorage
 from graphgen.utils import load_json, logger, write_json
@@ -42,6 +43,42 @@ def get_by_ids(self, ids, fields=None) -> list:
     def get_all(self) -> dict[str, dict]:
         return self._data
 
+    def iter_items(self) -> Iterator[Tuple[str, dict]]:
+        """
+        Iterate over all items without loading everything into memory at once.
+        Returns an iterator of (key, value) tuples.
+        """
+        for key, value in self._data.items():
+            yield key, value
+
+    def get_batch(self, keys: list[str]) -> dict[str, dict]:
+        """
+        Get a batch of items by their keys.
+        
+        :param keys: List of keys to retrieve.
+        :return: Dictionary of {key: value} for the requested keys.
+        """
+        return {key: self._data.get(key) for key in keys if key in self._data}
+
+    def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]:
+        """
+        Iterate over items in batches to avoid loading everything into memory.
+        
+        :param batch_size: Number of items per batch.
+        :return: Iterator of dictionaries, each containing up to batch_size items.
+        """
+        batch = {}
+        count = 0
+        for key, value in self._data.items():
+            batch[key] = value
+            count += 1
+            if count >= batch_size:
+                yield batch
+                batch = {}
+                count = 0
+        if batch:
+            yield batch
+
     def filter_keys(self, data: list[str]) -> set[str]:
         return {s for s in data if s not in self._data}
 
diff --git a/graphgen/operators/read/read_files.py b/graphgen/operators/read/read_files.py
index d9e7f673..39723e76 100644
--- a/graphgen/operators/read/read_files.py
+++ b/graphgen/operators/read/read_files.py
@@ -93,7 +93,13 @@ def read_files(
             suffix = Path(file_path).suffix.lstrip(".").lower()
             reader = _build_reader(suffix, cache_dir)
 
-            yield from reader.read(file_path)
+            # Prefer stream reading if available (for memory efficiency)
+            if hasattr(reader, "read_stream"):
+                yield from reader.read_stream(file_path)
+            else:
+                # Fallback to regular read() method
+                for doc in reader.read(file_path):
+                    yield doc
 
         except Exception as e:  # pylint: disable=broad-except
             logger.exception("Error reading %s: %s", file_info.get("path"), e)

From 52566706baa78b579a488a4cbcbb3a246b867f6e Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sun, 14 Dec 2025 22:50:53 +0800
Subject: [PATCH 15/16] add: add retry for all API usage and support extract
 sequence from local db

---
 graphgen/models/searcher/db/ncbi_searcher.py | 168 +++++++++++++------
 1 file changed, 119 insertions(+), 49 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index dd5e3f2d..73b3eba0 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -28,7 +28,7 @@ def _get_pool():
 
 
 # ensure only one NCBI request at a time
-_ncbi_lock = asyncio.Lock()
+_blast_lock = asyncio.Lock()
 
 
 class NCBISearch(BaseSearcher):
@@ -97,14 +97,16 @@ def _nested_get(data: dict, *keys, default=None):
     def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
         """Infer molecule_type_detail from accession prefix or gene type."""
         if accession:
-            if accession.startswith(("NM_", "XM_")):
-                return "mRNA"
-            if accession.startswith(("NC_", "NT_")):
-                return "genomic DNA"
-            if accession.startswith(("NR_", "XR_")):
-                return "RNA"
-            if accession.startswith("NG_"):
-                return "genomic region"
+            # Map accession prefixes to molecule types
+            prefix_map = {
+                ("NM_", "XM_"): "mRNA",
+                ("NC_", "NT_"): "genomic DNA",
+                ("NR_", "XR_"): "RNA",
+                ("NG_",): "genomic region",
+            }
+            for prefixes, mol_type in prefix_map.items():
+                if accession.startswith(prefixes):
+                    return mol_type
         # Fallback: infer from gene type if available
         if gene_type is not None:
             gene_type_map = {
@@ -163,7 +165,6 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
             None,
         )
         # Fallback: if no type 3 accession, try any available accession
-        # This is needed for genes that don't have mRNA transcripts but have other sequence records
         if not representative_accession:
             representative_accession = next(
                 (
@@ -219,6 +220,12 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
             "_representative_accession": representative_accession,
         }
 
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
     def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
         """Get gene information by Gene ID."""
         def _extract_metadata_from_genbank(result: dict, accession: str):
@@ -227,12 +234,7 @@ def _extract_metadata_from_genbank(result: dict, accession: str):
                 record = SeqIO.read(handle, "genbank")
 
                 result["title"] = record.description
-                result["molecule_type_detail"] = (
-                    "mRNA" if accession.startswith(("NM_", "XM_")) else
-                    "genomic DNA" if accession.startswith(("NC_", "NT_")) else
-                    "RNA" if accession.startswith(("NR_", "XR_")) else
-                    "genomic region" if accession.startswith("NG_") else "N/A"
-                )
+                result["molecule_type_detail"] = self._infer_molecule_type_detail(accession) or "N/A"
 
                 for feature in record.features:
                     if feature.type == "source":
@@ -267,25 +269,62 @@ def _extract_sequence_from_fasta(result: dict, accession: str):
                 result["sequence_length"] = None
             return result
 
+        def _extract_sequence(result: dict, accession: str):
+            """
+            Extract sequence using the appropriate method based on configuration.
+            If use_local_blast=True, use local database. Otherwise, use NCBI API.
+            Always fetches sequence (no option to skip).
+            """
+            # If using local BLAST, use local database
+            if self.use_local_blast:
+                sequence = self._extract_sequence_from_local_db(accession)
+                
+                if sequence:
+                    result["sequence"] = sequence
+                    result["sequence_length"] = len(sequence)
+                else:
+                    # Failed to extract from local DB, set to None (no fallback to API)
+                    result["sequence"] = None
+                    result["sequence_length"] = None
+                    logger.warning(
+                        "Failed to extract sequence from local DB for accession %s. "
+                        "Not falling back to NCBI API as use_local_blast=True.",
+                        accession
+                    )
+            else:
+                # Use NCBI API to fetch sequence
+                result = _extract_sequence_from_fasta(result, accession)
+            
+            return result
+
         try:
             with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
                 gene_record = Entrez.read(handle)
-                if not gene_record:
-                    return None
+            
+            if not gene_record:
+                return None
 
-                result = self._gene_record_to_dict(gene_record, gene_id)
-                if accession := (preferred_accession or result.get("_representative_accession")):
-                    result = _extract_metadata_from_genbank(result, accession)
-                    result = _extract_sequence_from_fasta(result, accession)
+            result = self._gene_record_to_dict(gene_record, gene_id)
+            
+            if accession := (preferred_accession or result.get("_representative_accession")):
+                result = _extract_metadata_from_genbank(result, accession)
+                # Extract sequence using appropriate method
+                result = _extract_sequence(result, accession)
 
-                result.pop("_representative_accession", None)
-                return result
+            result.pop("_representative_accession", None)
+            return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as exc:
             logger.error("Gene ID %s not found: %s", gene_id, exc)
             return None
 
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
     def get_by_accession(self, accession: str) -> Optional[dict]:
         """Get sequence information by accession number."""
         def _extract_gene_id(link_handle):
@@ -311,9 +350,11 @@ def _extract_gene_id(link_handle):
                 return None
 
             result = self.get_by_gene_id(gene_id, preferred_accession=accession)
+            
             if result:
                 result["id"] = accession
                 result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
+            
             return result
         except (RequestException, IncompleteRead):
             raise
@@ -321,6 +362,12 @@ def _extract_gene_id(link_handle):
             logger.error("Accession %s not found: %s", accession, exc)
             return None
 
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """Search NCBI Gene database with a keyword and return the best hit."""
         if not keyword.strip():
@@ -330,14 +377,39 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
             for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]:
                 with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle:
                     search_results = Entrez.read(search_handle)
-                    if len(gene_id := search_results.get("IdList", [])) > 0:
-                        return self.get_by_gene_id(gene_id)
+                
+                if len(gene_id := search_results.get("IdList", [])) > 0:
+                    result = self.get_by_gene_id(gene_id)
+                    return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
 
+    def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]:
+        """Extract sequence from local BLAST database using blastdbcmd."""
+        try:
+            cmd = [
+                "blastdbcmd",
+                "-db", self.local_blast_db,
+                "-entry", accession,
+                "-outfmt", "%s"  # Only sequence, no header
+            ]
+            sequence = subprocess.check_output(
+                cmd,
+                text=True,
+                timeout=10,  # 10 second timeout for local extraction
+                stderr=subprocess.DEVNULL
+            ).strip()
+            return sequence if sequence else None
+        except subprocess.TimeoutExpired:
+            logger.warning("Timeout extracting sequence from local DB for accession %s", accession)
+            return None
+        except Exception as exc:
+            logger.warning("Failed to extract sequence from local DB for accession %s: %s", accession, exc)
+            return None
+
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
         """
         Perform local BLAST search using local BLAST database.
@@ -436,20 +508,22 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O
             # Try local BLAST first if enabled
             if self.use_local_blast:
                 accession = self._local_blast(seq, threshold)
+                
                 if accession:
                     logger.debug("Local BLAST found accession: %s", accession)
-                    return self.get_by_accession(accession)
-                logger.info(
-                    "Local BLAST found no match for sequence. "
-                    "API fallback disabled when using local database."
-                )
+                    # When using local BLAST, skip sequence fetching by default (faster, fewer API calls)
+                    # Sequence is already known from the query, so we only need metadata
+                    result = self.get_by_accession(accession)
+                    return result
+                
+                logger.info("Local BLAST found no match for sequence. API fallback disabled when using local database.")
                 return None
 
             # Fall back to network BLAST only if local BLAST is not enabled
             logger.debug("Falling back to NCBIWWW.qblast")
-
             with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle:
-                return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold)
+                result = _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold)
+            return result
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
@@ -474,29 +548,25 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona
         loop = asyncio.get_running_loop()
 
         # Auto-detect query type and execute in thread pool
-        # Only use lock for network API calls (NCBI rate limit: max 3 requests per second)
-        # Local BLAST can run in parallel
+        # All methods need lock because they all call NCBI API (rate limit: max 3 requests per second)
+        # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API
+        async def _execute_with_lock(func, *args):
+            """Execute function with lock for NCBI API calls."""
+            async with _blast_lock:
+                return await loop.run_in_executor(_get_pool(), func, *args)
+
         if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
-            # FASTA sequence: use lock only if using network BLAST
-            if self.use_local_blast:
-                # Local BLAST can run in parallel, no lock needed
-                result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
-            else:
-                # Network BLAST needs lock to respect rate limits
-                async with _ncbi_lock:
-                    result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
+            # FASTA sequence: always use lock (even with local BLAST, get_by_accession needs API)
+            result = await _execute_with_lock(self.get_by_fasta, query, threshold)
         elif re.fullmatch(r"^\d+$", query):
             # Gene ID: always use lock (network API call)
-            async with _ncbi_lock:
-                result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
+            result = await _execute_with_lock(self.get_by_gene_id, query)
         elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
             # Accession: always use lock (network API call)
-            async with _ncbi_lock:
-                result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
+            result = await _execute_with_lock(self.get_by_accession, query)
         else:
             # Keyword: always use lock (network API call)
-            async with _ncbi_lock:
-                result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
+            result = await _execute_with_lock(self.get_best_hit, query)
 
         if result:
             result["_search_query"] = query

From 6eaa3c52dfddc303cb8e324770025b7c89f89c08 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sun, 14 Dec 2025 23:48:43 +0800
Subject: [PATCH 16/16] add: add retry for all API usage in RNA search

---
 graphgen/models/searcher/db/rnacentral_searcher.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 8e409ed6..7fcba467 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -151,6 +151,12 @@ def _calculate_md5(sequence: str) -> str:
 
         return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
 
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((requests.Timeout, requests.RequestException)),
+        reraise=False,
+    )
     def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
         """
         Get RNA information by RNAcentral ID.
@@ -178,6 +184,12 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
             logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e)
             return None
 
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((requests.Timeout, requests.RequestException)),
+        reraise=False,
+    )
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.