offTargetNCBIBlast/app.py at main · RicoLeiser/offTargetNCBIBlast · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
from Bio.Blast import NCBIWWW, NCBIXML
import pandas as pd
import time
from io import StringIO

KMER_SIZE = 21
DELAY = 1.5  # seconds

st.title("21-mer NCBI BLAST Pipeline (Cloud Compatible)")

# ------------------- File Upload -------------------
uploaded_file = st.file_uploader(
    "Upload a FASTA file (drag-and-drop supported)",
    type=["fasta", "fa", "fna"]
)

# ------------------- Functions -------------------
def read_fasta(file) -> str:
    """Read a FASTA file and return concatenated sequence(s)"""
    seq = ""
    for line in file:
        line = line.decode().strip() if isinstance(line, bytes) else line.strip()
        if line.startswith(">"):
            continue
        seq += line
    return seq

def generate_kmers(sequence: str, k=21):
    """Return list of all kmers of length k"""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def blast_short(seq: str) -> str:
    """Submit a short sequence to NCBI BLAST and return XML"""
    return NCBIWWW.qblast("blastn", "nt", seq, format_type="XML").read()

def extract_species_from_xml(xml_data) -> list:
    """
    Parse BLAST XML and extract species with 100% identity and coverage.
    Handles cases where the title contains 'PREDICTED:'.
    """
    hits = []
    blast_record = NCBIXML.read(xml_data)

    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            query_cov = (hsp.align_length / blast_record.query_length) * 100
            identity = (hsp.identities / hsp.align_length) * 100
            if identity == 100.0 and query_cov == 100.0:
                title = alignment.title
                parts = title.split()

                # Look for 'PREDICTED:' in the title
                if "PREDICTED:" in parts:
                    idx = parts.index("PREDICTED:")
                    if len(parts) > idx + 2:
                        species = f"{parts[idx+1]} {parts[idx+2]}"
                        hits.append(species)
                else:
                    # Fallback: take the first two words after the accession (usually parts[1] and parts[2])
                    if len(parts) >= 3:
                        species = f"{parts[1]} {parts[2]}"
                        hits.append(species)
    return hits

# ------------------- Main Pipeline -------------------
if uploaded_file is not None:
    st.info("Reading uploaded FASTA...")
    sequence = read_fasta(uploaded_file)
    st.success("FASTA loaded!")

    st.info(f"Generating {KMER_SIZE}-mers...")
    kmers = generate_kmers(sequence, KMER_SIZE)
    st.success(f"{len(kmers)} kmers generated!")

    st.info("Starting BLAST for all kmers (may take some time)...")
    progress_bar = st.progress(0)
    status_text = st.empty()
    species_counts = {}

    for idx, kmer in enumerate(kmers):
        status_text.text(f"BLASTing kmer {idx+1}/{len(kmers)}")
        try:
            xml_string = blast_short(kmer)
            xml_data = StringIO(xml_string)
            hits = extract_species_from_xml(xml_data)

            # Count each species only once per kmer
            unique_hits = set(hits)
            for sp in unique_hits:
                species_counts[sp] = species_counts.get(sp, 0) + 1

        except Exception as e:
            st.warning(f"Error BLASTing kmer {idx+1}: {e}")

        time.sleep(DELAY)
        progress_bar.progress((idx + 1) / len(kmers))

    st.success("BLAST complete!")

    if species_counts:
        df = pd.DataFrame(
            list(species_counts.items()),
            columns=["Latin species name", "Count"]
        )
        st.dataframe(df)

        # Excel download
        excel_file = "blast_output.xlsx"
        df.to_excel(excel_file, index=False)
        with open(excel_file, "rb") as f:
            st.download_button(
                label="Download Excel",
                data=f,
                file_name="blast_output.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
    else:
        st.warning("No hits with 100% identity and coverage found.")