|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +# This script is to identify the HGT events from fungi to yeast or yeast to yeast. |
| 5 | + |
| 6 | +import re |
| 7 | +from ete3 import NCBITaxa |
| 8 | +import sys |
| 9 | +import os |
| 10 | +import math |
| 11 | +import csv |
| 12 | +from Bio import SeqIO |
| 13 | +from Bio import Entrez |
| 14 | +from subprocess import Popen, PIPE |
| 15 | + |
| 16 | + |
| 17 | +# LINNAEUS_FILTER = ["species","genus","family","order","class","subphylum","phylum","kingdom","superkingdom"] |
| 18 | + |
| 19 | +def parse_NCBI(filename): |
| 20 | + with open(filename, "r") as infile : |
| 21 | + lines = infile.readlines() |
| 22 | + |
| 23 | + accession_number = list() |
| 24 | + accession_bitscore = dict() |
| 25 | + |
| 26 | + gene = lines[0].strip("\n").split("\t")[0] |
| 27 | + for line in lines : |
| 28 | + accession = line.strip("\n").split("\t")[1] |
| 29 | + bitscore = line.strip("\n").split("\t")[-1] |
| 30 | + accession_number.append(accession) |
| 31 | + accession_bitscore[accession] = float(bitscore) |
| 32 | + |
| 33 | + # print(len(accession_number)) |
| 34 | + return gene, accession_number, accession_bitscore |
| 35 | + |
| 36 | +# It can work, but HttpError: Too many requests when running parallel |
| 37 | +def getTaxid2(accession): |
| 38 | + # Retrieving data in the GenBank using only the GenBank code accession in biopython |
| 39 | + # https://www.ncbi.nlm.nih.gov/books/NBK25497/table/chapter2.T._entrez_unique_identifiers_ui/?report=objectonly |
| 40 | + # https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch |
| 41 | + # https://biopython.org/DIST/docs/api/Bio.Entrez-module.html |
| 42 | + Entrez.email = "leyu@example.org" |
| 43 | + |
| 44 | + # https://www.biostars.org/p/304175/ |
| 45 | + # get tax id using only the GenBank code accession in biopython |
| 46 | + # handle = Entrez.efetch(db='protein', id="NP_012706.1", rettype='gb') |
| 47 | + handle = Entrez.efetch(db='protein', id=accession, rettype='gb') |
| 48 | + record = SeqIO.read(handle,'genbank') |
| 49 | + # print(record.features[0].qualifiers) |
| 50 | + if record.features[0].qualifiers['db_xref'][0].split(":")[0] == 'taxon': |
| 51 | + taxid = record.features[0].qualifiers['db_xref'][0].split(":")[1] # the type is a string |
| 52 | + # organism = record.features[0].qualifiers['organism'][0] |
| 53 | + # seq = record.seq |
| 54 | + # print(taxid,organism) |
| 55 | + |
| 56 | + # return taxid,organism |
| 57 | + # return seq |
| 58 | + return taxid |
| 59 | + |
| 60 | +accession_taxid = dict() |
| 61 | +with open("acc2taxid_final.txt","r") as infile : |
| 62 | + lines = infile.readlines() |
| 63 | +for line in lines : |
| 64 | + accession = line.strip('\n').split(',')[0] |
| 65 | + taxid = line.strip('\n').split(',')[1] |
| 66 | + accession_taxid[accession] = taxid |
| 67 | + |
| 68 | +def getTaxid(accession) : |
| 69 | + taxid = accession_taxid[accession] |
| 70 | + return taxid |
| 71 | + |
| 72 | +def main() : |
| 73 | + # filename = sys.argv[1] |
| 74 | + # gene, accession_number, accession_bitscore = parse_NCBI(filename) |
| 75 | + |
| 76 | + # gene, accession_number, accession_bitscore = parse_NCBI("AOX_test.txt") |
| 77 | + # accession_number, accession_bitscore = parse_NCBI("./gain_blast/gain_OG_3.txt") |
| 78 | + |
| 79 | + outfile = open("./all_fungi2yeast.tsv", "wt") |
| 80 | + tsv_writer = csv.writer(outfile, delimiter="\t") |
| 81 | + tsv_writer.writerow(["gene id", "max_bitscore_recipient_yeast", "max_bitscore_other_fungi", "bitscore_percentage", "recipient_species_number", "other_fungi_species_number", "HGT_index", "max_bitscore_accession", \ |
| 82 | + "max_bitscore_taxid", "other_fungi_kingdom", "other_fungi_phylum", "other_fungi_subphylum", "other_fungi_species"]) |
| 83 | + |
| 84 | + i = 0 |
| 85 | + filenames =[filename for filename in os.listdir("./blast_file") if filename.endswith('txt')] |
| 86 | + for filename in filenames : |
| 87 | + print(filename) |
| 88 | + i += 1 |
| 89 | + print('This is', i, '----------------------------------------') |
| 90 | + gene, accession_number, accession_bitscore = parse_NCBI("./blast_file/%s" % filename) |
| 91 | + |
| 92 | + recipient_yeast = list() |
| 93 | + other_fungi_accession = list() |
| 94 | + recipient_yeast_accession_bitscore = dict() |
| 95 | + other_fungi_accession_bitscore = dict() |
| 96 | + recipient_species = list() |
| 97 | + other_fungi_species = list() |
| 98 | + |
| 99 | + # i = 0 |
| 100 | + k = 0 |
| 101 | + ncbi = NCBITaxa() |
| 102 | + for accession in accession_number[:200] : |
| 103 | + # i += 1 |
| 104 | + # print('This is', i, '----------------------------------------') |
| 105 | + # print(accession) |
| 106 | + try : |
| 107 | + taxid = getTaxid(accession) |
| 108 | + # taxid = getTaxid2(accession) |
| 109 | + # print(taxid) |
| 110 | + except : |
| 111 | + k+=1 |
| 112 | + # print('------------------------------',k) |
| 113 | + continue |
| 114 | + lineage = ncbi.get_lineage(taxid) |
| 115 | + lineage2ranks = ncbi.get_rank(lineage) |
| 116 | + ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items()) |
| 117 | + # print(ranks2lineage) |
| 118 | + # # taxid2name = ncbi.get_taxid_translator([ranks2lineage['kingdom'], ranks2lineage['subphylum'], ranks2lineage['species']]) |
| 119 | + # # {4751: 'Fungi', 147537: 'Saccharomycotina', 460519: 'Komagataella phaffii'} |
| 120 | + |
| 121 | + # # taxid2name = ncbi.get_taxid_translator(lineage) |
| 122 | + # # print(taxid2name) |
| 123 | + # # print(taxid2name[ranks2lineage['species']]) |
| 124 | + |
| 125 | + taxonomy_alignment = ranks2lineage |
| 126 | + |
| 127 | + try : |
| 128 | + if taxonomy_alignment['subphylum'] == 147537 : |
| 129 | + recipient_yeast.append(accession) |
| 130 | + recipient_species.append(taxonomy_alignment['species']) |
| 131 | + |
| 132 | + # if taxonomy_alignment['kingdom'] == 4751 and taxonomy_alignment['subphylum'] != 147537 : |
| 133 | + if taxonomy_alignment['kingdom'] == 4751 and taxonomy_alignment['phylum'] != 4890 : # 'Ascomycota' phylum |
| 134 | + other_fungi_accession.append(accession) |
| 135 | + other_fungi_species.append(taxonomy_alignment['species']) |
| 136 | + except : |
| 137 | + continue |
| 138 | + |
| 139 | + for accession_id in recipient_yeast : |
| 140 | + recipient_yeast_accession_bitscore[accession_id] = accession_bitscore[accession_id] |
| 141 | + |
| 142 | + for accession_id in other_fungi_accession : |
| 143 | + other_fungi_accession_bitscore[accession_id] = accession_bitscore[accession_id] |
| 144 | + |
| 145 | + if recipient_yeast_accession_bitscore : |
| 146 | + max_recipient_yeast_accession_key = max(recipient_yeast_accession_bitscore,key=recipient_yeast_accession_bitscore.get) |
| 147 | + max_recipient_yeast_bitscore = recipient_yeast_accession_bitscore[max_recipient_yeast_accession_key] |
| 148 | + |
| 149 | + if other_fungi_accession_bitscore : |
| 150 | + max_other_fungi_accession_key = max(other_fungi_accession_bitscore,key=other_fungi_accession_bitscore.get) |
| 151 | + max_other_fungi_bitscore = other_fungi_accession_bitscore[max_other_fungi_accession_key] |
| 152 | + if max_other_fungi_accession_key not in ['pir|S67133|', 'pir|S39953|', '5OQM_l', 'pir|JC7966|'] : |
| 153 | + max_taxid = getTaxid(max_other_fungi_accession_key) |
| 154 | + max_lineage = ncbi.get_lineage(max_taxid) |
| 155 | + max_lineage2ranks = ncbi.get_rank(max_lineage) |
| 156 | + max_ranks2lineage = dict((rank, taxid) for (taxid, rank) in max_lineage2ranks.items()) |
| 157 | + max_taxid2name = ncbi.get_taxid_translator([max_ranks2lineage['kingdom'], max_ranks2lineage['phylum'], max_ranks2lineage['subphylum'], max_ranks2lineage['species']]) |
| 158 | + |
| 159 | + print(gene) |
| 160 | + print(max_recipient_yeast_bitscore) |
| 161 | + print(max_other_fungi_bitscore) |
| 162 | + print(max_taxid2name) |
| 163 | + |
| 164 | + if recipient_species : |
| 165 | + recipient_species_number = len(set(recipient_species)) |
| 166 | + if other_fungi_species : |
| 167 | + other_fungi_species_number = len(set(other_fungi_species)) |
| 168 | + |
| 169 | + bitscore_index = max_other_fungi_bitscore/max_recipient_yeast_bitscore |
| 170 | + HGT_index = other_fungi_species_number/(other_fungi_species_number+recipient_species_number) |
| 171 | + |
| 172 | + print(recipient_species_number) |
| 173 | + print(other_fungi_species_number) |
| 174 | + print(HGT_index) # >60% or >70% or >80% or >90% |
| 175 | + # XP_002494271.1 |
| 176 | + # 1398.3 |
| 177 | + # 1020.4 |
| 178 | + # {4751: 'Fungi', 4890: 'Ascomycota', 147538: 'Pezizomycotina', 337075: 'Pyronema omphalodes'} |
| 179 | + # 16 |
| 180 | + # 142 |
| 181 | + # 0.8987341772151899 |
| 182 | + # This is a potential HGT event from fungi to yeast!!! |
| 183 | + |
| 184 | + # if max_other_fungi_bitscore>100 and bitscore_index>0.5 and HGT_index>0.7 : |
| 185 | + # print("This is a potential HGT event from fungi to yeast!!!") |
| 186 | + # outfile = open("./results/%s.tsv" % gene, "wt") |
| 187 | + # tsv_writer = csv.writer(outfile, delimiter="\t") |
| 188 | + # tsv_writer.writerow(["gene id", "max_bitscore_recipient_yeast", "max_bitscore_other_fungi", "bitscore_percentage", "HGT_index", "max_bitscore_accession", \ |
| 189 | + # "max_bitscore_taxid", "other_fungi_kingdom", "other_fungi_phylum", "other_fungi_subphylum", "other_fungi_species"]) |
| 190 | + # tsv_writer.writerow([gene, max_recipient_yeast_bitscore, max_other_fungi_bitscore, bitscore_index, HGT_index, max_other_fungi_accession_key, \ |
| 191 | + # max_taxid, max_taxid2name[max_ranks2lineage['kingdom']], max_taxid2name[max_ranks2lineage['phylum']], \ |
| 192 | + # max_taxid2name[max_ranks2lineage['subphylum']], max_taxid2name[max_ranks2lineage['species']]]) |
| 193 | + # outfile.close() |
| 194 | + |
| 195 | + if max_other_fungi_bitscore>100 and bitscore_index>0.5 and HGT_index>=0.9 : |
| 196 | + print("This is a potential HGT event from fungi to yeast!!!") |
| 197 | + |
| 198 | + tsv_writer.writerow([gene, max_recipient_yeast_bitscore, max_other_fungi_bitscore, bitscore_index, recipient_species_number, other_fungi_species_number, HGT_index, max_other_fungi_accession_key, \ |
| 199 | + max_taxid, max_taxid2name[max_ranks2lineage['kingdom']], max_taxid2name[max_ranks2lineage['phylum']], \ |
| 200 | + max_taxid2name[max_ranks2lineage['subphylum']], max_taxid2name[max_ranks2lineage['species']]]) |
| 201 | + outfile.close() |
| 202 | + |
| 203 | + |
| 204 | +if __name__== "__main__": |
| 205 | + main() |
| 206 | + |
0 commit comments