Skip to content

Commit 4b5ad59

Browse files
authored
Merge pull request #63 from SysBioChalmers/Dev
Update HGT and gene families analysis
2 parents 0e34f9c + 6c8a61e commit 4b5ad59

149 files changed

Lines changed: 169172 additions & 837 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.DS_Store

6 KB
Binary file not shown.

evolution_analysis/.DS_Store

8 KB
Binary file not shown.
10 KB
Binary file not shown.
14 KB
Binary file not shown.
Binary file not shown.
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
# This file is to get 300 homolog accession id and sequences for the following multiple sequence alignment.
5+
6+
7+
import re
8+
import sys
9+
import os
10+
from Bio import SeqIO
11+
from Bio import Entrez
12+
from ete3 import NCBITaxa
13+
from subprocess import Popen, PIPE
14+
15+
16+
# PATH="/Users/leyu/Documents/coding/evolution_code/HGT_analysis/HGT_prediction_tool/"
17+
18+
LINNAEUS_FILTER = ["species","genus","family","order","class","subphylum","phylum","kingdom","superkingdom"]
19+
20+
def parse_NCBI(filename):
21+
with open(filename, "r") as infile :
22+
lines = infile.readlines()
23+
24+
accession_number = list()
25+
accession_similarity = dict()
26+
27+
for line in lines :
28+
accession = line.strip("\n").split("\t")[1]
29+
similarity = line.strip("\n").split("\t")[2]
30+
accession_number.append(accession)
31+
accession_similarity[accession] = float(similarity)
32+
33+
# print(len(accession_number))
34+
return accession_number, accession_similarity
35+
36+
def get_refSeq(gene) :
37+
# get the protein sequence accoding to protein sequence id
38+
with open("/Users/leyu/Documents/Le/Data/orthomcl_output/343taxa_proteins.fasta", "r") as handleGene :
39+
proteinSeq = dict()
40+
for record in SeqIO.parse(handleGene, "fasta") :
41+
# ['__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__',
42+
# '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__le___', '__len__', '__lt__',
43+
# '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__',
44+
# '__subclasshook__', '__weakref__', '_per_letter_annotations', '_seq', '_set_per_letter_annotations', '_set_seq', 'annotations', 'dbxrefs', 'description',
45+
# 'features', 'format', 'id', 'letter_annotations', 'lower', 'name', 'reverse_complement', 'seq', 'translate', 'upper']
46+
# if record.id.startswith("Candida_albicans") :
47+
if record.id == gene :
48+
proteinSeq[record.id] = str(record.seq)
49+
# print("The protein number of %s is: %d" % (gene,len(proteinSeq)))
50+
51+
return proteinSeq[gene]
52+
53+
accession_taxid = dict()
54+
with open("acc2taxid_final.txt","r") as infile :
55+
lines = infile.readlines()
56+
for line in lines :
57+
accession = line.strip('\n').split(',')[0]
58+
taxid = line.strip('\n').split(',')[1]
59+
accession_taxid[accession] = taxid
60+
61+
def getTaxid(accession) :
62+
taxid = accession_taxid[accession]
63+
return taxid
64+
65+
def getSeq(accession):
66+
# Retrieving data in the GenBank using only the GenBank code accession in biopython
67+
# https://www.ncbi.nlm.nih.gov/books/NBK25497/table/chapter2.T._entrez_unique_identifiers_ui/?report=objectonly
68+
# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
69+
# https://biopython.org/DIST/docs/api/Bio.Entrez-module.html
70+
Entrez.email = "leyu@example.org"
71+
72+
# https://www.biostars.org/p/304175/
73+
# get tax id using only the GenBank code accession in biopython
74+
# handle = Entrez.efetch(db='protein', id="NP_012706.1", rettype='gb')
75+
handle = Entrez.efetch(db='protein', id=accession, rettype='gb')
76+
record = SeqIO.read(handle,'genbank')
77+
78+
seq = record.seq
79+
80+
print(seq)
81+
82+
return seq
83+
84+
def main() :
85+
# accession_number = parse_NCBI(filename)
86+
# accession_number = parse_NCBI("AOX_test.txt")
87+
with open("all_fungi2yeast.tsv", "r") as infile :
88+
lines = infile.readlines()[1:]
89+
90+
genes = [line.strip().split('\t')[0] for line in lines]
91+
print(genes[:7])
92+
print(len(genes)) # 100
93+
94+
i = 0
95+
for gene in genes :
96+
i += 1
97+
print('This is', str(i) + '--------------------------------------------------------')
98+
print(gene)
99+
accession_number, accession_similarity = parse_NCBI("./blast_file/%s.txt" % gene)
100+
101+
ncbi = NCBITaxa()
102+
id_seq = dict()
103+
for accession in accession_number :
104+
# accession = "XP_003959759.1"
105+
print(accession)
106+
print(accession_similarity[accession])
107+
try :
108+
seq = str(getSeq(accession))
109+
except :
110+
continue
111+
if accession not in ['pir|S67133|', 'pir|S39953|', '5OQM_l', 'pir|JC7966|'] :
112+
taxid = getTaxid(accession)
113+
lineage = ncbi.get_lineage(taxid)
114+
lineage2ranks = ncbi.get_rank(lineage)
115+
ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
116+
# print(ranks2lineage)
117+
try :
118+
taxid2name = ncbi.get_taxid_translator([ranks2lineage['kingdom'], ranks2lineage['phylum'], ranks2lineage['subphylum'], ranks2lineage['class'], ranks2lineage['species']])
119+
print(taxid2name[ranks2lineage["subphylum"]])
120+
# The Eurotiomycetes are a class of ascomycetes within the subphylum Pezizomycotina.
121+
# Proteobacteria is a major phylum of Gram-negative bacteria.
122+
if accession_similarity[accession] < 80 : # Sequences with more than 80% similarity were eliminated.
123+
if taxid2name[ranks2lineage["subphylum"]] == "Saccharomycotina" :
124+
accession = accession + "-" + taxid2name[ranks2lineage["species"]].replace(" ", "_") + "@Saccharomycotina"
125+
if taxid2name[ranks2lineage["subphylum"]] != "Saccharomycotina" and taxid2name[ranks2lineage["phylum"]] == "Ascomycota":
126+
accession = accession + "-" + taxid2name[ranks2lineage["species"]].replace(" ", "_") + "@other_Ascomycota"
127+
if taxid2name[ranks2lineage["phylum"]] != "Ascomycota":
128+
accession = accession + "-" + taxid2name[ranks2lineage["species"]].replace(" ", "_") + "@" + taxid2name[ranks2lineage["phylum"]]
129+
print(accession)
130+
id_seq[accession] = seq
131+
else :
132+
continue
133+
except :
134+
continue
135+
136+
gene_seq = get_refSeq(gene)
137+
gene_query = "QUERY_" + gene
138+
print(gene_query)
139+
print(gene_seq)
140+
id_seq[gene_query] = gene_seq
141+
142+
print(len(id_seq))
143+
144+
os.system("mkdir ./fasta/%s" % gene)
145+
with open("./fasta/%s/%s.fasta" % (gene,gene), "w") as outfile :
146+
for accession,seq in id_seq.items() :
147+
outfile.write(">"+accession+"\n")
148+
outfile.write(seq+"\n")
149+
150+
151+
if __name__== "__main__":
152+
# parse_NCBI("AOX_test.txt") # Try to read the blastp result file to get 300 homologs
153+
main()
154+
155+
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
# This script is to identify the HGT events from fungi to yeast or yeast to yeast.
5+
6+
import re
7+
from ete3 import NCBITaxa
8+
import sys
9+
import os
10+
import math
11+
import csv
12+
from Bio import SeqIO
13+
from Bio import Entrez
14+
from subprocess import Popen, PIPE
15+
16+
17+
# LINNAEUS_FILTER = ["species","genus","family","order","class","subphylum","phylum","kingdom","superkingdom"]
18+
19+
def parse_NCBI(filename):
20+
with open(filename, "r") as infile :
21+
lines = infile.readlines()
22+
23+
accession_number = list()
24+
accession_bitscore = dict()
25+
26+
gene = lines[0].strip("\n").split("\t")[0]
27+
for line in lines :
28+
accession = line.strip("\n").split("\t")[1]
29+
bitscore = line.strip("\n").split("\t")[-1]
30+
accession_number.append(accession)
31+
accession_bitscore[accession] = float(bitscore)
32+
33+
# print(len(accession_number))
34+
return gene, accession_number, accession_bitscore
35+
36+
# It can work, but HttpError: Too many requests when running parallel
37+
def getTaxid2(accession):
38+
# Retrieving data in the GenBank using only the GenBank code accession in biopython
39+
# https://www.ncbi.nlm.nih.gov/books/NBK25497/table/chapter2.T._entrez_unique_identifiers_ui/?report=objectonly
40+
# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
41+
# https://biopython.org/DIST/docs/api/Bio.Entrez-module.html
42+
Entrez.email = "leyu@example.org"
43+
44+
# https://www.biostars.org/p/304175/
45+
# get tax id using only the GenBank code accession in biopython
46+
# handle = Entrez.efetch(db='protein', id="NP_012706.1", rettype='gb')
47+
handle = Entrez.efetch(db='protein', id=accession, rettype='gb')
48+
record = SeqIO.read(handle,'genbank')
49+
# print(record.features[0].qualifiers)
50+
if record.features[0].qualifiers['db_xref'][0].split(":")[0] == 'taxon':
51+
taxid = record.features[0].qualifiers['db_xref'][0].split(":")[1] # the type is a string
52+
# organism = record.features[0].qualifiers['organism'][0]
53+
# seq = record.seq
54+
# print(taxid,organism)
55+
56+
# return taxid,organism
57+
# return seq
58+
return taxid
59+
60+
accession_taxid = dict()
61+
with open("acc2taxid_final.txt","r") as infile :
62+
lines = infile.readlines()
63+
for line in lines :
64+
accession = line.strip('\n').split(',')[0]
65+
taxid = line.strip('\n').split(',')[1]
66+
accession_taxid[accession] = taxid
67+
68+
def getTaxid(accession) :
69+
taxid = accession_taxid[accession]
70+
return taxid
71+
72+
def main() :
73+
# filename = sys.argv[1]
74+
# gene, accession_number, accession_bitscore = parse_NCBI(filename)
75+
76+
# gene, accession_number, accession_bitscore = parse_NCBI("AOX_test.txt")
77+
# accession_number, accession_bitscore = parse_NCBI("./gain_blast/gain_OG_3.txt")
78+
79+
outfile = open("./all_fungi2yeast.tsv", "wt")
80+
tsv_writer = csv.writer(outfile, delimiter="\t")
81+
tsv_writer.writerow(["gene id", "max_bitscore_recipient_yeast", "max_bitscore_other_fungi", "bitscore_percentage", "recipient_species_number", "other_fungi_species_number", "HGT_index", "max_bitscore_accession", \
82+
"max_bitscore_taxid", "other_fungi_kingdom", "other_fungi_phylum", "other_fungi_subphylum", "other_fungi_species"])
83+
84+
i = 0
85+
filenames =[filename for filename in os.listdir("./blast_file") if filename.endswith('txt')]
86+
for filename in filenames :
87+
print(filename)
88+
i += 1
89+
print('This is', i, '----------------------------------------')
90+
gene, accession_number, accession_bitscore = parse_NCBI("./blast_file/%s" % filename)
91+
92+
recipient_yeast = list()
93+
other_fungi_accession = list()
94+
recipient_yeast_accession_bitscore = dict()
95+
other_fungi_accession_bitscore = dict()
96+
recipient_species = list()
97+
other_fungi_species = list()
98+
99+
# i = 0
100+
k = 0
101+
ncbi = NCBITaxa()
102+
for accession in accession_number[:200] :
103+
# i += 1
104+
# print('This is', i, '----------------------------------------')
105+
# print(accession)
106+
try :
107+
taxid = getTaxid(accession)
108+
# taxid = getTaxid2(accession)
109+
# print(taxid)
110+
except :
111+
k+=1
112+
# print('------------------------------',k)
113+
continue
114+
lineage = ncbi.get_lineage(taxid)
115+
lineage2ranks = ncbi.get_rank(lineage)
116+
ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
117+
# print(ranks2lineage)
118+
# # taxid2name = ncbi.get_taxid_translator([ranks2lineage['kingdom'], ranks2lineage['subphylum'], ranks2lineage['species']])
119+
# # {4751: 'Fungi', 147537: 'Saccharomycotina', 460519: 'Komagataella phaffii'}
120+
121+
# # taxid2name = ncbi.get_taxid_translator(lineage)
122+
# # print(taxid2name)
123+
# # print(taxid2name[ranks2lineage['species']])
124+
125+
taxonomy_alignment = ranks2lineage
126+
127+
try :
128+
if taxonomy_alignment['subphylum'] == 147537 :
129+
recipient_yeast.append(accession)
130+
recipient_species.append(taxonomy_alignment['species'])
131+
132+
# if taxonomy_alignment['kingdom'] == 4751 and taxonomy_alignment['subphylum'] != 147537 :
133+
if taxonomy_alignment['kingdom'] == 4751 and taxonomy_alignment['phylum'] != 4890 : # 'Ascomycota' phylum
134+
other_fungi_accession.append(accession)
135+
other_fungi_species.append(taxonomy_alignment['species'])
136+
except :
137+
continue
138+
139+
for accession_id in recipient_yeast :
140+
recipient_yeast_accession_bitscore[accession_id] = accession_bitscore[accession_id]
141+
142+
for accession_id in other_fungi_accession :
143+
other_fungi_accession_bitscore[accession_id] = accession_bitscore[accession_id]
144+
145+
if recipient_yeast_accession_bitscore :
146+
max_recipient_yeast_accession_key = max(recipient_yeast_accession_bitscore,key=recipient_yeast_accession_bitscore.get)
147+
max_recipient_yeast_bitscore = recipient_yeast_accession_bitscore[max_recipient_yeast_accession_key]
148+
149+
if other_fungi_accession_bitscore :
150+
max_other_fungi_accession_key = max(other_fungi_accession_bitscore,key=other_fungi_accession_bitscore.get)
151+
max_other_fungi_bitscore = other_fungi_accession_bitscore[max_other_fungi_accession_key]
152+
if max_other_fungi_accession_key not in ['pir|S67133|', 'pir|S39953|', '5OQM_l', 'pir|JC7966|'] :
153+
max_taxid = getTaxid(max_other_fungi_accession_key)
154+
max_lineage = ncbi.get_lineage(max_taxid)
155+
max_lineage2ranks = ncbi.get_rank(max_lineage)
156+
max_ranks2lineage = dict((rank, taxid) for (taxid, rank) in max_lineage2ranks.items())
157+
max_taxid2name = ncbi.get_taxid_translator([max_ranks2lineage['kingdom'], max_ranks2lineage['phylum'], max_ranks2lineage['subphylum'], max_ranks2lineage['species']])
158+
159+
print(gene)
160+
print(max_recipient_yeast_bitscore)
161+
print(max_other_fungi_bitscore)
162+
print(max_taxid2name)
163+
164+
if recipient_species :
165+
recipient_species_number = len(set(recipient_species))
166+
if other_fungi_species :
167+
other_fungi_species_number = len(set(other_fungi_species))
168+
169+
bitscore_index = max_other_fungi_bitscore/max_recipient_yeast_bitscore
170+
HGT_index = other_fungi_species_number/(other_fungi_species_number+recipient_species_number)
171+
172+
print(recipient_species_number)
173+
print(other_fungi_species_number)
174+
print(HGT_index) # >60% or >70% or >80% or >90%
175+
# XP_002494271.1
176+
# 1398.3
177+
# 1020.4
178+
# {4751: 'Fungi', 4890: 'Ascomycota', 147538: 'Pezizomycotina', 337075: 'Pyronema omphalodes'}
179+
# 16
180+
# 142
181+
# 0.8987341772151899
182+
# This is a potential HGT event from fungi to yeast!!!
183+
184+
# if max_other_fungi_bitscore>100 and bitscore_index>0.5 and HGT_index>0.7 :
185+
# print("This is a potential HGT event from fungi to yeast!!!")
186+
# outfile = open("./results/%s.tsv" % gene, "wt")
187+
# tsv_writer = csv.writer(outfile, delimiter="\t")
188+
# tsv_writer.writerow(["gene id", "max_bitscore_recipient_yeast", "max_bitscore_other_fungi", "bitscore_percentage", "HGT_index", "max_bitscore_accession", \
189+
# "max_bitscore_taxid", "other_fungi_kingdom", "other_fungi_phylum", "other_fungi_subphylum", "other_fungi_species"])
190+
# tsv_writer.writerow([gene, max_recipient_yeast_bitscore, max_other_fungi_bitscore, bitscore_index, HGT_index, max_other_fungi_accession_key, \
191+
# max_taxid, max_taxid2name[max_ranks2lineage['kingdom']], max_taxid2name[max_ranks2lineage['phylum']], \
192+
# max_taxid2name[max_ranks2lineage['subphylum']], max_taxid2name[max_ranks2lineage['species']]])
193+
# outfile.close()
194+
195+
if max_other_fungi_bitscore>100 and bitscore_index>0.5 and HGT_index>=0.9 :
196+
print("This is a potential HGT event from fungi to yeast!!!")
197+
198+
tsv_writer.writerow([gene, max_recipient_yeast_bitscore, max_other_fungi_bitscore, bitscore_index, recipient_species_number, other_fungi_species_number, HGT_index, max_other_fungi_accession_key, \
199+
max_taxid, max_taxid2name[max_ranks2lineage['kingdom']], max_taxid2name[max_ranks2lineage['phylum']], \
200+
max_taxid2name[max_ranks2lineage['subphylum']], max_taxid2name[max_ranks2lineage['species']]])
201+
outfile.close()
202+
203+
204+
if __name__== "__main__":
205+
main()
206+

0 commit comments

Comments
 (0)