Skip to content

Commit b052893

Browse files
committed
Converted phylo from muscle to biotite
1 parent 79dad2f commit b052893

4 files changed

Lines changed: 48 additions & 29 deletions

File tree

README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,12 @@ VCF-kit is a command-line based collection of utilities for performing analysis
2424

2525
__VCF-Kit has been upgraded to Python 3__
2626

27-
VCF-kit has been tested with Python 3.6. VCF-kit makes use of additional software for a variety of tasks:
27+
VCF-kit has been tested with Python 3.10. VCF-kit makes use of additional software for a variety of tasks:
2828

2929
* bwa (v 0.7.12)
3030
* samtools (v 1.3)
3131
* bcftools (v 1.3)
3232
* blast (v 2.2.31+)
33-
* muscle (v 3.8.31)
3433
* primer3 (v 2.5.0)
3534

3635
You can install these dependencies and VCF-kit using conda, or you can use a Docker image.
@@ -46,7 +45,6 @@ conda create -n vcf-kit \
4645
"samtools>=1.10" \
4746
"bcftools>=1.10" \
4847
"blast>=2.2.31" \
49-
"muscle>=3.8.31" \
5048
"primer3>=2.5.0"
5149

5250
conda activate vcf-kit

docs/phylo.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ vk phylo fasta <vcf> [<region>]
55
vk phylo tree (nj|upgma) [--plot] <vcf> [<region>]
66
```
77

8-
The `phylo` command can be used to generate dendrograms, tree files, or a fasta file of variants concatenated together (equivelent to a multiple sequence alignment) from a VCF. Tree files are generated in [Newick format](http://evolution.genetics.washington.edu/phylip/newicktree.html)) with [MUSCLE](http://drive5.com/muscle/) using [UPGMA](https://en.wikipedia.org/wiki/UPGMA) or [neighbor-joining](https://en.wikipedia.org/wiki/Neighbor_joining). VCF-kit can use the output tree file to generate a plot of the tree/phylogeny.
8+
The `phylo` command can be used to generate dendrograms, tree files, or a fasta file of variants concatenated together (equivelent to a multiple sequence alignment) from a VCF. Tree files are generated in [Newick format](http://evolution.genetics.washington.edu/phylip/newicktree.html)) with [biotite](https://www.biotite-python.org/latest/index.html) using [UPGMA](https://en.wikipedia.org/wiki/UPGMA) or [neighbor-joining](https://en.wikipedia.org/wiki/Neighbor_joining). VCF-kit can use the output tree file to generate a plot of the tree/phylogeny.
99

1010
`phylo` can read a VCF directly or from stdin by using `-`.
1111

@@ -38,7 +38,7 @@ bcftools filter --set-GTs . --exclude 'FMT/DP < 20' data/test.vcf.gz | vk phylo
3838

3939
## Generating a tree/phylogeny
4040

41-
`vk phylo tree` can be used to generate a tree/phylogeny from a vcf file. This command uses a fasta file (identical to what is produced using `vk phylo fasta`), and uses [MUSCLE](https://en.wikipedia.org/wiki/MUSCLE_(alignment_software) to produce a tree file.
41+
`vk phylo tree` can be used to generate a tree/phylogeny from a vcf file. This command uses a fasta file (identical to what is produced using `vk phylo fasta`), and uses [biotite](https://www.biotite-python.org/latest/index.html) to produce a tree file.
4242

4343
### Generate a UPGMA tree
4444

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ cyvcf2
88
docopt
99
biopython==1.77
1010
logzero
11+
biotite
1112
pomegranate
1213
clint
1314
requests

vcfkit/phylo.py

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818

1919
import numpy as np
2020
from jinja2 import Template
21+
import biotite.sequence.align as align, phylo
22+
import biotite.sequence.phylo as phylo
23+
from biotite.sequence import NucleotideSequence
24+
import networkx as nx
25+
import matplotlib.pyplot as plt
2126

2227
from clint.textui import colored, indent, puts_err
2328
from docopt import docopt
@@ -52,56 +57,71 @@ def first(s):
5257
"""
5358
Generate an aligned fasta from a VCF file.
5459
"""
55-
gt_set = np.chararray((0,len(v.samples)))
5660
gt_set = []
5761
for line in variant_set:
5862
if line.is_snp:
5963
gt_set.append(firstv(line.gt_bases))
6064
if len(gt_set) == 0:
6165
exit(puts_err("No genotypes"))
6266
gt_set = np.vstack(gt_set)
63-
seqs = list(zip(v.samples, np.transpose(gt_set)))
67+
seqs = [''.join(gt_set[:, i]) for i in range(gt_set.shape[1])]
68+
6469
if args["fasta"]:
65-
for sample, seq in seqs:
66-
print((">" + sample))
67-
print((''.join(seq)))
70+
for sample, seq in list(zip(v.samples, seqs)):
71+
print(">" + sample)
72+
print(seq)
6873

6974
elif args["tree"]:
7075
"""
7176
Generate a phylogenetic tree using an aligned fasta with muscle.
7277
"""
7378

74-
########
75-
# Consider biotite to replace deprecated --maketree function in muscle
76-
########
77-
78-
# Check for muscle dependency
79-
check_program_exists("muscle")
80-
fasta = ""
8179
with indent(4):
8280
puts_err(colored.blue("\nGenerating Fasta\n"))
83-
for sample, seq in seqs:
84-
fasta += ">" + sample + "\n" + ''.join(seq) + "\n"
85-
tree_type = "upgma" # default is upgma
81+
trace = align.Alignment.trace_from_strings(seqs)
82+
aligned_seqs = align.Alignment([NucleotideSequence(seq) for seq in seqs], trace)
83+
distances = 1 - align.get_pairwise_sequence_identity(aligned_seqs, mode="all")
84+
8685
if args["nj"]:
87-
tree_type = "neighborjoining"
88-
with indent(4):
89-
puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n"))
90-
comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type]
91-
tree, err = Popen(comm, stdin=PIPE, stdout=PIPE).communicate(input=fasta.encode())
92-
86+
tree = phylo.neighbor_joining(distances)
87+
else:
88+
tree = phylo.upgma(distances)
89+
9390
# output tree
94-
print(tree.decode("utf-8"))
91+
print(tree.to_newick(labels=v.samples))
9592

9693
if args["--plot"]:
94+
# graph = tree.as_graph().to_undirected()
95+
# fig = plt.figure(figsize=(8.0, 8.0))
96+
# ax = fig.gca()
97+
# ax.axis("off")
98+
# # Calculate position of nodes in the plot
99+
# pos = nx.kamada_kawai_layout(graph)
100+
# # Assign the gene names to the nodes that represent a reference index
101+
# node_labels = {i: name for i, name in enumerate(v.samples)}
102+
# nx.draw_networkx_edges(graph, pos, ax=ax)
103+
# nx.draw_networkx_labels(
104+
# graph,
105+
# pos,
106+
# ax=ax,
107+
# labels=node_labels,
108+
# font_size=7,
109+
# # Draw a white background behind the labeled nodes
110+
# # for better readability
111+
# bbox=dict(pad=0, color="white"),
112+
# )
113+
# fig.tight_layout()
114+
115+
# plt.show()
116+
97117
prefix = os.path.dirname(os.path.abspath(sys.modules['vcfkit'].__file__)) + "/static"
98118
template = open(prefix + "/tree.html",'r').read()
99119
tree_template = Template(template)
100120
html_out = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
101121
with html_out as f:
102-
tree = tree.replace("\n", "")
122+
tree = tree.to_newick(labels=v.samples)
103123
sample_len = len(v.samples)
104-
f.write(tree_template.render(**locals()))
124+
f.write(tree_template.render(tree=tree, prefix=prefix, sample_len=sample_len).encode())
105125
webbrowser.open("file://" + html_out.name)
106126

107127
if __name__ == '__main__':

0 commit comments

Comments
 (0)