-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathalignmentDist.py
More file actions
61 lines (54 loc) · 3.6 KB
/
alignmentDist.py
File metadata and controls
61 lines (54 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
import sys
# Manually defining the two sequences to be compared
# Hopefully, a future version will be able to automatically parse
## input files for these
seq1 = "AACAGGTTTGGTCCTAGCCTTTCTATTAGCCCTTAGTGAGATTACACATGCAAGCATCCCCGCCCCAGTGAGTCGCCCTCCAAGCCACTCTGACTAAGAGGAACAAGCATCAAGCACGCAACAACGCAGCTCAAGACGCTTAGCCTAGCCACACCCCCACGGGAGACAGCAGTGACAAATCTTTAGCAATGAACGAAAGTTCAACTAAGCTACACTAGCCACAGGGCTGGTCAACTTCGTGCCAGCCACCGCGGTCACACGATTAGCCCAAGTTAATAGAGATCGGCGTAGAGAGTGTTTTAGATTTTTTTCCCCCCAATAAAGCTAAAATTTACCTGAGTTGTAGAAAACTTAAGTTAATACAAAATAAACTACGAAAGTGGCTTTAATATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACTTTAACAGTTGAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATACCCCCCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCCCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCCACGAAGTAAGCGCAAACACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGGAGTGGCAAGAAATGGGCTACATTTTCTACTTCAGAAAACTACGATAACCCTCATGAAATTTGAAGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAAGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCTTCAAGTATATTTCAGGGACTACCTAACTAAAACCCCCACGCATCTATATAGAAGAGGCAAGTCGTAACATGGTAAGCGTACTGGAAAGTGCGCTTGGACGAAC"
seq2 = "AATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTCACCCTCTAAATCACCACGATCAAAAGGGACAAGCATCAAGCACGCAACAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATAAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAAACTACGAAAGTGGCTTTAACATATCTGAATACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTGAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAAC"
# Check to verify that the sequences are of equal length
if len(seq1) == len(seq2):
pass
else:
print("The sequences are not properly aligned or they are of different lengths.")
sys.exit()
# Uses splicing to cut the sequences into individual letters/bases
seq1s = []
seq2s = []
seq1s[:0] = seq1
seq2s[:0] = seq2
# Zips both sequences together for base to base comparison and keeps
## a running tally of transitions (s) and transversions (v)
def compSeq(lst1,lst2):
s = 0; v = 0
for i,j in zip(lst1,lst2):
if i == j:
pass
elif i == 'A' and j == 'G':
s += 1
elif i == 'G' and j == 'A':
s += 1
elif i == 'C' and j == 'T':
s += 1
elif i == 'T' and j == 'C':
s += 1
elif i == 'T' and j == 'G' or 'A':
v += 1
elif i == 'C' and j == 'G' or 'A':
v += 1
elif i == 'A' and j == 'T' or 'C':
v += 1
elif i == 'G' and j == 'T' or 'C':
v += 1
return s,v
# Calls the compSeq function with our lists of base pairs and stores the
## output as a list for easier access
SV = list(compSeq(seq1s,seq2s))
# Calculates the percentage of transitions (S) and transversions (V) as they
## relate to the total number of sites in the sequences
S = SV[0] / len(seq1); V = SV[1] / len(seq1)
# Function to calculate the distance between the two sequences using S and V
def D(sit, ver):
d = ((-.5) * np.log(1 - (2 * sit) - ver)) - ((.25) * np.log(1 - (2 * ver)))
return d
print(SV)
print(D(S,V))