TDC_evaluation/alignWithMEME.py at main · bhcooper/TDC_evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python

import sys
import numpy as np
import pandas as pd

from pandas import read_csv

Nlookup = {"A":0, "C":1, "G":2, "T":3}

comp = {"A":"T", "C":"G", "G":"C", "T":"A", "N":"N"}
def rc(seqs):
    return np.array(["".join([comp[x] for x in seq][::-1]) for seq in seqs])

table = read_csv(sys.argv[1], sep="\t", index_col=0)

table_rc = table.copy()
table_rc.index = rc(table.index)
table = pd.concat((table, table_rc))
table = table.groupby(table.index).mean().sort_values(table.columns[0], ascending=False)
table.iloc[:,0] = table.iloc[:,0]/table.iloc[:,0].max()

unique = set()
for seq in table.index:
    if(not seq in unique and not rc([seq])[0] in unique):
        unique.add(seq)

table = table.loc[list(unique)]

seqs = np.array([seq.replace("N", "") for seq in table.index])
seqlen = len(seqs[0])

matrix = None
with open(sys.argv[2], 'r') as f:
    matrix = f.read()

matrix = matrix.split("letter-probability matrix")[1]
matrix = matrix.split('\n')[1:-1]
matrix = pd.DataFrame([line.split() for line in matrix], columns=['A', 'C', 'G', 'T'], dtype=float).transpose()
print(f"PFM Shape: {matrix.shape}")
matrix = np.concatenate((np.full((4, seqlen-1), 0.25), matrix, np.full((4, seqlen-1), 0.25)), axis=1)

nshifts = matrix.shape[1] - seqlen + 1

background = 0.25**seqlen
scores = np.zeros((len(seqs), 2, nshifts))

for j in range(nshifts):
    scores[:,0,j] = [np.product([matrix[Nlookup[c], i+j] for i,c in enumerate(seq)]) for seq in seqs]
    scores[:,1,j] = [np.product([matrix[3-Nlookup[c], i+j] for i,c in enumerate(seq[::-1])]) for seq in seqs]

score = np.max(np.concatenate((np.max(scores, axis=2), np.full((len(seqs), 1), background)), axis=1), axis=1)
strand = np.argmax(np.concatenate((np.max(scores, axis=2), np.full((len(seqs), 1), background)), axis=1), axis=1)
argF = strand == 0
argR = strand == 1
argN = strand == 2
shift = np.zeros_like(strand)
shift[argF] = np.argmax(scores[argF,0,:], axis=1) - seqlen + 1
shift[argR] = np.argmax(scores[argR,1,:], axis=1) - seqlen + 1

seqs[argR] = rc(seqs[argR])
table.index = seqs
table = table.reset_index()
table.rename(columns={table.columns[0]:"Seq"}, inplace=True)

table["Shift"] = shift
table["score"] = score
table['background?'] = argN

minShift = -np.min(table["Shift"])
maxShift = np.max(table["Shift"])
for i, item in table.iterrows():
    shift = item["Shift"]
    table.at[i, table.columns[0]] = "N" * (minShift+shift) + item[table.columns[0]] + "N" * (maxShift-shift)

out = table[[table.columns[0], table.columns[1], 'score', "Shift"]].drop_duplicates().sort_values(table.columns[1], ascending=False).reset_index(drop=True)
out.to_csv(sys.argv[1][:-4] + "_MEME.tsv", sep="\t", index=False)