-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembed.py
More file actions
executable file
·92 lines (75 loc) · 2.95 KB
/
embed.py
File metadata and controls
executable file
·92 lines (75 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
import json
import os
import sys
import time
from esm.models.esmc import ESMC
from tqdm import tqdm
import torch
if len(sys.argv) < 3:
print('Usage: %s <input BGC file path> <output BGC features file path>' % sys.argv[0])
exit(1)
input_path = sys.argv[1]
output_path = sys.argv[2]
torch.manual_seed(123)
def now():
return time.strftime('%Y-%m-%d %H:%M:%S')
# Use current dir for data_root, don't try to download models, load from data/weights
os.environ['INFRA_PROVIDER'] = '1'
print(now(), 'Loading model')
model = ESMC.from_pretrained('esmc_600m').to('cuda:1')
print(now(), 'Loading BGCs')
with open(input_path, 'r') as f:
bgcs = json.load(f)
class BgcDataLoader:
def __init__(self, bgcs, model, batch_token_limit=4096):
self.bgcs = bgcs
self.batch_token_limit = batch_token_limit
self.model = model
self.ids = []
self.sequences = []
for bgc_name, bgc in bgcs.items():
for i, gene_seq in enumerate(bgc):
self.sequences.append(gene_seq)
self.ids.append((bgc_name, i))
self.total_sequences = len(self.sequences)
assert len(set(self.ids)) == len(self.ids), "Found duplicate sequence labels"
def __len__(self):
total_tokens = sum(len(str(seq.seq)) + 2 for seq in self.sequences) # +2 for BOS and EOS tokens
return (total_tokens + self.batch_token_limit - 1) // self.batch_token_limit
def __iter__(self):
ids, lengths, seqs = [], [], []
current_token_count = 0
for i, seq in enumerate(self.sequences):
seq_length = len(seq)
token_count = seq_length + 2
if current_token_count + token_count > self.batch_token_limit and ids:
tokens = self.model._tokenize(seqs)
yield ids, lengths, tokens
ids, lengths, seqs = [], [], []
current_token_count = 0
ids.append(self.ids[i])
lengths.append(seq_length)
seqs.append(seq)
current_token_count += token_count
if ids:
tokens = self.model._tokenize(seqs)
yield ids, lengths, tokens
print(now(), 'Preparing data loader')
data_loader = BgcDataLoader(bgcs, model)
features = {}
with torch.no_grad():
for batch_ids, batch_lengths, batch_tokens in tqdm(data_loader, desc='Processing batches'):
output = model(batch_tokens)
for batch_idx, batch_id in enumerate(batch_ids):
emb = output.embeddings[batch_idx, 0, :]
bgc_name = batch_id[0]
if bgc_name not in features:
features[bgc_name] = torch.zeros(emb.shape[0] + 1)
features[bgc_name] += torch.cat((torch.ones(1), emb.to('cpu')))
for bgc_name, emb_sum in features.items():
features[bgc_name] = torch.div(emb_sum, emb_sum[0]).tolist()[1:]
print(now(), 'Saving embeddings')
with open(output_path, 'w') as f:
json.dump(features, f)
print(now(), 'Done')