BGCat/embed.py at master · HassounLab/BGCat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3

import json
import os
import sys
import time

from esm.models.esmc import ESMC
from tqdm import tqdm
import torch

if len(sys.argv) < 3:
    print('Usage: %s <input BGC file path> <output BGC features file path>' % sys.argv[0])
    exit(1)
input_path = sys.argv[1]
output_path = sys.argv[2]

torch.manual_seed(123)

def now():
    return time.strftime('%Y-%m-%d %H:%M:%S')

# Use current dir for data_root, don't try to download models, load from data/weights
os.environ['INFRA_PROVIDER'] = '1'
print(now(), 'Loading model')
model = ESMC.from_pretrained('esmc_600m').to('cuda:1')

print(now(), 'Loading BGCs')
with open(input_path, 'r') as f:
    bgcs = json.load(f)

class BgcDataLoader:
    def __init__(self, bgcs, model, batch_token_limit=4096):
        self.bgcs = bgcs
        self.batch_token_limit = batch_token_limit
        self.model = model

        self.ids = []
        self.sequences = []
        for bgc_name, bgc in bgcs.items():
            for i, gene_seq in enumerate(bgc):
                self.sequences.append(gene_seq)
                self.ids.append((bgc_name, i))
        self.total_sequences = len(self.sequences)

        assert len(set(self.ids)) == len(self.ids), "Found duplicate sequence labels"

    def __len__(self):
        total_tokens = sum(len(str(seq.seq)) + 2 for seq in self.sequences)  # +2 for BOS and EOS tokens
        return (total_tokens + self.batch_token_limit - 1) // self.batch_token_limit

    def __iter__(self):
        ids, lengths, seqs = [], [], []
        current_token_count = 0

        for i, seq in enumerate(self.sequences):
            seq_length = len(seq)
            token_count = seq_length + 2
            if current_token_count + token_count > self.batch_token_limit and ids:
                tokens = self.model._tokenize(seqs)
                yield ids, lengths, tokens
                ids, lengths, seqs = [], [], []
                current_token_count = 0
            ids.append(self.ids[i])
            lengths.append(seq_length)
            seqs.append(seq)
            current_token_count += token_count

        if ids:
            tokens = self.model._tokenize(seqs)
            yield ids, lengths, tokens
print(now(), 'Preparing data loader')
data_loader = BgcDataLoader(bgcs, model)

features = {}
with torch.no_grad():
    for batch_ids, batch_lengths, batch_tokens in tqdm(data_loader, desc='Processing batches'):
        output = model(batch_tokens)
        for batch_idx, batch_id in enumerate(batch_ids):
            emb = output.embeddings[batch_idx, 0, :]
            bgc_name = batch_id[0]
            if bgc_name not in features:
                features[bgc_name] = torch.zeros(emb.shape[0] + 1)
            features[bgc_name] += torch.cat((torch.ones(1), emb.to('cpu')))
for bgc_name, emb_sum in features.items():
    features[bgc_name] = torch.div(emb_sum, emb_sum[0]).tolist()[1:]

print(now(), 'Saving embeddings')
with open(output_path, 'w') as f:
    json.dump(features, f)

print(now(), 'Done')