-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
128 lines (111 loc) · 3.86 KB
/
main.py
File metadata and controls
128 lines (111 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from config import * # <-- 导入统一参数
from datetime import datetime
# ======================
# 读取文本 & tokenizer
# ======================
print("📖 Loading text...")
text = Path(TEXT_PATH).read_text(encoding="utf-8")
chars = sorted(list(set(text)))
specials = [PAD, BOS, EOS, SYS, USR, BOT]
itos = specials + chars
stoi = {s: i for i, s in enumerate(itos)}
vocab_size = len(itos)
print("🔤 Vocab size:", vocab_size)
def encode(s):
return [stoi[BOS]] + [stoi[c] for c in s] + [stoi[EOS]]
data = torch.tensor(
[i for line in text.splitlines() for i in encode(line)],
dtype=torch.long
)
# ======================
# Dataset
# ======================
class LMDataset(Dataset):
def __len__(self):
return len(data) - CONTEXT_SIZE - 1
def __getitem__(self, idx):
x = data[idx : idx + CONTEXT_SIZE]
y = data[idx + 1 : idx + CONTEXT_SIZE + 1]
bot_id = stoi[BOT]
mask = torch.zeros_like(y, dtype=torch.float)
in_bot = False
for i, token in enumerate(x):
if token.item() == bot_id:
in_bot = True
if in_bot:
mask[i] = 1.0
# 如果整条序列都没有 <|bot|>,退化为全 1
if mask.sum() == 0:
mask[:] = 1.0
return x, y, mask
loader = DataLoader(LMDataset(), batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
# ======================
# GPT 模型
# ======================
class GPTBlock(nn.Module):
def __init__(self, dim):
super().__init__()
self.ln1 = nn.LayerNorm(dim)
self.attn = nn.MultiheadAttention(dim, HEADS, dropout=DROPOUT, batch_first=True)
self.ln2 = nn.LayerNorm(dim)
self.ff = nn.Sequential(
nn.Linear(dim, FF_DIM),
nn.GELU(),
nn.Linear(FF_DIM, dim),
nn.Dropout(DROPOUT)
)
def forward(self, x, mask):
h = self.ln1(x)
attn, _ = self.attn(h, h, h, attn_mask=mask)
x = x + attn
x = x + self.ff(self.ln2(x))
return x
class GPT(nn.Module):
def __init__(self):
super().__init__()
self.token = nn.Embedding(vocab_size, EMBED_DIM)
self.pos = nn.Embedding(CONTEXT_SIZE, EMBED_DIM)
self.blocks = nn.ModuleList([GPTBlock(EMBED_DIM) for _ in range(LAYERS)])
self.ln_f = nn.LayerNorm(EMBED_DIM)
self.head = nn.Linear(EMBED_DIM, vocab_size, bias=False)
self.head.weight = self.token.weight
def forward(self, x):
B, T = x.shape
pos = torch.arange(T, device=x.device)
x = self.token(x) + self.pos(pos)
mask = torch.triu(torch.ones(T, T, device=x.device) * float("-inf"), diagonal=1)
for blk in self.blocks:
x = blk(x, mask)
return self.head(self.ln_f(x))
model = GPT().to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
# ======================
# 训练
# ======================
print("🚀 Training...")
for epoch in range(EPOCHS):
model.train()
total_loss = 0
for x, y, mask in loader:
x, y, mask = x.to(DEVICE), y.to(DEVICE), mask.to(DEVICE)
logits = model(x)
# cross entropy with mask
loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1), reduction="none")
loss = (loss * mask.view(-1)).sum() / mask.sum().clamp(min=1)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}/{EPOCHS} | loss {total_loss/len(loader):.4f}"
f"[{datetime.now().strftime('%H:%M:%S')}]"
)
# ======================
# 保存
# ======================
torch.save({"model": model.state_dict(), "stoi": stoi, "itos": itos, "context": CONTEXT_SIZE}, SAVE_PATH)
print("✅ Saved to", SAVE_PATH)