Merge pull request #17 from Synthyra/fixing_pattention

lhallee · web-flow · commit 4d205a2dd1ac · 2025-07-11T12:25:31.000-04:00
Fixing pattention
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ __pycache__/
 *.bin
 /logs
 /experiments/*.yaml
+/.cache
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
 # sudo docker build -t speedrun_plm .
 # sudo docker run --gpus all --shm-size=128g -v ${PWD}:/workspace speedrun_plm torchrun --standalone --nproc_per_node=4 train.py
-
+# docker run --gpus all -v ${PWD}:/workspace speedrun_plm python train.py --bugfix
 # 1️⃣  CUDA / cuDNN base with no Python
 FROM nvidia/cuda:12.6.2-cudnn-devel-ubuntu24.04
 
@@ -34,8 +34,7 @@ WORKDIR /app
 COPY requirements.txt .
 
 RUN pip install --upgrade pip setuptools && \
-    # force-install torch built for CUDA 12.6
-    pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu128 -U && \
+    pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu128 && \
     pip install -r requirements.txt -U
 
 # 5️⃣  Copy the rest of the source
diff --git a/model/attention.py b/model/attention.py
@@ -1,7 +1,9 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+import math
 from typing import Optional
-from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+from torch.nn.attention.flex_attention import flex_attention
 
 from model.flex_mods import generate_tanh_softcap
 from model.utils import norm, Linear
@@ -61,6 +63,7 @@ def forward(
             x: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             vi: Optional[torch.Tensor] = None,
+            **kwargs,
         ) -> torch.Tensor:
         l, d = x.size() # batch size must be 1 for FlexAttention
         q, k, v = self.Wq(x), self.Wk(x), self.Wv(x)
@@ -98,32 +101,32 @@ def __init__(self, config):
         self.config = config
         self.n_tokens = config.num_att_tokens
         self.Wq = Linear(config.hidden_size, config.hidden_size)
-        self.Pk = nn.Parameter(torch.randn(1, self.n_tokens, config.hidden_size))
-        self.Pv = nn.Parameter(torch.randn(1, self.n_tokens, config.hidden_size))
-        self.sliding_window_size = config.sliding_window_size
+        self.Pk = nn.Parameter(torch.randn(self.n_tokens, config.hidden_size))
+        self.Pv = nn.Parameter(torch.randn(self.n_tokens, config.hidden_size))
 
-    def forward(
-            self,
-            x: torch.Tensor,
-            sliding_window_size: Optional[int] = None,
-        ) -> torch.Tensor:
+    def act(self, x: torch.Tensor) -> torch.Tensor:
+        o = x / (torch.norm(x, p=2, dim=-1, keepdim=True) + 1e-3) * math.sqrt(x.shape[-1])
+        o = F.gelu(o)
+        return o
+
+    def forward(self, x: torch.Tensor, last_eos: Optional[int] = None) -> torch.Tensor:
+        if last_eos is None:
+            last_eos = x.shape[1] - 1
         Q_len, d = x.size() # batch size must be 1 for FlexAttention
 
-        if sliding_window_size is None:
-            sliding_window_size = self.sliding_window_size
+        attention_mask = torch.ones(Q_len, self.n_tokens, device=x.device)
+        attention_mask[last_eos:, :] = 0
 
-        def doc_mask_mod(b, h, q_idx, kv_idx):
-            bidirectional_sliding_window_mask = torch.abs(q_idx - kv_idx) < sliding_window_size
-            return bidirectional_sliding_window_mask
+        q = self.Wq(x) # (Q_len, d)
+        k = self.Pk # (n_tokens, d)
+        v = self.Pv # (n_tokens, d)
 
-        KV_len = self.n_tokens
-        attention_mask = create_block_mask(doc_mask_mod, 1, 1, Q_len, KV_len)
+        attn_weight = q @ k.transpose(0, 1) # (Q_len, n_tokens)
+        attn_weight *= attention_mask
+        attn_weight = self.act(attn_weight)
 
-        q = self.Wq(x).unsqueeze(0).unsqueeze(1) # (1, 1, Q_len, d)
-        k = self.Pk.unsqueeze(1) # (1, 1, n_tokens, d)
-        v = self.Pv.unsqueeze(1) # (1, 1, n_tokens, d)
-        y = flex_attention(q, k, v, block_mask=attention_mask) # (1, 1, Q_len, d)
-        return y.squeeze(1) # (1, Q_len, d)
+        y = attn_weight @ v # (Q_len, d)
+        return y.unsqueeze(0) # (1, Q_len, d)
 
 
 class MultiHeadPAttention(nn.Module):
@@ -148,12 +151,14 @@ def forward(
             x: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             vi: Optional[torch.Tensor] = None,
+            last_eos: Optional[int] = None,
+            **kwargs,
         ) -> torch.Tensor:
         # attention mask already prepped for sdpa shape (bs, 1, seq_len, seq_len)
         l, d = x.size()
-        q = self.Wq(x) # (1, l, d)
-        k = self.Wk(x) # (1, l, d)
-        v = self.Wv(x) # (1, l, d)
+        q = self.Wq(x, last_eos) # (1, l, d)
+        k = self.Wk(x, last_eos) # (1, l, d)
+        v = self.Wv(x, last_eos) # (1, l, d)
 
         if self.unet and vi is not None:
             # Reshape vi from (l, d) to (1, l, d) to match v's shape before applying it
@@ -175,3 +180,62 @@ def forward(
         
         y = y.contiguous().view(1, l, self.n_heads * self.d_head) # (1, l, n_heads * d_head)
         return self.Wo(y).squeeze(0) # (l, hidden_size)
+
+
+if __name__ == '__main__':
+    # test pattention
+    # py -m model.attention
+    
+    # Simple config for testing
+    class TestConfig:
+        def __init__(self):
+            self.hidden_size = 64
+            self.num_att_tokens = 8
+    
+    config = TestConfig()
+    pattention = PAttention(config)
+    
+    # Test input: sequence length 10, hidden size 64
+    seq_len = 10
+    x = torch.randn(seq_len, config.hidden_size)
+    
+    # Test mask logic with different last_eos values
+    print("Testing PAttention mask logic...")
+    
+    # Case 1: last_eos = 5 (mask positions 0-4, unmask positions 5-9)
+    last_eos = 5
+    output = pattention(x, last_eos=last_eos)
+    
+    # Manually check the mask logic
+    q = pattention.Wq(x)
+    k = pattention.Pk
+    attn_weight = q @ k.transpose(0, 1)
+    
+    # Create expected mask
+    expected_mask = torch.ones(seq_len, config.num_att_tokens)
+    expected_mask[:last_eos, :] = 0
+    
+    # Apply mask
+    masked_attn = attn_weight * expected_mask
+    
+    # Check that positions before last_eos are zero
+    assert torch.allclose(masked_attn[:last_eos, :], torch.zeros(last_eos, config.num_att_tokens)), \
+        "Attention weights before last_eos should be zero"
+    
+    # Check that positions from last_eos onwards are non-zero (assuming non-zero input)
+    assert not torch.allclose(masked_attn[last_eos:, :], torch.zeros(seq_len - last_eos, config.num_att_tokens)), \
+        "Attention weights from last_eos onwards should be non-zero"
+    
+    print(f"Test passed for last_eos={last_eos}")
+    
+    # Case 2: last_eos = 0 (no masking)
+    last_eos = 0
+    output = pattention(x, last_eos=last_eos)
+    print(f"Test passed for last_eos={last_eos}")
+    
+    # Case 3: last_eos = seq_len - 1 (mask all but last position)
+    last_eos = seq_len - 1
+    output = pattention(x, last_eos=last_eos)
+    print(f"Test passed for last_eos={last_eos}")
+    
+    print("All PAttention mask tests passed!")
diff --git a/model/model.py b/model/model.py
@@ -105,12 +105,25 @@ def forward(
             attention_mask: Optional[torch.Tensor] = None,
             vi: Optional[torch.Tensor] = None,
             x0: Optional[torch.Tensor] = None,
+            last_eos: Optional[int] = None,
+            **kwargs,
         ) -> torch.Tensor:
         if self.unet:
             x = self.lambdas[0] * x + self.lambdas[1] * x0
-            x = x + self.attn(norm(x), attention_mask, vi)
+            x = x + self.attn(
+                x=norm(x),
+                attention_mask=attention_mask,
+                vi=vi,
+                last_eos=last_eos,
+                **kwargs,
+            )
         else:
-            x = x + self.attn(norm(x), attention_mask)
+            x = x + self.attn(
+                x=norm(x),
+                attention_mask=attention_mask,
+                last_eos=last_eos,
+                **kwargs,
+            )
         x = x + self.mlp(norm(x))
         return x
 
@@ -120,9 +133,18 @@ def __init__(self, config: PLMConfig):
         super().__init__()
         self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(
+            self,
+            x: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            **kwargs,
+        ) -> torch.Tensor:
         for layer in self.layers:
-            x = layer(x, attention_mask)
+            x = layer(
+                x=x,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
         return x
     
 
@@ -137,17 +159,35 @@ def __init__(self, config: PLMConfig):
 
         self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, x: torch.Tensor, ve: List[torch.Tensor], attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(
+            self,
+            x: torch.Tensor,
+            ve: List[torch.Tensor],
+            attention_mask: Optional[torch.Tensor] = None,
+            **kwargs,
+        ) -> torch.Tensor:
         x0 = x
         ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:]
         skip_connections = []
         for i in range(self.num_encoder_layers):
-            x = self.layers[i](x, attention_mask, ve_enc[i], x0)
+            x = self.layers[i](
+                x=x,
+                attention_mask=attention_mask,
+                vi=ve_enc[i],
+                x0=x0,
+                **kwargs,
+            )
             skip_connections.append(x)
         
         for i in range(self.num_decoder_layers):
             x = x + self.skip_weights[i] * skip_connections.pop()
-            x = self.layers[self.num_encoder_layers + i](x, attention_mask, ve_dec[i], x0)
+            x = self.layers[self.num_encoder_layers + i](
+                x=x,
+                attention_mask=attention_mask,
+                vi=ve_dec[i],
+                x0=x0,
+                **kwargs,
+            )
         return x
 
 
@@ -219,9 +259,18 @@ def doc_mask_mod(b, h, q_idx, kv_idx):
         x = norm(x)
         if self.unet:
             ve = self.value_embeds(input_ids)
-            x = self.transformer(x, ve, attention_mask)
+            x = self.transformer(
+                x=x,
+                ve=ve,
+                attention_mask=attention_mask,
+                last_eos=last_eos,
+            )
         else:
-            x = self.transformer(x, attention_mask)
+            x = self.transformer(
+                x=x,
+                attention_mask=attention_mask,
+                last_eos=last_eos,
+            )
         return x
 
     def get_vector_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
diff --git a/train.py b/train.py
@@ -88,9 +88,9 @@ def arg_parser():
     parser.add_argument("--bfloat16", action="store_true", help="Use bfloat16")
     
     # Data hyperparams
-    parser.add_argument("--input_bin", type=str, default='data/omgprot50/omgprot50_train_*.bin', help="Input training bin files pattern")
-    parser.add_argument("--input_valid_bin", type=str, default='data/omgprot50/omgprot50_valid_*.bin', help="Input validation bin files pattern")
-    parser.add_argument("--input_test_bin", type=str, default='data/omgprot50/omgprot50_test_*.bin', help="Input test bin files pattern")
+    parser.add_argument("--input_bin", type=str, default='data/omg_prot50/omg_prot50_train_*.bin', help="Input training bin files pattern")
+    parser.add_argument("--input_valid_bin", type=str, default='data/omg_prot50/omg_prot50_valid_*.bin', help="Input validation bin files pattern")
+    parser.add_argument("--input_test_bin", type=str, default='data/omg_prot50/omg_prot50_test_*.bin', help="Input test bin files pattern")
     parser.add_argument("--mlm", type=bool, default=False, help="Use masked language modeling")
     parser.add_argument("--mask_rate", type=float, default=0.2, help="Mask rate for masked language modeling")
     parser.add_argument("--starting_mask_rate", type=float, default=0.1, help="Starting mask rate for masked language modeling")
@@ -401,7 +401,7 @@ def init_schedulers(self):
         if self.args.mask_rate_schedule:
             mask_rate_scheduler = LerpFloat(
                 start_val=self.args.starting_mask_rate, 
-                end_val=self.args.mask_rate + 0.01,
+                end_val=self.args.mask_rate,
                 precision=0.01
             )
         else:
@@ -561,10 +561,17 @@ def train(self):
                 timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val
 
                 frac_done = step / self.args.num_steps  # training progress
-                self.sliding_window_size = self.sliding_window_size_scheduler(frac_done)
+                if frac_done > 1:
+                    self.sliding_window_size = self.args.max_length
+                else:
+                    self.sliding_window_size = self.sliding_window_size_scheduler(frac_done)
+                
                 if self.mask_rate_scheduler:
                     frac_done_mask = step / self.args.mask_rate_steps
-                    mask_rate = self.mask_rate_scheduler(frac_done_mask)
+                    if frac_done_mask > 1:
+                        mask_rate = self.args.mask_rate
+                    else:
+                        mask_rate = self.mask_rate_scheduler(frac_done_mask)
                     self.current_mask_rate = mask_rate
                     self.train_loader.set_mask_rate(mask_rate)
 
@@ -685,7 +692,7 @@ def train(self):
         args.num_att_tokens = 128
         args.expansion_ratio = 2.0
         args.soft_logit_cap = 16.0
-        args.p_attention = False
+        args.p_attention = True
         args.tie_embeddings = False
         args.unet = True
         args.batch_size = 2048

-Original file line number
+Diff line change
 *.bin
 /logs
 /experiments/*.yaml
 +/.cache