conv -> patch

lhallee · lhallee · commit 7ecb48c6dcb1 · 2026-02-06T10:32:10.000-05:00
diff --git a/README.md b/README.md
@@ -91,56 +91,36 @@ flowchart TB
     emb --> L1 --> L2 --> LN --> head
 ```
 
-### 2. Conv1D UNet Transformer
-A U-Net style architecture that uses Conv1D for downsampling and ConvTranspose1D for upsampling. It progressively reduces sequence length while increasing hidden dimension, allowing the model to process information at hierarchically different resolutions.
+### 2. Transformer UNet
+A U-Net architecture that uses skip connections between encoder and decoder layers, but maintains the same sequence length and hidden dimension throughout (no downsampling). This allows the model to mix features from early and late layers.
 
 ```mermaid
 flowchart TB
     subgraph Input
-        emb[Embedding Layer<br/>seq_len x hidden_size]
+        emb[Embedding Layer]
     end
-    
+
     subgraph Encoder[Encoder Path]
-        e0[TransformerBlock 0<br/>FULL RESOLUTION<br/>1024 x 768]
-        down1[Conv1D Downsample]
-        e1[TransformerBlock 1<br/>512 x 832]
-        down2[Conv1D Downsample]
-        e2[TransformerBlock 2<br/>256 x 896]
-        down3[...]
-        eN[MLP Block if seq=1]
+        e1[TransformerBlock 1]
+        e2[TransformerBlock 2]
     end
-    
+
     subgraph Decoder[Decoder Path]
-        dN[MLP Block if seq=1]
-        up1[ConvTranspose1D Upsample]
-        d2[TransformerBlock + Skip<br/>256 x 896]
-        up2[ConvTranspose1D Upsample]
-        d1[TransformerBlock + Skip<br/>512 x 832]
-        up3[ConvTranspose1D Upsample]
-        d0[TransformerBlock N<br/>FULL RESOLUTION<br/>1024 x 768]
+        d2[TransformerBlock 3 + Skip]
+        d1[TransformerBlock 4 + Skip]
     end
-    
-    subgraph ExtraLayers[Extra Sequential Layers]
-        ex1[TransformerBlock<br/>Full Resolution]
-        ex2[TransformerBlock<br/>Full Resolution]
-    end
-    
+
     subgraph Output
         head[LM Head]
     end
-    
-    emb --> e0
-    e0 --> down1 --> e1 --> down2 --> e2 --> down3 --> eN
-    eN --> dN --> up1 --> d2 --> up2 --> d1 --> up3 --> d0
-    d0 --> ex1 --> ex2 --> head
-    
-    e0 -.->|skip| d0
+
+    emb --> e1 --> e2 --> d2 --> d1 --> head
     e1 -.->|skip| d1
     e2 -.->|skip| d2
 ```
 
 ### 3. Patch UNet Transformer
-An optimized U-Net architecture designed for speed. It uses "Patch Merging" (concatenating adjacent tokens) for downsampling instead of convolutions, which is faster and cleaner. It operates on batched inputs `(B, L)` and efficiently handles document boundaries and padding without complex dynamic shape logic.
+An optimized U-Net architecture designed for speed. It uses "Patch Merging" (concatenating adjacent tokens) for downsampling, which is faster and cleaner than convolutions. It operates on batched inputs `(B, L)` and efficiently handles document boundaries and padding without complex dynamic shape logic.
 
 ```mermaid
 flowchart TB
diff --git a/example_yamls/debug.yaml b/example_yamls/debug.yaml
@@ -28,7 +28,7 @@ expansion_ratio: 2.0
 soft_logit_cap: 16.0
 tie_embeddings: false
 unet: true
-conv_unet: false
+patch_unet: false
 token_dropout: true
 bfloat16: true
 compile_model: false
diff --git a/example_yamls/default.yaml b/example_yamls/default.yaml
@@ -21,14 +21,14 @@ auto_grad_clip_p: 10.0
 hidden_size: 768
 num_attention_heads: 6
 num_hidden_layers: 24
-num_unet_layers: 0 # Number of layers for Conv1D UNet (set to > 0 to use)
+num_unet_layers: 0 # Number of layers for Patch UNet (set to > 0 to use)
 num_extra_layers: 0 # Number of extra transformer layers after UNet
 vocab_size: 33
 expansion_ratio: 2.0
 soft_logit_cap: 32.0
 tie_embeddings: false
 unet: true
-conv_unet: false # Use Conv1D UNet with downsampling
+patch_unet: false # Use Patch UNet with downsampling
 token_dropout: true
 bfloat16: false
 compile_model: true
diff --git a/example_yamls/patch_unet.yaml b/example_yamls/patch_unet.yaml
@@ -6,7 +6,7 @@
 
 # General Configuration
 bugfix: false
-save_path: "Synthyra/speedrun_conv_unet"
+save_path: "Synthyra/speedrun_patch_unet"
 data_name: "uniref50"
 num_chunks: 197
 log_name: null
@@ -24,15 +24,15 @@ auto_grad_clip_p: 10.0
 # With only 6 heads: hidden dimension is capped to keep head_dim <= 128.
 hidden_size: 768
 num_attention_heads: 12
-num_hidden_layers: 0  # Not used for conv_unet
+num_hidden_layers: 0  # Not used for patch_unet
 num_unet_layers: 12   # 6 encoder + 6 decoder
 num_extra_layers: 4    # Extra full-resolution transformer layers after UNet
 vocab_size: 33
 expansion_ratio: 2.0
 soft_logit_cap: 32.0
 tie_embeddings: false
 unet: false            # Standard UNet off
-conv_unet: true        # Batched Conv UNet on
+patch_unet: true        # Batched Patch UNet on
 token_dropout: false
 bfloat16: true
 compile_model: true
@@ -69,7 +69,7 @@ muon_momentum_warmup_steps: 300
 
 # Evaluation & Logging
 eval_every: 1000
-hf_model_name: "lhallee/speedrun_conv_unet"
+hf_model_name: "lhallee/speedrun_patch_unet"
 save_every: null
 
 # Dataloader Parameters
diff --git a/example_yamls/patch_unet_debug.yaml b/example_yamls/patch_unet_debug.yaml
@@ -4,10 +4,10 @@
 
 # General Configuration
 bugfix: true
-save_path: "Synthyra/debug_conv_unet"
+save_path: "Synthyra/debug_patch_unet"
 data_name: "uniref50"
 num_chunks: 10
-log_name: "debug_conv_unet"
+log_name: "debug_patch_unet"
 
 # Distributed Training & Reproducibility
 seed: 42
@@ -27,7 +27,7 @@ expansion_ratio: 2.0
 soft_logit_cap: 16.0
 tie_embeddings: false
 unet: false
-conv_unet: true
+patch_unet: true
 token_dropout: false
 bfloat16: true
 compile_model: false   # Faster startup for debugging
@@ -64,7 +64,7 @@ muon_momentum_warmup_steps: 10
 
 # Evaluation & Logging
 eval_every: 50
-hf_model_name: "Synthyra/debug_conv_unet"
+hf_model_name: "Synthyra/debug_patch_unet"
 save_every: null
 
 # Dataloader Parameters
diff --git a/example_yamls/test.yaml b/example_yamls/test.yaml
@@ -5,7 +5,7 @@
 
 # General Configuration
 bugfix: false
-save_path: "Synthyra/conv_unet_test"
+save_path: "Synthyra/patch_unet_test"
 data_name: "uniref50"
 num_chunks: 197
 log_name: null # If null, a random UUID will be generated
@@ -21,14 +21,14 @@ auto_grad_clip_p: 10.0
 hidden_size: 768
 num_attention_heads: 6
 num_hidden_layers: 24
-num_unet_layers: 12 # Number of layers for Conv1D UNet (set to > 0 to use)
+num_unet_layers: 12 # Number of layers for Patch UNet (set to > 0 to use)
 num_extra_layers: 4 # Number of extra transformer layers after UNet
 vocab_size: 33
 expansion_ratio: 2.0
 soft_logit_cap: 32.0
 tie_embeddings: false
 unet: true
-conv_unet: true # Use Conv1D UNet with downsampling
+patch_unet: true # Use Patch UNet with downsampling
 token_dropout: false
 bfloat16: true
 compile_model: true
diff --git a/model/model.py b/model/model.py
@@ -28,7 +28,7 @@ def __init__(
         sliding_window_size: int = 2048,
         tie_embeddings: bool = False,
         unet: bool = False,
-        conv_unet: bool = False,
+        patch_unet: bool = False,
         mlm: bool = False,
         masked_diffusion: bool = False,
         token_dropout: bool = True,
@@ -48,7 +48,7 @@ def __init__(
         self.sliding_window_size = sliding_window_size
         self.tie_embeddings = tie_embeddings
         self.unet = unet
-        self.conv_unet = conv_unet
+        self.patch_unet = patch_unet
         self.mlm = mlm
         self.masked_diffusion = masked_diffusion
         self.token_dropout = token_dropout
@@ -661,11 +661,11 @@ def __init__(self, config: PLMConfig):
         self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
 
         self.unet = config.unet
-        self.conv_unet = config.conv_unet
+        self.patch_unet = config.patch_unet
         
-        if config.conv_unet:
+        if config.patch_unet:
             # Batched UNet with Swin-style patch merge/expand
-            assert config.num_unet_layers > 0, "num_unet_layers must be > 0 for conv_unet"
+            assert config.num_unet_layers > 0, "num_unet_layers must be > 0 for patch_unet"
             self.transformer = BatchedUnetTransformer(config)
             hidden_sizes = self.transformer.hidden_sizes
             self.value_embeds = BatchedValueEmbedding(config.vocab_size, hidden_sizes)
@@ -698,9 +698,9 @@ def __init__(self, config: PLMConfig):
         self.ce = nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
 
     def get_last_hidden_state(self, input_ids: torch.Tensor, sliding_window_size: int) -> torch.Tensor:
-        if self.conv_unet:
+        if self.patch_unet:
             # Batched UNet path: input_ids is (B, L)
-            assert input_ids.dim() == 2, f"conv_unet expects (B, L) input, got shape {input_ids.shape}"
+            assert input_ids.dim() == 2, f"patch_unet expects (B, L) input, got shape {input_ids.shape}"
             B, L = input_ids.shape
 
             # Pre-compute multi-resolution block masks
@@ -795,18 +795,18 @@ def get_vector_embeddings(self, input_ids: torch.Tensor, sliding_window_size: Op
         """Mean-pool hidden states per document to get per-document embeddings.
         
         Args:
-            input_ids: (B, L) for conv_unet or (total_len,) for standard/unet
+            input_ids: (B, L) for patch_unet or (total_len,) for standard/unet
             sliding_window_size: Override sliding window size
         
         Returns:
-            For conv_unet (B, L): flattened (total_docs, hidden_size) across all batch elements
+            For patch_unet (B, L): flattened (total_docs, hidden_size) across all batch elements
             For standard (total_len,): (num_docs, hidden_size)
         """
         if sliding_window_size is None:
             sliding_window_size = self.sliding_window_size
         x = self.get_last_hidden_state(input_ids, sliding_window_size)
 
-        if self.conv_unet:
+        if self.patch_unet:
             # Batched: x is (B, L, D), input_ids is (B, L)
             B, L, D = x.shape
             doc_ids = (input_ids == self.cls_token_id).cumsum(dim=1)  # (B, L)
@@ -871,7 +871,7 @@ def get_logits(self, input_ids: torch.Tensor, sliding_window_size: Optional[int]
         """Get LM logits without computing loss.
 
         Args:
-            input_ids: (B, L) for conv_unet or (total_len,) for standard/unet
+            input_ids: (B, L) for patch_unet or (total_len,) for standard/unet
             sliding_window_size: Override sliding window size
 
         Returns:
@@ -892,7 +892,7 @@ def get_embeddings(
         """Get per-sequence pooled embeddings.
 
         Args:
-            input_ids: (B, L) for conv_unet or (total_len,) for standard/unet
+            input_ids: (B, L) for patch_unet or (total_len,) for standard/unet
             sliding_window_size: Override sliding window size
             pooling: 'mean' for mean pooling over non-pad tokens, 'cls' for CLS token embedding
 
@@ -903,7 +903,7 @@ def get_embeddings(
             sliding_window_size = self.sliding_window_size
         hidden = self.get_last_hidden_state(input_ids, sliding_window_size)
 
-        if self.conv_unet:
+        if self.patch_unet:
             # Batched: hidden is (B, L, D), input_ids is (B, L)
             assert input_ids.dim() == 2
             B, L, D = hidden.shape
@@ -1013,20 +1013,20 @@ def push_weights_to_hub(self, repo_id: str):
     print(f"Original UNet loss: {loss.item():.4f}")
 
     print("\n" + "=" * 80)
-    print("Testing Batched UNet Transformer (conv_unet)")
+    print("Testing Batched UNet Transformer (patch_unet)")
     print("=" * 80)
     max_length = 128  # Power of 2 for patch merging
-    conv_config = PLMConfig(
+    patch_config = PLMConfig(
         hidden_size=384,
         num_attention_heads=6,
         num_unet_layers=8,  # 4 encoder + 4 decoder
         num_extra_layers=2,
         max_sequence_length=max_length,
         expansion_ratio=8/3,
-        conv_unet=True,
+        patch_unet=True,
     )
-    conv_model = PLM(conv_config).cuda()
-    print(f"Model parameters: {sum(p.numel() for p in conv_model.parameters()):,}")
+    patch_model = PLM(patch_config).cuda()
+    print(f"Model parameters: {sum(p.numel() for p in patch_model.parameters()):,}")
 
     # Create batched test input (B, max_length) with packed documents per element
     B = 4
@@ -1042,13 +1042,13 @@ def push_weights_to_hub(self, repo_id: str):
     batched_labels = batched_ids.clone()
     batched_labels[batched_labels != 32] = -100
 
-    loss = conv_model(batched_ids, batched_labels, mask_rate)
+    loss = patch_model(batched_ids, batched_labels, mask_rate)
     print(f"Batched UNet loss: {loss.item():.4f}")
 
-    print(f"\nHidden sizes: {conv_model.transformer.hidden_sizes}")
-    print(f"Vector depth (log2(max_length)): {conv_model.transformer.vector_depth}")
-    print(f"Num encoder layers: {conv_model.transformer.num_encoder_layers}")
-    print(f"Num decoder layers: {conv_model.transformer.num_decoder_layers}")
+    print(f"\nHidden sizes: {patch_model.transformer.hidden_sizes}")
+    print(f"Vector depth (log2(max_length)): {patch_model.transformer.vector_depth}")
+    print(f"Num encoder layers: {patch_model.transformer.num_encoder_layers}")
+    print(f"Num decoder layers: {patch_model.transformer.num_decoder_layers}")
 
     print("\n" + "=" * 80)
     print("Testing Batched UNet with deep layers (MLP at vector depth)")
@@ -1060,7 +1060,7 @@ def push_weights_to_hub(self, repo_id: str):
         num_extra_layers=1,
         max_sequence_length=128,  # log2(128)=7, so layers 7+ become MLPs
         expansion_ratio=8/3,
-        conv_unet=True,
+        patch_unet=True,
     )
     deep_model = PLM(deep_config).cuda()
 
@@ -1086,7 +1086,7 @@ def push_weights_to_hub(self, repo_id: str):
         input_ids=batched_ids,
         cls_token_id=0,
         pad_token_id=1,
-        num_levels=conv_model.transformer.num_resolution_levels,
+        num_levels=patch_model.transformer.num_resolution_levels,
         sliding_window_size=128,
         n_heads=6,
         device=batched_ids.device,
diff --git a/train.py b/train.py