From bc50003dc7fdf3bd5d97a3d99d008ef5eda45e97 Mon Sep 17 00:00:00 2001 From: huseyincavusbi Date: Wed, 24 Jun 2026 18:31:42 +0300 Subject: [PATCH 1/6] =?UTF-8?q?fix:=20remove=20misleading=20'encoder-decod?= =?UTF-8?q?er'=20label=20from=20AutoModel=20log=20=E2=80=94=20can=20also?= =?UTF-8?q?=20be=20multimodal=20(AutoModelForImageTextToText)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- transformer_lens/benchmarks/main_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_lens/benchmarks/main_benchmark.py b/transformer_lens/benchmarks/main_benchmark.py index 05735095f..2fe3b0d40 100644 --- a/transformer_lens/benchmarks/main_benchmark.py +++ b/transformer_lens/benchmarks/main_benchmark.py @@ -935,7 +935,7 @@ def cleanup_model(model, model_name_str: str): # Use appropriate AutoModel class (e.g., AutoModelForSeq2SeqLM for T5) auto_model_class = get_auto_model_class(model_name, trust_remote_code=trust_remote_code) if verbose and auto_model_class != AutoModelForCausalLM: - print(f"Using {auto_model_class.__name__} for encoder-decoder model") + print(f"Using {auto_model_class.__name__}") # Ensure pad_token_id exists (some models crash without it during init). hf_config = AutoConfig.from_pretrained( model_name, trust_remote_code=trust_remote_code, token=_hf_token() From 1b25029d850bd02c8be620d96f5e8bdc1b82ad1e Mon Sep 17 00:00:00 2001 From: huseyincavusbi Date: Wed, 24 Jun 2026 18:32:14 +0300 Subject: [PATCH 2/6] =?UTF-8?q?fix:=20skip=20Phase=202=20header=20when=20p?= =?UTF-8?q?hase=20not=20selected=20=E2=80=94=20empty=20header=20printed=20?= =?UTF-8?q?in=20every=20run?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- transformer_lens/benchmarks/main_benchmark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformer_lens/benchmarks/main_benchmark.py b/transformer_lens/benchmarks/main_benchmark.py index 2fe3b0d40..d2ccbfa70 100644 --- a/transformer_lens/benchmarks/main_benchmark.py +++ b/transformer_lens/benchmarks/main_benchmark.py @@ -1209,14 +1209,14 @@ def cleanup_model(model, model_name_str: str): # PHASE 2: Bridge (unprocessed) + HookedTransformer (unprocessed) # ======================================================================== current_phase[0] = 2 - if verbose: - print(f"\n{'='*80}") - print("PHASE 2: TransformerBridge (unprocessed) + HookedTransformer (unprocessed)") - print(f"{'='*80}\n") # OPTIMIZATION: Run generation benchmarks first (only bridge in memory) # Then cleanup bridge before loading HT to reduce peak memory if should_run_phase(2) and bridge_unprocessed: + if verbose: + print(f"\n{'='*80}") + print("PHASE 2: TransformerBridge (unprocessed) + HookedTransformer (unprocessed)") + print(f"{'='*80}\n") if verbose: print("Running Phase 2 benchmarks...\n") From f41eaed4f7d0c47048dc4d1518cead2039fa3054 Mon Sep 17 00:00:00 2001 From: huseyincavusbi Date: Wed, 24 Jun 2026 18:34:54 +0300 Subject: [PATCH 3/6] =?UTF-8?q?fix:=20skip=20component=20benchmarking=20fo?= =?UTF-8?q?r=20DelegatedAttentionBlockBridge=20=E2=80=94=20attn/PLE/rotary?= =?UTF-8?q?=5Femb=20can't=20be=20tested=20in=20isolation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../benchmarks/component_outputs.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/transformer_lens/benchmarks/component_outputs.py b/transformer_lens/benchmarks/component_outputs.py index 1efe59f82..9aa260d93 100644 --- a/transformer_lens/benchmarks/component_outputs.py +++ b/transformer_lens/benchmarks/component_outputs.py @@ -419,6 +419,31 @@ def _test_component_recursive( ): return + # Skip attention and PLE submodules when using DelegatedAttentionBlockBridge. + # These architectures delegate all math to HF; the benchmark can't call the HF + # attention in isolation (missing position_embeddings, attention_mask, etc.) and + # PLE submodules receive per-layer inputs at a different dimension than hidden_states. + _is_delegated = ( + hasattr(self.bridge_model, "blocks") + and "hook_q_input" + not in getattr( + self.bridge_model.blocks, "hook_aliases", {"hook_q_input": True} + ) + ) + if _is_delegated and "attn" in component_path: + return + if _is_delegated and component_path == "rotary_emb": + return + if _is_delegated and any( + name in component_path + for name in ( + "per_layer_input_gate", + "per_layer_projection", + "post_per_layer_input_norm", + ) + ): + return + # Skip models whose MLP/attn forward signatures require extra context from the block: # - BLOOM: MLP requires residual and alibi bias # - T5: requires cache_position for relative position embeddings From 50dc4a206d94662e7d008bef850cd233ce142d2b Mon Sep 17 00:00:00 2001 From: huseyincavusbi Date: Wed, 24 Jun 2026 18:49:34 +0300 Subject: [PATCH 4/6] fix: detect DelegatedAttentionBlockBridge via adapter.component_mapping instead of bridge_model.blocks --- transformer_lens/benchmarks/component_outputs.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/transformer_lens/benchmarks/component_outputs.py b/transformer_lens/benchmarks/component_outputs.py index 9aa260d93..bc2add46a 100644 --- a/transformer_lens/benchmarks/component_outputs.py +++ b/transformer_lens/benchmarks/component_outputs.py @@ -423,12 +423,9 @@ def _test_component_recursive( # These architectures delegate all math to HF; the benchmark can't call the HF # attention in isolation (missing position_embeddings, attention_mask, etc.) and # PLE submodules receive per-layer inputs at a different dimension than hidden_states. - _is_delegated = ( - hasattr(self.bridge_model, "blocks") - and "hook_q_input" - not in getattr( - self.bridge_model.blocks, "hook_aliases", {"hook_q_input": True} - ) + _blocks_component = getattr(self.adapter, "component_mapping", {}).get("blocks") if self.adapter is not None else None + _is_delegated = _blocks_component is not None and ( + "hook_q_input" not in getattr(_blocks_component, "hook_aliases", {"hook_q_input": True}) ) if _is_delegated and "attn" in component_path: return From 8ebb72962bfbbc609754d30b4467f1322cc90379 Mon Sep 17 00:00:00 2001 From: huseyincavusbi Date: Wed, 24 Jun 2026 18:56:29 +0300 Subject: [PATCH 5/6] fix: add rotary_emb skip in _test_component for delegated attention blocks --- .../benchmarks/component_outputs.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/transformer_lens/benchmarks/component_outputs.py b/transformer_lens/benchmarks/component_outputs.py index bc2add46a..09688b611 100644 --- a/transformer_lens/benchmarks/component_outputs.py +++ b/transformer_lens/benchmarks/component_outputs.py @@ -199,6 +199,17 @@ def print_detailed_analysis(self) -> None: class ComponentBenchmarker: """Benchmarking utility for testing TransformerBridge components against HuggingFace.""" + def _is_delegated_block(self) -> bool: + """Return True if the blocks component uses DelegatedAttentionBlockBridge.""" + blocks = ( + getattr(self.adapter, "component_mapping", {}).get("blocks") + if self.adapter is not None + else None + ) + return blocks is not None and ( + "hook_q_input" not in getattr(blocks, "hook_aliases", {"hook_q_input": True}) + ) + def __init__( self, bridge_model: nn.Module, @@ -423,10 +434,7 @@ def _test_component_recursive( # These architectures delegate all math to HF; the benchmark can't call the HF # attention in isolation (missing position_embeddings, attention_mask, etc.) and # PLE submodules receive per-layer inputs at a different dimension than hidden_states. - _blocks_component = getattr(self.adapter, "component_mapping", {}).get("blocks") if self.adapter is not None else None - _is_delegated = _blocks_component is not None and ( - "hook_q_input" not in getattr(_blocks_component, "hook_aliases", {"hook_q_input": True}) - ) + _is_delegated = self._is_delegated_block() if _is_delegated and "attn" in component_path: return if _is_delegated and component_path == "rotary_emb": @@ -548,6 +556,12 @@ def _test_component( ComponentTestResult or None if the component cannot be tested """ try: + # Skip rotary_emb for DelegatedAttentionBlockBridge architectures. + # Gemma4's RotaryEmbeddingBridge wraps a rotary that returns a set-like + # structure which the benchmark comparison can't subscript. + if self._is_delegated_block() and component_path == "rotary_emb": + return None + # Get bridge component # The adapter returns nn.Module, but for bridge models it's actually GeneralizedComponent bridge_component = cast( From 4d64b5d0c86bb3484f08a7b34080448a55cce59b Mon Sep 17 00:00:00 2001 From: huseyincavusbi Date: Wed, 24 Jun 2026 21:09:28 +0300 Subject: [PATCH 6/6] fix: remove dead rotary_emb skip from _test_component_recursive (rotary is top-level, only _test_component handles it) --- transformer_lens/benchmarks/component_outputs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/transformer_lens/benchmarks/component_outputs.py b/transformer_lens/benchmarks/component_outputs.py index 09688b611..adbb76924 100644 --- a/transformer_lens/benchmarks/component_outputs.py +++ b/transformer_lens/benchmarks/component_outputs.py @@ -437,8 +437,6 @@ def _test_component_recursive( _is_delegated = self._is_delegated_block() if _is_delegated and "attn" in component_path: return - if _is_delegated and component_path == "rotary_emb": - return if _is_delegated and any( name in component_path for name in (