Support overriding attention scale

jonatanklosko · jonatanklosko · commit a8caabddf6c8 · 2025-12-31T19:47:14.000+01:00
diff --git a/lib/bumblebee/layers.ex b/lib/bumblebee/layers.ex
@@ -219,8 +219,8 @@ defmodule Bumblebee.Layers do
     * `:window_size` - when set, enables sliding window attention.
       Should be a `{left, right}` tuple with window size on each side
 
-    * `:scale` - whether to scale attention weights by $\frac{1}{\sqrt{d}}$.
-      Defaults to `true`
+    * `:scale` - the scaling factor applied to the attention weights.
+      Defaults to $\frac{1}{\sqrt{d}}$
 
     * `:dropout_rate` - the dropout rate for attention weights dropout.
       Defaults to `0.0`
@@ -231,7 +231,7 @@ defmodule Bumblebee.Layers do
 
   """
   def attention(query, key, value, key_mask, head_mask, bias, offset, opts \\ []) do
-    opts = Keyword.validate!(opts, [:window_size, causal: false, scale: true, dropout_rate: 0.0])
+    opts = Keyword.validate!(opts, [:window_size, :scale, causal: false, dropout_rate: 0.0])
 
     weights =
       Axon.layer(
@@ -263,14 +263,18 @@ defmodule Bumblebee.Layers do
 
     weights = Nx.dot(query, [3], [0, 1], key, [3], [0, 1])
 
-    weights =
-      if opts[:scale] do
-        depth = Nx.axis_size(query, -1)
-        weights / Nx.as_type(Nx.sqrt(depth), Nx.type(query))
-      else
-        weights
+    scale =
+      case opts[:scale] do
+        nil ->
+          depth = Nx.axis_size(query, -1)
+          1 / Nx.as_type(Nx.sqrt(depth), Nx.type(query))
+
+        scale ->
+          scale
       end
 
+    weights = weights * scale
+
     key_mask =
       case key_mask do
         %Axon.None{} ->
diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex
@@ -53,7 +53,7 @@ defmodule Bumblebee.Layers.Transformer do
       :layer_norm,
       :block_type,
       :attention_window_size,
-      :scale_attention_weights,
+      :attention_scale,
       :query_norm,
       :key_norm
     ]
@@ -276,8 +276,8 @@ defmodule Bumblebee.Layers.Transformer do
     * `:attention_window_size` - when set, enables sliding window attention.
       Should be a `{left, right}` tuple with window size on each side
 
-    * `:scale_attention_weights` - whether to scale query in the traditional style of
-      multi-headed attention. Defaults to `true`
+    * `:attention_scale` - the scaling factor applied to the attention weights.
+      Defaults to $\frac{1}{\sqrt{d}}$.
 
     * `:rotary_embedding` - configuration of rotary embedding. If set,
       will apply rotary position embedding with the given options. Valid
@@ -331,7 +331,7 @@ defmodule Bumblebee.Layers.Transformer do
         block_type: :standard,
         layer_norm: [],
         attention_window_size: nil,
-        scale_attention_weights: true,
+        attention_scale: nil,
         rotary_embedding: nil,
         query_norm: nil,
         key_norm: nil
@@ -362,7 +362,7 @@ defmodule Bumblebee.Layers.Transformer do
     layer_norm = opts[:layer_norm]
     block_type = opts[:block_type]
     attention_window_size = opts[:attention_window_size]
-    scale_attention_weights = opts[:scale_attention_weights]
+    attention_scale = opts[:attention_scale]
     rotary_embedding = opts[:rotary_embedding]
     query_norm = opts[:query_norm]
     key_norm = opts[:key_norm]
@@ -422,7 +422,7 @@ defmodule Bumblebee.Layers.Transformer do
           value_use_bias: value_use_bias,
           output_use_bias: output_use_bias,
           attention_window_size: attention_window_size,
-          scale_attention_weights: scale_attention_weights,
+          attention_scale: attention_scale,
           rotary_embedding: rotary_embedding,
           query_norm: query_norm,
           key_norm: key_norm,
@@ -469,7 +469,7 @@ defmodule Bumblebee.Layers.Transformer do
           value_use_bias: value_use_bias,
           output_use_bias: output_use_bias,
           attention_window_size: attention_window_size,
-          scale_attention_weights: scale_attention_weights,
+          attention_scale: attention_scale,
           rotary_embedding: rotary_embedding,
           name: join(name, "cross_attention")
         )
@@ -699,8 +699,8 @@ defmodule Bumblebee.Layers.Transformer do
     * `:attention_window_size` - when set, enables sliding window attention.
       Should be a `{left, right}` tuple with window size on each side
 
-    * `:scale_attention_weights` - whether to scale query in the traditional style of
-      multi-headed attention. Defaults to `true`
+    * `:attention_scale` - the scaling factor applied to the attention weights.
+      Defaults to $\frac{1}{\sqrt{d}}$
 
     * `:rotary_embedding` - configuration of rotary embedding. If set,
       will apply rotary position embedding with the given options. Valid
@@ -742,7 +742,7 @@ defmodule Bumblebee.Layers.Transformer do
         offset: Layers.none(),
         causal: false,
         attention_window_size: nil,
-        scale_attention_weights: true,
+        attention_scale: nil,
         kernel_initializer: :glorot_uniform,
         dropout_rate: 0.0,
         attention_head_size: nil,
@@ -767,7 +767,7 @@ defmodule Bumblebee.Layers.Transformer do
     kernel_initializer = opts[:kernel_initializer]
     causal = opts[:causal]
     attention_window_size = opts[:attention_window_size]
-    scale_attention_weights = opts[:scale_attention_weights]
+    attention_scale = opts[:attention_scale]
     dropout_rate = opts[:dropout_rate]
     rotary_embedding = opts[:rotary_embedding]
     query_norm = opts[:query_norm]
@@ -908,7 +908,7 @@ defmodule Bumblebee.Layers.Transformer do
         attention_head_mask,
         attention_relative_bias,
         offset,
-        scale: scale_attention_weights,
+        scale: attention_scale,
         causal: causal,
         window_size: attention_window_size,
         dropout_rate: dropout_rate
diff --git a/lib/bumblebee/text/gpt2.ex b/lib/bumblebee/text/gpt2.ex
@@ -412,7 +412,7 @@ defmodule Bumblebee.Text.Gpt2 do
           activation: spec.activation
         ],
         block_type: :norm_first,
-        scale_attention_weights: spec.scale_attention_weights,
+        attention_scale: if(not spec.scale_attention_weights, do: 1),
         name: join(name, "blocks")
       ] ++
         if(spec.use_cross_attention,
diff --git a/lib/bumblebee/text/gpt_big_code.ex b/lib/bumblebee/text/gpt_big_code.ex
@@ -417,7 +417,7 @@ defmodule Bumblebee.Text.GptBigCode do
           activation: spec.activation
         ],
         block_type: :norm_first,
-        scale_attention_weights: spec.scale_attention_weights,
+        attention_scale: if(not spec.scale_attention_weights, do: 1),
         name: join(name, "blocks")
       ] ++
         if(spec.use_cross_attention,
diff --git a/lib/bumblebee/text/t5.ex b/lib/bumblebee/text/t5.ex
@@ -412,7 +412,7 @@ defmodule Bumblebee.Text.T5 do
           max_distance: spec.relative_attention_max_distance
         ],
         share_attention_relative_bias: true,
-        scale_attention_weights: false,
+        attention_scale: 1,
         name: join(name, "blocks")
       )
 
@@ -469,7 +469,7 @@ defmodule Bumblebee.Text.T5 do
           max_distance: spec.relative_attention_max_distance
         ],
         share_attention_relative_bias: true,
-        scale_attention_weights: false,
+        attention_scale: 1,
         name: join(name, "blocks")
       )