@@ -3226,7 +3226,20 @@ def array_compact(array: Expr) -> Expr:
32263226
32273227
32283228def array_normalize (array : Expr ) -> Expr :
3229- """Returns the L2-normalized vector for a numeric array.
3229+ """Scales a numeric array so it has Euclidean length 1.
3230+
3231+ Treats the array as a vector and divides every element by the vector's
3232+ Euclidean (L2) norm — the square root of the sum of the squared
3233+ elements. The returned array points in the same direction as the input
3234+ but has a magnitude of 1, which makes it suitable for cosine-similarity
3235+ comparisons and other operations that expect unit vectors.
3236+
3237+ For the input ``[3.0, 4.0]`` the L2 norm is ``sqrt(3**2 + 4**2) = 5``,
3238+ so each element is divided by 5 to produce ``[0.6, 0.8]``.
3239+
3240+ Normalizing the zero vector is undefined (it would divide by zero), so
3241+ the function returns NULL for an all-zero input. NULL is also returned
3242+ if any element of the input array is NULL.
32303243
32313244 Examples:
32323245 >>> ctx = dfn.SessionContext()
@@ -3236,16 +3249,45 @@ def array_normalize(array: Expr) -> Expr:
32363249 ... )
32373250 >>> result.collect_column("result")[0].as_py()
32383251 [0.6, 0.8]
3252+
3253+ The zero vector has no direction to preserve, so the result is NULL:
3254+
3255+ >>> df_zero = ctx.from_pydict({"a": [[0.0, 0.0]]})
3256+ >>> result = df_zero.select(
3257+ ... dfn.functions.array_normalize(dfn.col("a")).alias("result")
3258+ ... )
3259+ >>> result.collect_column("result")[0].as_py() is None
3260+ True
32393261 """
32403262 return Expr (f .array_normalize (array .expr ))
32413263
32423264
32433265def cosine_distance (array1 : Expr , array2 : Expr ) -> Expr :
3244- """Returns the cosine distance between two numeric arrays.
3266+ """Measures how much two numeric arrays differ in direction.
3267+
3268+ Treats each input as a vector and compares the angle between them,
3269+ ignoring their magnitudes. The result is ``1 - cosine_similarity``,
3270+ where cosine similarity is the dot product of the two vectors divided
3271+ by the product of their Euclidean (L2) norms.
3272+
3273+ The returned value ranges from 0 to 2:
3274+
3275+ * ``0`` — vectors point in the same direction (any positive scaling
3276+ of one yields the other).
3277+ * ``1`` — vectors are orthogonal (no shared direction).
3278+ * ``2`` — vectors point in exactly opposite directions.
32453279
3246- Computed as ``1 - cosine_similarity(array1, array2)``.
3280+ This is the standard distance metric for comparing embedding vectors
3281+ (text, image, audio) where direction carries the meaning and overall
3282+ magnitude does not.
3283+
3284+ Both arrays must have the same length; otherwise execution fails. If
3285+ either input is the zero vector the cosine is undefined and the
3286+ function returns NULL.
32473287
32483288 Examples:
3289+ Identical vectors have distance ``0``:
3290+
32493291 >>> ctx = dfn.SessionContext()
32503292 >>> df = ctx.from_pydict(
32513293 ... {"a": [[1.0, 2.0, 3.0]], "b": [[1.0, 2.0, 3.0]]}
@@ -3257,6 +3299,19 @@ def cosine_distance(array1: Expr, array2: Expr) -> Expr:
32573299 ... )
32583300 >>> result.collect_column("result")[0].as_py()
32593301 0.0
3302+
3303+ Orthogonal vectors have distance ``1``:
3304+
3305+ >>> df_orth = ctx.from_pydict(
3306+ ... {"a": [[1.0, 0.0]], "b": [[0.0, 1.0]]}
3307+ ... )
3308+ >>> result = df_orth.select(
3309+ ... dfn.functions.cosine_distance(
3310+ ... dfn.col("a"), dfn.col("b")
3311+ ... ).alias("result")
3312+ ... )
3313+ >>> result.collect_column("result")[0].as_py()
3314+ 1.0
32603315 """
32613316 return Expr (f .cosine_distance (array1 .expr , array2 .expr ))
32623317
@@ -3319,7 +3374,7 @@ def list_compact(array: Expr) -> Expr:
33193374
33203375
33213376def list_normalize (array : Expr ) -> Expr :
3322- """Returns the L2-normalized vector for a numeric array .
3377+ """Scales a numeric array so it has Euclidean length 1 .
33233378
33243379 See Also:
33253380 This is an alias for :py:func:`array_normalize`.
0 commit comments