Netflix · tolgaki · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/libvmaf/src/feature/common/convolution.c b/libvmaf/src/feature/common/convolution.c
@@ -32,20 +32,24 @@ void convolution_x_c_s(const float *filter, int filter_width, const float *src,
 	int borders_right = vmaf_floorn(width - (filter_width - radius), step);
 
 	for (int i = 0; i < height; ++i) {
+		const float *src_row = src + i * src_stride;
+		float *dst_row = dst + i * dst_stride;
+
 		for (int j = 0; j < borders_left; j += step) {
-			dst[i * dst_stride + j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
+			dst_row[j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
 		}
 
 		for (int j = borders_left; j < borders_right; j += step) {
+			const float *src_ptr = src_row + j - radius;
 			float accum = 0;
 			for (int k = 0; k < filter_width; ++k) {
-				accum += filter[k] * src[i * src_stride + j - radius + k];
+				accum += filter[k] * src_ptr[k];
 			}
-			dst[i * dst_stride + j / step] = accum;
+			dst_row[j / step] = accum;
 		}
 
 		for (int j = borders_right; j < width; j += step) {
-			dst[i * dst_stride + j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
+			dst_row[j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
 		}
 	}
 }
@@ -62,12 +66,13 @@ void convolution_y_c_s(const float *filter, int filter_width, const float *src,
 		}
 	}
 	for (int i = borders_top; i < borders_bottom; i += step) {
+		float *dst_row = dst + (i / step) * dst_stride;
 		for (int j = 0; j < width; ++j) {
 			float accum = 0;
 			for (int k = 0; k < filter_width; ++k) {
 				accum += filter[k] * src[(i - radius + k) * src_stride + j];
 			}
-			dst[(i / step) * dst_stride + j] = accum;
+			dst_row[j] = accum;
 		}
 	}
 	for (int i = borders_bottom; i < height; i += step) {

diff --git a/libvmaf/src/feature/feature_collector.c b/libvmaf/src/feature/feature_collector.c
@@ -152,7 +152,7 @@ static int feature_vector_init(FeatureVector **const feature_vector,
     fv->name = malloc(strlen(name) + 1);
     if (!fv->name) goto free_fv;
     strcpy(fv->name, name);
-    fv->capacity = 8;
+    fv->capacity = 512;
     fv->score = malloc(sizeof(fv->score[0]) * fv->capacity);
     if (!fv->score) goto free_name;
     memset(fv->score, 0, sizeof(fv->score[0]) * fv->capacity);

diff --git a/libvmaf/src/feature/integer_adm.c b/libvmaf/src/feature/integer_adm.c
@@ -723,12 +723,13 @@ static void adm_decouple(AdmBuffer *buf, int w, int h, int stride,
             t_mag_sq = (int64_t)th * th + (int64_t)tv * tv;
 
             /**
-             * angle_flag is calculated in floating-point by converting fixed-point variables back to
-             * floating-point
+             * angle_flag is calculated using fixed-point arithmetic.
+             * The 4096.0 divisors cancel out in the squared comparison,
+             * so we can compare directly in the integer domain.
              */
-            int angle_flag = (((float)ot_dp / 4096.0) >= 0.0f) &&
-                (((float)ot_dp / 4096.0) * ((float)ot_dp / 4096.0) >=
-                    cos_1deg_sq * ((float)o_mag_sq / 4096.0) * ((float)t_mag_sq / 4096.0));
+            int angle_flag = (ot_dp >= 0) &&
+                ((double)ot_dp * ot_dp >=
+                    cos_1deg_sq * (double)o_mag_sq * t_mag_sq);
 
             /**
              * Division th/oh is carried using lookup table and converted to multiplication
@@ -753,18 +754,16 @@ static void adm_decouple(AdmBuffer *buf, int w, int h, int stride,
             rst_v = ((kv * ov) + 16384) >> 15;
             rst_d = ((kd * od) + 16384) >> 15;
 
-            const float rst_h_f = ((float)kh / 32768) * ((float)oh / 64);
-            const float rst_v_f = ((float)kv / 32768) * ((float)ov / 64);
-            const float rst_d_f = ((float)kd / 32768) * ((float)od / 64);
+            /* Sign of rst_*_f = sign(k) * sign(o), but k is clamped >= 0,
+             * so the sign is determined solely by oh/ov/od. */
+            if (angle_flag && (kh > 0) && (oh > 0)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
+            if (angle_flag && (kh > 0) && (oh < 0)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
 
-            if (angle_flag && (rst_h_f > 0.)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
-            if (angle_flag && (rst_h_f < 0.)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
+            if (angle_flag && (kv > 0) && (ov > 0)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
+            if (angle_flag && (kv > 0) && (ov < 0)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
 
-            if (angle_flag && (rst_v_f > 0.)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
-            if (angle_flag && (rst_v_f < 0.)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
-
-            if (angle_flag && (rst_d_f > 0.)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
-            if (angle_flag && (rst_d_f < 0.)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od > 0)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od < 0)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
 
             r->band_h[i * stride + j] = rst_h;
             r->band_v[i * stride + j] = rst_v;
@@ -854,9 +853,9 @@ static void adm_decouple_s123(AdmBuffer *buf, int w, int h, int stride,
             o_mag_sq = (int64_t)oh * oh + (int64_t)ov * ov;
             t_mag_sq = (int64_t)th * th + (int64_t)tv * tv;
 
-            int angle_flag = (((float)ot_dp / 4096.0) >= 0.0f) &&
-                (((float)ot_dp / 4096.0) * ((float)ot_dp / 4096.0) >=
-                    cos_1deg_sq * ((float)o_mag_sq / 4096.0) * ((float)t_mag_sq / 4096.0));
+            int angle_flag = (ot_dp >= 0) &&
+                ((double)ot_dp * ot_dp >=
+                    cos_1deg_sq * (double)o_mag_sq * t_mag_sq);
 
             /**
              * Division th/oh is carried using lookup table and converted to multiplication
@@ -905,18 +904,16 @@ static void adm_decouple_s123(AdmBuffer *buf, int w, int h, int stride,
             rst_v = ((kv * ov) + 16384) >> 15;
             rst_d = ((kd * od) + 16384) >> 15;
 
-            const float rst_h_f = ((float)kh / 32768) * ((float)oh / 64);
-            const float rst_v_f = ((float)kv / 32768) * ((float)ov / 64);
-            const float rst_d_f = ((float)kd / 32768) * ((float)od / 64);
-
-            if (angle_flag && (rst_h_f > 0.)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
-            if (angle_flag && (rst_h_f < 0.)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
+            /* Sign of rst_*_f = sign(k) * sign(o), but k is clamped >= 0,
+             * so the sign is determined solely by oh/ov/od. */
+            if (angle_flag && (kh > 0) && (oh > 0)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
+            if (angle_flag && (kh > 0) && (oh < 0)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
 
-            if (angle_flag && (rst_v_f > 0.)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
-            if (angle_flag && (rst_v_f < 0.)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
+            if (angle_flag && (kv > 0) && (ov > 0)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
+            if (angle_flag && (kv > 0) && (ov < 0)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
 
-            if (angle_flag && (rst_d_f > 0.)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
-            if (angle_flag && (rst_d_f < 0.)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od > 0)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od < 0)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
 
             r->band_h[i * stride + j] = rst_h;
             r->band_v[i * stride + j] = rst_v;
@@ -961,8 +958,8 @@ static void adm_csf(AdmBuffer *buf, int w, int h, int stride,
         i_rfactor[2] = 49417;
     }
     else {
-        const double pow2_21 = pow(2, 21);
-        const double pow2_23 = pow(2, 23);
+        const double pow2_21 = 2097152.0; /* 2^21 */
+        const double pow2_23 = 8388608.0; /* 2^23 */
         i_rfactor[0] = (uint16_t) (rfactor1[0] * pow2_21);
         i_rfactor[1] = (uint16_t) (rfactor1[1] * pow2_21);
         i_rfactor[2] = (uint16_t) (rfactor1[2] * pow2_23);
@@ -1039,7 +1036,7 @@ static void i4_adm_csf(AdmBuffer *buf, int scale, int w, int h, int stride,
     const float rfactor1[3] = { 1.0f / factor1, 1.0f / factor1, 1.0f / factor2 };
 
     //i_rfactor in fixed-point
-    const double pow2_32 = pow(2, 32);
+    const double pow2_32 = 4294967296.0; /* 2^32 */
     const uint32_t i_rfactor[3] = { (uint32_t)(rfactor1[0] * pow2_32),
                                     (uint32_t)(rfactor1[1] * pow2_32),
                                     (uint32_t)(rfactor1[2] * pow2_32) };
@@ -1167,10 +1164,13 @@ static float adm_csf_den_scale(const adm_dwt_band_t *src, int w, int h,
      * after cubing 18bits are to shifted
      * Hence final shift is 18-shift_accum
      */
-    double shift_csf = pow(2, (18 - shift_accum));
-    double csf_h = (double)(accum_h / shift_csf) * pow(rfactor[0], 3);
-    double csf_v = (double)(accum_v / shift_csf) * pow(rfactor[1], 3);
-    double csf_d = (double)(accum_d / shift_csf) * pow(rfactor[2], 3);
+    double shift_csf = ldexp(1.0, (int)(18 - shift_accum));
+    double rfactor0_cubed = (double)rfactor[0] * rfactor[0] * rfactor[0];
+    double rfactor1_cubed = (double)rfactor[1] * rfactor[1] * rfactor[1];
+    double rfactor2_cubed = (double)rfactor[2] * rfactor[2] * rfactor[2];
+    double csf_h = (double)(accum_h / shift_csf) * rfactor0_cubed;
+    double csf_v = (double)(accum_v / shift_csf) * rfactor1_cubed;
+    double csf_d = (double)(accum_d / shift_csf) * rfactor2_cubed;
 
     float powf_add = powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f);
     float den_scale_h = powf(csf_h, 1.0f / 3.0f) + powf_add;
@@ -1206,9 +1206,9 @@ static float adm_csf_den_s123(const i4_adm_dwt_band_t *src, int scale, int w, in
     const int bottom = h - top;
 
     uint32_t shift_cub = (uint32_t)ceil(log2(right - left));
-    uint32_t add_shift_cub = (uint32_t)pow(2, (shift_cub - 1));
+    uint32_t add_shift_cub = 1u << (shift_cub - 1);
     uint32_t shift_accum = (uint32_t)ceil(log2(bottom - top));
-    uint32_t add_shift_accum = (uint32_t)pow(2, (shift_accum - 1));
+    uint32_t add_shift_accum = 1u << (shift_accum - 1);
 
     int32_t *src_h = src->band_h + top * src_stride;
     int32_t *src_v = src->band_v + top * src_stride;
@@ -1250,10 +1250,13 @@ static float adm_csf_den_s123(const i4_adm_dwt_band_t *src, int scale, int w, in
      * All the results are converted to floating-point to calculate the scores
      * For all scales the final shift is 3*shifts from dwt - total shifts done here
      */
-    double shift_csf = pow(2, (accum_convert_float[scale - 1] - shift_accum - shift_cub));
-    double csf_h = (double)(accum_h / shift_csf) * pow(rfactor[0], 3);
-    double csf_v = (double)(accum_v / shift_csf) * pow(rfactor[1], 3);
-    double csf_d = (double)(accum_d / shift_csf) * pow(rfactor[2], 3);
+    double shift_csf = ldexp(1.0, (int)(accum_convert_float[scale - 1] - shift_accum - shift_cub));
+    double rfactor0_cubed = (double)rfactor[0] * rfactor[0] * rfactor[0];
+    double rfactor1_cubed = (double)rfactor[1] * rfactor[1] * rfactor[1];
+    double rfactor2_cubed = (double)rfactor[2] * rfactor[2] * rfactor[2];
+    double csf_h = (double)(accum_h / shift_csf) * rfactor0_cubed;
+    double csf_v = (double)(accum_v / shift_csf) * rfactor1_cubed;
+    double csf_d = (double)(accum_d / shift_csf) * rfactor2_cubed;
 
     float powf_add = powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f);
     float den_scale_h = powf(csf_h, 1.0f / 3.0f) + powf_add;
@@ -1291,8 +1294,8 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
         i_rfactor[2] = 49417;
     }
     else {
-        const double pow2_21 = pow(2, 21);
-        const double pow2_23 = pow(2, 23);
+        const double pow2_21 = 2097152.0; /* 2^21 */
+        const double pow2_23 = 8388608.0; /* 2^23 */
         i_rfactor[0] = (uint16_t) (rfactor1[0] * pow2_21);
         i_rfactor[1] = (uint16_t) (rfactor1[1] * pow2_21);
         i_rfactor[2] = (uint16_t) (rfactor1[2] * pow2_23);
@@ -1306,16 +1309,16 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
     const int32_t add_shift_xdsq = 536870912;
 
     const uint32_t shift_xhcub = (uint32_t)ceil(log2(w) - 4);
-    const uint32_t add_shift_xhcub = (uint32_t)pow(2, (shift_xhcub - 1));
+    const uint32_t add_shift_xhcub = 1u << (shift_xhcub - 1);
 
     const uint32_t shift_xvcub = (uint32_t)ceil(log2(w) - 4);
-    const uint32_t add_shift_xvcub = (uint32_t)pow(2, (shift_xvcub - 1));
+    const uint32_t add_shift_xvcub = 1u << (shift_xvcub - 1);
 
     const uint32_t shift_xdcub = (uint32_t)ceil(log2(w) - 3);
-    const uint32_t add_shift_xdcub = (uint32_t)pow(2, (shift_xdcub - 1));
+    const uint32_t add_shift_xdcub = 1u << (shift_xdcub - 1);
 
     const uint32_t shift_inner_accum = (uint32_t)ceil(log2(h));
-    const uint32_t add_shift_inner_accum = (uint32_t)pow(2, (shift_inner_accum - 1));
+    const uint32_t add_shift_inner_accum = 1u << (shift_inner_accum - 1);
 
     const int32_t shift_xhsub = 10;
     const int32_t shift_xvsub = 10;
@@ -1630,9 +1633,9 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
      * => after cubing (6+23)*3=87 after squaring shifted by 30
      * hence pending is 57-shift's done based on width and height
      */
-    float f_accum_h = (float)(accum_h / pow(2, (52 - shift_xhcub - shift_inner_accum)));
-    float f_accum_v = (float)(accum_v / pow(2, (52 - shift_xvcub - shift_inner_accum)));
-    float f_accum_d = (float)(accum_d / pow(2, (57 - shift_xdcub - shift_inner_accum)));
+    float f_accum_h = (float)(accum_h / ldexp(1.0, (int)(52 - shift_xhcub - shift_inner_accum)));
+    float f_accum_v = (float)(accum_v / ldexp(1.0, (int)(52 - shift_xvcub - shift_inner_accum)));
+    float f_accum_d = (float)(accum_d / ldexp(1.0, (int)(57 - shift_xdcub - shift_inner_accum)));
 
     float num_scale_h = powf(f_accum_h, 1.0f / 3.0f) + powf((bottom - top) *
                         (right - left) / 32.0f, 1.0f / 3.0f);
@@ -1657,9 +1660,9 @@ static float i4_adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s
     float factor2 = dwt_quant_step(&dwt_7_9_YCbCr_threshold[0], scale, 2, adm_norm_view_dist, adm_ref_display_height);
     float rfactor1[3] = { 1.0f / factor1, 1.0f / factor1, 1.0f / factor2 };
 
-    const uint32_t rfactor[3] = { (uint32_t)(rfactor1[0] * pow(2, 32)),
-                                  (uint32_t)(rfactor1[1] * pow(2, 32)),
-                                  (uint32_t)(rfactor1[2] * pow(2, 32)) };
+    const uint32_t rfactor[3] = { (uint32_t)(rfactor1[0] * 4294967296.0),
+                                  (uint32_t)(rfactor1[1] * 4294967296.0),
+                                  (uint32_t)(rfactor1[2] * 4294967296.0) };
 
     const uint32_t shift_dst[3] = { 28, 28, 28 };
     const uint32_t shift_flt[3] = { 32, 32, 32 };
@@ -1672,14 +1675,14 @@ static float i4_adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s
     }
 
     uint32_t shift_cub = (uint32_t)ceil(log2(w));
-    uint32_t add_shift_cub = (uint32_t)pow(2, (shift_cub - 1));
+    uint32_t add_shift_cub = 1u << (shift_cub - 1);
 
     uint32_t shift_inner_accum = (uint32_t)ceil(log2(h));
-    uint32_t add_shift_inner_accum = (uint32_t)pow(2, (shift_inner_accum - 1));
+    uint32_t add_shift_inner_accum = 1u << (shift_inner_accum - 1);
 
-    float final_shift[3] = { pow(2,(45 - shift_cub - shift_inner_accum)),
-                             pow(2,(39 - shift_cub - shift_inner_accum)),
-                             pow(2,(36 - shift_cub - shift_inner_accum)) };
+    float final_shift[3] = { ldexp(1.0f, (int)(45 - shift_cub - shift_inner_accum)),
+                             ldexp(1.0f, (int)(39 - shift_cub - shift_inner_accum)),
+                             ldexp(1.0f, (int)(36 - shift_cub - shift_inner_accum)) };
 
     const int32_t shift_sq = 30;
     const int32_t add_shift_sq = 536870912; //2^29

diff --git a/libvmaf/src/feature/integer_motion.c b/libvmaf/src/feature/integer_motion.c
@@ -122,7 +122,7 @@ y_convolution_16(void *src, uint16_t *dst, unsigned width,
     const unsigned radius = filter_width / 2;
     const unsigned top_edge = vmaf_ceiln(radius, 1);
     const unsigned bottom_edge = vmaf_floorn(height - (filter_width - radius), 1);
-    const unsigned add_before_shift = (int) pow(2, (inp_size_bits - 1));
+    const unsigned add_before_shift = 1u << (inp_size_bits - 1);
     const unsigned shift_var = inp_size_bits;
 
     uint16_t *src_p = (uint16_t*) src + (top_edge - radius) * src_stride;
@@ -189,7 +189,7 @@ y_convolution_8(void *src, uint16_t *dst, unsigned width,
     const unsigned top_edge = vmaf_ceiln(radius, 1);
     const unsigned bottom_edge = vmaf_floorn(height - (filter_width - radius), 1);
     const unsigned shift_var = 8;
-    const unsigned add_before_shift = (int) pow(2, (shift_var - 1));
+    const unsigned add_before_shift = 1u << (shift_var - 1);
 
     for (unsigned i = 0; i < top_edge; i++) {
         for (unsigned j = 0; j < width; ++j) {
@@ -241,6 +241,16 @@ static void sad_c(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad)
     }
 }
 
+#if ARCH_X86
+static void sad_avx2_wrapper(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad)
+{
+    sad_16_avx2(pic_a->data[0], pic_b->data[0],
+                pic_a->w[0], pic_a->h[0],
+                pic_a->stride[0] / 2, pic_b->stride[0] / 2,
+                sad);
+}
+#endif
+
 static int extract_force_zero(VmafFeatureExtractor *fex,
                               VmafPicture *ref_pic, VmafPicture *ref_pic_90,
                               VmafPicture *dist_pic, VmafPicture *dist_pic_90,
@@ -304,18 +314,19 @@ static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt,
 
     s->y_convolution = bpc == 8 ? y_convolution_8 : y_convolution_16;
     s->x_convolution = x_convolution_16;
+    s->sad = sad_c;
 
 #if ARCH_X86
     unsigned flags = vmaf_get_cpu_flags();
-    if (flags & VMAF_X86_CPU_FLAG_AVX2)
+    if (flags & VMAF_X86_CPU_FLAG_AVX2) {
         s->x_convolution = x_convolution_16_avx2;
+        s->sad = sad_avx2_wrapper;
+    }
 #if HAVE_AVX512
     if (flags & VMAF_X86_CPU_FLAG_AVX512)
         s->x_convolution = x_convolution_16_avx512;
 #endif
 #endif
-
-    s->sad = sad_c;
     s->score = 0.;
 
     return 0;