diff --git a/libvmaf/src/feature/common/convolution.c b/libvmaf/src/feature/common/convolution.c
index e74f51dc4..9cc4e3edf 100644
--- a/libvmaf/src/feature/common/convolution.c
+++ b/libvmaf/src/feature/common/convolution.c
@@ -32,20 +32,24 @@ void convolution_x_c_s(const float *filter, int filter_width, const float *src,
 	int borders_right = vmaf_floorn(width - (filter_width - radius), step);
 
 	for (int i = 0; i < height; ++i) {
+		const float *src_row = src + i * src_stride;
+		float *dst_row = dst + i * dst_stride;
+
 		for (int j = 0; j < borders_left; j += step) {
-			dst[i * dst_stride + j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
+			dst_row[j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
 		}
 
 		for (int j = borders_left; j < borders_right; j += step) {
+			const float *src_ptr = src_row + j - radius;
 			float accum = 0;
 			for (int k = 0; k < filter_width; ++k) {
-				accum += filter[k] * src[i * src_stride + j - radius + k];
+				accum += filter[k] * src_ptr[k];
 			}
-			dst[i * dst_stride + j / step] = accum;
+			dst_row[j / step] = accum;
 		}
 
 		for (int j = borders_right; j < width; j += step) {
-			dst[i * dst_stride + j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
+			dst_row[j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j);
 		}
 	}
 }
@@ -62,12 +66,13 @@ void convolution_y_c_s(const float *filter, int filter_width, const float *src,
 		}
 	}
 	for (int i = borders_top; i < borders_bottom; i += step) {
+		float *dst_row = dst + (i / step) * dst_stride;
 		for (int j = 0; j < width; ++j) {
 			float accum = 0;
 			for (int k = 0; k < filter_width; ++k) {
 				accum += filter[k] * src[(i - radius + k) * src_stride + j];
 			}
-			dst[(i / step) * dst_stride + j] = accum;
+			dst_row[j] = accum;
 		}
 	}
 	for (int i = borders_bottom; i < height; i += step) {
diff --git a/libvmaf/src/feature/feature_collector.c b/libvmaf/src/feature/feature_collector.c
index 2eb2ec643..7dd531c00 100644
--- a/libvmaf/src/feature/feature_collector.c
+++ b/libvmaf/src/feature/feature_collector.c
@@ -152,7 +152,7 @@ static int feature_vector_init(FeatureVector **const feature_vector,
     fv->name = malloc(strlen(name) + 1);
     if (!fv->name) goto free_fv;
     strcpy(fv->name, name);
-    fv->capacity = 8;
+    fv->capacity = 512;
     fv->score = malloc(sizeof(fv->score[0]) * fv->capacity);
     if (!fv->score) goto free_name;
     memset(fv->score, 0, sizeof(fv->score[0]) * fv->capacity);
diff --git a/libvmaf/src/feature/integer_adm.c b/libvmaf/src/feature/integer_adm.c
index 269dbb5e8..1b9c04c56 100644
--- a/libvmaf/src/feature/integer_adm.c
+++ b/libvmaf/src/feature/integer_adm.c
@@ -723,12 +723,13 @@ static void adm_decouple(AdmBuffer *buf, int w, int h, int stride,
             t_mag_sq = (int64_t)th * th + (int64_t)tv * tv;
 
             /**
-             * angle_flag is calculated in floating-point by converting fixed-point variables back to
-             * floating-point
+             * angle_flag is calculated using fixed-point arithmetic.
+             * The 4096.0 divisors cancel out in the squared comparison,
+             * so we can compare directly in the integer domain.
              */
-            int angle_flag = (((float)ot_dp / 4096.0) >= 0.0f) &&
-                (((float)ot_dp / 4096.0) * ((float)ot_dp / 4096.0) >=
-                    cos_1deg_sq * ((float)o_mag_sq / 4096.0) * ((float)t_mag_sq / 4096.0));
+            int angle_flag = (ot_dp >= 0) &&
+                ((double)ot_dp * ot_dp >=
+                    cos_1deg_sq * (double)o_mag_sq * t_mag_sq);
 
             /**
              * Division th/oh is carried using lookup table and converted to multiplication
@@ -753,18 +754,16 @@ static void adm_decouple(AdmBuffer *buf, int w, int h, int stride,
             rst_v = ((kv * ov) + 16384) >> 15;
             rst_d = ((kd * od) + 16384) >> 15;
 
-            const float rst_h_f = ((float)kh / 32768) * ((float)oh / 64);
-            const float rst_v_f = ((float)kv / 32768) * ((float)ov / 64);
-            const float rst_d_f = ((float)kd / 32768) * ((float)od / 64);
+            /* Sign of rst_*_f = sign(k) * sign(o), but k is clamped >= 0,
+             * so the sign is determined solely by oh/ov/od. */
+            if (angle_flag && (kh > 0) && (oh > 0)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
+            if (angle_flag && (kh > 0) && (oh < 0)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
 
-            if (angle_flag && (rst_h_f > 0.)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
-            if (angle_flag && (rst_h_f < 0.)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
+            if (angle_flag && (kv > 0) && (ov > 0)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
+            if (angle_flag && (kv > 0) && (ov < 0)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
 
-            if (angle_flag && (rst_v_f > 0.)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
-            if (angle_flag && (rst_v_f < 0.)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
-
-            if (angle_flag && (rst_d_f > 0.)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
-            if (angle_flag && (rst_d_f < 0.)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od > 0)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od < 0)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
 
             r->band_h[i * stride + j] = rst_h;
             r->band_v[i * stride + j] = rst_v;
@@ -854,9 +853,9 @@ static void adm_decouple_s123(AdmBuffer *buf, int w, int h, int stride,
             o_mag_sq = (int64_t)oh * oh + (int64_t)ov * ov;
             t_mag_sq = (int64_t)th * th + (int64_t)tv * tv;
 
-            int angle_flag = (((float)ot_dp / 4096.0) >= 0.0f) &&
-                (((float)ot_dp / 4096.0) * ((float)ot_dp / 4096.0) >=
-                    cos_1deg_sq * ((float)o_mag_sq / 4096.0) * ((float)t_mag_sq / 4096.0));
+            int angle_flag = (ot_dp >= 0) &&
+                ((double)ot_dp * ot_dp >=
+                    cos_1deg_sq * (double)o_mag_sq * t_mag_sq);
 
             /**
              * Division th/oh is carried using lookup table and converted to multiplication
@@ -905,18 +904,16 @@ static void adm_decouple_s123(AdmBuffer *buf, int w, int h, int stride,
             rst_v = ((kv * ov) + 16384) >> 15;
             rst_d = ((kd * od) + 16384) >> 15;
 
-            const float rst_h_f = ((float)kh / 32768) * ((float)oh / 64);
-            const float rst_v_f = ((float)kv / 32768) * ((float)ov / 64);
-            const float rst_d_f = ((float)kd / 32768) * ((float)od / 64);
-
-            if (angle_flag && (rst_h_f > 0.)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
-            if (angle_flag && (rst_h_f < 0.)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
+            /* Sign of rst_*_f = sign(k) * sign(o), but k is clamped >= 0,
+             * so the sign is determined solely by oh/ov/od. */
+            if (angle_flag && (kh > 0) && (oh > 0)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th);
+            if (angle_flag && (kh > 0) && (oh < 0)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th);
 
-            if (angle_flag && (rst_v_f > 0.)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
-            if (angle_flag && (rst_v_f < 0.)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
+            if (angle_flag && (kv > 0) && (ov > 0)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv);
+            if (angle_flag && (kv > 0) && (ov < 0)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv);
 
-            if (angle_flag && (rst_d_f > 0.)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
-            if (angle_flag && (rst_d_f < 0.)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od > 0)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td);
+            if (angle_flag && (kd > 0) && (od < 0)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td);
 
             r->band_h[i * stride + j] = rst_h;
             r->band_v[i * stride + j] = rst_v;
@@ -961,8 +958,8 @@ static void adm_csf(AdmBuffer *buf, int w, int h, int stride,
         i_rfactor[2] = 49417;
     }
     else {
-        const double pow2_21 = pow(2, 21);
-        const double pow2_23 = pow(2, 23);
+        const double pow2_21 = 2097152.0; /* 2^21 */
+        const double pow2_23 = 8388608.0; /* 2^23 */
         i_rfactor[0] = (uint16_t) (rfactor1[0] * pow2_21);
         i_rfactor[1] = (uint16_t) (rfactor1[1] * pow2_21);
         i_rfactor[2] = (uint16_t) (rfactor1[2] * pow2_23);
@@ -1039,7 +1036,7 @@ static void i4_adm_csf(AdmBuffer *buf, int scale, int w, int h, int stride,
     const float rfactor1[3] = { 1.0f / factor1, 1.0f / factor1, 1.0f / factor2 };
 
     //i_rfactor in fixed-point
-    const double pow2_32 = pow(2, 32);
+    const double pow2_32 = 4294967296.0; /* 2^32 */
     const uint32_t i_rfactor[3] = { (uint32_t)(rfactor1[0] * pow2_32),
                                     (uint32_t)(rfactor1[1] * pow2_32),
                                     (uint32_t)(rfactor1[2] * pow2_32) };
@@ -1167,10 +1164,13 @@ static float adm_csf_den_scale(const adm_dwt_band_t *src, int w, int h,
      * after cubing 18bits are to shifted
      * Hence final shift is 18-shift_accum
      */
-    double shift_csf = pow(2, (18 - shift_accum));
-    double csf_h = (double)(accum_h / shift_csf) * pow(rfactor[0], 3);
-    double csf_v = (double)(accum_v / shift_csf) * pow(rfactor[1], 3);
-    double csf_d = (double)(accum_d / shift_csf) * pow(rfactor[2], 3);
+    double shift_csf = ldexp(1.0, (int)(18 - shift_accum));
+    double rfactor0_cubed = (double)rfactor[0] * rfactor[0] * rfactor[0];
+    double rfactor1_cubed = (double)rfactor[1] * rfactor[1] * rfactor[1];
+    double rfactor2_cubed = (double)rfactor[2] * rfactor[2] * rfactor[2];
+    double csf_h = (double)(accum_h / shift_csf) * rfactor0_cubed;
+    double csf_v = (double)(accum_v / shift_csf) * rfactor1_cubed;
+    double csf_d = (double)(accum_d / shift_csf) * rfactor2_cubed;
 
     float powf_add = powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f);
     float den_scale_h = powf(csf_h, 1.0f / 3.0f) + powf_add;
@@ -1206,9 +1206,9 @@ static float adm_csf_den_s123(const i4_adm_dwt_band_t *src, int scale, int w, in
     const int bottom = h - top;
 
     uint32_t shift_cub = (uint32_t)ceil(log2(right - left));
-    uint32_t add_shift_cub = (uint32_t)pow(2, (shift_cub - 1));
+    uint32_t add_shift_cub = 1u << (shift_cub - 1);
     uint32_t shift_accum = (uint32_t)ceil(log2(bottom - top));
-    uint32_t add_shift_accum = (uint32_t)pow(2, (shift_accum - 1));
+    uint32_t add_shift_accum = 1u << (shift_accum - 1);
 
     int32_t *src_h = src->band_h + top * src_stride;
     int32_t *src_v = src->band_v + top * src_stride;
@@ -1250,10 +1250,13 @@ static float adm_csf_den_s123(const i4_adm_dwt_band_t *src, int scale, int w, in
      * All the results are converted to floating-point to calculate the scores
      * For all scales the final shift is 3*shifts from dwt - total shifts done here
      */
-    double shift_csf = pow(2, (accum_convert_float[scale - 1] - shift_accum - shift_cub));
-    double csf_h = (double)(accum_h / shift_csf) * pow(rfactor[0], 3);
-    double csf_v = (double)(accum_v / shift_csf) * pow(rfactor[1], 3);
-    double csf_d = (double)(accum_d / shift_csf) * pow(rfactor[2], 3);
+    double shift_csf = ldexp(1.0, (int)(accum_convert_float[scale - 1] - shift_accum - shift_cub));
+    double rfactor0_cubed = (double)rfactor[0] * rfactor[0] * rfactor[0];
+    double rfactor1_cubed = (double)rfactor[1] * rfactor[1] * rfactor[1];
+    double rfactor2_cubed = (double)rfactor[2] * rfactor[2] * rfactor[2];
+    double csf_h = (double)(accum_h / shift_csf) * rfactor0_cubed;
+    double csf_v = (double)(accum_v / shift_csf) * rfactor1_cubed;
+    double csf_d = (double)(accum_d / shift_csf) * rfactor2_cubed;
 
     float powf_add = powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f);
     float den_scale_h = powf(csf_h, 1.0f / 3.0f) + powf_add;
@@ -1291,8 +1294,8 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
         i_rfactor[2] = 49417;
     }
     else {
-        const double pow2_21 = pow(2, 21);
-        const double pow2_23 = pow(2, 23);
+        const double pow2_21 = 2097152.0; /* 2^21 */
+        const double pow2_23 = 8388608.0; /* 2^23 */
         i_rfactor[0] = (uint16_t) (rfactor1[0] * pow2_21);
         i_rfactor[1] = (uint16_t) (rfactor1[1] * pow2_21);
         i_rfactor[2] = (uint16_t) (rfactor1[2] * pow2_23);
@@ -1306,16 +1309,16 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
     const int32_t add_shift_xdsq = 536870912;
 
     const uint32_t shift_xhcub = (uint32_t)ceil(log2(w) - 4);
-    const uint32_t add_shift_xhcub = (uint32_t)pow(2, (shift_xhcub - 1));
+    const uint32_t add_shift_xhcub = 1u << (shift_xhcub - 1);
 
     const uint32_t shift_xvcub = (uint32_t)ceil(log2(w) - 4);
-    const uint32_t add_shift_xvcub = (uint32_t)pow(2, (shift_xvcub - 1));
+    const uint32_t add_shift_xvcub = 1u << (shift_xvcub - 1);
 
     const uint32_t shift_xdcub = (uint32_t)ceil(log2(w) - 3);
-    const uint32_t add_shift_xdcub = (uint32_t)pow(2, (shift_xdcub - 1));
+    const uint32_t add_shift_xdcub = 1u << (shift_xdcub - 1);
 
     const uint32_t shift_inner_accum = (uint32_t)ceil(log2(h));
-    const uint32_t add_shift_inner_accum = (uint32_t)pow(2, (shift_inner_accum - 1));
+    const uint32_t add_shift_inner_accum = 1u << (shift_inner_accum - 1);
 
     const int32_t shift_xhsub = 10;
     const int32_t shift_xvsub = 10;
@@ -1630,9 +1633,9 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
      * => after cubing (6+23)*3=87 after squaring shifted by 30
      * hence pending is 57-shift's done based on width and height
      */
-    float f_accum_h = (float)(accum_h / pow(2, (52 - shift_xhcub - shift_inner_accum)));
-    float f_accum_v = (float)(accum_v / pow(2, (52 - shift_xvcub - shift_inner_accum)));
-    float f_accum_d = (float)(accum_d / pow(2, (57 - shift_xdcub - shift_inner_accum)));
+    float f_accum_h = (float)(accum_h / ldexp(1.0, (int)(52 - shift_xhcub - shift_inner_accum)));
+    float f_accum_v = (float)(accum_v / ldexp(1.0, (int)(52 - shift_xvcub - shift_inner_accum)));
+    float f_accum_d = (float)(accum_d / ldexp(1.0, (int)(57 - shift_xdcub - shift_inner_accum)));
 
     float num_scale_h = powf(f_accum_h, 1.0f / 3.0f) + powf((bottom - top) *
                         (right - left) / 32.0f, 1.0f / 3.0f);
@@ -1657,9 +1660,9 @@ static float i4_adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s
     float factor2 = dwt_quant_step(&dwt_7_9_YCbCr_threshold[0], scale, 2, adm_norm_view_dist, adm_ref_display_height);
     float rfactor1[3] = { 1.0f / factor1, 1.0f / factor1, 1.0f / factor2 };
 
-    const uint32_t rfactor[3] = { (uint32_t)(rfactor1[0] * pow(2, 32)),
-                                  (uint32_t)(rfactor1[1] * pow(2, 32)),
-                                  (uint32_t)(rfactor1[2] * pow(2, 32)) };
+    const uint32_t rfactor[3] = { (uint32_t)(rfactor1[0] * 4294967296.0),
+                                  (uint32_t)(rfactor1[1] * 4294967296.0),
+                                  (uint32_t)(rfactor1[2] * 4294967296.0) };
 
     const uint32_t shift_dst[3] = { 28, 28, 28 };
     const uint32_t shift_flt[3] = { 32, 32, 32 };
@@ -1672,14 +1675,14 @@ static float i4_adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s
     }
 
     uint32_t shift_cub = (uint32_t)ceil(log2(w));
-    uint32_t add_shift_cub = (uint32_t)pow(2, (shift_cub - 1));
+    uint32_t add_shift_cub = 1u << (shift_cub - 1);
 
     uint32_t shift_inner_accum = (uint32_t)ceil(log2(h));
-    uint32_t add_shift_inner_accum = (uint32_t)pow(2, (shift_inner_accum - 1));
+    uint32_t add_shift_inner_accum = 1u << (shift_inner_accum - 1);
 
-    float final_shift[3] = { pow(2,(45 - shift_cub - shift_inner_accum)),
-                             pow(2,(39 - shift_cub - shift_inner_accum)),
-                             pow(2,(36 - shift_cub - shift_inner_accum)) };
+    float final_shift[3] = { ldexp(1.0f, (int)(45 - shift_cub - shift_inner_accum)),
+                             ldexp(1.0f, (int)(39 - shift_cub - shift_inner_accum)),
+                             ldexp(1.0f, (int)(36 - shift_cub - shift_inner_accum)) };
 
     const int32_t shift_sq = 30;
     const int32_t add_shift_sq = 536870912; //2^29
diff --git a/libvmaf/src/feature/integer_motion.c b/libvmaf/src/feature/integer_motion.c
index 58eef5be9..b63fc17c6 100644
--- a/libvmaf/src/feature/integer_motion.c
+++ b/libvmaf/src/feature/integer_motion.c
@@ -122,7 +122,7 @@ y_convolution_16(void *src, uint16_t *dst, unsigned width,
     const unsigned radius = filter_width / 2;
     const unsigned top_edge = vmaf_ceiln(radius, 1);
     const unsigned bottom_edge = vmaf_floorn(height - (filter_width - radius), 1);
-    const unsigned add_before_shift = (int) pow(2, (inp_size_bits - 1));
+    const unsigned add_before_shift = 1u << (inp_size_bits - 1);
     const unsigned shift_var = inp_size_bits;
 
     uint16_t *src_p = (uint16_t*) src + (top_edge - radius) * src_stride;
@@ -189,7 +189,7 @@ y_convolution_8(void *src, uint16_t *dst, unsigned width,
     const unsigned top_edge = vmaf_ceiln(radius, 1);
     const unsigned bottom_edge = vmaf_floorn(height - (filter_width - radius), 1);
     const unsigned shift_var = 8;
-    const unsigned add_before_shift = (int) pow(2, (shift_var - 1));
+    const unsigned add_before_shift = 1u << (shift_var - 1);
 
     for (unsigned i = 0; i < top_edge; i++) {
         for (unsigned j = 0; j < width; ++j) {
@@ -241,6 +241,16 @@ static void sad_c(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad)
     }
 }
 
+#if ARCH_X86
+static void sad_avx2_wrapper(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad)
+{
+    sad_16_avx2(pic_a->data[0], pic_b->data[0],
+                pic_a->w[0], pic_a->h[0],
+                pic_a->stride[0] / 2, pic_b->stride[0] / 2,
+                sad);
+}
+#endif
+
 static int extract_force_zero(VmafFeatureExtractor *fex,
                               VmafPicture *ref_pic, VmafPicture *ref_pic_90,
                               VmafPicture *dist_pic, VmafPicture *dist_pic_90,
@@ -304,18 +314,19 @@ static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt,
 
     s->y_convolution = bpc == 8 ? y_convolution_8 : y_convolution_16;
     s->x_convolution = x_convolution_16;
+    s->sad = sad_c;
 
 #if ARCH_X86
     unsigned flags = vmaf_get_cpu_flags();
-    if (flags & VMAF_X86_CPU_FLAG_AVX2)
+    if (flags & VMAF_X86_CPU_FLAG_AVX2) {
         s->x_convolution = x_convolution_16_avx2;
+        s->sad = sad_avx2_wrapper;
+    }
 #if HAVE_AVX512
     if (flags & VMAF_X86_CPU_FLAG_AVX512)
         s->x_convolution = x_convolution_16_avx512;
 #endif
 #endif
-
-    s->sad = sad_c;
     s->score = 0.;
 
     return 0;
diff --git a/libvmaf/src/feature/integer_psnr.c b/libvmaf/src/feature/integer_psnr.c
index 18b5b9205..0525cacbe 100644
--- a/libvmaf/src/feature/integer_psnr.c
+++ b/libvmaf/src/feature/integer_psnr.c
@@ -23,10 +23,15 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "cpu.h"
 #include "feature_collector.h"
 #include "feature_extractor.h"
 #include "opt.h"
 
+#if ARCH_X86
+#include "x86/psnr_avx2.h"
+#endif
+
 typedef struct PsnrState {
     bool enable_chroma;
     bool enable_mse;
@@ -35,6 +40,10 @@ typedef struct PsnrState {
     uint32_t peak;
     double psnr_max[3];
     double min_sse;
+    void (*compute_sse_8)(const uint8_t *ref, const uint8_t *dis,
+                          unsigned w, unsigned h,
+                          ptrdiff_t stride_ref, ptrdiff_t stride_dis,
+                          uint64_t *sse);
     struct {
         uint64_t sse[3];
         uint64_t n_pixels[3];
@@ -83,6 +92,25 @@ static const VmafOption options[] = {
 };
 
 
+static void psnr_sse_8_c(const uint8_t *ref, const uint8_t *dis,
+                         unsigned w, unsigned h,
+                         ptrdiff_t stride_ref, ptrdiff_t stride_dis,
+                         uint64_t *sse)
+{
+    uint64_t total_sse = 0;
+    for (unsigned i = 0; i < h; i++) {
+        uint32_t sse_inner = 0;
+        for (unsigned j = 0; j < w; j++) {
+            const int16_t e = ref[j] - dis[j];
+            sse_inner += e * e;
+        }
+        total_sse += sse_inner;
+        ref += stride_ref;
+        dis += stride_dis;
+    }
+    *sse = total_sse;
+}
+
 static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt,
                 unsigned bpc, unsigned w, unsigned h)
 {
@@ -92,11 +120,18 @@ static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt,
     if (pix_fmt == VMAF_PIX_FMT_YUV400P)
         s->enable_chroma = false;
 
+    s->compute_sse_8 = psnr_sse_8_c;
+#if ARCH_X86
+    unsigned flags = vmaf_get_cpu_flags();
+    if (flags & VMAF_X86_CPU_FLAG_AVX2)
+        s->compute_sse_8 = psnr_sse_8_avx2;
+#endif
+
     for (unsigned i = 0; i < 3; i++) {
         if (s->min_sse != 0.0) {
             const int ss_hor = pix_fmt != VMAF_PIX_FMT_YUV444P;
             const int ss_ver = pix_fmt == VMAF_PIX_FMT_YUV420P;
-            const double mse = s->min_sse / 
+            const double mse = s->min_sse /
                 (((i && ss_hor) ? w / 2 : w) * ((i && ss_ver) ? h / 2 : h));
             s->psnr_max[i] = ceil(10. * log10(s->peak * s->peak / mse));
         } else {
@@ -123,20 +158,11 @@ static int psnr(VmafPicture *ref_pic, VmafPicture *dist_pic,
     int err = 0;
 
     for (unsigned p = 0; p < n; p++) {
-        uint8_t *ref = ref_pic->data[p];
-        uint8_t *dis = dist_pic->data[p];
-
         uint64_t sse = 0;
-        for (unsigned i = 0; i < ref_pic->h[p]; i++) {
-            uint32_t sse_inner = 0;
-            for (unsigned j = 0; j < ref_pic->w[p]; j++) {
-                const int16_t e = ref[j] - dis[j];
-                sse_inner += e * e;
-            }
-            sse += sse_inner;
-            ref += ref_pic->stride[p];
-            dis += dist_pic->stride[p];
-        }
+        s->compute_sse_8(ref_pic->data[p], dist_pic->data[p],
+                         ref_pic->w[p], ref_pic->h[p],
+                         ref_pic->stride[p], dist_pic->stride[p],
+                         &sse);
 
         if (s->enable_apsnr) {
             s->apsnr.sse[p] += sse;
diff --git a/libvmaf/src/feature/integer_vif.c b/libvmaf/src/feature/integer_vif.c
index fd6a82317..a9e8b8720 100644
--- a/libvmaf/src/feature/integer_vif.c
+++ b/libvmaf/src/feature/integer_vif.c
@@ -301,34 +301,19 @@ void vif_statistic_8(struct VifPublicState *s, float *num, float *den, unsigned
 
             sigma2_sq = MAX(sigma2_sq, 0);
             if (sigma1_sq >= sigma_nsq) {
-                /**
-                * log values are taken from the look-up table generated by
-                * log_generate() function which is called in integer_combo_threadfunc
-                * den_val in float is log2(1 + sigma1_sq/2)
-                * here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17
-                * multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value
-                * x because best 16 bits are taken
-                */
                 accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17;
 
                 if (sigma12 > 0 && sigma2_sq > 0) {
-                    /**
-                    * In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq))
-                    *
-                    * In Fixed-point the above is converted to
-                    * numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq)
-                    */
-
-                    const double eps = 65536 * 1.0e-10;
-                    double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away
+                    double g = (double)sigma12 / sigma1_sq;
                     int32_t sv_sq = sigma2_sq - g * sigma12;
 
                     sv_sq = (uint32_t)(MAX(sv_sq, 0));
 
                     g = MIN(g, vif_enhn_gain_limit);
 
+                    double g_sq = g * g;
                     uint32_t numer1 = (sv_sq + sigma_nsq);
-                    int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator
+                    int64_t numer1_tmp = (int64_t)(g_sq * sigma1_sq) + numer1;
                     accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1);
                 }
             }
@@ -444,34 +429,19 @@ void vif_statistic_16(struct VifPublicState *s, float *num, float *den, unsigned
 
             sigma2_sq = MAX(sigma2_sq, 0);
             if (sigma1_sq >= sigma_nsq) {
-                /**
-                * log values are taken from the look-up table generated by
-                * log_generate() function which is called in integer_combo_threadfunc
-                * den_val in float is log2(1 + sigma1_sq/2)
-                * here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17
-                * multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value
-                * x because best 16 bits are taken
-                */
                 accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17;
 
                 if (sigma12 > 0 && sigma2_sq > 0) {
-                    /**
-                    * In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq))
-                    *
-                    * In Fixed-point the above is converted to
-                    * numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq)
-                    */
-
-                    const double eps = 65536 * 1.0e-10;
-                    double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away
+                    double g = (double)sigma12 / sigma1_sq;
                     int32_t sv_sq = sigma2_sq - g * sigma12;
 
                     sv_sq = (uint32_t)(MAX(sv_sq, 0));
 
                     g = MIN(g, vif_enhn_gain_limit);
 
+                    double g_sq = g * g;
                     uint32_t numer1 = (sv_sq + sigma_nsq);
-                    int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator
+                    int64_t numer1_tmp = (int64_t)(g_sq * sigma1_sq) + numer1;
                     accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1);
                 }
             }
@@ -535,34 +505,19 @@ VifResiduals vif_compute_line_residuals(VifPublicState *s, unsigned from,
 
         sigma2_sq = MAX(sigma2_sq, 0);
         if (sigma1_sq >= sigma_nsq) {
-            /**
-            * log values are taken from the look-up table generated by
-            * log_generate() function which is called in integer_combo_threadfunc
-            * den_val in float is log2(1 + sigma1_sq/2)
-            * here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17
-            * multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value
-            * x because best 16 bits are taken
-            */
             residuals.accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17;
 
             if (sigma12 > 0 && sigma2_sq > 0) {
-                /**
-                * In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq))
-                *
-                * In Fixed-point the above is converted to
-                * numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq)
-                */
-
-                const double eps = 65536 * 1.0e-10;
-                double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away
+                double g = (double)sigma12 / sigma1_sq;
                 int32_t sv_sq = sigma2_sq - g * sigma12;
 
                 sv_sq = (uint32_t)(MAX(sv_sq, 0));
 
                 g = MIN(g, vif_enhn_gain_limit);
 
+                double g_sq = g * g;
                 uint32_t numer1 = (sv_sq + sigma_nsq);
-                int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator
+                int64_t numer1_tmp = (int64_t)(g_sq * sigma1_sq) + numer1;
                 residuals.accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1);
             }
         }
diff --git a/libvmaf/src/feature/x86/motion_avx2.c b/libvmaf/src/feature/x86/motion_avx2.c
index f8491f36c..118709616 100644
--- a/libvmaf/src/feature/x86/motion_avx2.c
+++ b/libvmaf/src/feature/x86/motion_avx2.c
@@ -19,6 +19,7 @@
 #include <immintrin.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdlib.h>
 
 #include "feature/integer_motion.h"
 #include "feature/common/alignment.h"
@@ -132,3 +133,57 @@ void x_convolution_16_avx2(const uint16_t *src, uint16_t *dst, unsigned width,
         }
     }
 }
+
+void sad_16_avx2(const uint16_t *a, const uint16_t *b,
+                 unsigned w, unsigned h,
+                 ptrdiff_t stride_a, ptrdiff_t stride_b,
+                 uint64_t *sad)
+{
+    __m256i acc = _mm256_setzero_si256();
+    uint64_t total_sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        unsigned j = 0;
+        __m256i row_acc = _mm256_setzero_si256();
+
+        /* Process 16 uint16_t elements at a time */
+        for (; j + 16 <= w; j += 16) {
+            __m256i va = _mm256_loadu_si256((const __m256i *)(a + j));
+            __m256i vb = _mm256_loadu_si256((const __m256i *)(b + j));
+            /* Compute absolute difference: max(a-b, b-a) for unsigned */
+            __m256i diff1 = _mm256_subs_epu16(va, vb);
+            __m256i diff2 = _mm256_subs_epu16(vb, va);
+            __m256i absdiff = _mm256_or_si256(diff1, diff2);
+            /* Widen to 32-bit and accumulate */
+            __m256i lo = _mm256_unpacklo_epi16(absdiff, _mm256_setzero_si256());
+            __m256i hi = _mm256_unpackhi_epi16(absdiff, _mm256_setzero_si256());
+            row_acc = _mm256_add_epi32(row_acc, lo);
+            row_acc = _mm256_add_epi32(row_acc, hi);
+        }
+
+        /* Reduce row_acc to scalar (collapse 8x32-bit to 64-bit) */
+        __m256i row_acc_lo = _mm256_unpacklo_epi32(row_acc, _mm256_setzero_si256());
+        __m256i row_acc_hi = _mm256_unpackhi_epi32(row_acc, _mm256_setzero_si256());
+        acc = _mm256_add_epi64(acc, row_acc_lo);
+        acc = _mm256_add_epi64(acc, row_acc_hi);
+
+        /* Scalar tail */
+        uint32_t inner_sad = 0;
+        for (; j < w; j++) {
+            inner_sad += abs(a[j] - b[j]);
+        }
+        total_sad += inner_sad;
+
+        a += stride_a;
+        b += stride_b;
+    }
+
+    /* Horizontal sum of acc (4x64-bit) */
+    __m128i lo128 = _mm256_castsi256_si128(acc);
+    __m128i hi128 = _mm256_extracti128_si256(acc, 1);
+    __m128i sum128 = _mm_add_epi64(lo128, hi128);
+    __m128i sum64 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
+    total_sad += (uint64_t)_mm_extract_epi64(sum64, 0);
+
+    *sad = total_sad;
+}
diff --git a/libvmaf/src/feature/x86/motion_avx2.h b/libvmaf/src/feature/x86/motion_avx2.h
index 0b0973b3d..ed97efcb9 100644
--- a/libvmaf/src/feature/x86/motion_avx2.h
+++ b/libvmaf/src/feature/x86/motion_avx2.h
@@ -25,4 +25,9 @@ void x_convolution_16_avx2(const uint16_t *src, uint16_t *dst, unsigned width,
                            unsigned height, ptrdiff_t src_stride,
                            ptrdiff_t dst_stride);
 
+void sad_16_avx2(const uint16_t *a, const uint16_t *b,
+                 unsigned w, unsigned h,
+                 ptrdiff_t stride_a, ptrdiff_t stride_b,
+                 uint64_t *sad);
+
 #endif /* X86_AVX2_MOTION_H_ */
diff --git a/libvmaf/src/feature/x86/psnr_avx2.c b/libvmaf/src/feature/x86/psnr_avx2.c
new file mode 100644
index 000000000..fcd58b3b1
--- /dev/null
+++ b/libvmaf/src/feature/x86/psnr_avx2.c
@@ -0,0 +1,87 @@
+/**
+ *
+ *  Copyright 2016-2020 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+
+void psnr_sse_8_avx2(const uint8_t *ref, const uint8_t *dis,
+                     unsigned w, unsigned h,
+                     ptrdiff_t stride_ref, ptrdiff_t stride_dis,
+                     uint64_t *sse)
+{
+    __m256i acc = _mm256_setzero_si256();
+    uint64_t total_sse = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        unsigned j = 0;
+        __m256i row_acc = _mm256_setzero_si256();
+
+        /* Process 32 uint8_t elements at a time */
+        for (; j + 32 <= w; j += 32) {
+            __m256i r = _mm256_loadu_si256((const __m256i *)(ref + j));
+            __m256i d = _mm256_loadu_si256((const __m256i *)(dis + j));
+
+            /* Zero-extend low 16 bytes to 16-bit */
+            __m256i r_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r));
+            __m256i d_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(d));
+
+            /* Zero-extend high 16 bytes to 16-bit */
+            __m256i r_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(r, 1));
+            __m256i d_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(d, 1));
+
+            /* Compute signed differences */
+            __m256i diff_lo = _mm256_sub_epi16(r_lo, d_lo);
+            __m256i diff_hi = _mm256_sub_epi16(r_hi, d_hi);
+
+            /* Square and sum adjacent pairs: int16*int16 -> int32 */
+            __m256i sq_lo = _mm256_madd_epi16(diff_lo, diff_lo);
+            __m256i sq_hi = _mm256_madd_epi16(diff_hi, diff_hi);
+
+            /* Accumulate into 32-bit row accumulator */
+            row_acc = _mm256_add_epi32(row_acc, sq_lo);
+            row_acc = _mm256_add_epi32(row_acc, sq_hi);
+        }
+
+        /* Widen row_acc from 32-bit to 64-bit and add to main accumulator */
+        __m256i row_lo = _mm256_unpacklo_epi32(row_acc, _mm256_setzero_si256());
+        __m256i row_hi = _mm256_unpackhi_epi32(row_acc, _mm256_setzero_si256());
+        acc = _mm256_add_epi64(acc, row_lo);
+        acc = _mm256_add_epi64(acc, row_hi);
+
+        /* Scalar tail */
+        uint32_t sse_inner = 0;
+        for (; j < w; j++) {
+            const int16_t e = ref[j] - dis[j];
+            sse_inner += e * e;
+        }
+        total_sse += sse_inner;
+
+        ref += stride_ref;
+        dis += stride_dis;
+    }
+
+    /* Horizontal sum of acc (4x64-bit) */
+    __m128i lo128 = _mm256_castsi256_si128(acc);
+    __m128i hi128 = _mm256_extracti128_si256(acc, 1);
+    __m128i sum128 = _mm_add_epi64(lo128, hi128);
+    __m128i sum64 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
+    total_sse += (uint64_t)_mm_extract_epi64(sum64, 0);
+
+    *sse = total_sse;
+}
diff --git a/libvmaf/src/feature/x86/psnr_avx2.h b/libvmaf/src/feature/x86/psnr_avx2.h
new file mode 100644
index 000000000..ee2f6065e
--- /dev/null
+++ b/libvmaf/src/feature/x86/psnr_avx2.h
@@ -0,0 +1,30 @@
+/**
+ *
+ *  Copyright 2016-2020 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#ifndef PSNR_AVX2_H_
+#define PSNR_AVX2_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+void psnr_sse_8_avx2(const uint8_t *ref, const uint8_t *dis,
+                     unsigned w, unsigned h,
+                     ptrdiff_t stride_ref, ptrdiff_t stride_dis,
+                     uint64_t *sse);
+
+#endif /* PSNR_AVX2_H_ */
diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build
index 80a34bc4f..6563bddd2 100644
--- a/libvmaf/src/meson.build
+++ b/libvmaf/src/meson.build
@@ -242,6 +242,7 @@ if is_asm_enabled
           feature_src_dir + 'x86/vif_avx2.c',
           feature_src_dir + 'x86/adm_avx2.c',
           feature_src_dir + 'x86/cambi_avx2.c',
+          feature_src_dir + 'x86/psnr_avx2.c',
       ]
 
       x86_avx2_static_lib = static_library(
diff --git a/libvmaf/src/model.c b/libvmaf/src/model.c
index ab623a9fd..02dab9e42 100644
--- a/libvmaf/src/model.c
+++ b/libvmaf/src/model.c
@@ -191,6 +191,7 @@ void vmaf_model_destroy(VmafModel *model)
     svm_free_and_destroy_model(&(model->svm));
     for (unsigned i = 0; i < model->n_features; i++) {
         free(model->feature[i].name);
+        free(model->feature[i].name_with_opt);
         vmaf_dictionary_free(&model->feature[i].opts_dict);
     }
     free(model->feature);
diff --git a/libvmaf/src/model.h b/libvmaf/src/model.h
index e0a41953b..f1778a972 100644
--- a/libvmaf/src/model.h
+++ b/libvmaf/src/model.h
@@ -41,6 +41,7 @@ typedef struct {
     char *name;
     double slope, intercept;
     VmafDictionary *opts_dict;
+    char *name_with_opt; /* cached feature name with options, lazily populated */
 } VmafModelFeature;
 
 typedef struct {
diff --git a/libvmaf/src/predict.c b/libvmaf/src/predict.c
index 9926de15b..ea94b8634 100644
--- a/libvmaf/src/predict.c
+++ b/libvmaf/src/predict.c
@@ -237,49 +237,66 @@ int vmaf_predict_score_at_index(VmafModel *model,
 
     int err = 0;
 
-    struct svm_node *node = malloc(sizeof(*node) * (model->n_features + 1));
-    if (!node) return -ENOMEM;
+    /* Use stack allocation for typical model sizes, heap for large ones */
+    struct svm_node node_buf[32];
+    struct svm_node *node;
+    if (model->n_features + 1 <= 32) {
+        node = node_buf;
+    } else {
+        node = malloc(sizeof(*node) * (model->n_features + 1));
+        if (!node) return -ENOMEM;
+    }
 
     for (unsigned i = 0; i < model->n_features; i++) {
-        VmafFeatureExtractor *fex =
-            vmaf_get_feature_extractor_by_feature_name(model->feature[i].name, 0);
-
-        if (!fex) {
-            vmaf_log(VMAF_LOG_LEVEL_ERROR,
-                     "vmaf_predict_score_at_index(): no feature extractor "
-                     "providing feature '%s'\n", model->feature[i].name);
-            err = -EINVAL;
-            goto free_node;
-        }
+        /* Use cached feature name if available, otherwise generate it */
+        const char *feature_name = model->feature[i].name_with_opt;
+        char *feature_name_alloc = NULL;
 
-        VmafDictionary *opts_dict = NULL;
-        if (model->feature[i].opts_dict) {
-            err = vmaf_dictionary_copy(&model->feature[i].opts_dict, &opts_dict);
-            if (err) return err;
-        }
+        if (!feature_name) {
+            VmafFeatureExtractor *fex =
+                vmaf_get_feature_extractor_by_feature_name(model->feature[i].name, 0);
+
+            if (!fex) {
+                vmaf_log(VMAF_LOG_LEVEL_ERROR,
+                         "vmaf_predict_score_at_index(): no feature extractor "
+                         "providing feature '%s'\n", model->feature[i].name);
+                err = -EINVAL;
+                goto free_node;
+            }
 
-        VmafFeatureExtractorContext *fex_ctx;
-        err = vmaf_feature_extractor_context_create(&fex_ctx, fex, opts_dict);
-        if (err) {
-            vmaf_log(VMAF_LOG_LEVEL_ERROR,
-                     "vmaf_predict_score_at_index(): could not generate "
-                     "feature extractor context\n");
-            vmaf_dictionary_free(&opts_dict);
-            return err;
-        }
+            VmafDictionary *opts_dict = NULL;
+            if (model->feature[i].opts_dict) {
+                err = vmaf_dictionary_copy(&model->feature[i].opts_dict, &opts_dict);
+                if (err) { err = err; goto free_node; }
+            }
 
-        char *feature_name =
-            vmaf_feature_name_from_options(model->feature[i].name,
-                    fex_ctx->fex->options, fex_ctx->fex->priv);
+            VmafFeatureExtractorContext *fex_ctx;
+            err = vmaf_feature_extractor_context_create(&fex_ctx, fex, opts_dict);
+            if (err) {
+                vmaf_log(VMAF_LOG_LEVEL_ERROR,
+                         "vmaf_predict_score_at_index(): could not generate "
+                         "feature extractor context\n");
+                vmaf_dictionary_free(&opts_dict);
+                goto free_node;
+            }
 
-        vmaf_feature_extractor_context_destroy(fex_ctx);
+            feature_name_alloc =
+                vmaf_feature_name_from_options(model->feature[i].name,
+                        fex_ctx->fex->options, fex_ctx->fex->priv);
 
-        if (!feature_name) {
-            vmaf_log(VMAF_LOG_LEVEL_ERROR,
-                     "vmaf_predict_score_at_index(): could not generate "
-                     "feature name\n");
-            err = -ENOMEM;
-            goto free_node;
+            vmaf_feature_extractor_context_destroy(fex_ctx);
+
+            if (!feature_name_alloc) {
+                vmaf_log(VMAF_LOG_LEVEL_ERROR,
+                         "vmaf_predict_score_at_index(): could not generate "
+                         "feature name\n");
+                err = -ENOMEM;
+                goto free_node;
+            }
+            feature_name = feature_name_alloc;
+            /* Cache for future calls */
+            model->feature[i].name_with_opt = feature_name_alloc;
+            feature_name_alloc = NULL; /* ownership transferred */
         }
 
         double feature_score;
@@ -293,10 +310,10 @@ int vmaf_predict_score_at_index(VmafModel *model,
                        "vmaf_predict_score_at_index(): no feature '%s' "
                        "at index %d\n", feature_name, index);
             }
-            free(feature_name);
+            free(feature_name_alloc);
             goto free_node;
         }
-        free(feature_name);
+        free(feature_name_alloc);
 
         err = normalize(model, model->feature[i].slope,
                         model->feature[i].intercept, &feature_score);
@@ -327,7 +344,8 @@ int vmaf_predict_score_at_index(VmafModel *model,
     *vmaf_score = prediction;
 
 free_node:
-    free(node);
+    if (node != node_buf)
+        free(node);
     return err;
 }
 
diff --git a/libvmaf/src/thread_pool.c b/libvmaf/src/thread_pool.c
index 55d027aed..43c3ad79c 100644
--- a/libvmaf/src/thread_pool.c
+++ b/libvmaf/src/thread_pool.c
@@ -22,9 +22,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#define JOB_INLINE_DATA_SIZE 64
+
 typedef struct VmafThreadPoolJob {
     void (*func)(void *data);
     void *data;
+    char inline_data[JOB_INLINE_DATA_SIZE];
     struct VmafThreadPoolJob *next;
 } VmafThreadPoolJob;
 
@@ -38,6 +41,7 @@ typedef struct VmafTreadPool {
     unsigned n_threads;
     unsigned n_working;
     bool stop;
+    VmafThreadPoolJob *free_jobs;
 } VmafThreadPool;
 
 static VmafThreadPoolJob *vmaf_thread_pool_fetch_job(VmafThreadPool *pool)
@@ -55,10 +59,23 @@ static VmafThreadPoolJob *vmaf_thread_pool_fetch_job(VmafThreadPool *pool)
     return job;
 }
 
+static void vmaf_thread_pool_job_recycle(VmafThreadPool *pool,
+                                          VmafThreadPoolJob *job)
+{
+    if (!job) return;
+    if (job->data && job->data != job->inline_data)
+        free(job->data);
+    job->data = NULL;
+    job->func = NULL;
+    job->next = pool->free_jobs;
+    pool->free_jobs = job;
+}
+
 static void vmaf_thread_pool_job_destroy(VmafThreadPoolJob *job)
 {
     if (!job) return;
-    if (job->data) free(job->data);
+    if (job->data && job->data != job->inline_data)
+        free(job->data);
     free(job);
 }
 
@@ -76,10 +93,11 @@ static void *vmaf_thread_pool_runner(void *p)
         pthread_mutex_unlock(&(pool->queue.lock));
         if (job) {
             job->func(job->data);
-            vmaf_thread_pool_job_destroy(job);
         }
         pthread_mutex_lock(&(pool->queue.lock));
         pool->n_working--;
+        if (job)
+            vmaf_thread_pool_job_recycle(pool, job);
         if (!pool->stop && pool->n_working == 0 && !pool->queue.head)
             pthread_cond_signal(&(pool->working));
         pthread_mutex_unlock(&(pool->queue.lock));
@@ -121,18 +139,39 @@ int vmaf_thread_pool_enqueue(VmafThreadPool *pool, void (*func)(void *data),
     if (!pool) return -EINVAL;
     if (!func) return -EINVAL;
 
-    VmafThreadPoolJob *job = malloc(sizeof(*job));
-    if (!job) return -ENOMEM;
+    pthread_mutex_lock(&(pool->queue.lock));
+
+    /* Try to reuse a job from the free list */
+    VmafThreadPoolJob *job = pool->free_jobs;
+    if (job) {
+        pool->free_jobs = job->next;
+    } else {
+        job = malloc(sizeof(*job));
+        if (!job) {
+            pthread_mutex_unlock(&(pool->queue.lock));
+            return -ENOMEM;
+        }
+    }
+
     memset(job, 0, sizeof(*job));
     job->func = func;
     if (data) {
-        job->data = malloc(data_sz);
-        if (!job->data) goto free_job;
-        memcpy(job->data, data, data_sz);
+        if (data_sz <= JOB_INLINE_DATA_SIZE) {
+            memcpy(job->inline_data, data, data_sz);
+            job->data = job->inline_data;
+        } else {
+            job->data = malloc(data_sz);
+            if (!job->data) {
+                /* Return job to free list */
+                job->next = pool->free_jobs;
+                pool->free_jobs = job;
+                pthread_mutex_unlock(&(pool->queue.lock));
+                return -ENOMEM;
+            }
+            memcpy(job->data, data, data_sz);
+        }
     }
 
-    pthread_mutex_lock(&(pool->queue.lock));
-
     if (!pool->queue.head) {
         pool->queue.head = job;
         pool->queue.tail = pool->queue.head;
@@ -141,14 +180,10 @@ int vmaf_thread_pool_enqueue(VmafThreadPool *pool, void (*func)(void *data),
         pool->queue.tail = job;
     }
 
-    pthread_cond_broadcast(&(pool->queue.empty));
+    pthread_cond_signal(&(pool->queue.empty));
     pthread_mutex_unlock(&(pool->queue.lock));
 
     return 0;
-
-free_job:
-    free(job);
-    return -ENOMEM;
 }
 
 int vmaf_thread_pool_wait(VmafThreadPool *pool)
@@ -178,6 +213,15 @@ int vmaf_thread_pool_destroy(VmafThreadPool *pool)
     pthread_cond_broadcast(&(pool->queue.empty));
     pthread_mutex_unlock(&(pool->queue.lock));
     vmaf_thread_pool_wait(pool);
+
+    /* Free all pooled job objects */
+    job = pool->free_jobs;
+    while (job) {
+        VmafThreadPoolJob *next_job = job->next;
+        free(job);
+        job = next_job;
+    }
+
     pthread_mutex_destroy(&(pool->queue.lock));
     pthread_cond_destroy(&(pool->queue.empty));
     pthread_cond_destroy(&(pool->working));
diff --git a/libvmaf/test/meson.build b/libvmaf/test/meson.build
index 1fe4d422c..a5575c385 100644
--- a/libvmaf/test/meson.build
+++ b/libvmaf/test/meson.build
@@ -177,3 +177,12 @@ test('test_cli_parse', test_cli_parse)
 test('test_psnr', test_psnr)
 test('test_framesync', test_framesync)
 test('test_propagate_metadata', test_propagate_metadata)
+
+test_perf_optimizations = executable('test_perf_optimizations',
+    ['test.c', 'test_perf_optimizations.c'],
+    include_directories : [libvmaf_inc, test_inc, include_directories('../src/'), include_directories('../src/feature/')],
+    link_with : get_option('default_library') == 'both' ? libvmaf.get_static_lib() : libvmaf,
+    dependencies : [math_lib, stdatomic_dependency, thread_lib, cuda_dependency],
+)
+
+test('test_perf_optimizations', test_perf_optimizations)
diff --git a/libvmaf/test/test_perf_optimizations.c b/libvmaf/test/test_perf_optimizations.c
new file mode 100644
index 000000000..4b48d3786
--- /dev/null
+++ b/libvmaf/test/test_perf_optimizations.c
@@ -0,0 +1,570 @@
+/**
+ *
+ *  Copyright 2016-2020 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "test.h"
+#include "thread_pool.h"
+#include "feature/feature_collector.h"
+#include "feature/feature_extractor.h"
+#include "predict.h"
+#include "libvmaf/libvmaf.h"
+#include "libvmaf/picture.h"
+#include "libvmaf/model.h"
+
+/* ========================================================================
+ * Helper utilities
+ * ======================================================================== */
+
+#define EPS 0.00001
+
+static int almost_equal(double a, double b) {
+    double diff = a > b ? a - b : b - a;
+    return diff < EPS;
+}
+
+/* ========================================================================
+ * Thread pool tests (thundering herd fix, signal vs broadcast)
+ * ======================================================================== */
+
+static volatile int tp_counter = 0;
+static pthread_mutex_t tp_counter_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void increment_counter(void *data)
+{
+    (void)data;
+    pthread_mutex_lock(&tp_counter_lock);
+    tp_counter++;
+    pthread_mutex_unlock(&tp_counter_lock);
+}
+
+static char *test_thread_pool_many_jobs()
+{
+    int err;
+    VmafThreadPool *pool;
+    const unsigned n_threads = 4;
+    const unsigned n_jobs = 1000;
+
+    tp_counter = 0;
+
+    err = vmaf_thread_pool_create(&pool, n_threads);
+    mu_assert("problem creating thread pool", !err);
+
+    for (unsigned i = 0; i < n_jobs; i++) {
+        err = vmaf_thread_pool_enqueue(pool, increment_counter, NULL, 0);
+        mu_assert("problem enqueuing job", !err);
+    }
+
+    err = vmaf_thread_pool_wait(pool);
+    mu_assert("problem waiting for thread pool", !err);
+
+    mu_assert("not all jobs completed (thundering herd regression)",
+              tp_counter == (int)n_jobs);
+
+    err = vmaf_thread_pool_destroy(pool);
+    mu_assert("problem destroying thread pool", !err);
+
+    return NULL;
+}
+
+static void accumulate_value(void *data)
+{
+    int *val = data;
+    pthread_mutex_lock(&tp_counter_lock);
+    tp_counter += *val;
+    pthread_mutex_unlock(&tp_counter_lock);
+}
+
+static char *test_thread_pool_data_passing()
+{
+    int err;
+    VmafThreadPool *pool;
+
+    tp_counter = 0;
+
+    err = vmaf_thread_pool_create(&pool, 2);
+    mu_assert("problem creating thread pool", !err);
+
+    for (int i = 1; i <= 100; i++) {
+        err = vmaf_thread_pool_enqueue(pool, accumulate_value, &i, sizeof(i));
+        mu_assert("problem enqueuing job with data", !err);
+    }
+
+    err = vmaf_thread_pool_wait(pool);
+    mu_assert("problem waiting for thread pool", !err);
+
+    /* Sum of 1..100 = 5050 */
+    mu_assert("data passing through thread pool failed",
+              tp_counter == 5050);
+
+    err = vmaf_thread_pool_destroy(pool);
+    mu_assert("problem destroying thread pool", !err);
+
+    return NULL;
+}
+
+/* ========================================================================
+ * Feature collector tests (capacity 512, realloc behavior)
+ * ======================================================================== */
+
+static char *test_feature_vector_capacity_512()
+{
+    int err;
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("problem during vmaf_feature_collector_init", !err);
+
+    /* Append 500 scores to a single feature - should not realloc since
+     * initial capacity is now 512 */
+    for (unsigned i = 0; i < 500; i++) {
+        err = vmaf_feature_collector_append(fc, "test_feature", (double)i, i);
+        mu_assert("problem during vmaf_feature_collector_append", !err);
+    }
+
+    /* Verify all scores are retrievable */
+    double score;
+    err = vmaf_feature_collector_get_score(fc, "test_feature", &score, 0);
+    mu_assert("could not get score at index 0", !err);
+    mu_assert("score at index 0 incorrect", almost_equal(score, 0.0));
+
+    err = vmaf_feature_collector_get_score(fc, "test_feature", &score, 499);
+    mu_assert("could not get score at index 499", !err);
+    mu_assert("score at index 499 incorrect", almost_equal(score, 499.0));
+
+    /* Now append beyond 512 - should trigger realloc */
+    for (unsigned i = 500; i < 600; i++) {
+        err = vmaf_feature_collector_append(fc, "test_feature", (double)i, i);
+        mu_assert("problem appending beyond initial capacity", !err);
+    }
+
+    err = vmaf_feature_collector_get_score(fc, "test_feature", &score, 599);
+    mu_assert("could not get score after realloc", !err);
+    mu_assert("score after realloc incorrect", almost_equal(score, 599.0));
+
+    vmaf_feature_collector_destroy(fc);
+    return NULL;
+}
+
+static char *test_feature_collector_multiple_features()
+{
+    int err;
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("problem during init", !err);
+
+    /* Add 20 different features - tests collector capacity growth */
+    char name[64];
+    for (unsigned f = 0; f < 20; f++) {
+        snprintf(name, sizeof(name), "feature_%u", f);
+        for (unsigned i = 0; i < 10; i++) {
+            err = vmaf_feature_collector_append(fc, name, (double)(f * 10 + i), i);
+            mu_assert("problem during append", !err);
+        }
+    }
+
+    /* Verify retrieval from various features */
+    double score;
+    err = vmaf_feature_collector_get_score(fc, "feature_0", &score, 5);
+    mu_assert("get_score failed for feature_0", !err);
+    mu_assert("wrong score for feature_0[5]", almost_equal(score, 5.0));
+
+    err = vmaf_feature_collector_get_score(fc, "feature_19", &score, 9);
+    mu_assert("get_score failed for feature_19", !err);
+    mu_assert("wrong score for feature_19[9]", almost_equal(score, 199.0));
+
+    vmaf_feature_collector_destroy(fc);
+    return NULL;
+}
+
+/* ========================================================================
+ * Predict tests (stack-allocated SVM nodes, cached feature names)
+ * ======================================================================== */
+
+static char *test_predict_score_consistency()
+{
+    int err;
+
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("problem during vmaf_feature_collector_init", !err);
+
+    VmafModel *model;
+    VmafModelConfig cfg = {
+        .name = "vmaf",
+        .flags = VMAF_MODEL_FLAGS_DEFAULT,
+    };
+    err = vmaf_model_load(&model, &cfg, "vmaf_v0.6.1");
+    mu_assert("problem during vmaf_model_load", !err);
+
+    /* Insert known feature scores */
+    for (unsigned i = 0; i < model->n_features; i++) {
+        err = vmaf_feature_collector_append(fc, model->feature[i].name, 60., 0);
+        mu_assert("problem during feature append", !err);
+    }
+
+    /* Predict twice to test caching of name_with_opt */
+    double score1 = 0., score2 = 0.;
+    err = vmaf_predict_score_at_index(model, fc, 0, &score1, true, false, 0);
+    mu_assert("first prediction failed", !err);
+    mu_assert("first prediction out of range", score1 > 0. && score1 <= 100.);
+
+    /* Insert scores at index 1 too */
+    for (unsigned i = 0; i < model->n_features; i++) {
+        err = vmaf_feature_collector_append(fc, model->feature[i].name, 60., 1);
+        mu_assert("problem during feature append at index 1", !err);
+    }
+
+    err = vmaf_predict_score_at_index(model, fc, 1, &score2, true, false, 0);
+    mu_assert("second prediction failed", !err);
+
+    /* Same inputs should give same score */
+    mu_assert("cached prediction differs from first",
+              almost_equal(score1, score2));
+
+    vmaf_model_destroy(model);
+    vmaf_feature_collector_destroy(fc);
+    return NULL;
+}
+
+/* ========================================================================
+ * Feature extractor correctness tests (PSNR, VIF, Motion, ADM)
+ * ======================================================================== */
+
+static int alloc_picture_8b(VmafPicture *pic, unsigned w, unsigned h,
+                            uint8_t fill_value)
+{
+    int err = vmaf_picture_alloc(pic, VMAF_PIX_FMT_YUV400P, 8, w, h);
+    if (err) return err;
+    uint8_t *data = pic->data[0];
+    for (unsigned i = 0; i < h; i++) {
+        memset(data, fill_value, w);
+        data += pic->stride[0];
+    }
+    return 0;
+}
+
+static char *test_psnr_identical_frames()
+{
+    VmafPicture ref, dist;
+    int err = 0;
+
+    err |= alloc_picture_8b(&ref, 64, 64, 128);
+    err |= alloc_picture_8b(&dist, 64, 64, 128);
+    mu_assert("picture alloc failed", !err);
+
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("feature collector init failed", !err);
+
+    VmafFeatureExtractor *fex =
+        vmaf_get_feature_extractor_by_name("psnr");
+    mu_assert("could not find psnr feature extractor", fex);
+
+    VmafFeatureExtractorContext *fex_ctx;
+    err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL);
+    mu_assert("fex context create failed", !err);
+
+    err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL,
+                                                  &dist, NULL, 0, fc);
+    mu_assert("psnr extract failed", !err);
+
+    double psnr_y;
+    err = vmaf_feature_collector_get_score(fc, "psnr_y", &psnr_y, 0);
+    mu_assert("get psnr_y score failed", !err);
+
+    /* Identical frames should give maximum PSNR */
+    mu_assert("identical frames should have max PSNR",
+              psnr_y >= 60.0);
+
+    vmaf_feature_extractor_context_close(fex_ctx);
+    vmaf_feature_extractor_context_destroy(fex_ctx);
+    vmaf_feature_collector_destroy(fc);
+    vmaf_picture_unref(&ref);
+    vmaf_picture_unref(&dist);
+
+    return NULL;
+}
+
+static char *test_psnr_different_frames()
+{
+    VmafPicture ref, dist;
+    int err = 0;
+
+    err |= alloc_picture_8b(&ref, 64, 64, 0);
+    err |= alloc_picture_8b(&dist, 64, 64, 255);
+    mu_assert("picture alloc failed", !err);
+
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("feature collector init failed", !err);
+
+    VmafFeatureExtractor *fex =
+        vmaf_get_feature_extractor_by_name("psnr");
+    mu_assert("could not find psnr feature extractor", fex);
+
+    VmafFeatureExtractorContext *fex_ctx;
+    err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL);
+    mu_assert("fex context create failed", !err);
+
+    err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL,
+                                                  &dist, NULL, 0, fc);
+    mu_assert("psnr extract failed", !err);
+
+    double psnr_y;
+    err = vmaf_feature_collector_get_score(fc, "psnr_y", &psnr_y, 0);
+    mu_assert("get psnr_y score failed", !err);
+
+    /* Max difference (0 vs 255) should give very low PSNR */
+    mu_assert("max difference should have very low PSNR", psnr_y < 10.0);
+
+    vmaf_feature_extractor_context_close(fex_ctx);
+    vmaf_feature_extractor_context_destroy(fex_ctx);
+    vmaf_feature_collector_destroy(fc);
+    vmaf_picture_unref(&ref);
+    vmaf_picture_unref(&dist);
+
+    return NULL;
+}
+
+static char *test_vif_identical_frames()
+{
+    VmafPicture ref, dist;
+    int err = 0;
+
+    err |= alloc_picture_8b(&ref, 64, 64, 128);
+    err |= alloc_picture_8b(&dist, 64, 64, 128);
+    mu_assert("picture alloc failed", !err);
+
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("feature collector init failed", !err);
+
+    VmafFeatureExtractor *fex =
+        vmaf_get_feature_extractor_by_name("vif");
+    mu_assert("could not find vif feature extractor", fex);
+
+    VmafFeatureExtractorContext *fex_ctx;
+    err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL);
+    mu_assert("vif fex context create failed", !err);
+
+    err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL,
+                                                  &dist, NULL, 0, fc);
+    mu_assert("vif extract failed", !err);
+
+    double vif_s0;
+    err = vmaf_feature_collector_get_score(fc,
+            "VMAF_integer_feature_vif_scale0_score", &vif_s0, 0);
+    mu_assert("get vif scale0 score failed", !err);
+
+    /* Identical frames should give VIF = 1.0 */
+    mu_assert("identical frames should have VIF ~1.0",
+              vif_s0 > 0.99 && vif_s0 < 1.01);
+
+    vmaf_feature_extractor_context_close(fex_ctx);
+    vmaf_feature_extractor_context_destroy(fex_ctx);
+    vmaf_feature_collector_destroy(fc);
+    vmaf_picture_unref(&ref);
+    vmaf_picture_unref(&dist);
+
+    return NULL;
+}
+
+static char *test_motion_identical_frames()
+{
+    VmafPicture ref;
+    int err = 0;
+
+    err = alloc_picture_8b(&ref, 64, 64, 100);
+    mu_assert("picture alloc failed", !err);
+
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("feature collector init failed", !err);
+
+    VmafFeatureExtractor *fex =
+        vmaf_get_feature_extractor_by_name("motion");
+    mu_assert("could not find motion feature extractor", fex);
+
+    VmafFeatureExtractorContext *fex_ctx;
+    err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL);
+    mu_assert("motion fex context create failed", !err);
+
+    /* Feed the same frame twice (index 0 and 1) */
+    err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL,
+                                                  &ref, NULL, 0, fc);
+    mu_assert("motion extract failed (frame 0)", !err);
+
+    err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL,
+                                                  &ref, NULL, 1, fc);
+    mu_assert("motion extract failed (frame 1)", !err);
+
+    /* Motion of identical frames should be 0 */
+    double motion_score;
+    err = vmaf_feature_collector_get_score(fc,
+            "VMAF_integer_feature_motion2_score", &motion_score, 0);
+    mu_assert("get motion2 score failed", !err);
+    mu_assert("identical frames should have motion score 0",
+              almost_equal(motion_score, 0.0));
+
+    vmaf_feature_extractor_context_close(fex_ctx);
+    vmaf_feature_extractor_context_destroy(fex_ctx);
+    vmaf_feature_collector_destroy(fc);
+    vmaf_picture_unref(&ref);
+
+    return NULL;
+}
+
+static char *test_adm_identical_frames()
+{
+    VmafPicture ref, dist;
+    int err = 0;
+
+    err |= alloc_picture_8b(&ref, 64, 64, 128);
+    err |= alloc_picture_8b(&dist, 64, 64, 128);
+    mu_assert("picture alloc failed", !err);
+
+    VmafFeatureCollector *fc;
+    err = vmaf_feature_collector_init(&fc);
+    mu_assert("feature collector init failed", !err);
+
+    VmafFeatureExtractor *fex =
+        vmaf_get_feature_extractor_by_name("adm");
+    mu_assert("could not find adm feature extractor", fex);
+
+    VmafFeatureExtractorContext *fex_ctx;
+    err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL);
+    mu_assert("adm fex context create failed", !err);
+
+    err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL,
+                                                  &dist, NULL, 0, fc);
+    mu_assert("adm extract failed", !err);
+
+    double adm2;
+    err = vmaf_feature_collector_get_score(fc,
+            "VMAF_integer_feature_adm2_score", &adm2, 0);
+    mu_assert("get adm2 score failed", !err);
+
+    /* Identical frames should give ADM2 = 1.0 */
+    mu_assert("identical frames should have ADM2 ~1.0",
+              adm2 > 0.99 && adm2 < 1.01);
+
+    vmaf_feature_extractor_context_close(fex_ctx);
+    vmaf_feature_extractor_context_destroy(fex_ctx);
+    vmaf_feature_collector_destroy(fc);
+    vmaf_picture_unref(&ref);
+    vmaf_picture_unref(&dist);
+
+    return NULL;
+}
+
+/* ========================================================================
+ * End-to-end VMAF score test
+ * ======================================================================== */
+
+static char *test_vmaf_score_range()
+{
+    int err;
+    VmafConfiguration vmaf_cfg = { 0 };
+    VmafContext *vmaf;
+    err = vmaf_init(&vmaf, vmaf_cfg);
+    mu_assert("vmaf_init failed", !err);
+
+    VmafModelConfig model_cfg = { .name = "vmaf" };
+    VmafModel *model;
+    err = vmaf_model_load(&model, &model_cfg, "vmaf_v0.6.1");
+    mu_assert("model load failed", !err);
+
+    err = vmaf_use_features_from_model(vmaf, model);
+    mu_assert("use features from model failed", !err);
+
+    /* Create identical ref/dist pictures */
+    VmafPicture ref, dist;
+    err = vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, 8, 64, 64);
+    mu_assert("ref alloc failed", !err);
+    err = vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, 8, 64, 64);
+    mu_assert("dist alloc failed", !err);
+
+    /* Fill with a pattern */
+    for (int p = 0; p < 3; p++) {
+        uint8_t *rdata = ref.data[p];
+        uint8_t *ddata = dist.data[p];
+        for (unsigned i = 0; i < ref.h[p]; i++) {
+            for (unsigned j = 0; j < ref.w[p]; j++) {
+                rdata[j] = (uint8_t)((i + j) & 0xFF);
+                ddata[j] = (uint8_t)((i + j) & 0xFF);
+            }
+            rdata += ref.stride[p];
+            ddata += dist.stride[p];
+        }
+    }
+
+    err = vmaf_read_pictures(vmaf, &ref, &dist, 0);
+    mu_assert("read pictures failed", !err);
+
+    /* Flush */
+    err = vmaf_read_pictures(vmaf, NULL, NULL, 0);
+    mu_assert("flush failed", !err);
+
+    double vmaf_score;
+    err = vmaf_score_at_index(vmaf, model, &vmaf_score, 0);
+    mu_assert("score at index failed", !err);
+
+    /* Identical frames should give VMAF near 100 */
+    mu_assert("VMAF score for identical frames should be high (> 90)",
+              vmaf_score > 90.0);
+    mu_assert("VMAF score should not exceed 100", vmaf_score <= 100.0);
+
+    vmaf_model_destroy(model);
+    vmaf_close(vmaf);
+
+    return NULL;
+}
+
+/* ========================================================================
+ * Test runner
+ * ======================================================================== */
+
+char *run_tests()
+{
+    /* Thread pool tests */
+    mu_run_test(test_thread_pool_many_jobs);
+    mu_run_test(test_thread_pool_data_passing);
+
+    /* Feature collector tests */
+    mu_run_test(test_feature_vector_capacity_512);
+    mu_run_test(test_feature_collector_multiple_features);
+
+    /* Predict tests */
+    mu_run_test(test_predict_score_consistency);
+
+    /* Feature extractor correctness tests */
+    mu_run_test(test_psnr_identical_frames);
+    mu_run_test(test_psnr_different_frames);
+    mu_run_test(test_vif_identical_frames);
+    mu_run_test(test_motion_identical_frames);
+    mu_run_test(test_adm_identical_frames);
+
+    /* End-to-end test */
+    mu_run_test(test_vmaf_score_range);
+
+    return NULL;
+}