diff --git a/libvmaf/src/feature/common/convolution.c b/libvmaf/src/feature/common/convolution.c index e74f51dc4..9cc4e3edf 100644 --- a/libvmaf/src/feature/common/convolution.c +++ b/libvmaf/src/feature/common/convolution.c @@ -32,20 +32,24 @@ void convolution_x_c_s(const float *filter, int filter_width, const float *src, int borders_right = vmaf_floorn(width - (filter_width - radius), step); for (int i = 0; i < height; ++i) { + const float *src_row = src + i * src_stride; + float *dst_row = dst + i * dst_stride; + for (int j = 0; j < borders_left; j += step) { - dst[i * dst_stride + j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j); + dst_row[j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j); } for (int j = borders_left; j < borders_right; j += step) { + const float *src_ptr = src_row + j - radius; float accum = 0; for (int k = 0; k < filter_width; ++k) { - accum += filter[k] * src[i * src_stride + j - radius + k]; + accum += filter[k] * src_ptr[k]; } - dst[i * dst_stride + j / step] = accum; + dst_row[j / step] = accum; } for (int j = borders_right; j < width; j += step) { - dst[i * dst_stride + j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j); + dst_row[j / step] = convolution_edge_s(true, filter, filter_width, src, width, height, src_stride, i, j); } } } @@ -62,12 +66,13 @@ void convolution_y_c_s(const float *filter, int filter_width, const float *src, } } for (int i = borders_top; i < borders_bottom; i += step) { + float *dst_row = dst + (i / step) * dst_stride; for (int j = 0; j < width; ++j) { float accum = 0; for (int k = 0; k < filter_width; ++k) { accum += filter[k] * src[(i - radius + k) * src_stride + j]; } - dst[(i / step) * dst_stride + j] = accum; + dst_row[j] = accum; } } for (int i = borders_bottom; i < height; i += step) { diff --git a/libvmaf/src/feature/feature_collector.c b/libvmaf/src/feature/feature_collector.c index 2eb2ec643..7dd531c00 100644 --- a/libvmaf/src/feature/feature_collector.c +++ b/libvmaf/src/feature/feature_collector.c @@ -152,7 +152,7 @@ static int feature_vector_init(FeatureVector **const feature_vector, fv->name = malloc(strlen(name) + 1); if (!fv->name) goto free_fv; strcpy(fv->name, name); - fv->capacity = 8; + fv->capacity = 512; fv->score = malloc(sizeof(fv->score[0]) * fv->capacity); if (!fv->score) goto free_name; memset(fv->score, 0, sizeof(fv->score[0]) * fv->capacity); diff --git a/libvmaf/src/feature/integer_adm.c b/libvmaf/src/feature/integer_adm.c index 269dbb5e8..1b9c04c56 100644 --- a/libvmaf/src/feature/integer_adm.c +++ b/libvmaf/src/feature/integer_adm.c @@ -723,12 +723,13 @@ static void adm_decouple(AdmBuffer *buf, int w, int h, int stride, t_mag_sq = (int64_t)th * th + (int64_t)tv * tv; /** - * angle_flag is calculated in floating-point by converting fixed-point variables back to - * floating-point + * angle_flag is calculated using fixed-point arithmetic. + * The 4096.0 divisors cancel out in the squared comparison, + * so we can compare directly in the integer domain. */ - int angle_flag = (((float)ot_dp / 4096.0) >= 0.0f) && - (((float)ot_dp / 4096.0) * ((float)ot_dp / 4096.0) >= - cos_1deg_sq * ((float)o_mag_sq / 4096.0) * ((float)t_mag_sq / 4096.0)); + int angle_flag = (ot_dp >= 0) && + ((double)ot_dp * ot_dp >= + cos_1deg_sq * (double)o_mag_sq * t_mag_sq); /** * Division th/oh is carried using lookup table and converted to multiplication @@ -753,18 +754,16 @@ static void adm_decouple(AdmBuffer *buf, int w, int h, int stride, rst_v = ((kv * ov) + 16384) >> 15; rst_d = ((kd * od) + 16384) >> 15; - const float rst_h_f = ((float)kh / 32768) * ((float)oh / 64); - const float rst_v_f = ((float)kv / 32768) * ((float)ov / 64); - const float rst_d_f = ((float)kd / 32768) * ((float)od / 64); + /* Sign of rst_*_f = sign(k) * sign(o), but k is clamped >= 0, + * so the sign is determined solely by oh/ov/od. */ + if (angle_flag && (kh > 0) && (oh > 0)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th); + if (angle_flag && (kh > 0) && (oh < 0)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th); - if (angle_flag && (rst_h_f > 0.)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th); - if (angle_flag && (rst_h_f < 0.)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th); + if (angle_flag && (kv > 0) && (ov > 0)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv); + if (angle_flag && (kv > 0) && (ov < 0)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv); - if (angle_flag && (rst_v_f > 0.)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv); - if (angle_flag && (rst_v_f < 0.)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv); - - if (angle_flag && (rst_d_f > 0.)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td); - if (angle_flag && (rst_d_f < 0.)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td); + if (angle_flag && (kd > 0) && (od > 0)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td); + if (angle_flag && (kd > 0) && (od < 0)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td); r->band_h[i * stride + j] = rst_h; r->band_v[i * stride + j] = rst_v; @@ -854,9 +853,9 @@ static void adm_decouple_s123(AdmBuffer *buf, int w, int h, int stride, o_mag_sq = (int64_t)oh * oh + (int64_t)ov * ov; t_mag_sq = (int64_t)th * th + (int64_t)tv * tv; - int angle_flag = (((float)ot_dp / 4096.0) >= 0.0f) && - (((float)ot_dp / 4096.0) * ((float)ot_dp / 4096.0) >= - cos_1deg_sq * ((float)o_mag_sq / 4096.0) * ((float)t_mag_sq / 4096.0)); + int angle_flag = (ot_dp >= 0) && + ((double)ot_dp * ot_dp >= + cos_1deg_sq * (double)o_mag_sq * t_mag_sq); /** * Division th/oh is carried using lookup table and converted to multiplication @@ -905,18 +904,16 @@ static void adm_decouple_s123(AdmBuffer *buf, int w, int h, int stride, rst_v = ((kv * ov) + 16384) >> 15; rst_d = ((kd * od) + 16384) >> 15; - const float rst_h_f = ((float)kh / 32768) * ((float)oh / 64); - const float rst_v_f = ((float)kv / 32768) * ((float)ov / 64); - const float rst_d_f = ((float)kd / 32768) * ((float)od / 64); - - if (angle_flag && (rst_h_f > 0.)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th); - if (angle_flag && (rst_h_f < 0.)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th); + /* Sign of rst_*_f = sign(k) * sign(o), but k is clamped >= 0, + * so the sign is determined solely by oh/ov/od. */ + if (angle_flag && (kh > 0) && (oh > 0)) rst_h = MIN((rst_h * adm_enhn_gain_limit), th); + if (angle_flag && (kh > 0) && (oh < 0)) rst_h = MAX((rst_h * adm_enhn_gain_limit), th); - if (angle_flag && (rst_v_f > 0.)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv); - if (angle_flag && (rst_v_f < 0.)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv); + if (angle_flag && (kv > 0) && (ov > 0)) rst_v = MIN(rst_v * adm_enhn_gain_limit, tv); + if (angle_flag && (kv > 0) && (ov < 0)) rst_v = MAX(rst_v * adm_enhn_gain_limit, tv); - if (angle_flag && (rst_d_f > 0.)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td); - if (angle_flag && (rst_d_f < 0.)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td); + if (angle_flag && (kd > 0) && (od > 0)) rst_d = MIN(rst_d * adm_enhn_gain_limit, td); + if (angle_flag && (kd > 0) && (od < 0)) rst_d = MAX(rst_d * adm_enhn_gain_limit, td); r->band_h[i * stride + j] = rst_h; r->band_v[i * stride + j] = rst_v; @@ -961,8 +958,8 @@ static void adm_csf(AdmBuffer *buf, int w, int h, int stride, i_rfactor[2] = 49417; } else { - const double pow2_21 = pow(2, 21); - const double pow2_23 = pow(2, 23); + const double pow2_21 = 2097152.0; /* 2^21 */ + const double pow2_23 = 8388608.0; /* 2^23 */ i_rfactor[0] = (uint16_t) (rfactor1[0] * pow2_21); i_rfactor[1] = (uint16_t) (rfactor1[1] * pow2_21); i_rfactor[2] = (uint16_t) (rfactor1[2] * pow2_23); @@ -1039,7 +1036,7 @@ static void i4_adm_csf(AdmBuffer *buf, int scale, int w, int h, int stride, const float rfactor1[3] = { 1.0f / factor1, 1.0f / factor1, 1.0f / factor2 }; //i_rfactor in fixed-point - const double pow2_32 = pow(2, 32); + const double pow2_32 = 4294967296.0; /* 2^32 */ const uint32_t i_rfactor[3] = { (uint32_t)(rfactor1[0] * pow2_32), (uint32_t)(rfactor1[1] * pow2_32), (uint32_t)(rfactor1[2] * pow2_32) }; @@ -1167,10 +1164,13 @@ static float adm_csf_den_scale(const adm_dwt_band_t *src, int w, int h, * after cubing 18bits are to shifted * Hence final shift is 18-shift_accum */ - double shift_csf = pow(2, (18 - shift_accum)); - double csf_h = (double)(accum_h / shift_csf) * pow(rfactor[0], 3); - double csf_v = (double)(accum_v / shift_csf) * pow(rfactor[1], 3); - double csf_d = (double)(accum_d / shift_csf) * pow(rfactor[2], 3); + double shift_csf = ldexp(1.0, (int)(18 - shift_accum)); + double rfactor0_cubed = (double)rfactor[0] * rfactor[0] * rfactor[0]; + double rfactor1_cubed = (double)rfactor[1] * rfactor[1] * rfactor[1]; + double rfactor2_cubed = (double)rfactor[2] * rfactor[2] * rfactor[2]; + double csf_h = (double)(accum_h / shift_csf) * rfactor0_cubed; + double csf_v = (double)(accum_v / shift_csf) * rfactor1_cubed; + double csf_d = (double)(accum_d / shift_csf) * rfactor2_cubed; float powf_add = powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f); float den_scale_h = powf(csf_h, 1.0f / 3.0f) + powf_add; @@ -1206,9 +1206,9 @@ static float adm_csf_den_s123(const i4_adm_dwt_band_t *src, int scale, int w, in const int bottom = h - top; uint32_t shift_cub = (uint32_t)ceil(log2(right - left)); - uint32_t add_shift_cub = (uint32_t)pow(2, (shift_cub - 1)); + uint32_t add_shift_cub = 1u << (shift_cub - 1); uint32_t shift_accum = (uint32_t)ceil(log2(bottom - top)); - uint32_t add_shift_accum = (uint32_t)pow(2, (shift_accum - 1)); + uint32_t add_shift_accum = 1u << (shift_accum - 1); int32_t *src_h = src->band_h + top * src_stride; int32_t *src_v = src->band_v + top * src_stride; @@ -1250,10 +1250,13 @@ static float adm_csf_den_s123(const i4_adm_dwt_band_t *src, int scale, int w, in * All the results are converted to floating-point to calculate the scores * For all scales the final shift is 3*shifts from dwt - total shifts done here */ - double shift_csf = pow(2, (accum_convert_float[scale - 1] - shift_accum - shift_cub)); - double csf_h = (double)(accum_h / shift_csf) * pow(rfactor[0], 3); - double csf_v = (double)(accum_v / shift_csf) * pow(rfactor[1], 3); - double csf_d = (double)(accum_d / shift_csf) * pow(rfactor[2], 3); + double shift_csf = ldexp(1.0, (int)(accum_convert_float[scale - 1] - shift_accum - shift_cub)); + double rfactor0_cubed = (double)rfactor[0] * rfactor[0] * rfactor[0]; + double rfactor1_cubed = (double)rfactor[1] * rfactor[1] * rfactor[1]; + double rfactor2_cubed = (double)rfactor[2] * rfactor[2] * rfactor[2]; + double csf_h = (double)(accum_h / shift_csf) * rfactor0_cubed; + double csf_v = (double)(accum_v / shift_csf) * rfactor1_cubed; + double csf_d = (double)(accum_d / shift_csf) * rfactor2_cubed; float powf_add = powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f); float den_scale_h = powf(csf_h, 1.0f / 3.0f) + powf_add; @@ -1291,8 +1294,8 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri i_rfactor[2] = 49417; } else { - const double pow2_21 = pow(2, 21); - const double pow2_23 = pow(2, 23); + const double pow2_21 = 2097152.0; /* 2^21 */ + const double pow2_23 = 8388608.0; /* 2^23 */ i_rfactor[0] = (uint16_t) (rfactor1[0] * pow2_21); i_rfactor[1] = (uint16_t) (rfactor1[1] * pow2_21); i_rfactor[2] = (uint16_t) (rfactor1[2] * pow2_23); @@ -1306,16 +1309,16 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri const int32_t add_shift_xdsq = 536870912; const uint32_t shift_xhcub = (uint32_t)ceil(log2(w) - 4); - const uint32_t add_shift_xhcub = (uint32_t)pow(2, (shift_xhcub - 1)); + const uint32_t add_shift_xhcub = 1u << (shift_xhcub - 1); const uint32_t shift_xvcub = (uint32_t)ceil(log2(w) - 4); - const uint32_t add_shift_xvcub = (uint32_t)pow(2, (shift_xvcub - 1)); + const uint32_t add_shift_xvcub = 1u << (shift_xvcub - 1); const uint32_t shift_xdcub = (uint32_t)ceil(log2(w) - 3); - const uint32_t add_shift_xdcub = (uint32_t)pow(2, (shift_xdcub - 1)); + const uint32_t add_shift_xdcub = 1u << (shift_xdcub - 1); const uint32_t shift_inner_accum = (uint32_t)ceil(log2(h)); - const uint32_t add_shift_inner_accum = (uint32_t)pow(2, (shift_inner_accum - 1)); + const uint32_t add_shift_inner_accum = 1u << (shift_inner_accum - 1); const int32_t shift_xhsub = 10; const int32_t shift_xvsub = 10; @@ -1630,9 +1633,9 @@ static float adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri * => after cubing (6+23)*3=87 after squaring shifted by 30 * hence pending is 57-shift's done based on width and height */ - float f_accum_h = (float)(accum_h / pow(2, (52 - shift_xhcub - shift_inner_accum))); - float f_accum_v = (float)(accum_v / pow(2, (52 - shift_xvcub - shift_inner_accum))); - float f_accum_d = (float)(accum_d / pow(2, (57 - shift_xdcub - shift_inner_accum))); + float f_accum_h = (float)(accum_h / ldexp(1.0, (int)(52 - shift_xhcub - shift_inner_accum))); + float f_accum_v = (float)(accum_v / ldexp(1.0, (int)(52 - shift_xvcub - shift_inner_accum))); + float f_accum_d = (float)(accum_d / ldexp(1.0, (int)(57 - shift_xdcub - shift_inner_accum))); float num_scale_h = powf(f_accum_h, 1.0f / 3.0f) + powf((bottom - top) * (right - left) / 32.0f, 1.0f / 3.0f); @@ -1657,9 +1660,9 @@ static float i4_adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s float factor2 = dwt_quant_step(&dwt_7_9_YCbCr_threshold[0], scale, 2, adm_norm_view_dist, adm_ref_display_height); float rfactor1[3] = { 1.0f / factor1, 1.0f / factor1, 1.0f / factor2 }; - const uint32_t rfactor[3] = { (uint32_t)(rfactor1[0] * pow(2, 32)), - (uint32_t)(rfactor1[1] * pow(2, 32)), - (uint32_t)(rfactor1[2] * pow(2, 32)) }; + const uint32_t rfactor[3] = { (uint32_t)(rfactor1[0] * 4294967296.0), + (uint32_t)(rfactor1[1] * 4294967296.0), + (uint32_t)(rfactor1[2] * 4294967296.0) }; const uint32_t shift_dst[3] = { 28, 28, 28 }; const uint32_t shift_flt[3] = { 32, 32, 32 }; @@ -1672,14 +1675,14 @@ static float i4_adm_cm(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s } uint32_t shift_cub = (uint32_t)ceil(log2(w)); - uint32_t add_shift_cub = (uint32_t)pow(2, (shift_cub - 1)); + uint32_t add_shift_cub = 1u << (shift_cub - 1); uint32_t shift_inner_accum = (uint32_t)ceil(log2(h)); - uint32_t add_shift_inner_accum = (uint32_t)pow(2, (shift_inner_accum - 1)); + uint32_t add_shift_inner_accum = 1u << (shift_inner_accum - 1); - float final_shift[3] = { pow(2,(45 - shift_cub - shift_inner_accum)), - pow(2,(39 - shift_cub - shift_inner_accum)), - pow(2,(36 - shift_cub - shift_inner_accum)) }; + float final_shift[3] = { ldexp(1.0f, (int)(45 - shift_cub - shift_inner_accum)), + ldexp(1.0f, (int)(39 - shift_cub - shift_inner_accum)), + ldexp(1.0f, (int)(36 - shift_cub - shift_inner_accum)) }; const int32_t shift_sq = 30; const int32_t add_shift_sq = 536870912; //2^29 diff --git a/libvmaf/src/feature/integer_motion.c b/libvmaf/src/feature/integer_motion.c index 58eef5be9..b63fc17c6 100644 --- a/libvmaf/src/feature/integer_motion.c +++ b/libvmaf/src/feature/integer_motion.c @@ -122,7 +122,7 @@ y_convolution_16(void *src, uint16_t *dst, unsigned width, const unsigned radius = filter_width / 2; const unsigned top_edge = vmaf_ceiln(radius, 1); const unsigned bottom_edge = vmaf_floorn(height - (filter_width - radius), 1); - const unsigned add_before_shift = (int) pow(2, (inp_size_bits - 1)); + const unsigned add_before_shift = 1u << (inp_size_bits - 1); const unsigned shift_var = inp_size_bits; uint16_t *src_p = (uint16_t*) src + (top_edge - radius) * src_stride; @@ -189,7 +189,7 @@ y_convolution_8(void *src, uint16_t *dst, unsigned width, const unsigned top_edge = vmaf_ceiln(radius, 1); const unsigned bottom_edge = vmaf_floorn(height - (filter_width - radius), 1); const unsigned shift_var = 8; - const unsigned add_before_shift = (int) pow(2, (shift_var - 1)); + const unsigned add_before_shift = 1u << (shift_var - 1); for (unsigned i = 0; i < top_edge; i++) { for (unsigned j = 0; j < width; ++j) { @@ -241,6 +241,16 @@ static void sad_c(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad) } } +#if ARCH_X86 +static void sad_avx2_wrapper(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad) +{ + sad_16_avx2(pic_a->data[0], pic_b->data[0], + pic_a->w[0], pic_a->h[0], + pic_a->stride[0] / 2, pic_b->stride[0] / 2, + sad); +} +#endif + static int extract_force_zero(VmafFeatureExtractor *fex, VmafPicture *ref_pic, VmafPicture *ref_pic_90, VmafPicture *dist_pic, VmafPicture *dist_pic_90, @@ -304,18 +314,19 @@ static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt, s->y_convolution = bpc == 8 ? y_convolution_8 : y_convolution_16; s->x_convolution = x_convolution_16; + s->sad = sad_c; #if ARCH_X86 unsigned flags = vmaf_get_cpu_flags(); - if (flags & VMAF_X86_CPU_FLAG_AVX2) + if (flags & VMAF_X86_CPU_FLAG_AVX2) { s->x_convolution = x_convolution_16_avx2; + s->sad = sad_avx2_wrapper; + } #if HAVE_AVX512 if (flags & VMAF_X86_CPU_FLAG_AVX512) s->x_convolution = x_convolution_16_avx512; #endif #endif - - s->sad = sad_c; s->score = 0.; return 0; diff --git a/libvmaf/src/feature/integer_psnr.c b/libvmaf/src/feature/integer_psnr.c index 18b5b9205..0525cacbe 100644 --- a/libvmaf/src/feature/integer_psnr.c +++ b/libvmaf/src/feature/integer_psnr.c @@ -23,10 +23,15 @@ #include #include +#include "cpu.h" #include "feature_collector.h" #include "feature_extractor.h" #include "opt.h" +#if ARCH_X86 +#include "x86/psnr_avx2.h" +#endif + typedef struct PsnrState { bool enable_chroma; bool enable_mse; @@ -35,6 +40,10 @@ typedef struct PsnrState { uint32_t peak; double psnr_max[3]; double min_sse; + void (*compute_sse_8)(const uint8_t *ref, const uint8_t *dis, + unsigned w, unsigned h, + ptrdiff_t stride_ref, ptrdiff_t stride_dis, + uint64_t *sse); struct { uint64_t sse[3]; uint64_t n_pixels[3]; @@ -83,6 +92,25 @@ static const VmafOption options[] = { }; +static void psnr_sse_8_c(const uint8_t *ref, const uint8_t *dis, + unsigned w, unsigned h, + ptrdiff_t stride_ref, ptrdiff_t stride_dis, + uint64_t *sse) +{ + uint64_t total_sse = 0; + for (unsigned i = 0; i < h; i++) { + uint32_t sse_inner = 0; + for (unsigned j = 0; j < w; j++) { + const int16_t e = ref[j] - dis[j]; + sse_inner += e * e; + } + total_sse += sse_inner; + ref += stride_ref; + dis += stride_dis; + } + *sse = total_sse; +} + static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt, unsigned bpc, unsigned w, unsigned h) { @@ -92,11 +120,18 @@ static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt, if (pix_fmt == VMAF_PIX_FMT_YUV400P) s->enable_chroma = false; + s->compute_sse_8 = psnr_sse_8_c; +#if ARCH_X86 + unsigned flags = vmaf_get_cpu_flags(); + if (flags & VMAF_X86_CPU_FLAG_AVX2) + s->compute_sse_8 = psnr_sse_8_avx2; +#endif + for (unsigned i = 0; i < 3; i++) { if (s->min_sse != 0.0) { const int ss_hor = pix_fmt != VMAF_PIX_FMT_YUV444P; const int ss_ver = pix_fmt == VMAF_PIX_FMT_YUV420P; - const double mse = s->min_sse / + const double mse = s->min_sse / (((i && ss_hor) ? w / 2 : w) * ((i && ss_ver) ? h / 2 : h)); s->psnr_max[i] = ceil(10. * log10(s->peak * s->peak / mse)); } else { @@ -123,20 +158,11 @@ static int psnr(VmafPicture *ref_pic, VmafPicture *dist_pic, int err = 0; for (unsigned p = 0; p < n; p++) { - uint8_t *ref = ref_pic->data[p]; - uint8_t *dis = dist_pic->data[p]; - uint64_t sse = 0; - for (unsigned i = 0; i < ref_pic->h[p]; i++) { - uint32_t sse_inner = 0; - for (unsigned j = 0; j < ref_pic->w[p]; j++) { - const int16_t e = ref[j] - dis[j]; - sse_inner += e * e; - } - sse += sse_inner; - ref += ref_pic->stride[p]; - dis += dist_pic->stride[p]; - } + s->compute_sse_8(ref_pic->data[p], dist_pic->data[p], + ref_pic->w[p], ref_pic->h[p], + ref_pic->stride[p], dist_pic->stride[p], + &sse); if (s->enable_apsnr) { s->apsnr.sse[p] += sse; diff --git a/libvmaf/src/feature/integer_vif.c b/libvmaf/src/feature/integer_vif.c index fd6a82317..a9e8b8720 100644 --- a/libvmaf/src/feature/integer_vif.c +++ b/libvmaf/src/feature/integer_vif.c @@ -301,34 +301,19 @@ void vif_statistic_8(struct VifPublicState *s, float *num, float *den, unsigned sigma2_sq = MAX(sigma2_sq, 0); if (sigma1_sq >= sigma_nsq) { - /** - * log values are taken from the look-up table generated by - * log_generate() function which is called in integer_combo_threadfunc - * den_val in float is log2(1 + sigma1_sq/2) - * here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17 - * multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value - * x because best 16 bits are taken - */ accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17; if (sigma12 > 0 && sigma2_sq > 0) { - /** - * In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq)) - * - * In Fixed-point the above is converted to - * numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq) - */ - - const double eps = 65536 * 1.0e-10; - double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away + double g = (double)sigma12 / sigma1_sq; int32_t sv_sq = sigma2_sq - g * sigma12; sv_sq = (uint32_t)(MAX(sv_sq, 0)); g = MIN(g, vif_enhn_gain_limit); + double g_sq = g * g; uint32_t numer1 = (sv_sq + sigma_nsq); - int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator + int64_t numer1_tmp = (int64_t)(g_sq * sigma1_sq) + numer1; accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1); } } @@ -444,34 +429,19 @@ void vif_statistic_16(struct VifPublicState *s, float *num, float *den, unsigned sigma2_sq = MAX(sigma2_sq, 0); if (sigma1_sq >= sigma_nsq) { - /** - * log values are taken from the look-up table generated by - * log_generate() function which is called in integer_combo_threadfunc - * den_val in float is log2(1 + sigma1_sq/2) - * here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17 - * multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value - * x because best 16 bits are taken - */ accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17; if (sigma12 > 0 && sigma2_sq > 0) { - /** - * In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq)) - * - * In Fixed-point the above is converted to - * numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq) - */ - - const double eps = 65536 * 1.0e-10; - double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away + double g = (double)sigma12 / sigma1_sq; int32_t sv_sq = sigma2_sq - g * sigma12; sv_sq = (uint32_t)(MAX(sv_sq, 0)); g = MIN(g, vif_enhn_gain_limit); + double g_sq = g * g; uint32_t numer1 = (sv_sq + sigma_nsq); - int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator + int64_t numer1_tmp = (int64_t)(g_sq * sigma1_sq) + numer1; accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1); } } @@ -535,34 +505,19 @@ VifResiduals vif_compute_line_residuals(VifPublicState *s, unsigned from, sigma2_sq = MAX(sigma2_sq, 0); if (sigma1_sq >= sigma_nsq) { - /** - * log values are taken from the look-up table generated by - * log_generate() function which is called in integer_combo_threadfunc - * den_val in float is log2(1 + sigma1_sq/2) - * here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17 - * multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value - * x because best 16 bits are taken - */ residuals.accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17; if (sigma12 > 0 && sigma2_sq > 0) { - /** - * In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq)) - * - * In Fixed-point the above is converted to - * numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq) - */ - - const double eps = 65536 * 1.0e-10; - double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away + double g = (double)sigma12 / sigma1_sq; int32_t sv_sq = sigma2_sq - g * sigma12; sv_sq = (uint32_t)(MAX(sv_sq, 0)); g = MIN(g, vif_enhn_gain_limit); + double g_sq = g * g; uint32_t numer1 = (sv_sq + sigma_nsq); - int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator + int64_t numer1_tmp = (int64_t)(g_sq * sigma1_sq) + numer1; residuals.accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1); } } diff --git a/libvmaf/src/feature/x86/motion_avx2.c b/libvmaf/src/feature/x86/motion_avx2.c index f8491f36c..118709616 100644 --- a/libvmaf/src/feature/x86/motion_avx2.c +++ b/libvmaf/src/feature/x86/motion_avx2.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "feature/integer_motion.h" #include "feature/common/alignment.h" @@ -132,3 +133,57 @@ void x_convolution_16_avx2(const uint16_t *src, uint16_t *dst, unsigned width, } } } + +void sad_16_avx2(const uint16_t *a, const uint16_t *b, + unsigned w, unsigned h, + ptrdiff_t stride_a, ptrdiff_t stride_b, + uint64_t *sad) +{ + __m256i acc = _mm256_setzero_si256(); + uint64_t total_sad = 0; + + for (unsigned i = 0; i < h; i++) { + unsigned j = 0; + __m256i row_acc = _mm256_setzero_si256(); + + /* Process 16 uint16_t elements at a time */ + for (; j + 16 <= w; j += 16) { + __m256i va = _mm256_loadu_si256((const __m256i *)(a + j)); + __m256i vb = _mm256_loadu_si256((const __m256i *)(b + j)); + /* Compute absolute difference: max(a-b, b-a) for unsigned */ + __m256i diff1 = _mm256_subs_epu16(va, vb); + __m256i diff2 = _mm256_subs_epu16(vb, va); + __m256i absdiff = _mm256_or_si256(diff1, diff2); + /* Widen to 32-bit and accumulate */ + __m256i lo = _mm256_unpacklo_epi16(absdiff, _mm256_setzero_si256()); + __m256i hi = _mm256_unpackhi_epi16(absdiff, _mm256_setzero_si256()); + row_acc = _mm256_add_epi32(row_acc, lo); + row_acc = _mm256_add_epi32(row_acc, hi); + } + + /* Reduce row_acc to scalar (collapse 8x32-bit to 64-bit) */ + __m256i row_acc_lo = _mm256_unpacklo_epi32(row_acc, _mm256_setzero_si256()); + __m256i row_acc_hi = _mm256_unpackhi_epi32(row_acc, _mm256_setzero_si256()); + acc = _mm256_add_epi64(acc, row_acc_lo); + acc = _mm256_add_epi64(acc, row_acc_hi); + + /* Scalar tail */ + uint32_t inner_sad = 0; + for (; j < w; j++) { + inner_sad += abs(a[j] - b[j]); + } + total_sad += inner_sad; + + a += stride_a; + b += stride_b; + } + + /* Horizontal sum of acc (4x64-bit) */ + __m128i lo128 = _mm256_castsi256_si128(acc); + __m128i hi128 = _mm256_extracti128_si256(acc, 1); + __m128i sum128 = _mm_add_epi64(lo128, hi128); + __m128i sum64 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8)); + total_sad += (uint64_t)_mm_extract_epi64(sum64, 0); + + *sad = total_sad; +} diff --git a/libvmaf/src/feature/x86/motion_avx2.h b/libvmaf/src/feature/x86/motion_avx2.h index 0b0973b3d..ed97efcb9 100644 --- a/libvmaf/src/feature/x86/motion_avx2.h +++ b/libvmaf/src/feature/x86/motion_avx2.h @@ -25,4 +25,9 @@ void x_convolution_16_avx2(const uint16_t *src, uint16_t *dst, unsigned width, unsigned height, ptrdiff_t src_stride, ptrdiff_t dst_stride); +void sad_16_avx2(const uint16_t *a, const uint16_t *b, + unsigned w, unsigned h, + ptrdiff_t stride_a, ptrdiff_t stride_b, + uint64_t *sad); + #endif /* X86_AVX2_MOTION_H_ */ diff --git a/libvmaf/src/feature/x86/psnr_avx2.c b/libvmaf/src/feature/x86/psnr_avx2.c new file mode 100644 index 000000000..fcd58b3b1 --- /dev/null +++ b/libvmaf/src/feature/x86/psnr_avx2.c @@ -0,0 +1,87 @@ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include + +void psnr_sse_8_avx2(const uint8_t *ref, const uint8_t *dis, + unsigned w, unsigned h, + ptrdiff_t stride_ref, ptrdiff_t stride_dis, + uint64_t *sse) +{ + __m256i acc = _mm256_setzero_si256(); + uint64_t total_sse = 0; + + for (unsigned i = 0; i < h; i++) { + unsigned j = 0; + __m256i row_acc = _mm256_setzero_si256(); + + /* Process 32 uint8_t elements at a time */ + for (; j + 32 <= w; j += 32) { + __m256i r = _mm256_loadu_si256((const __m256i *)(ref + j)); + __m256i d = _mm256_loadu_si256((const __m256i *)(dis + j)); + + /* Zero-extend low 16 bytes to 16-bit */ + __m256i r_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r)); + __m256i d_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(d)); + + /* Zero-extend high 16 bytes to 16-bit */ + __m256i r_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(r, 1)); + __m256i d_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(d, 1)); + + /* Compute signed differences */ + __m256i diff_lo = _mm256_sub_epi16(r_lo, d_lo); + __m256i diff_hi = _mm256_sub_epi16(r_hi, d_hi); + + /* Square and sum adjacent pairs: int16*int16 -> int32 */ + __m256i sq_lo = _mm256_madd_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_madd_epi16(diff_hi, diff_hi); + + /* Accumulate into 32-bit row accumulator */ + row_acc = _mm256_add_epi32(row_acc, sq_lo); + row_acc = _mm256_add_epi32(row_acc, sq_hi); + } + + /* Widen row_acc from 32-bit to 64-bit and add to main accumulator */ + __m256i row_lo = _mm256_unpacklo_epi32(row_acc, _mm256_setzero_si256()); + __m256i row_hi = _mm256_unpackhi_epi32(row_acc, _mm256_setzero_si256()); + acc = _mm256_add_epi64(acc, row_lo); + acc = _mm256_add_epi64(acc, row_hi); + + /* Scalar tail */ + uint32_t sse_inner = 0; + for (; j < w; j++) { + const int16_t e = ref[j] - dis[j]; + sse_inner += e * e; + } + total_sse += sse_inner; + + ref += stride_ref; + dis += stride_dis; + } + + /* Horizontal sum of acc (4x64-bit) */ + __m128i lo128 = _mm256_castsi256_si128(acc); + __m128i hi128 = _mm256_extracti128_si256(acc, 1); + __m128i sum128 = _mm_add_epi64(lo128, hi128); + __m128i sum64 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8)); + total_sse += (uint64_t)_mm_extract_epi64(sum64, 0); + + *sse = total_sse; +} diff --git a/libvmaf/src/feature/x86/psnr_avx2.h b/libvmaf/src/feature/x86/psnr_avx2.h new file mode 100644 index 000000000..ee2f6065e --- /dev/null +++ b/libvmaf/src/feature/x86/psnr_avx2.h @@ -0,0 +1,30 @@ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef PSNR_AVX2_H_ +#define PSNR_AVX2_H_ + +#include +#include + +void psnr_sse_8_avx2(const uint8_t *ref, const uint8_t *dis, + unsigned w, unsigned h, + ptrdiff_t stride_ref, ptrdiff_t stride_dis, + uint64_t *sse); + +#endif /* PSNR_AVX2_H_ */ diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build index 80a34bc4f..6563bddd2 100644 --- a/libvmaf/src/meson.build +++ b/libvmaf/src/meson.build @@ -242,6 +242,7 @@ if is_asm_enabled feature_src_dir + 'x86/vif_avx2.c', feature_src_dir + 'x86/adm_avx2.c', feature_src_dir + 'x86/cambi_avx2.c', + feature_src_dir + 'x86/psnr_avx2.c', ] x86_avx2_static_lib = static_library( diff --git a/libvmaf/src/model.c b/libvmaf/src/model.c index ab623a9fd..02dab9e42 100644 --- a/libvmaf/src/model.c +++ b/libvmaf/src/model.c @@ -191,6 +191,7 @@ void vmaf_model_destroy(VmafModel *model) svm_free_and_destroy_model(&(model->svm)); for (unsigned i = 0; i < model->n_features; i++) { free(model->feature[i].name); + free(model->feature[i].name_with_opt); vmaf_dictionary_free(&model->feature[i].opts_dict); } free(model->feature); diff --git a/libvmaf/src/model.h b/libvmaf/src/model.h index e0a41953b..f1778a972 100644 --- a/libvmaf/src/model.h +++ b/libvmaf/src/model.h @@ -41,6 +41,7 @@ typedef struct { char *name; double slope, intercept; VmafDictionary *opts_dict; + char *name_with_opt; /* cached feature name with options, lazily populated */ } VmafModelFeature; typedef struct { diff --git a/libvmaf/src/predict.c b/libvmaf/src/predict.c index 9926de15b..ea94b8634 100644 --- a/libvmaf/src/predict.c +++ b/libvmaf/src/predict.c @@ -237,49 +237,66 @@ int vmaf_predict_score_at_index(VmafModel *model, int err = 0; - struct svm_node *node = malloc(sizeof(*node) * (model->n_features + 1)); - if (!node) return -ENOMEM; + /* Use stack allocation for typical model sizes, heap for large ones */ + struct svm_node node_buf[32]; + struct svm_node *node; + if (model->n_features + 1 <= 32) { + node = node_buf; + } else { + node = malloc(sizeof(*node) * (model->n_features + 1)); + if (!node) return -ENOMEM; + } for (unsigned i = 0; i < model->n_features; i++) { - VmafFeatureExtractor *fex = - vmaf_get_feature_extractor_by_feature_name(model->feature[i].name, 0); - - if (!fex) { - vmaf_log(VMAF_LOG_LEVEL_ERROR, - "vmaf_predict_score_at_index(): no feature extractor " - "providing feature '%s'\n", model->feature[i].name); - err = -EINVAL; - goto free_node; - } + /* Use cached feature name if available, otherwise generate it */ + const char *feature_name = model->feature[i].name_with_opt; + char *feature_name_alloc = NULL; - VmafDictionary *opts_dict = NULL; - if (model->feature[i].opts_dict) { - err = vmaf_dictionary_copy(&model->feature[i].opts_dict, &opts_dict); - if (err) return err; - } + if (!feature_name) { + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_feature_name(model->feature[i].name, 0); + + if (!fex) { + vmaf_log(VMAF_LOG_LEVEL_ERROR, + "vmaf_predict_score_at_index(): no feature extractor " + "providing feature '%s'\n", model->feature[i].name); + err = -EINVAL; + goto free_node; + } - VmafFeatureExtractorContext *fex_ctx; - err = vmaf_feature_extractor_context_create(&fex_ctx, fex, opts_dict); - if (err) { - vmaf_log(VMAF_LOG_LEVEL_ERROR, - "vmaf_predict_score_at_index(): could not generate " - "feature extractor context\n"); - vmaf_dictionary_free(&opts_dict); - return err; - } + VmafDictionary *opts_dict = NULL; + if (model->feature[i].opts_dict) { + err = vmaf_dictionary_copy(&model->feature[i].opts_dict, &opts_dict); + if (err) { err = err; goto free_node; } + } - char *feature_name = - vmaf_feature_name_from_options(model->feature[i].name, - fex_ctx->fex->options, fex_ctx->fex->priv); + VmafFeatureExtractorContext *fex_ctx; + err = vmaf_feature_extractor_context_create(&fex_ctx, fex, opts_dict); + if (err) { + vmaf_log(VMAF_LOG_LEVEL_ERROR, + "vmaf_predict_score_at_index(): could not generate " + "feature extractor context\n"); + vmaf_dictionary_free(&opts_dict); + goto free_node; + } - vmaf_feature_extractor_context_destroy(fex_ctx); + feature_name_alloc = + vmaf_feature_name_from_options(model->feature[i].name, + fex_ctx->fex->options, fex_ctx->fex->priv); - if (!feature_name) { - vmaf_log(VMAF_LOG_LEVEL_ERROR, - "vmaf_predict_score_at_index(): could not generate " - "feature name\n"); - err = -ENOMEM; - goto free_node; + vmaf_feature_extractor_context_destroy(fex_ctx); + + if (!feature_name_alloc) { + vmaf_log(VMAF_LOG_LEVEL_ERROR, + "vmaf_predict_score_at_index(): could not generate " + "feature name\n"); + err = -ENOMEM; + goto free_node; + } + feature_name = feature_name_alloc; + /* Cache for future calls */ + model->feature[i].name_with_opt = feature_name_alloc; + feature_name_alloc = NULL; /* ownership transferred */ } double feature_score; @@ -293,10 +310,10 @@ int vmaf_predict_score_at_index(VmafModel *model, "vmaf_predict_score_at_index(): no feature '%s' " "at index %d\n", feature_name, index); } - free(feature_name); + free(feature_name_alloc); goto free_node; } - free(feature_name); + free(feature_name_alloc); err = normalize(model, model->feature[i].slope, model->feature[i].intercept, &feature_score); @@ -327,7 +344,8 @@ int vmaf_predict_score_at_index(VmafModel *model, *vmaf_score = prediction; free_node: - free(node); + if (node != node_buf) + free(node); return err; } diff --git a/libvmaf/src/thread_pool.c b/libvmaf/src/thread_pool.c index 55d027aed..43c3ad79c 100644 --- a/libvmaf/src/thread_pool.c +++ b/libvmaf/src/thread_pool.c @@ -22,9 +22,12 @@ #include #include +#define JOB_INLINE_DATA_SIZE 64 + typedef struct VmafThreadPoolJob { void (*func)(void *data); void *data; + char inline_data[JOB_INLINE_DATA_SIZE]; struct VmafThreadPoolJob *next; } VmafThreadPoolJob; @@ -38,6 +41,7 @@ typedef struct VmafTreadPool { unsigned n_threads; unsigned n_working; bool stop; + VmafThreadPoolJob *free_jobs; } VmafThreadPool; static VmafThreadPoolJob *vmaf_thread_pool_fetch_job(VmafThreadPool *pool) @@ -55,10 +59,23 @@ static VmafThreadPoolJob *vmaf_thread_pool_fetch_job(VmafThreadPool *pool) return job; } +static void vmaf_thread_pool_job_recycle(VmafThreadPool *pool, + VmafThreadPoolJob *job) +{ + if (!job) return; + if (job->data && job->data != job->inline_data) + free(job->data); + job->data = NULL; + job->func = NULL; + job->next = pool->free_jobs; + pool->free_jobs = job; +} + static void vmaf_thread_pool_job_destroy(VmafThreadPoolJob *job) { if (!job) return; - if (job->data) free(job->data); + if (job->data && job->data != job->inline_data) + free(job->data); free(job); } @@ -76,10 +93,11 @@ static void *vmaf_thread_pool_runner(void *p) pthread_mutex_unlock(&(pool->queue.lock)); if (job) { job->func(job->data); - vmaf_thread_pool_job_destroy(job); } pthread_mutex_lock(&(pool->queue.lock)); pool->n_working--; + if (job) + vmaf_thread_pool_job_recycle(pool, job); if (!pool->stop && pool->n_working == 0 && !pool->queue.head) pthread_cond_signal(&(pool->working)); pthread_mutex_unlock(&(pool->queue.lock)); @@ -121,18 +139,39 @@ int vmaf_thread_pool_enqueue(VmafThreadPool *pool, void (*func)(void *data), if (!pool) return -EINVAL; if (!func) return -EINVAL; - VmafThreadPoolJob *job = malloc(sizeof(*job)); - if (!job) return -ENOMEM; + pthread_mutex_lock(&(pool->queue.lock)); + + /* Try to reuse a job from the free list */ + VmafThreadPoolJob *job = pool->free_jobs; + if (job) { + pool->free_jobs = job->next; + } else { + job = malloc(sizeof(*job)); + if (!job) { + pthread_mutex_unlock(&(pool->queue.lock)); + return -ENOMEM; + } + } + memset(job, 0, sizeof(*job)); job->func = func; if (data) { - job->data = malloc(data_sz); - if (!job->data) goto free_job; - memcpy(job->data, data, data_sz); + if (data_sz <= JOB_INLINE_DATA_SIZE) { + memcpy(job->inline_data, data, data_sz); + job->data = job->inline_data; + } else { + job->data = malloc(data_sz); + if (!job->data) { + /* Return job to free list */ + job->next = pool->free_jobs; + pool->free_jobs = job; + pthread_mutex_unlock(&(pool->queue.lock)); + return -ENOMEM; + } + memcpy(job->data, data, data_sz); + } } - pthread_mutex_lock(&(pool->queue.lock)); - if (!pool->queue.head) { pool->queue.head = job; pool->queue.tail = pool->queue.head; @@ -141,14 +180,10 @@ int vmaf_thread_pool_enqueue(VmafThreadPool *pool, void (*func)(void *data), pool->queue.tail = job; } - pthread_cond_broadcast(&(pool->queue.empty)); + pthread_cond_signal(&(pool->queue.empty)); pthread_mutex_unlock(&(pool->queue.lock)); return 0; - -free_job: - free(job); - return -ENOMEM; } int vmaf_thread_pool_wait(VmafThreadPool *pool) @@ -178,6 +213,15 @@ int vmaf_thread_pool_destroy(VmafThreadPool *pool) pthread_cond_broadcast(&(pool->queue.empty)); pthread_mutex_unlock(&(pool->queue.lock)); vmaf_thread_pool_wait(pool); + + /* Free all pooled job objects */ + job = pool->free_jobs; + while (job) { + VmafThreadPoolJob *next_job = job->next; + free(job); + job = next_job; + } + pthread_mutex_destroy(&(pool->queue.lock)); pthread_cond_destroy(&(pool->queue.empty)); pthread_cond_destroy(&(pool->working)); diff --git a/libvmaf/test/meson.build b/libvmaf/test/meson.build index 1fe4d422c..a5575c385 100644 --- a/libvmaf/test/meson.build +++ b/libvmaf/test/meson.build @@ -177,3 +177,12 @@ test('test_cli_parse', test_cli_parse) test('test_psnr', test_psnr) test('test_framesync', test_framesync) test('test_propagate_metadata', test_propagate_metadata) + +test_perf_optimizations = executable('test_perf_optimizations', + ['test.c', 'test_perf_optimizations.c'], + include_directories : [libvmaf_inc, test_inc, include_directories('../src/'), include_directories('../src/feature/')], + link_with : get_option('default_library') == 'both' ? libvmaf.get_static_lib() : libvmaf, + dependencies : [math_lib, stdatomic_dependency, thread_lib, cuda_dependency], +) + +test('test_perf_optimizations', test_perf_optimizations) diff --git a/libvmaf/test/test_perf_optimizations.c b/libvmaf/test/test_perf_optimizations.c new file mode 100644 index 000000000..4b48d3786 --- /dev/null +++ b/libvmaf/test/test_perf_optimizations.c @@ -0,0 +1,570 @@ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#include + +#include "test.h" +#include "thread_pool.h" +#include "feature/feature_collector.h" +#include "feature/feature_extractor.h" +#include "predict.h" +#include "libvmaf/libvmaf.h" +#include "libvmaf/picture.h" +#include "libvmaf/model.h" + +/* ======================================================================== + * Helper utilities + * ======================================================================== */ + +#define EPS 0.00001 + +static int almost_equal(double a, double b) { + double diff = a > b ? a - b : b - a; + return diff < EPS; +} + +/* ======================================================================== + * Thread pool tests (thundering herd fix, signal vs broadcast) + * ======================================================================== */ + +static volatile int tp_counter = 0; +static pthread_mutex_t tp_counter_lock = PTHREAD_MUTEX_INITIALIZER; + +static void increment_counter(void *data) +{ + (void)data; + pthread_mutex_lock(&tp_counter_lock); + tp_counter++; + pthread_mutex_unlock(&tp_counter_lock); +} + +static char *test_thread_pool_many_jobs() +{ + int err; + VmafThreadPool *pool; + const unsigned n_threads = 4; + const unsigned n_jobs = 1000; + + tp_counter = 0; + + err = vmaf_thread_pool_create(&pool, n_threads); + mu_assert("problem creating thread pool", !err); + + for (unsigned i = 0; i < n_jobs; i++) { + err = vmaf_thread_pool_enqueue(pool, increment_counter, NULL, 0); + mu_assert("problem enqueuing job", !err); + } + + err = vmaf_thread_pool_wait(pool); + mu_assert("problem waiting for thread pool", !err); + + mu_assert("not all jobs completed (thundering herd regression)", + tp_counter == (int)n_jobs); + + err = vmaf_thread_pool_destroy(pool); + mu_assert("problem destroying thread pool", !err); + + return NULL; +} + +static void accumulate_value(void *data) +{ + int *val = data; + pthread_mutex_lock(&tp_counter_lock); + tp_counter += *val; + pthread_mutex_unlock(&tp_counter_lock); +} + +static char *test_thread_pool_data_passing() +{ + int err; + VmafThreadPool *pool; + + tp_counter = 0; + + err = vmaf_thread_pool_create(&pool, 2); + mu_assert("problem creating thread pool", !err); + + for (int i = 1; i <= 100; i++) { + err = vmaf_thread_pool_enqueue(pool, accumulate_value, &i, sizeof(i)); + mu_assert("problem enqueuing job with data", !err); + } + + err = vmaf_thread_pool_wait(pool); + mu_assert("problem waiting for thread pool", !err); + + /* Sum of 1..100 = 5050 */ + mu_assert("data passing through thread pool failed", + tp_counter == 5050); + + err = vmaf_thread_pool_destroy(pool); + mu_assert("problem destroying thread pool", !err); + + return NULL; +} + +/* ======================================================================== + * Feature collector tests (capacity 512, realloc behavior) + * ======================================================================== */ + +static char *test_feature_vector_capacity_512() +{ + int err; + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("problem during vmaf_feature_collector_init", !err); + + /* Append 500 scores to a single feature - should not realloc since + * initial capacity is now 512 */ + for (unsigned i = 0; i < 500; i++) { + err = vmaf_feature_collector_append(fc, "test_feature", (double)i, i); + mu_assert("problem during vmaf_feature_collector_append", !err); + } + + /* Verify all scores are retrievable */ + double score; + err = vmaf_feature_collector_get_score(fc, "test_feature", &score, 0); + mu_assert("could not get score at index 0", !err); + mu_assert("score at index 0 incorrect", almost_equal(score, 0.0)); + + err = vmaf_feature_collector_get_score(fc, "test_feature", &score, 499); + mu_assert("could not get score at index 499", !err); + mu_assert("score at index 499 incorrect", almost_equal(score, 499.0)); + + /* Now append beyond 512 - should trigger realloc */ + for (unsigned i = 500; i < 600; i++) { + err = vmaf_feature_collector_append(fc, "test_feature", (double)i, i); + mu_assert("problem appending beyond initial capacity", !err); + } + + err = vmaf_feature_collector_get_score(fc, "test_feature", &score, 599); + mu_assert("could not get score after realloc", !err); + mu_assert("score after realloc incorrect", almost_equal(score, 599.0)); + + vmaf_feature_collector_destroy(fc); + return NULL; +} + +static char *test_feature_collector_multiple_features() +{ + int err; + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("problem during init", !err); + + /* Add 20 different features - tests collector capacity growth */ + char name[64]; + for (unsigned f = 0; f < 20; f++) { + snprintf(name, sizeof(name), "feature_%u", f); + for (unsigned i = 0; i < 10; i++) { + err = vmaf_feature_collector_append(fc, name, (double)(f * 10 + i), i); + mu_assert("problem during append", !err); + } + } + + /* Verify retrieval from various features */ + double score; + err = vmaf_feature_collector_get_score(fc, "feature_0", &score, 5); + mu_assert("get_score failed for feature_0", !err); + mu_assert("wrong score for feature_0[5]", almost_equal(score, 5.0)); + + err = vmaf_feature_collector_get_score(fc, "feature_19", &score, 9); + mu_assert("get_score failed for feature_19", !err); + mu_assert("wrong score for feature_19[9]", almost_equal(score, 199.0)); + + vmaf_feature_collector_destroy(fc); + return NULL; +} + +/* ======================================================================== + * Predict tests (stack-allocated SVM nodes, cached feature names) + * ======================================================================== */ + +static char *test_predict_score_consistency() +{ + int err; + + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("problem during vmaf_feature_collector_init", !err); + + VmafModel *model; + VmafModelConfig cfg = { + .name = "vmaf", + .flags = VMAF_MODEL_FLAGS_DEFAULT, + }; + err = vmaf_model_load(&model, &cfg, "vmaf_v0.6.1"); + mu_assert("problem during vmaf_model_load", !err); + + /* Insert known feature scores */ + for (unsigned i = 0; i < model->n_features; i++) { + err = vmaf_feature_collector_append(fc, model->feature[i].name, 60., 0); + mu_assert("problem during feature append", !err); + } + + /* Predict twice to test caching of name_with_opt */ + double score1 = 0., score2 = 0.; + err = vmaf_predict_score_at_index(model, fc, 0, &score1, true, false, 0); + mu_assert("first prediction failed", !err); + mu_assert("first prediction out of range", score1 > 0. && score1 <= 100.); + + /* Insert scores at index 1 too */ + for (unsigned i = 0; i < model->n_features; i++) { + err = vmaf_feature_collector_append(fc, model->feature[i].name, 60., 1); + mu_assert("problem during feature append at index 1", !err); + } + + err = vmaf_predict_score_at_index(model, fc, 1, &score2, true, false, 0); + mu_assert("second prediction failed", !err); + + /* Same inputs should give same score */ + mu_assert("cached prediction differs from first", + almost_equal(score1, score2)); + + vmaf_model_destroy(model); + vmaf_feature_collector_destroy(fc); + return NULL; +} + +/* ======================================================================== + * Feature extractor correctness tests (PSNR, VIF, Motion, ADM) + * ======================================================================== */ + +static int alloc_picture_8b(VmafPicture *pic, unsigned w, unsigned h, + uint8_t fill_value) +{ + int err = vmaf_picture_alloc(pic, VMAF_PIX_FMT_YUV400P, 8, w, h); + if (err) return err; + uint8_t *data = pic->data[0]; + for (unsigned i = 0; i < h; i++) { + memset(data, fill_value, w); + data += pic->stride[0]; + } + return 0; +} + +static char *test_psnr_identical_frames() +{ + VmafPicture ref, dist; + int err = 0; + + err |= alloc_picture_8b(&ref, 64, 64, 128); + err |= alloc_picture_8b(&dist, 64, 64, 128); + mu_assert("picture alloc failed", !err); + + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("feature collector init failed", !err); + + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_name("psnr"); + mu_assert("could not find psnr feature extractor", fex); + + VmafFeatureExtractorContext *fex_ctx; + err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL); + mu_assert("fex context create failed", !err); + + err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL, + &dist, NULL, 0, fc); + mu_assert("psnr extract failed", !err); + + double psnr_y; + err = vmaf_feature_collector_get_score(fc, "psnr_y", &psnr_y, 0); + mu_assert("get psnr_y score failed", !err); + + /* Identical frames should give maximum PSNR */ + mu_assert("identical frames should have max PSNR", + psnr_y >= 60.0); + + vmaf_feature_extractor_context_close(fex_ctx); + vmaf_feature_extractor_context_destroy(fex_ctx); + vmaf_feature_collector_destroy(fc); + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); + + return NULL; +} + +static char *test_psnr_different_frames() +{ + VmafPicture ref, dist; + int err = 0; + + err |= alloc_picture_8b(&ref, 64, 64, 0); + err |= alloc_picture_8b(&dist, 64, 64, 255); + mu_assert("picture alloc failed", !err); + + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("feature collector init failed", !err); + + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_name("psnr"); + mu_assert("could not find psnr feature extractor", fex); + + VmafFeatureExtractorContext *fex_ctx; + err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL); + mu_assert("fex context create failed", !err); + + err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL, + &dist, NULL, 0, fc); + mu_assert("psnr extract failed", !err); + + double psnr_y; + err = vmaf_feature_collector_get_score(fc, "psnr_y", &psnr_y, 0); + mu_assert("get psnr_y score failed", !err); + + /* Max difference (0 vs 255) should give very low PSNR */ + mu_assert("max difference should have very low PSNR", psnr_y < 10.0); + + vmaf_feature_extractor_context_close(fex_ctx); + vmaf_feature_extractor_context_destroy(fex_ctx); + vmaf_feature_collector_destroy(fc); + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); + + return NULL; +} + +static char *test_vif_identical_frames() +{ + VmafPicture ref, dist; + int err = 0; + + err |= alloc_picture_8b(&ref, 64, 64, 128); + err |= alloc_picture_8b(&dist, 64, 64, 128); + mu_assert("picture alloc failed", !err); + + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("feature collector init failed", !err); + + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_name("vif"); + mu_assert("could not find vif feature extractor", fex); + + VmafFeatureExtractorContext *fex_ctx; + err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL); + mu_assert("vif fex context create failed", !err); + + err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL, + &dist, NULL, 0, fc); + mu_assert("vif extract failed", !err); + + double vif_s0; + err = vmaf_feature_collector_get_score(fc, + "VMAF_integer_feature_vif_scale0_score", &vif_s0, 0); + mu_assert("get vif scale0 score failed", !err); + + /* Identical frames should give VIF = 1.0 */ + mu_assert("identical frames should have VIF ~1.0", + vif_s0 > 0.99 && vif_s0 < 1.01); + + vmaf_feature_extractor_context_close(fex_ctx); + vmaf_feature_extractor_context_destroy(fex_ctx); + vmaf_feature_collector_destroy(fc); + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); + + return NULL; +} + +static char *test_motion_identical_frames() +{ + VmafPicture ref; + int err = 0; + + err = alloc_picture_8b(&ref, 64, 64, 100); + mu_assert("picture alloc failed", !err); + + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("feature collector init failed", !err); + + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_name("motion"); + mu_assert("could not find motion feature extractor", fex); + + VmafFeatureExtractorContext *fex_ctx; + err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL); + mu_assert("motion fex context create failed", !err); + + /* Feed the same frame twice (index 0 and 1) */ + err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL, + &ref, NULL, 0, fc); + mu_assert("motion extract failed (frame 0)", !err); + + err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL, + &ref, NULL, 1, fc); + mu_assert("motion extract failed (frame 1)", !err); + + /* Motion of identical frames should be 0 */ + double motion_score; + err = vmaf_feature_collector_get_score(fc, + "VMAF_integer_feature_motion2_score", &motion_score, 0); + mu_assert("get motion2 score failed", !err); + mu_assert("identical frames should have motion score 0", + almost_equal(motion_score, 0.0)); + + vmaf_feature_extractor_context_close(fex_ctx); + vmaf_feature_extractor_context_destroy(fex_ctx); + vmaf_feature_collector_destroy(fc); + vmaf_picture_unref(&ref); + + return NULL; +} + +static char *test_adm_identical_frames() +{ + VmafPicture ref, dist; + int err = 0; + + err |= alloc_picture_8b(&ref, 64, 64, 128); + err |= alloc_picture_8b(&dist, 64, 64, 128); + mu_assert("picture alloc failed", !err); + + VmafFeatureCollector *fc; + err = vmaf_feature_collector_init(&fc); + mu_assert("feature collector init failed", !err); + + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_name("adm"); + mu_assert("could not find adm feature extractor", fex); + + VmafFeatureExtractorContext *fex_ctx; + err = vmaf_feature_extractor_context_create(&fex_ctx, fex, NULL); + mu_assert("adm fex context create failed", !err); + + err = vmaf_feature_extractor_context_extract(fex_ctx, &ref, NULL, + &dist, NULL, 0, fc); + mu_assert("adm extract failed", !err); + + double adm2; + err = vmaf_feature_collector_get_score(fc, + "VMAF_integer_feature_adm2_score", &adm2, 0); + mu_assert("get adm2 score failed", !err); + + /* Identical frames should give ADM2 = 1.0 */ + mu_assert("identical frames should have ADM2 ~1.0", + adm2 > 0.99 && adm2 < 1.01); + + vmaf_feature_extractor_context_close(fex_ctx); + vmaf_feature_extractor_context_destroy(fex_ctx); + vmaf_feature_collector_destroy(fc); + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); + + return NULL; +} + +/* ======================================================================== + * End-to-end VMAF score test + * ======================================================================== */ + +static char *test_vmaf_score_range() +{ + int err; + VmafConfiguration vmaf_cfg = { 0 }; + VmafContext *vmaf; + err = vmaf_init(&vmaf, vmaf_cfg); + mu_assert("vmaf_init failed", !err); + + VmafModelConfig model_cfg = { .name = "vmaf" }; + VmafModel *model; + err = vmaf_model_load(&model, &model_cfg, "vmaf_v0.6.1"); + mu_assert("model load failed", !err); + + err = vmaf_use_features_from_model(vmaf, model); + mu_assert("use features from model failed", !err); + + /* Create identical ref/dist pictures */ + VmafPicture ref, dist; + err = vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, 8, 64, 64); + mu_assert("ref alloc failed", !err); + err = vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, 8, 64, 64); + mu_assert("dist alloc failed", !err); + + /* Fill with a pattern */ + for (int p = 0; p < 3; p++) { + uint8_t *rdata = ref.data[p]; + uint8_t *ddata = dist.data[p]; + for (unsigned i = 0; i < ref.h[p]; i++) { + for (unsigned j = 0; j < ref.w[p]; j++) { + rdata[j] = (uint8_t)((i + j) & 0xFF); + ddata[j] = (uint8_t)((i + j) & 0xFF); + } + rdata += ref.stride[p]; + ddata += dist.stride[p]; + } + } + + err = vmaf_read_pictures(vmaf, &ref, &dist, 0); + mu_assert("read pictures failed", !err); + + /* Flush */ + err = vmaf_read_pictures(vmaf, NULL, NULL, 0); + mu_assert("flush failed", !err); + + double vmaf_score; + err = vmaf_score_at_index(vmaf, model, &vmaf_score, 0); + mu_assert("score at index failed", !err); + + /* Identical frames should give VMAF near 100 */ + mu_assert("VMAF score for identical frames should be high (> 90)", + vmaf_score > 90.0); + mu_assert("VMAF score should not exceed 100", vmaf_score <= 100.0); + + vmaf_model_destroy(model); + vmaf_close(vmaf); + + return NULL; +} + +/* ======================================================================== + * Test runner + * ======================================================================== */ + +char *run_tests() +{ + /* Thread pool tests */ + mu_run_test(test_thread_pool_many_jobs); + mu_run_test(test_thread_pool_data_passing); + + /* Feature collector tests */ + mu_run_test(test_feature_vector_capacity_512); + mu_run_test(test_feature_collector_multiple_features); + + /* Predict tests */ + mu_run_test(test_predict_score_consistency); + + /* Feature extractor correctness tests */ + mu_run_test(test_psnr_identical_frames); + mu_run_test(test_psnr_different_frames); + mu_run_test(test_vif_identical_frames); + mu_run_test(test_motion_identical_frames); + mu_run_test(test_adm_identical_frames); + + /* End-to-end test */ + mu_run_test(test_vmaf_score_range); + + return NULL; +}