diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 68613ec967dcc11e6c3dd9e7567b867006f74d7f..f52dccbf4e1df126e8bd365e82bc32f71fc1d270 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -714,6 +714,9 @@ specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/vp9_subtract_block/, "$sse2_x86inc"; +add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp/, "$ssse3_x86_64"; + add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 61d9d5d1e11d4d89f5ef5bb3864cd0d12ae91c19..ab7991e052f3e12e582b8107708e18f6b308a6ac 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -35,6 +35,7 @@ struct macroblock_plane { // Quantizer setings int16_t *quant_fp; + int16_t *round_fp; int16_t *quant; int16_t *quant_shift; int16_t *zbin; @@ -110,6 +111,9 @@ struct macroblock { int use_lp32x32fdct; int skip_encode; + // use fast quantization process + int quant_fp; + // skip forward transform and quantization int skip_txfm; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 94143d92f9ae79e7f46733ffa9ce955eb3870832..dab3ff74574b1a0a630cf7e17314d8b7a31aeb86 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3074,6 +3074,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { init_encode_frame_mb_context(cpi); set_prev_mi(cm); + x->quant_fp = cpi->sf.use_quant_fp; x->skip_txfm = 0; if (sf->use_nonrd_pick_mode) { // Initialize internal buffer pointers for rtc coding, where non-RD diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 0961f3b5f77e0258603d3aa0dbd9feca4743b209..d97226e49bc8938c2acfbb8e6b7dd268bc2beb83 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -306,6 +306,56 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); + break; + case TX_16X16: + vp9_fdct16x16(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_8X8: + vp9_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + default: + assert(0); + } +} + +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -424,11 +474,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, if (x->skip_txfm == 0) { // full forward transform and quantization - if (!x->skip_recode) - vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + if (!x->skip_recode) { + if (x->quant_fp) + vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); + else + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } } else if (x->skip_txfm == 2) { // fast path forward transform and quantization - vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); } else { // skip forward transform p->eobs[block] = 0; diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 3196c99203b02d4a38c3d6ecea5cdeebbb1dea22..0b8c3d2b04a7ca76bba39bad43b595eca5b596d1 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -24,6 +24,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size); +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); void vp9_xform_quant(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size); diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index f817bcc311a8b0405a578e493c716fa60dd09f73..1846da9f50636a67f2939c367fc511ded2024730 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -42,9 +42,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, } void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { int eob = -1; if (!skip_block) { @@ -63,6 +63,47 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, *eob_ptr = eob + 1; } +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} + void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -207,11 +248,16 @@ void vp9_init_quantizer(VP9_COMP *cpi) { const int qrounding_factor = q == 0 ? 64 : 48; for (i = 0; i < 2; ++i) { + int qrounding_factor_fp = i == 0 ? 48 : 42; + if (q == 0) + qrounding_factor_fp = 64; + // y quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) : vp9_ac_quant(q, 0); invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); quants->y_quant_fp[q][i] = (1 << 16) / quant; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->y_round[q][i] = (qrounding_factor * quant) >> 7; cm->y_dequant[q][i] = quant; @@ -222,6 +268,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], quant); quants->uv_quant_fp[q][i] = (1 << 16) / quant; + quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; cm->uv_dequant[q][i] = quant; @@ -240,6 +287,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { for (i = 2; i < 8; i++) { quants->y_quant[q][i] = quants->y_quant[q][1]; quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; quants->y_zbin[q][i] = quants->y_zbin[q][1]; quants->y_round[q][i] = quants->y_round[q][1]; @@ -247,6 +295,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->uv_quant[q][i] = quants->uv_quant[q][1]; quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; + quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1]; quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; quants->uv_round[q][i] = quants->uv_round[q][1]; @@ -276,6 +325,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { // Y x->plane[0].quant = quants->y_quant[qindex]; x->plane[0].quant_fp = quants->y_quant_fp[qindex]; + x->plane[0].round_fp = quants->y_round_fp[qindex]; x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; @@ -286,6 +336,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { for (i = 1; i < 3; i++) { x->plane[i].quant = quants->uv_quant[qindex]; x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; + x->plane[i].round_fp = quants->uv_round_fp[qindex]; x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 0e90462ebf46480a232999db24ec0a68faddfee6..24e4491910a056c1cbaad9c59c9b0bf7ffe67bcb 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -28,6 +28,8 @@ typedef struct { // if we want to deprecate the current use of y_quant. DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 811187057aae5998f756d52238e70df61d451d71..f25091b06ebdcad3f59d2c2487d44c0f30a93982 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -282,6 +282,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->elevate_newmv_thresh = 2000; } if (speed >= 7) { + sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1; sf->lpf_pick = LPF_PICK_MINIMAL_LPF; sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ? 800 : 300; @@ -318,6 +319,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; + sf->use_quant_fp = 0; sf->reference_masking = 0; sf->partition_search_type = SEARCH_PARTITION; sf->less_rectangular_check = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index f6d631184f64233139989e14ba5d2e0b841bcf57..74ec8c43c59f9a07a8e831a055c1369f1f2b06d9 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -284,6 +284,9 @@ typedef struct SPEED_FEATURES { // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. int adaptive_pred_interp_filter; + // Fast quantization process path + int use_quant_fp; + // Search through variable block partition types in non-RD mode decision // encoding process for RTC. int partition_check; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 48ccef8ccfb3045c20812dd14ff7c7cef96ee5cb..62da8659c28da43987b951e9245acbb8cd79834d 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -217,3 +217,185 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ INIT_XMM ssse3 QUANTIZE_FN b, 7 QUANTIZE_FN b_32x32, 7 + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 +; TODO(jingning) to be continued with 32x32 quantization process + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob + lea coeffq, [ coeffq+ncoeffq*2] + lea iscanq, [ iscanq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + pcmpeqw m12, m12 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + pcmpeqw m12, m12 +%ifidn %1, b_32x32 + pmovmskb r6, m7 + pmovmskb r2, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [dqcoeffq+ncoeffq*2+ 0], m7 + mova [dqcoeffq+ncoeffq*2+16], m7 + mova [qcoeffq+ncoeffq*2+ 0], m7 + mova [qcoeffq+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [eobq], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7