Commit 82fd084b authored by Yaowu Xu's avatar Yaowu Xu Committed by Gerrit Code Review

Merge "Re-design quantization process"

parents bd756699 9ac2f663
......@@ -714,6 +714,9 @@ specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vp9_subtract_block/, "$sse2_x86inc";
add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
......
......@@ -35,6 +35,7 @@ struct macroblock_plane {
// Quantizer setings
int16_t *quant_fp;
int16_t *round_fp;
int16_t *quant;
int16_t *quant_shift;
int16_t *zbin;
......@@ -110,6 +111,9 @@ struct macroblock {
int use_lp32x32fdct;
int skip_encode;
// use fast quantization process
int quant_fp;
// skip forward transform and quantization
int skip_txfm;
......
......@@ -3074,6 +3074,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
init_encode_frame_mb_context(cpi);
set_prev_mi(cm);
x->quant_fp = cpi->sf.use_quant_fp;
x->skip_txfm = 0;
if (sf->use_nonrd_pick_mode) {
// Initialize internal buffer pointers for rtc coding, where non-RD
......
......@@ -306,6 +306,56 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
const int16_t *src_diff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
switch (tx_size) {
case TX_32X32:
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_fdct8x8(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob,
scan_order->scan, scan_order->iscan);
break;
default:
assert(0);
}
}
void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
......@@ -424,11 +474,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
if (x->skip_txfm == 0) {
// full forward transform and quantization
if (!x->skip_recode)
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
if (!x->skip_recode) {
if (x->quant_fp)
vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
else
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
}
} else if (x->skip_txfm == 2) {
// fast path forward transform and quantization
vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
} else {
// skip forward transform
p->eobs[block] = 0;
......
......@@ -24,6 +24,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
......
......@@ -42,9 +42,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
}
void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int16_t *round_ptr, const int16_t quant,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
int eob = -1;
if (!skip_block) {
......@@ -63,6 +63,47 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
*eob_ptr = eob + 1;
}
void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
(void)zbin_oq_value;
(void)iscan;
vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
if (!skip_block) {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
for (i = 0; i < count; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = (tmp * quant_ptr[rc != 0]) >> 16;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
if (tmp)
eob = i;
}
}
*eob_ptr = eob + 1;
}
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
......@@ -207,11 +248,16 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
const int qrounding_factor = q == 0 ? 64 : 48;
for (i = 0; i < 2; ++i) {
int qrounding_factor_fp = i == 0 ? 48 : 42;
if (q == 0)
qrounding_factor_fp = 64;
// y
quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
: vp9_ac_quant(q, 0);
invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
quants->y_quant_fp[q][i] = (1 << 16) / quant;
quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
cm->y_dequant[q][i] = quant;
......@@ -222,6 +268,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
invert_quant(&quants->uv_quant[q][i],
&quants->uv_quant_shift[q][i], quant);
quants->uv_quant_fp[q][i] = (1 << 16) / quant;
quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
cm->uv_dequant[q][i] = quant;
......@@ -240,6 +287,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
for (i = 2; i < 8; i++) {
quants->y_quant[q][i] = quants->y_quant[q][1];
quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
quants->y_zbin[q][i] = quants->y_zbin[q][1];
quants->y_round[q][i] = quants->y_round[q][1];
......@@ -247,6 +295,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
quants->uv_quant[q][i] = quants->uv_quant[q][1];
quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
quants->uv_round[q][i] = quants->uv_round[q][1];
......@@ -276,6 +325,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
// Y
x->plane[0].quant = quants->y_quant[qindex];
x->plane[0].quant_fp = quants->y_quant_fp[qindex];
x->plane[0].round_fp = quants->y_round_fp[qindex];
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
x->plane[0].zbin = quants->y_zbin[qindex];
x->plane[0].round = quants->y_round[qindex];
......@@ -286,6 +336,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
for (i = 1; i < 3; i++) {
x->plane[i].quant = quants->uv_quant[qindex];
x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
x->plane[i].round_fp = quants->uv_round_fp[qindex];
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
x->plane[i].zbin = quants->uv_zbin[qindex];
x->plane[i].round = quants->uv_round[qindex];
......
......@@ -28,6 +28,8 @@ typedef struct {
// if we want to deprecate the current use of y_quant.
DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
......
......@@ -282,6 +282,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->elevate_newmv_thresh = 2000;
}
if (speed >= 7) {
sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
800 : 300;
......@@ -318,6 +319,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->use_quant_fp = 0;
sf->reference_masking = 0;
sf->partition_search_type = SEARCH_PARTITION;
sf->less_rectangular_check = 0;
......
......@@ -284,6 +284,9 @@ typedef struct SPEED_FEATURES {
// was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
int adaptive_pred_interp_filter;
// Fast quantization process path
int use_quant_fp;
// Search through variable block partition types in non-RD mode decision
// encoding process for RTC.
int partition_check;
......
......@@ -217,3 +217,185 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
INIT_XMM ssse3
QUANTIZE_FN b, 7
QUANTIZE_FN b_32x32, 7
%macro QUANTIZE_FP 2
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
; actual quantize loop - setup pointers, rounders, etc.
movifnidn coeffq, coeffmp
movifnidn ncoeffq, ncoeffmp
mov r2, dequantmp
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
mova m1, [roundq] ; m1 = round
mova m2, [quantq] ; m2 = quant
%ifidn %1, b_32x32
; TODO(jingning) to be continued with 32x32 quantization process
pcmpeqw m5, m5
psrlw m5, 15
paddw m0, m5
paddw m1, m5
psrlw m0, 1 ; m0 = (m0 + 1) / 2
psrlw m1, 1 ; m1 = (m1 + 1) / 2
%endif
mova m3, [r2q] ; m3 = dequant
mov r3, qcoeffmp
mov r4, dqcoeffmp
mov r5, iscanmp
%ifidn %1, b_32x32
psllw m4, 1
%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
neg ncoeffq
; get DC and first 15 AC coeffs
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpeqw m7, m7
pcmpeqw m12, m12
paddsw m6, m1 ; m6 += round
punpckhqdq m1, m1
paddsw m11, m1 ; m11 += round
pmulhw m8, m6, m2 ; m8 = m6*q>>16
punpckhqdq m2, m2
pmulhw m13, m11, m2 ; m13 = m11*q>>16
psignw m8, m9 ; m8 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
%ifidn %1, b_32x32
pabsw m8, m8
pabsw m13, m13
%endif
pmullw m8, m3 ; dqc[i] = qc[i] * q
punpckhqdq m3, m3
pmullw m13, m3 ; dqc[i] = qc[i] * q
%ifidn %1, b_32x32
psrlw m8, 1
psrlw m13, 1
psignw m8, m9
psignw m13, m10
%endif
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m8, m5 ; m8 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
psubw m6, m7 ; m6 = scan[i] + 1
psubw m11, m12 ; m11 = scan[i] + 1
pandn m8, m6 ; m8 = max(eob)
pandn m13, m11 ; m13 = max(eob)
pmaxsw m8, m13
add ncoeffq, mmsize
jz .accumulate_eob
.ac_only_loop:
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpeqw m7, m7
pcmpeqw m12, m12
%ifidn %1, b_32x32
pmovmskb r6, m7
pmovmskb r2, m12
or r6, r2
jz .skip_iter
%endif
paddsw m6, m1 ; m6 += round
paddsw m11, m1 ; m11 += round
pmulhw m14, m6, m2 ; m14 = m6*q>>16
pmulhw m13, m11, m2 ; m13 = m11*q>>16
psignw m14, m9 ; m14 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
%ifidn %1, b_32x32
pabsw m14, m14
pabsw m13, m13
%endif
pmullw m14, m3 ; dqc[i] = qc[i] * q
pmullw m13, m3 ; dqc[i] = qc[i] * q
%ifidn %1, b_32x32
psrlw m14, 1
psrlw m13, 1
psignw m14, m9
psignw m13, m10
%endif
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m14, m5 ; m14 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
psubw m6, m7 ; m6 = scan[i] + 1
psubw m11, m12 ; m11 = scan[i] + 1
pandn m14, m6 ; m14 = max(eob)
pandn m13, m11 ; m13 = max(eob)
pmaxsw m8, m14
pmaxsw m8, m13
add ncoeffq, mmsize
jl .ac_only_loop
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
mova [qcoeffq+ncoeffq*2+ 0], m5
mova [qcoeffq+ncoeffq*2+16], m5
mova [dqcoeffq+ncoeffq*2+ 0], m5
mova [dqcoeffq+ncoeffq*2+16], m5
add ncoeffq, mmsize
jl .ac_only_loop
%endif
.accumulate_eob:
; horizontally accumulate/max eobs and write into [eob] memory pointer
mov r2, eobmp
pshufd m7, m8, 0xe
pmaxsw m8, m7
pshuflw m7, m8, 0xe
pmaxsw m8, m7
pshuflw m7, m8, 0x1
pmaxsw m8, m7
pextrw r6, m8, 0
mov [r2], r6
RET
; skip-block, i.e. just write all zeroes
.blank:
mov r0, dqcoeffmp
movifnidn ncoeffq, ncoeffmp
mov r2, qcoeffmp
mov r3, eobmp
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
neg ncoeffq
pxor m7, m7
.blank_loop:
mova [dqcoeffq+ncoeffq*2+ 0], m7
mova [dqcoeffq+ncoeffq*2+16], m7
mova [qcoeffq+ncoeffq*2+ 0], m7
mova [qcoeffq+ncoeffq*2+16], m7
add ncoeffq, mmsize
jl .blank_loop
mov word [eobq], 0
RET
%endmacro
INIT_XMM ssse3
QUANTIZE_FP fp, 7
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment