Commit c8defcfd authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

Update quantize SSSE3 SIMD to cover 32x32 transform case also.

Encode time of bus (speed 0) 50 frames @ 1500kbps goes from 2min14.4 to
2min10.1, i.e. a 2.3% overall speed increase.

Change-Id: I3699580e74ec26c7d24e03681bc47ba25ee1ee87
parent 7353ceab
......@@ -569,6 +569,9 @@ specialize vp9_subtract_block sse2
prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
# Structured Similarity (SSIM)
#
......
......@@ -85,18 +85,19 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
}
// This function works well for large transform size.
static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs,
void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
int16_t *quant_ptr, int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
int16_t *dequant_ptr, int zbin_oq_value,
uint16_t *eob_ptr, const int16_t *scan,
int *idx_arr) {
const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
int idx = 0;
int idx_arr[1024];
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
......@@ -179,20 +180,18 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
// Call different quantization for different transform size.
if (n_coeffs >= 1024) {
// Save index of picked coefficient in pre-scan pass.
int idx_arr[1024];
quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
n_coeffs, mb->skip_block,
mb->plane[plane].zbin,
mb->plane[plane].round,
mb->plane[plane].quant,
mb->plane[plane].quant_shift,
BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
xd->plane[plane].dequant,
mb->plane[plane].zbin_extra,
&xd->plane[plane].eobs[block],
scan, idx_arr);
vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
n_coeffs, mb->skip_block,
mb->plane[plane].zbin,
mb->plane[plane].round,
mb->plane[plane].quant,
mb->plane[plane].quant_shift,
BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
xd->plane[plane].dequant,
mb->plane[plane].zbin_extra,
&xd->plane[plane].eobs[block],
scan, iscan);
}
else {
vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
......
......@@ -15,10 +15,10 @@ pw_1: times 8 dw 1
SECTION .text
INIT_XMM ssse3
cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
eob, scan, iscan
%macro QUANTIZE_FN 1
cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
......@@ -57,6 +57,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
%ifidn %1, b_32x32
paddw m6, m6
paddw m11, m11
%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
......@@ -77,9 +81,19 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
pand m13, m12
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
%ifidn %1, b_32x32
pabsw m8, m8
pabsw m13, m13
%endif
pmullw m8, m3 ; dqc[i] = qc[i] * q
punpckhqdq m3, m3
pmullw m13, m3 ; dqc[i] = qc[i] * q
%ifidn %1, b_32x32
psrlw m8, 1
psrlw m13, 1
psignw m8, m9
psignw m13, m10
%endif
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m8, m5 ; m8 = c[i] == 0
......@@ -99,6 +113,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
%ifidn %1, b_32x32
paddw m6, m6
paddw m11, m11
%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
paddw m6, m1 ; m6 += round
......@@ -115,8 +133,18 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
pand m13, m12
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
%ifidn %1, b_32x32
pabsw m14, m14
pabsw m13, m13
%endif
pmullw m14, m3 ; dqc[i] = qc[i] * q
pmullw m13, m3 ; dqc[i] = qc[i] * q
%ifidn %1, b_32x32
psrlw m14, 1
psrlw m13, 1
psignw m14, m9
psignw m13, m10
%endif
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m14, m5 ; m14 = c[i] == 0
......@@ -163,3 +191,8 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
jl .blank_loop
mov word [eobq], 0
RET
%endmacro
INIT_XMM ssse3
QUANTIZE_FN b
QUANTIZE_FN b_32x32
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment