Commit ce3f4ade authored by Debargha Mukherjee's avatar Debargha Mukherjee Committed by Gerrit Code Review

Merge "SSSE3 optimisation for quantize in high bit depth"

parents 7266bedc 37c68efe
......@@ -849,10 +849,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b sse2/;
specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32/;
specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b sse2/;
......
......@@ -53,15 +53,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
%if CONFIG_VP9_HIGHBITDEPTH
lea coeffq, [ coeffq+ncoeffq*4]
lea qcoeffq, [ qcoeffq+ncoeffq*4]
lea dqcoeffq, [dqcoeffq+ncoeffq*4]
%else
lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
%endif
lea iscanq, [ iscanq+ncoeffq*2]
neg ncoeffq
; get DC and first 15 AC coeffs
%if CONFIG_VP9_HIGHBITDEPTH
; coeff stored as 32bit numbers & require 16bit numbers
mova m9, [ coeffq+ncoeffq*4+ 0]
mova m6, [ coeffq+ncoeffq*4+16]
mova m10, [ coeffq+ncoeffq*4+32]
mova m11, [ coeffq+ncoeffq*4+48]
packssdw m9, m6 ; m9 = c[i]
packssdw m10, m11 ; m10 = c[i]
%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
%endif
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
......@@ -82,8 +98,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m13, m10 ; m13 = reinsert sign
pand m8, m7
pand m13, m12
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
mova m11, m8
mova m6, m8
pcmpgtw m5, m8
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+ 0], m11
mova [qcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+32], m11
mova [qcoeffq+ncoeffq*4+48], m6
pxor m5, m5 ; reset m5 to zero register
%else
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
%endif
%ifidn %1, b_32x32
pabsw m8, m8
pabsw m13, m13
......@@ -97,8 +133,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m8, m9
psignw m13, m10
%endif
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
mova m11, m8
mova m6, m8
pcmpgtw m5, m8
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+ 0], m11
mova [dqcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+32], m11
mova [dqcoeffq+ncoeffq*4+48], m6
pxor m5, m5 ; reset m5 to zero register
%else
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
%endif
pcmpeqw m8, m5 ; m8 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
......@@ -112,8 +168,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
jz .accumulate_eob
.ac_only_loop:
%if CONFIG_VP9_HIGHBITDEPTH
; pack coeff from 32bit to 16bit array
mova m9, [ coeffq+ncoeffq*4+ 0]
mova m6, [ coeffq+ncoeffq*4+16]
mova m10, [ coeffq+ncoeffq*4+32]
mova m11, [ coeffq+ncoeffq*4+48]
packssdw m9, m6 ; m9 = c[i]
packssdw m10, m11 ; m10 = c[i]
%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
%endif
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
......@@ -136,8 +202,29 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m13, m10 ; m13 = reinsert sign
pand m14, m7
pand m13, m12
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
pxor m11, m11
mova m11, m14
mova m6, m14
pcmpgtw m5, m14
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+ 0], m11
mova [qcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+32], m11
mova [qcoeffq+ncoeffq*4+48], m6
pxor m5, m5 ; reset m5 to zero register
%else
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
%endif
%ifidn %1, b_32x32
pabsw m14, m14
pabsw m13, m13
......@@ -150,8 +237,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m14, m9
psignw m13, m10
%endif
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
mova m11, m14
mova m6, m14
pcmpgtw m5, m14
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+ 0], m11
mova [dqcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+32], m11
mova [dqcoeffq+ncoeffq*4+48], m6
pxor m5, m5
%else
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
%endif
pcmpeqw m14, m5 ; m14 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
......@@ -168,10 +275,21 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
%if CONFIG_VP9_HIGHBITDEPTH
mova [qcoeffq+ncoeffq*4+ 0], m5
mova [qcoeffq+ncoeffq*4+16], m5
mova [qcoeffq+ncoeffq*4+32], m5
mova [qcoeffq+ncoeffq*4+48], m5
mova [dqcoeffq+ncoeffq*4+ 0], m5
mova [dqcoeffq+ncoeffq*4+16], m5
mova [dqcoeffq+ncoeffq*4+32], m5
mova [dqcoeffq+ncoeffq*4+48], m5
%else
mova [qcoeffq+ncoeffq*2+ 0], m5
mova [qcoeffq+ncoeffq*2+16], m5
mova [dqcoeffq+ncoeffq*2+ 0], m5
mova [dqcoeffq+ncoeffq*2+16], m5
%endif
add ncoeffq, mmsize
jl .ac_only_loop
%endif
......@@ -196,15 +314,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
mov r2, qcoeffmp
mov r3, eobmp
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
%if CONFIG_VP9_HIGHBITDEPTH
lea dqcoeffq, [dqcoeffq+ncoeffq*4]
lea qcoeffq, [ qcoeffq+ncoeffq*4]
%else
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
%endif
neg ncoeffq
pxor m7, m7
.blank_loop:
%if CONFIG_VP9_HIGHBITDEPTH
mova [dqcoeffq+ncoeffq*4+ 0], m7
mova [dqcoeffq+ncoeffq*4+16], m7
mova [dqcoeffq+ncoeffq*4+32], m7
mova [dqcoeffq+ncoeffq*4+48], m7
mova [qcoeffq+ncoeffq*4+ 0], m7
mova [qcoeffq+ncoeffq*4+16], m7
mova [qcoeffq+ncoeffq*4+32], m7
mova [qcoeffq+ncoeffq*4+48], m7
%else
mova [dqcoeffq+ncoeffq*2+ 0], m7
mova [dqcoeffq+ncoeffq*2+16], m7
mova [qcoeffq+ncoeffq*2+ 0], m7
mova [qcoeffq+ncoeffq*2+16], m7
%endif
add ncoeffq, mmsize
jl .blank_loop
mov word [eobq], 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment