Commit ce3f4ade authored by Debargha Mukherjee's avatar Debargha Mukherjee Committed by Gerrit Code Review

Merge "SSSE3 optimisation for quantize in high bit depth"

parents 7266bedc 37c68efe
...@@ -849,10 +849,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { ...@@ -849,10 +849,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) { if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b sse2/; specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32/; specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b sse2/; specialize qw/vpx_highbd_quantize_b sse2/;
......
...@@ -53,15 +53,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -53,15 +53,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%endif %endif
pxor m5, m5 ; m5 = dedicated zero pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
%if CONFIG_VP9_HIGHBITDEPTH
lea coeffq, [ coeffq+ncoeffq*4]
lea qcoeffq, [ qcoeffq+ncoeffq*4]
lea dqcoeffq, [dqcoeffq+ncoeffq*4]
%else
lea coeffq, [ coeffq+ncoeffq*2] lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2]
lea dqcoeffq, [dqcoeffq+ncoeffq*2] lea dqcoeffq, [dqcoeffq+ncoeffq*2]
%endif
lea iscanq, [ iscanq+ncoeffq*2]
neg ncoeffq neg ncoeffq
; get DC and first 15 AC coeffs ; get DC and first 15 AC coeffs
%if CONFIG_VP9_HIGHBITDEPTH
; coeff stored as 32bit numbers & require 16bit numbers
mova m9, [ coeffq+ncoeffq*4+ 0]
mova m6, [ coeffq+ncoeffq*4+16]
mova m10, [ coeffq+ncoeffq*4+32]
mova m11, [ coeffq+ncoeffq*4+48]
packssdw m9, m6 ; m9 = c[i]
packssdw m10, m11 ; m10 = c[i]
%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
%endif
pabsw m6, m9 ; m6 = abs(m9) pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10) pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
...@@ -82,8 +98,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -82,8 +98,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m13, m10 ; m13 = reinsert sign psignw m13, m10 ; m13 = reinsert sign
pand m8, m7 pand m8, m7
pand m13, m12 pand m13, m12
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
mova m11, m8
mova m6, m8
pcmpgtw m5, m8
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+ 0], m11
mova [qcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+32], m11
mova [qcoeffq+ncoeffq*4+48], m6
pxor m5, m5 ; reset m5 to zero register
%else
mova [qcoeffq+ncoeffq*2+ 0], m8 mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13 mova [qcoeffq+ncoeffq*2+16], m13
%endif
%ifidn %1, b_32x32 %ifidn %1, b_32x32
pabsw m8, m8 pabsw m8, m8
pabsw m13, m13 pabsw m13, m13
...@@ -97,8 +133,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -97,8 +133,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m8, m9 psignw m8, m9
psignw m13, m10 psignw m13, m10
%endif %endif
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
mova m11, m8
mova m6, m8
pcmpgtw m5, m8
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+ 0], m11
mova [dqcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+32], m11
mova [dqcoeffq+ncoeffq*4+48], m6
pxor m5, m5 ; reset m5 to zero register
%else
mova [dqcoeffq+ncoeffq*2+ 0], m8 mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13 mova [dqcoeffq+ncoeffq*2+16], m13
%endif
pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m8, m5 ; m8 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
...@@ -112,8 +168,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -112,8 +168,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
jz .accumulate_eob jz .accumulate_eob
.ac_only_loop: .ac_only_loop:
%if CONFIG_VP9_HIGHBITDEPTH
; pack coeff from 32bit to 16bit array
mova m9, [ coeffq+ncoeffq*4+ 0]
mova m6, [ coeffq+ncoeffq*4+16]
mova m10, [ coeffq+ncoeffq*4+32]
mova m11, [ coeffq+ncoeffq*4+48]
packssdw m9, m6 ; m9 = c[i]
packssdw m10, m11 ; m10 = c[i]
%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
%endif
pabsw m6, m9 ; m6 = abs(m9) pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10) pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
...@@ -136,8 +202,29 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -136,8 +202,29 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m13, m10 ; m13 = reinsert sign psignw m13, m10 ; m13 = reinsert sign
pand m14, m7 pand m14, m7
pand m13, m12 pand m13, m12
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
pxor m11, m11
mova m11, m14
mova m6, m14
pcmpgtw m5, m14
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+ 0], m11
mova [qcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [qcoeffq+ncoeffq*4+32], m11
mova [qcoeffq+ncoeffq*4+48], m6
pxor m5, m5 ; reset m5 to zero register
%else
mova [qcoeffq+ncoeffq*2+ 0], m14 mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13 mova [qcoeffq+ncoeffq*2+16], m13
%endif
%ifidn %1, b_32x32 %ifidn %1, b_32x32
pabsw m14, m14 pabsw m14, m14
pabsw m13, m13 pabsw m13, m13
...@@ -150,8 +237,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -150,8 +237,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psignw m14, m9 psignw m14, m9
psignw m13, m10 psignw m13, m10
%endif %endif
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
mova m11, m14
mova m6, m14
pcmpgtw m5, m14
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+ 0], m11
mova [dqcoeffq+ncoeffq*4+16], m6
pxor m5, m5
mova m11, m13
mova m6, m13
pcmpgtw m5, m13
punpcklwd m11, m5
punpckhwd m6, m5
mova [dqcoeffq+ncoeffq*4+32], m11
mova [dqcoeffq+ncoeffq*4+48], m6
pxor m5, m5
%else
mova [dqcoeffq+ncoeffq*2+ 0], m14 mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13 mova [dqcoeffq+ncoeffq*2+16], m13
%endif
pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m14, m5 ; m14 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
...@@ -168,10 +275,21 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -168,10 +275,21 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%ifidn %1, b_32x32 %ifidn %1, b_32x32
jmp .accumulate_eob jmp .accumulate_eob
.skip_iter: .skip_iter:
%if CONFIG_VP9_HIGHBITDEPTH
mova [qcoeffq+ncoeffq*4+ 0], m5
mova [qcoeffq+ncoeffq*4+16], m5
mova [qcoeffq+ncoeffq*4+32], m5
mova [qcoeffq+ncoeffq*4+48], m5
mova [dqcoeffq+ncoeffq*4+ 0], m5
mova [dqcoeffq+ncoeffq*4+16], m5
mova [dqcoeffq+ncoeffq*4+32], m5
mova [dqcoeffq+ncoeffq*4+48], m5
%else
mova [qcoeffq+ncoeffq*2+ 0], m5 mova [qcoeffq+ncoeffq*2+ 0], m5
mova [qcoeffq+ncoeffq*2+16], m5 mova [qcoeffq+ncoeffq*2+16], m5
mova [dqcoeffq+ncoeffq*2+ 0], m5 mova [dqcoeffq+ncoeffq*2+ 0], m5
mova [dqcoeffq+ncoeffq*2+16], m5 mova [dqcoeffq+ncoeffq*2+16], m5
%endif
add ncoeffq, mmsize add ncoeffq, mmsize
jl .ac_only_loop jl .ac_only_loop
%endif %endif
...@@ -196,15 +314,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ...@@ -196,15 +314,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
mov r2, qcoeffmp mov r2, qcoeffmp
mov r3, eobmp mov r3, eobmp
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
%if CONFIG_VP9_HIGHBITDEPTH
lea dqcoeffq, [dqcoeffq+ncoeffq*4]
lea qcoeffq, [ qcoeffq+ncoeffq*4]
%else
lea dqcoeffq, [dqcoeffq+ncoeffq*2] lea dqcoeffq, [dqcoeffq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2]
%endif
neg ncoeffq neg ncoeffq
pxor m7, m7 pxor m7, m7
.blank_loop: .blank_loop:
%if CONFIG_VP9_HIGHBITDEPTH
mova [dqcoeffq+ncoeffq*4+ 0], m7
mova [dqcoeffq+ncoeffq*4+16], m7
mova [dqcoeffq+ncoeffq*4+32], m7
mova [dqcoeffq+ncoeffq*4+48], m7
mova [qcoeffq+ncoeffq*4+ 0], m7
mova [qcoeffq+ncoeffq*4+16], m7
mova [qcoeffq+ncoeffq*4+32], m7
mova [qcoeffq+ncoeffq*4+48], m7
%else
mova [dqcoeffq+ncoeffq*2+ 0], m7 mova [dqcoeffq+ncoeffq*2+ 0], m7
mova [dqcoeffq+ncoeffq*2+16], m7 mova [dqcoeffq+ncoeffq*2+16], m7
mova [qcoeffq+ncoeffq*2+ 0], m7 mova [qcoeffq+ncoeffq*2+ 0], m7
mova [qcoeffq+ncoeffq*2+16], m7 mova [qcoeffq+ncoeffq*2+16], m7
%endif
add ncoeffq, mmsize add ncoeffq, mmsize
jl .blank_loop jl .blank_loop
mov word [eobq], 0 mov word [eobq], 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment