Commit 9bf89689 authored by Linfeng Zhang's avatar Linfeng Zhang Committed by James Zern

Upgrade fwht4x4_mmx() to fwht4x4_sse2() (from libvpx)

Cherry-pick af7fb17c Upgrade fwht4x4_mmx() to fwht4x4_sse2() for vp9 and
vp10.

Function level timing test shows about 27% time saving on
a Xeon E5-2680 v2 desktop.

Rename dct_sse2.c to dct_intrin_sse2.c to avoid duplicate basenames.

Change-Id: I2c504130099af8f0ccc07da0dacef2464197b0ac
parent fa0076a7
......@@ -90,7 +90,7 @@ AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
endif
ifeq ($(CONFIG_USE_X86INC),yes)
AV1_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
endif
......@@ -100,7 +100,7 @@ AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
endif
endif
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
......
......@@ -227,7 +227,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_fht16x16 sse2/;
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fwht4x4/, "$mmx_x86inc";
specialize qw/av1_fwht4x4/, "$sse2_x86inc";
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4/;
......@@ -342,7 +342,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_fht16x16 sse2 msa/;
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fwht4x4 msa/, "$mmx_x86inc";
specialize qw/av1_fwht4x4 msa/, "$sse2_x86inc";
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4/;
......
......@@ -9,8 +9,6 @@
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
;
%define private_prefix av1
%include "third_party/x86inc/x86inc.asm"
......@@ -35,22 +33,18 @@ SECTION .text
%endmacro
%macro TRANSPOSE_4X4 0
movq m4, m0
movq m5, m2
punpcklwd m4, m1
punpckhwd m0, m1
punpcklwd m5, m3
punpckhwd m2, m3
movq m1, m4
movq m3, m0
punpckldq m1, m5
punpckhdq m4, m5
punpckldq m3, m2
punpckhdq m0, m2
SWAP 2, 3, 0, 1, 4
; 00 01 02 03
; 10 11 12 13
; 20 21 22 23
; 30 31 32 33
punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
mova m1, m0
punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
%endmacro
INIT_MMX mmx
INIT_XMM sse2
cglobal fwht4x4, 3, 4, 8, input, output, stride
lea r3q, [inputq + strideq*4]
movq m0, [inputq] ;a1
......@@ -60,48 +54,34 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
TRANSFORM_COLS
TRANSPOSE_4X4
SWAP 1, 2
psrldq m1, m0, 8
psrldq m3, m2, 8
TRANSFORM_COLS
TRANSPOSE_4X4
psllw m0, 2
psllw m1, 2
psllw m2, 2
psllw m3, 2
%if CONFIG_AOM_HIGHBITDEPTH
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m0
pcmpgtw m5, m1
movq m6, m0
movq m7, m1
punpcklwd m0, m4
punpcklwd m1, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq], m0
movq [outputq + 8], m6
movq [outputq + 16], m1
movq [outputq + 24], m7
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m2
pcmpgtw m5, m3
movq m6, m2
movq m7, m3
punpcklwd m2, m4
punpcklwd m3, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq + 32], m2
movq [outputq + 40], m6
movq [outputq + 48], m3
movq [outputq + 56], m7
; sign extension
mova m2, m0
mova m3, m1
punpcklwd m0, m0
punpcklwd m1, m1
punpckhwd m2, m2
punpckhwd m3, m3
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
mova [outputq], m0
mova [outputq + 16], m2
mova [outputq + 32], m1
mova [outputq + 48], m3
%else
movq [outputq], m0
movq [outputq + 8], m1
movq [outputq + 16], m2
movq [outputq + 24], m3
mova [outputq], m0
mova [outputq + 16], m1
%endif
RET
......@@ -453,20 +453,12 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 3, AOM_BITS_8)));
#endif // HAVE_NEON && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_AOM_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(MMX, Trans4x4WHT,
::testing::Values(make_tuple(&av1_fwht4x4_mmx,
&aom_iwht4x4_16_add_c, 0,
AOM_BITS_8)));
#endif
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4WHT,
::testing::Values(make_tuple(&av1_fwht4x4_c,
&aom_iwht4x4_16_add_sse2,
0, AOM_BITS_8)));
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4WHT,
::testing::Values(
make_tuple(&av1_fwht4x4_sse2, &aom_iwht4x4_16_add_c, 0, AOM_BITS_8),
make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_sse2, 0, AOM_BITS_8)));
#endif
#if HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment