Commit 2ab7b9a6 authored by Linfeng Zhang's avatar Linfeng Zhang Committed by Gerrit Code Review

Merge "Upgrade fwht4x4_mmx() to fwht4x4_sse2() for vp9 and vp10."

parents 0ba9b299 af7fb17c
......@@ -487,19 +487,11 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MMX, Trans4x4WHT,
::testing::Values(
make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
#endif
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4WHT,
::testing::Values(
make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
#endif
......
......@@ -398,7 +398,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp10_fht16x16 sse2/;
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
} else {
add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht4x4 sse2 msa/;
......@@ -410,7 +410,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp10_fht16x16 sse2 msa/;
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
}
# Inverse transform
......
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
......@@ -32,22 +32,18 @@ SECTION .text
%endmacro
%macro TRANSPOSE_4X4 0
movq m4, m0
movq m5, m2
punpcklwd m4, m1
punpckhwd m0, m1
punpcklwd m5, m3
punpckhwd m2, m3
movq m1, m4
movq m3, m0
punpckldq m1, m5
punpckhdq m4, m5
punpckldq m3, m2
punpckhdq m0, m2
SWAP 2, 3, 0, 1, 4
; 00 01 02 03
; 10 11 12 13
; 20 21 22 23
; 30 31 32 33
punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
mova m1, m0
punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
%endmacro
INIT_MMX mmx
INIT_XMM sse2
cglobal fwht4x4, 3, 4, 8, input, output, stride
lea r3q, [inputq + strideq*4]
movq m0, [inputq] ;a1
......@@ -57,48 +53,34 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
TRANSFORM_COLS
TRANSPOSE_4X4
SWAP 1, 2
psrldq m1, m0, 8
psrldq m3, m2, 8
TRANSFORM_COLS
TRANSPOSE_4X4
psllw m0, 2
psllw m1, 2
psllw m2, 2
psllw m3, 2
%if CONFIG_VP9_HIGHBITDEPTH
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m0
pcmpgtw m5, m1
movq m6, m0
movq m7, m1
punpcklwd m0, m4
punpcklwd m1, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq], m0
movq [outputq + 8], m6
movq [outputq + 16], m1
movq [outputq + 24], m7
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m2
pcmpgtw m5, m3
movq m6, m2
movq m7, m3
punpcklwd m2, m4
punpcklwd m3, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq + 32], m2
movq [outputq + 40], m6
movq [outputq + 48], m3
movq [outputq + 56], m7
; sign extension
mova m2, m0
mova m3, m1
punpcklwd m0, m0
punpcklwd m1, m1
punpckhwd m2, m2
punpckhwd m3, m3
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
mova [outputq], m0
mova [outputq + 16], m2
mova [outputq + 32], m1
mova [outputq + 48], m3
%else
movq [outputq], m0
movq [outputq + 8], m1
movq [outputq + 16], m2
movq [outputq + 24], m3
mova [outputq], m0
mova [outputq + 16], m1
%endif
RET
......@@ -93,7 +93,7 @@ VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
endif
......@@ -103,7 +103,7 @@ VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
endif
endif
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
......
......@@ -245,7 +245,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_fht16x16 sse2/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
} else {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht4x4 sse2 msa/;
......@@ -257,7 +257,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_fht16x16 sse2 msa/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
}
#
......
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
......@@ -32,23 +32,20 @@ SECTION .text
%endmacro
%macro TRANSPOSE_4X4 0
movq m4, m0
movq m5, m2
punpcklwd m4, m1
punpckhwd m0, m1
punpcklwd m5, m3
punpckhwd m2, m3
movq m1, m4
movq m3, m0
punpckldq m1, m5
punpckhdq m4, m5
punpckldq m3, m2
punpckhdq m0, m2
SWAP 2, 3, 0, 1, 4
; 00 01 02 03
; 10 11 12 13
; 20 21 22 23
; 30 31 32 33
punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
mova m1, m0
punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
%endmacro
INIT_MMX mmx
INIT_XMM sse2
cglobal fwht4x4, 3, 4, 8, input, output, stride
; TODO(linfeng): The duplication with vp10 should be resolved.
lea r3q, [inputq + strideq*4]
movq m0, [inputq] ;a1
movq m1, [inputq + strideq*2] ;b1
......@@ -57,48 +54,34 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
TRANSFORM_COLS
TRANSPOSE_4X4
SWAP 1, 2
psrldq m1, m0, 8
psrldq m3, m2, 8
TRANSFORM_COLS
TRANSPOSE_4X4
psllw m0, 2
psllw m1, 2
psllw m2, 2
psllw m3, 2
%if CONFIG_VP9_HIGHBITDEPTH
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m0
pcmpgtw m5, m1
movq m6, m0
movq m7, m1
punpcklwd m0, m4
punpcklwd m1, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq], m0
movq [outputq + 8], m6
movq [outputq + 16], m1
movq [outputq + 24], m7
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m2
pcmpgtw m5, m3
movq m6, m2
movq m7, m3
punpcklwd m2, m4
punpcklwd m3, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq + 32], m2
movq [outputq + 40], m6
movq [outputq + 48], m3
movq [outputq + 56], m7
; sign extension
mova m2, m0
mova m3, m1
punpcklwd m0, m0
punpcklwd m1, m1
punpckhwd m2, m2
punpckhwd m3, m3
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
mova [outputq], m0
mova [outputq + 16], m2
mova [outputq + 32], m1
mova [outputq + 48], m3
%else
movq [outputq], m0
movq [outputq + 8], m1
movq [outputq + 16], m2
movq [outputq + 24], m3
mova [outputq], m0
mova [outputq + 16], m1
%endif
RET
......@@ -101,7 +101,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
......@@ -116,7 +116,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
endif
endif
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment