Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
2ab7b9a6
Commit
2ab7b9a6
authored
May 27, 2016
by
Linfeng Zhang
Committed by
Gerrit Code Review
May 27, 2016
Browse files
Merge "Upgrade fwht4x4_mmx() to fwht4x4_sse2() for vp9 and vp10."
parents
0ba9b299
af7fb17c
Changes
9
Hide whitespace changes
Inline
Side-by-side
test/fdct4x4_test.cc
View file @
2ab7b9a6
...
...
@@ -487,19 +487,11 @@ INSTANTIATE_TEST_CASE_P(
make_tuple
(
&
vp9_fht4x4_c
,
&
vp9_iht4x4_16_add_neon
,
3
,
VPX_BITS_8
)));
#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P
(
MMX
,
Trans4x4WHT
,
::
testing
::
Values
(
make_tuple
(
&
vp9_fwht4x4_mmx
,
&
vpx_iwht4x4_16_add_c
,
0
,
VPX_BITS_8
)));
#endif
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P
(
SSE2
,
Trans4x4WHT
,
::
testing
::
Values
(
make_tuple
(
&
vp9_fwht4x4_sse2
,
&
vpx_iwht4x4_16_add_c
,
0
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fwht4x4_c
,
&
vpx_iwht4x4_16_add_sse2
,
0
,
VPX_BITS_8
)));
#endif
...
...
vp10/common/vp10_rtcd_defs.pl
View file @
2ab7b9a6
...
...
@@ -398,7 +398,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp10_fht16x16 sse2/
;
add_proto
qw/void vp10_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp10_fwht4x4/
,
"
$
mmx
_x86inc
";
specialize
qw/vp10_fwht4x4/
,
"
$
sse2
_x86inc
";
}
else
{
add_proto
qw/void vp10_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/vp10_fht4x4 sse2 msa/
;
...
...
@@ -410,7 +410,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp10_fht16x16 sse2 msa/
;
add_proto
qw/void vp10_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp10_fwht4x4 msa/
,
"
$
mmx
_x86inc
";
specialize
qw/vp10_fwht4x4 msa/
,
"
$
sse2
_x86inc
";
}
# Inverse transform
...
...
vp10/encoder/x86/dct_sse2.c
→
vp10/encoder/x86/dct_
intrin_
sse2.c
View file @
2ab7b9a6
File moved
vp10/encoder/x86/dct_
mmx
.asm
→
vp10/encoder/x86/dct_
sse2
.asm
View file @
2ab7b9a6
;
; Copyright (c) 201
4
The WebM project authors. All Rights Reserved.
; Copyright (c) 201
6
The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
...
...
@@ -32,22 +32,18 @@ SECTION .text
%endmacro
%macro TRANSPOSE_4X4 0
movq
m4
,
m0
movq
m5
,
m2
punpcklwd
m4
,
m1
punpckhwd
m0
,
m1
punpcklwd
m5
,
m3
punpckhwd
m2
,
m3
movq
m1
,
m4
movq
m3
,
m0
punpckldq
m1
,
m5
punpckhdq
m4
,
m5
punpckldq
m3
,
m2
punpckhdq
m0
,
m2
SWAP
2
,
3
,
0
,
1
,
4
; 00 01 02 03
; 10 11 12 13
; 20 21 22 23
; 30 31 32 33
punpcklwd
m0
,
m1
; 00 10 01 11 02 12 03 13
punpcklwd
m2
,
m3
; 20 30 21 31 22 32 23 33
mova
m1
,
m0
punpckldq
m0
,
m2
; 00 10 20 30 01 11 21 31
punpckhdq
m1
,
m2
; 02 12 22 32 03 13 23 33
%endmacro
INIT_MM
X
mmx
INIT_
X
MM
ss
e2
cglobal
fwht4x4
,
3
,
4
,
8
,
input
,
output
,
stride
lea
r3q
,
[
inputq
+
strideq
*
4
]
movq
m0
,
[
inputq
]
;a1
...
...
@@ -57,48 +53,34 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
TRANSFORM_COLS
TRANSPOSE_4X4
SWAP
1
,
2
psrldq
m1
,
m0
,
8
psrldq
m3
,
m2
,
8
TRANSFORM_COLS
TRANSPOSE_4X4
psllw
m0
,
2
psllw
m1
,
2
psllw
m2
,
2
psllw
m3
,
2
%if CONFIG_VP9_HIGHBITDEPTH
pxor
m4
,
m4
pxor
m5
,
m5
pcmpgtw
m4
,
m0
pcmpgtw
m5
,
m1
movq
m6
,
m0
movq
m7
,
m1
punpcklwd
m0
,
m4
punpcklwd
m1
,
m5
punpckhwd
m6
,
m4
punpckhwd
m7
,
m5
movq
[
outputq
],
m0
movq
[
outputq
+
8
],
m6
movq
[
outputq
+
16
],
m1
movq
[
outputq
+
24
],
m7
pxor
m4
,
m4
pxor
m5
,
m5
pcmpgtw
m4
,
m2
pcmpgtw
m5
,
m3
movq
m6
,
m2
movq
m7
,
m3
punpcklwd
m2
,
m4
punpcklwd
m3
,
m5
punpckhwd
m6
,
m4
punpckhwd
m7
,
m5
movq
[
outputq
+
32
],
m2
movq
[
outputq
+
40
],
m6
movq
[
outputq
+
48
],
m3
movq
[
outputq
+
56
],
m7
; sign extension
mova
m2
,
m0
mova
m3
,
m1
punpcklwd
m0
,
m0
punpcklwd
m1
,
m1
punpckhwd
m2
,
m2
punpckhwd
m3
,
m3
psrad
m0
,
16
psrad
m1
,
16
psrad
m2
,
16
psrad
m3
,
16
mova
[
outputq
],
m0
mova
[
outputq
+
16
],
m2
mova
[
outputq
+
32
],
m1
mova
[
outputq
+
48
],
m3
%else
movq
[
outputq
],
m0
movq
[
outputq
+
8
],
m1
movq
[
outputq
+
16
],
m2
movq
[
outputq
+
24
],
m3
mova
[
outputq
],
m0
mova
[
outputq
+
16
],
m1
%endif
RET
vp10/vp10cx.mk
View file @
2ab7b9a6
...
...
@@ -93,7 +93,7 @@ VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
endif
ifeq
($(CONFIG_USE_X86INC),yes)
VP10_CX_SRCS-$(HAVE_
MMX
)
+=
encoder/x86/dct_
mmx
.asm
VP10_CX_SRCS-$(HAVE_
SSE2
)
+=
encoder/x86/dct_
sse2
.asm
VP10_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/error_sse2.asm
endif
...
...
@@ -103,7 +103,7 @@ VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
endif
endif
VP10_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/dct_sse2.c
VP10_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/dct_
intrin_
sse2.c
VP10_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/dct_ssse3.c
ifeq
($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
...
...
vp9/common/vp9_rtcd_defs.pl
View file @
2ab7b9a6
...
...
@@ -245,7 +245,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp9_fht16x16 sse2/
;
add_proto
qw/void vp9_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fwht4x4/
,
"
$
mmx
_x86inc
";
specialize
qw/vp9_fwht4x4/
,
"
$
sse2
_x86inc
";
}
else
{
add_proto
qw/void vp9_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/vp9_fht4x4 sse2 msa/
;
...
...
@@ -257,7 +257,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp9_fht16x16 sse2 msa/
;
add_proto
qw/void vp9_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fwht4x4 msa/
,
"
$
mmx
_x86inc
";
specialize
qw/vp9_fwht4x4 msa/
,
"
$
sse2
_x86inc
";
}
#
...
...
vp9/encoder/x86/vp9_dct_sse2.c
→
vp9/encoder/x86/vp9_dct_
intrin_
sse2.c
View file @
2ab7b9a6
File moved
vp9/encoder/x86/vp9_dct_
mmx
.asm
→
vp9/encoder/x86/vp9_dct_
sse2
.asm
View file @
2ab7b9a6
;
; Copyright (c) 201
4
The WebM project authors. All Rights Reserved.
; Copyright (c) 201
6
The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
...
...
@@ -32,23 +32,20 @@ SECTION .text
%endmacro
%macro TRANSPOSE_4X4 0
movq
m4
,
m0
movq
m5
,
m2
punpcklwd
m4
,
m1
punpckhwd
m0
,
m1
punpcklwd
m5
,
m3
punpckhwd
m2
,
m3
movq
m1
,
m4
movq
m3
,
m0
punpckldq
m1
,
m5
punpckhdq
m4
,
m5
punpckldq
m3
,
m2
punpckhdq
m0
,
m2
SWAP
2
,
3
,
0
,
1
,
4
; 00 01 02 03
; 10 11 12 13
; 20 21 22 23
; 30 31 32 33
punpcklwd
m0
,
m1
; 00 10 01 11 02 12 03 13
punpcklwd
m2
,
m3
; 20 30 21 31 22 32 23 33
mova
m1
,
m0
punpckldq
m0
,
m2
; 00 10 20 30 01 11 21 31
punpckhdq
m1
,
m2
; 02 12 22 32 03 13 23 33
%endmacro
INIT_MM
X
mmx
INIT_
X
MM
ss
e2
cglobal
fwht4x4
,
3
,
4
,
8
,
input
,
output
,
stride
; TODO(linfeng): The duplication with vp10 should be resolved.
lea
r3q
,
[
inputq
+
strideq
*
4
]
movq
m0
,
[
inputq
]
;a1
movq
m1
,
[
inputq
+
strideq
*
2
]
;b1
...
...
@@ -57,48 +54,34 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
TRANSFORM_COLS
TRANSPOSE_4X4
SWAP
1
,
2
psrldq
m1
,
m0
,
8
psrldq
m3
,
m2
,
8
TRANSFORM_COLS
TRANSPOSE_4X4
psllw
m0
,
2
psllw
m1
,
2
psllw
m2
,
2
psllw
m3
,
2
%if CONFIG_VP9_HIGHBITDEPTH
pxor
m4
,
m4
pxor
m5
,
m5
pcmpgtw
m4
,
m0
pcmpgtw
m5
,
m1
movq
m6
,
m0
movq
m7
,
m1
punpcklwd
m0
,
m4
punpcklwd
m1
,
m5
punpckhwd
m6
,
m4
punpckhwd
m7
,
m5
movq
[
outputq
],
m0
movq
[
outputq
+
8
],
m6
movq
[
outputq
+
16
],
m1
movq
[
outputq
+
24
],
m7
pxor
m4
,
m4
pxor
m5
,
m5
pcmpgtw
m4
,
m2
pcmpgtw
m5
,
m3
movq
m6
,
m2
movq
m7
,
m3
punpcklwd
m2
,
m4
punpcklwd
m3
,
m5
punpckhwd
m6
,
m4
punpckhwd
m7
,
m5
movq
[
outputq
+
32
],
m2
movq
[
outputq
+
40
],
m6
movq
[
outputq
+
48
],
m3
movq
[
outputq
+
56
],
m7
; sign extension
mova
m2
,
m0
mova
m3
,
m1
punpcklwd
m0
,
m0
punpcklwd
m1
,
m1
punpckhwd
m2
,
m2
punpckhwd
m3
,
m3
psrad
m0
,
16
psrad
m1
,
16
psrad
m2
,
16
psrad
m3
,
16
mova
[
outputq
],
m0
mova
[
outputq
+
16
],
m2
mova
[
outputq
+
32
],
m1
mova
[
outputq
+
48
],
m3
%else
movq
[
outputq
],
m0
movq
[
outputq
+
8
],
m1
movq
[
outputq
+
16
],
m2
movq
[
outputq
+
24
],
m3
mova
[
outputq
],
m0
mova
[
outputq
+
16
],
m1
%endif
RET
vp9/vp9cx.mk
View file @
2ab7b9a6
...
...
@@ -101,7 +101,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif
ifeq
($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_
MMX
)
+=
encoder/x86/vp9_dct_
mmx
.asm
VP9_CX_SRCS-$(HAVE_
SSE2
)
+=
encoder/x86/vp9_dct_
sse2
.asm
ifeq
($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_highbd_error_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX)
+=
encoder/x86/vp9_highbd_error_avx.asm
...
...
@@ -116,7 +116,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
endif
endif
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_dct_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_dct_
intrin_
sse2.c
VP9_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/vp9_dct_ssse3.c
ifneq
($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/vp9_frame_scale_ssse3.c
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment