Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
80bc1d1d
Commit
80bc1d1d
authored
Mar 28, 2016
by
Yi Luo
Committed by
Gerrit Code Review
Mar 28, 2016
Browse files
Options
Browse Files
Download
Plain Diff
Merge "8x8/16x16 HT types V_DCT to H_FLIPADST SSE2 optimization" into nextgenv2
parents
9859dde4
770bf715
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
433 additions
and
12 deletions
+433
-12
test/vp10_fht16x16_test.cc
test/vp10_fht16x16_test.cc
+67
-0
test/vp10_fht8x8_test.cc
test/vp10_fht8x8_test.cc
+67
-0
vp10/encoder/hybrid_fwd_txfm.c
vp10/encoder/hybrid_fwd_txfm.c
+2
-11
vp10/encoder/x86/dct_sse2.c
vp10/encoder/x86/dct_sse2.c
+297
-1
No files found.
test/vp10_fht16x16_test.cc
View file @
80bc1d1d
...
...
@@ -70,6 +70,61 @@ TEST_P(VP10Trans16x16HT, CoeffCheck) {
RunCoeffCheck
();
}
#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST
(
VP10Trans16x16HTSpeedTest
,
C_version
)
{
ACMRandom
rnd
(
ACMRandom
::
DeterministicSeed
());
const
int
count_test_block
=
20000
;
int
bit_depth
=
8
;
int
mask
=
(
1
<<
bit_depth
)
-
1
;
const
int
num_coeffs
=
256
;
int16_t
*
input
=
new
int16_t
[
num_coeffs
];
tran_low_t
*
output
=
new
tran_low_t
[
num_coeffs
];
const
int
stride
=
16
;
int
tx_type
;
for
(
int
i
=
0
;
i
<
count_test_block
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_coeffs
;
++
j
)
{
input
[
j
]
=
(
rnd
.
Rand8
()
&
mask
)
-
(
rnd
.
Rand8
()
&
mask
);
}
for
(
tx_type
=
V_DCT
;
tx_type
<=
H_FLIPADST
;
++
tx_type
)
{
vp10_fht16x16_c
(
input
,
output
,
stride
,
tx_type
);
}
}
delete
[]
input
;
delete
[]
output
;
}
#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST
(
VP10Trans16x16HTSpeedTest
,
SSE2_version
)
{
ACMRandom
rnd
(
ACMRandom
::
DeterministicSeed
());
const
int
count_test_block
=
20000
;
int
bit_depth
=
8
;
int
mask
=
(
1
<<
bit_depth
)
-
1
;
const
int
num_coeffs
=
256
;
int16_t
*
input
=
reinterpret_cast
<
int16_t
*>
(
vpx_memalign
(
16
,
sizeof
(
int16_t
)
*
num_coeffs
));
tran_low_t
*
output
=
reinterpret_cast
<
tran_low_t
*>
(
vpx_memalign
(
16
,
sizeof
(
tran_low_t
)
*
num_coeffs
));
const
int
stride
=
16
;
int
tx_type
;
for
(
int
i
=
0
;
i
<
count_test_block
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_coeffs
;
++
j
)
{
input
[
j
]
=
(
rnd
.
Rand8
()
&
mask
)
-
(
rnd
.
Rand8
()
&
mask
);
}
for
(
tx_type
=
V_DCT
;
tx_type
<=
H_FLIPADST
;
++
tx_type
)
{
vp10_fht16x16_sse2
(
input
,
output
,
stride
,
tx_type
);
}
}
vpx_free
(
input
);
vpx_free
(
output
);
}
#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
using
std
::
tr1
::
make_tuple
;
#if HAVE_SSE2
...
...
@@ -103,6 +158,18 @@ INSTANTIATE_TEST_CASE_P(
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
7
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
8
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
10
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
11
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
12
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
13
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
14
,
VPX_BITS_8
,
256
),
make_tuple
(
&
vp10_fht16x16_sse2
,
&
vp10_iht16x16_256_add_sse2
,
15
,
VPX_BITS_8
,
256
)));
#endif // !CONFIG_EXT_TX
#endif // HAVE_SSE2
...
...
test/vp10_fht8x8_test.cc
View file @
80bc1d1d
...
...
@@ -69,6 +69,61 @@ TEST_P(VP10Trans8x8HT, CoeffCheck) {
RunCoeffCheck
();
}
#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST
(
VP10Trans8x8HTSpeedTest
,
C_version
)
{
ACMRandom
rnd
(
ACMRandom
::
DeterministicSeed
());
const
int
count_test_block
=
20000
;
int
bit_depth
=
8
;
int
mask
=
(
1
<<
bit_depth
)
-
1
;
const
int
num_coeffs
=
64
;
int16_t
*
input
=
new
int16_t
[
num_coeffs
];
tran_low_t
*
output
=
new
tran_low_t
[
num_coeffs
];
const
int
stride
=
8
;
int
tx_type
;
for
(
int
i
=
0
;
i
<
count_test_block
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_coeffs
;
++
j
)
{
input
[
j
]
=
(
rnd
.
Rand8
()
&
mask
)
-
(
rnd
.
Rand8
()
&
mask
);
}
for
(
tx_type
=
V_DCT
;
tx_type
<=
H_FLIPADST
;
++
tx_type
)
{
vp10_fht8x8_c
(
input
,
output
,
stride
,
tx_type
);
}
}
delete
[]
input
;
delete
[]
output
;
}
#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST
(
VP10Trans8x8HTSpeedTest
,
SSE2_version
)
{
ACMRandom
rnd
(
ACMRandom
::
DeterministicSeed
());
const
int
count_test_block
=
20000
;
int
bit_depth
=
8
;
int
mask
=
(
1
<<
bit_depth
)
-
1
;
const
int
num_coeffs
=
64
;
int16_t
*
input
=
reinterpret_cast
<
int16_t
*>
(
vpx_memalign
(
16
,
sizeof
(
int16_t
)
*
num_coeffs
));
tran_low_t
*
output
=
reinterpret_cast
<
tran_low_t
*>
(
vpx_memalign
(
16
,
sizeof
(
tran_low_t
)
*
num_coeffs
));
const
int
stride
=
8
;
int
tx_type
;
for
(
int
i
=
0
;
i
<
count_test_block
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_coeffs
;
++
j
)
{
input
[
j
]
=
(
rnd
.
Rand8
()
&
mask
)
-
(
rnd
.
Rand8
()
&
mask
);
}
for
(
tx_type
=
V_DCT
;
tx_type
<=
H_FLIPADST
;
++
tx_type
)
{
vp10_fht8x8_sse2
(
input
,
output
,
stride
,
tx_type
);
}
}
vpx_free
(
input
);
vpx_free
(
output
);
}
#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
using
std
::
tr1
::
make_tuple
;
#if HAVE_SSE2
...
...
@@ -102,6 +157,18 @@ INSTANTIATE_TEST_CASE_P(
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
7
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
8
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
10
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
11
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
12
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
13
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
14
,
VPX_BITS_8
,
64
),
make_tuple
(
&
vp10_fht8x8_sse2
,
&
vp10_iht8x8_64_add_sse2
,
15
,
VPX_BITS_8
,
64
)));
#endif // !CONFIG_EXT_TX
#endif // HAVE_SSE2
...
...
vp10/encoder/hybrid_fwd_txfm.c
View file @
80bc1d1d
...
...
@@ -54,8 +54,6 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
vp10_fht4x4
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
break
;
case
V_DCT
:
case
H_DCT
:
case
V_ADST
:
...
...
@@ -70,7 +68,6 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
}
}
...
...
@@ -93,15 +90,13 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
vp10_fht8x8
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
break
;
case
V_DCT
:
case
H_DCT
:
case
V_ADST
:
case
H_ADST
:
case
V_FLIPADST
:
case
H_FLIPADST
:
vp10_fht8x8
_c
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
vp10_fht8x8
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
break
;
case
IDTX
:
vp10_fwd_idtx_c
(
src_diff
,
coeff
,
diff_stride
,
8
,
tx_type
);
...
...
@@ -109,7 +104,6 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
}
}
...
...
@@ -132,15 +126,13 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
vp10_fht16x16
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
break
;
case
V_DCT
:
case
H_DCT
:
case
V_ADST
:
case
H_ADST
:
case
V_FLIPADST
:
case
H_FLIPADST
:
vp10_fht16x16
_c
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
vp10_fht16x16
(
src_diff
,
coeff
,
diff_stride
,
tx_type
);
break
;
case
IDTX
:
vp10_fwd_idtx_c
(
src_diff
,
coeff
,
diff_stride
,
16
,
tx_type
);
...
...
@@ -148,7 +140,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
}
}
...
...
vp10/encoder/x86/dct_sse2.c
View file @
80bc1d1d
...
...
@@ -1280,6 +1280,21 @@ static void fadst8_sse2(__m128i *in) {
array_transpose_8x8
(
in
,
in
);
}
#if CONFIG_EXT_TX
static
void
fidtx8_sse2
(
__m128i
*
in
)
{
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
1
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
1
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
1
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
1
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
1
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
1
);
array_transpose_8x8
(
in
,
in
);
}
#endif // CONFIG_EXT_TX
void
vp10_fht8x8_sse2
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
8
];
...
...
@@ -1345,10 +1360,51 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
case
V_DCT
:
load_buffer_8x8
(
input
,
in
,
stride
,
0
,
0
);
fdct8_sse2
(
in
);
fidtx8_sse2
(
in
);
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
case
H_DCT
:
load_buffer_8x8
(
input
,
in
,
stride
,
0
,
0
);
fidtx8_sse2
(
in
);
fdct8_sse2
(
in
);
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
case
V_ADST
:
load_buffer_8x8
(
input
,
in
,
stride
,
0
,
0
);
fadst8_sse2
(
in
);
fidtx8_sse2
(
in
);
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
case
H_ADST
:
load_buffer_8x8
(
input
,
in
,
stride
,
0
,
0
);
fidtx8_sse2
(
in
);
fadst8_sse2
(
in
);
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
case
V_FLIPADST
:
load_buffer_8x8
(
input
,
in
,
stride
,
1
,
0
);
fadst8_sse2
(
in
);
fidtx8_sse2
(
in
);
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
case
H_FLIPADST
:
load_buffer_8x8
(
input
,
in
,
stride
,
0
,
1
);
fidtx8_sse2
(
in
);
fadst8_sse2
(
in
);
right_shift_8x8
(
in
,
1
);
write_buffer_8x8
(
output
,
in
,
8
);
break
;
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
}
}
...
...
@@ -2226,6 +2282,204 @@ static void fadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16
(
in0
,
in1
);
}
#if CONFIG_EXT_TX
static
void
fidtx16_8col
(
__m128i
*
in
)
{
const
__m128i
k__zero_epi16
=
_mm_set1_epi16
((
int16_t
)
0
);
const
__m128i
k__sqrt2_epi16
=
_mm_set1_epi16
((
int16_t
)
Sqrt2
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
;
__m128i
u0
,
u1
,
u2
,
u3
,
u4
,
u5
,
u6
,
u7
;
__m128i
x0
,
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
;
__m128i
y0
,
y1
,
y2
,
y3
,
y4
,
y5
,
y6
,
y7
;
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
1
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
1
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
1
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
1
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
1
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
1
);
in
[
8
]
=
_mm_slli_epi16
(
in
[
8
],
1
);
in
[
9
]
=
_mm_slli_epi16
(
in
[
9
],
1
);
in
[
10
]
=
_mm_slli_epi16
(
in
[
10
],
1
);
in
[
11
]
=
_mm_slli_epi16
(
in
[
11
],
1
);
in
[
12
]
=
_mm_slli_epi16
(
in
[
12
],
1
);
in
[
13
]
=
_mm_slli_epi16
(
in
[
13
],
1
);
in
[
14
]
=
_mm_slli_epi16
(
in
[
14
],
1
);
in
[
15
]
=
_mm_slli_epi16
(
in
[
15
],
1
);
v0
=
_mm_unpacklo_epi16
(
in
[
0
],
k__zero_epi16
);
v1
=
_mm_unpacklo_epi16
(
in
[
1
],
k__zero_epi16
);
v2
=
_mm_unpacklo_epi16
(
in
[
2
],
k__zero_epi16
);
v3
=
_mm_unpacklo_epi16
(
in
[
3
],
k__zero_epi16
);
v4
=
_mm_unpacklo_epi16
(
in
[
4
],
k__zero_epi16
);
v5
=
_mm_unpacklo_epi16
(
in
[
5
],
k__zero_epi16
);
v6
=
_mm_unpacklo_epi16
(
in
[
6
],
k__zero_epi16
);
v7
=
_mm_unpacklo_epi16
(
in
[
7
],
k__zero_epi16
);
u0
=
_mm_unpacklo_epi16
(
in
[
8
],
k__zero_epi16
);
u1
=
_mm_unpacklo_epi16
(
in
[
9
],
k__zero_epi16
);
u2
=
_mm_unpacklo_epi16
(
in
[
10
],
k__zero_epi16
);
u3
=
_mm_unpacklo_epi16
(
in
[
11
],
k__zero_epi16
);
u4
=
_mm_unpacklo_epi16
(
in
[
12
],
k__zero_epi16
);
u5
=
_mm_unpacklo_epi16
(
in
[
13
],
k__zero_epi16
);
u6
=
_mm_unpacklo_epi16
(
in
[
14
],
k__zero_epi16
);
u7
=
_mm_unpacklo_epi16
(
in
[
15
],
k__zero_epi16
);
x0
=
_mm_unpackhi_epi16
(
in
[
0
],
k__zero_epi16
);
x1
=
_mm_unpackhi_epi16
(
in
[
1
],
k__zero_epi16
);
x2
=
_mm_unpackhi_epi16
(
in
[
2
],
k__zero_epi16
);
x3
=
_mm_unpackhi_epi16
(
in
[
3
],
k__zero_epi16
);
x4
=
_mm_unpackhi_epi16
(
in
[
4
],
k__zero_epi16
);
x5
=
_mm_unpackhi_epi16
(
in
[
5
],
k__zero_epi16
);
x6
=
_mm_unpackhi_epi16
(
in
[
6
],
k__zero_epi16
);
x7
=
_mm_unpackhi_epi16
(
in
[
7
],
k__zero_epi16
);
y0
=
_mm_unpackhi_epi16
(
in
[
8
],
k__zero_epi16
);
y1
=
_mm_unpackhi_epi16
(
in
[
9
],
k__zero_epi16
);
y2
=
_mm_unpackhi_epi16
(
in
[
10
],
k__zero_epi16
);
y3
=
_mm_unpackhi_epi16
(
in
[
11
],
k__zero_epi16
);
y4
=
_mm_unpackhi_epi16
(
in
[
12
],
k__zero_epi16
);
y5
=
_mm_unpackhi_epi16
(
in
[
13
],
k__zero_epi16
);
y6
=
_mm_unpackhi_epi16
(
in
[
14
],
k__zero_epi16
);
y7
=
_mm_unpackhi_epi16
(
in
[
15
],
k__zero_epi16
);
v0
=
_mm_madd_epi16
(
v0
,
k__sqrt2_epi16
);
v1
=
_mm_madd_epi16
(
v1
,
k__sqrt2_epi16
);
v2
=
_mm_madd_epi16
(
v2
,
k__sqrt2_epi16
);
v3
=
_mm_madd_epi16
(
v3
,
k__sqrt2_epi16
);
v4
=
_mm_madd_epi16
(
v4
,
k__sqrt2_epi16
);
v5
=
_mm_madd_epi16
(
v5
,
k__sqrt2_epi16
);
v6
=
_mm_madd_epi16
(
v6
,
k__sqrt2_epi16
);
v7
=
_mm_madd_epi16
(
v7
,
k__sqrt2_epi16
);
x0
=
_mm_madd_epi16
(
x0
,
k__sqrt2_epi16
);
x1
=
_mm_madd_epi16
(
x1
,
k__sqrt2_epi16
);
x2
=
_mm_madd_epi16
(
x2
,
k__sqrt2_epi16
);
x3
=
_mm_madd_epi16
(
x3
,
k__sqrt2_epi16
);
x4
=
_mm_madd_epi16
(
x4
,
k__sqrt2_epi16
);
x5
=
_mm_madd_epi16
(
x5
,
k__sqrt2_epi16
);
x6
=
_mm_madd_epi16
(
x6
,
k__sqrt2_epi16
);
x7
=
_mm_madd_epi16
(
x7
,
k__sqrt2_epi16
);
u0
=
_mm_madd_epi16
(
u0
,
k__sqrt2_epi16
);
u1
=
_mm_madd_epi16
(
u1
,
k__sqrt2_epi16
);
u2
=
_mm_madd_epi16
(
u2
,
k__sqrt2_epi16
);
u3
=
_mm_madd_epi16
(
u3
,
k__sqrt2_epi16
);
u4
=
_mm_madd_epi16
(
u4
,
k__sqrt2_epi16
);
u5
=
_mm_madd_epi16
(
u5
,
k__sqrt2_epi16
);
u6
=
_mm_madd_epi16
(
u6
,
k__sqrt2_epi16
);
u7
=
_mm_madd_epi16
(
u7
,
k__sqrt2_epi16
);
y0
=
_mm_madd_epi16
(
y0
,
k__sqrt2_epi16
);
y1
=
_mm_madd_epi16
(
y1
,
k__sqrt2_epi16
);
y2
=
_mm_madd_epi16
(
y2
,
k__sqrt2_epi16
);
y3
=
_mm_madd_epi16
(
y3
,
k__sqrt2_epi16
);
y4
=
_mm_madd_epi16
(
y4
,
k__sqrt2_epi16
);
y5
=
_mm_madd_epi16
(
y5
,
k__sqrt2_epi16
);
y6
=
_mm_madd_epi16
(
y6
,
k__sqrt2_epi16
);
y7
=
_mm_madd_epi16
(
y7
,
k__sqrt2_epi16
);
v0
=
_mm_add_epi32
(
v0
,
k__DCT_CONST_ROUNDING
);
v1
=
_mm_add_epi32
(
v1
,
k__DCT_CONST_ROUNDING
);
v2
=
_mm_add_epi32
(
v2
,
k__DCT_CONST_ROUNDING
);
v3
=
_mm_add_epi32
(
v3
,
k__DCT_CONST_ROUNDING
);
v4
=
_mm_add_epi32
(
v4
,
k__DCT_CONST_ROUNDING
);
v5
=
_mm_add_epi32
(
v5
,
k__DCT_CONST_ROUNDING
);
v6
=
_mm_add_epi32
(
v6
,
k__DCT_CONST_ROUNDING
);
v7
=
_mm_add_epi32
(
v7
,
k__DCT_CONST_ROUNDING
);
x0
=
_mm_add_epi32
(
x0
,
k__DCT_CONST_ROUNDING
);
x1
=
_mm_add_epi32
(
x1
,
k__DCT_CONST_ROUNDING
);
x2
=
_mm_add_epi32
(
x2
,
k__DCT_CONST_ROUNDING
);
x3
=
_mm_add_epi32
(
x3
,
k__DCT_CONST_ROUNDING
);
x4
=
_mm_add_epi32
(
x4
,
k__DCT_CONST_ROUNDING
);
x5
=
_mm_add_epi32
(
x5
,
k__DCT_CONST_ROUNDING
);
x6
=
_mm_add_epi32
(
x6
,
k__DCT_CONST_ROUNDING
);
x7
=
_mm_add_epi32
(
x7
,
k__DCT_CONST_ROUNDING
);
u0
=
_mm_add_epi32
(
u0
,
k__DCT_CONST_ROUNDING
);
u1
=
_mm_add_epi32
(
u1
,
k__DCT_CONST_ROUNDING
);
u2
=
_mm_add_epi32
(
u2
,
k__DCT_CONST_ROUNDING
);
u3
=
_mm_add_epi32
(
u3
,
k__DCT_CONST_ROUNDING
);
u4
=
_mm_add_epi32
(
u4
,
k__DCT_CONST_ROUNDING
);
u5
=
_mm_add_epi32
(
u5
,
k__DCT_CONST_ROUNDING
);
u6
=
_mm_add_epi32
(
u6
,
k__DCT_CONST_ROUNDING
);
u7
=
_mm_add_epi32
(
u7
,
k__DCT_CONST_ROUNDING
);
y0
=
_mm_add_epi32
(
y0
,
k__DCT_CONST_ROUNDING
);
y1
=
_mm_add_epi32
(
y1
,
k__DCT_CONST_ROUNDING
);
y2
=
_mm_add_epi32
(
y2
,
k__DCT_CONST_ROUNDING
);
y3
=
_mm_add_epi32
(
y3
,
k__DCT_CONST_ROUNDING
);
y4
=
_mm_add_epi32
(
y4
,
k__DCT_CONST_ROUNDING
);
y5
=
_mm_add_epi32
(
y5
,
k__DCT_CONST_ROUNDING
);
y6
=
_mm_add_epi32
(
y6
,
k__DCT_CONST_ROUNDING
);
y7
=
_mm_add_epi32
(
y7
,
k__DCT_CONST_ROUNDING
);
v0
=
_mm_srai_epi32
(
v0
,
DCT_CONST_BITS
);
v1
=
_mm_srai_epi32
(
v1
,
DCT_CONST_BITS
);
v2
=
_mm_srai_epi32
(
v2
,
DCT_CONST_BITS
);
v3
=
_mm_srai_epi32
(
v3
,
DCT_CONST_BITS
);
v4
=
_mm_srai_epi32
(
v4
,
DCT_CONST_BITS
);
v5
=
_mm_srai_epi32
(
v5
,
DCT_CONST_BITS
);
v6
=
_mm_srai_epi32
(
v6
,
DCT_CONST_BITS
);
v7
=
_mm_srai_epi32
(
v7
,
DCT_CONST_BITS
);
x0
=
_mm_srai_epi32
(
x0
,
DCT_CONST_BITS
);
x1
=
_mm_srai_epi32
(
x1
,
DCT_CONST_BITS
);
x2
=
_mm_srai_epi32
(
x2
,
DCT_CONST_BITS
);
x3
=
_mm_srai_epi32
(
x3
,
DCT_CONST_BITS
);
x4
=
_mm_srai_epi32
(
x4
,
DCT_CONST_BITS
);
x5
=
_mm_srai_epi32
(
x5
,
DCT_CONST_BITS
);
x6
=
_mm_srai_epi32
(
x6
,
DCT_CONST_BITS
);
x7
=
_mm_srai_epi32
(
x7
,
DCT_CONST_BITS
);
u0
=
_mm_srai_epi32
(
u0
,
DCT_CONST_BITS
);
u1
=
_mm_srai_epi32
(
u1
,
DCT_CONST_BITS
);
u2
=
_mm_srai_epi32
(
u2
,
DCT_CONST_BITS
);
u3
=
_mm_srai_epi32
(
u3
,
DCT_CONST_BITS
);
u4
=
_mm_srai_epi32
(
u4
,
DCT_CONST_BITS
);
u5
=
_mm_srai_epi32
(
u5
,
DCT_CONST_BITS
);
u6
=
_mm_srai_epi32
(
u6
,
DCT_CONST_BITS
);
u7
=
_mm_srai_epi32
(
u7
,
DCT_CONST_BITS
);
y0
=
_mm_srai_epi32
(
y0
,
DCT_CONST_BITS
);
y1
=
_mm_srai_epi32
(
y1
,
DCT_CONST_BITS
);
y2
=
_mm_srai_epi32
(
y2
,
DCT_CONST_BITS
);
y3
=
_mm_srai_epi32
(
y3
,
DCT_CONST_BITS
);
y4
=
_mm_srai_epi32
(
y4
,
DCT_CONST_BITS
);
y5
=
_mm_srai_epi32
(
y5
,
DCT_CONST_BITS
);
y6
=
_mm_srai_epi32
(
y6
,
DCT_CONST_BITS
);
y7
=
_mm_srai_epi32
(
y7
,
DCT_CONST_BITS
);
in
[
0
]
=
_mm_packs_epi32
(
v0
,
x0
);
in
[
1
]
=
_mm_packs_epi32
(
v1
,
x1
);
in
[
2
]
=
_mm_packs_epi32
(
v2
,
x2
);
in
[
3
]
=
_mm_packs_epi32
(
v3
,
x3
);
in
[
4
]
=
_mm_packs_epi32
(
v4
,
x4
);
in
[
5
]
=
_mm_packs_epi32
(
v5
,
x5
);
in
[
6
]
=
_mm_packs_epi32
(
v6
,
x6
);
in
[
7
]
=
_mm_packs_epi32
(
v7
,
x7
);
in
[
8
]
=
_mm_packs_epi32
(
u0
,
y0
);
in
[
9
]
=
_mm_packs_epi32
(
u1
,
y1
);
in
[
10
]
=
_mm_packs_epi32
(
u2
,
y2
);
in
[
11
]
=
_mm_packs_epi32
(
u3
,
y3
);
in
[
12
]
=
_mm_packs_epi32
(
u4
,
y4
);
in
[
13
]
=
_mm_packs_epi32
(
u5
,
y5
);
in
[
14
]
=
_mm_packs_epi32
(
u6
,
y6
);
in
[
15
]
=
_mm_packs_epi32
(
u7
,
y7
);
}
static
void
fidtx16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
)
{
fidtx16_8col
(
in0
);
fidtx16_8col
(
in1
);
array_transpose_16x16
(
in0
,
in1
);
}
#endif // CONFIG_EXT_TX
void
vp10_fht16x16_sse2
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in0
[
16
],
in1
[
16
];
...
...
@@ -2291,6 +2545,48 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
fadst16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
case
V_DCT
:
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
,
0
,
0
);
fdct16_sse2
(
in0
,
in1
);
right_shift_16x16
(
in0
,
in1
);
fidtx16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
case
H_DCT
:
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
,
0
,
0
);
fidtx16_sse2
(
in0
,
in1
);
right_shift_16x16
(
in0
,
in1
);
fdct16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
case
V_ADST
:
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
,
0
,
0
);
fadst16_sse2
(
in0
,
in1
);
right_shift_16x16
(
in0
,
in1
);
fidtx16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
case
H_ADST
:
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
,
0
,
0
);
fidtx16_sse2
(
in0
,
in1
);
right_shift_16x16
(
in0
,
in1
);
fadst16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
case
V_FLIPADST
:
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
,
1
,
0
);
fadst16_sse2
(
in0
,
in1
);
right_shift_16x16
(
in0
,
in1
);
fidtx16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
case
H_FLIPADST
:
load_buffer_16x16
(
input
,
in0
,
in1
,
stride
,
0
,
1
);
fidtx16_sse2
(
in0
,
in1
);
right_shift_16x16
(
in0
,
in1
);
fadst16_sse2
(
in0
,
in1
);
write_buffer_16x16
(
output
,
in0
,
in1
,
16
);
break
;
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment