Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
8e80f422
Commit
8e80f422
authored
Sep 09, 2016
by
Debargha Mukherjee
Committed by
Gerrit Code Review
Sep 09, 2016
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Add SSE2 versions of av1_fht8x16 and av1_fht16x8" into nextgenv2
parents
19a06bcc
1a800f65
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
639 additions
and
164 deletions
+639
-164
aom_dsp/x86/synonyms.h
aom_dsp/x86/synonyms.h
+2
-2
av1/common/av1_rtcd_defs.pl
av1/common/av1_rtcd_defs.pl
+73
-154
av1/encoder/dct.c
av1/encoder/dct.c
+8
-6
av1/encoder/x86/dct_intrin_sse2.c
av1/encoder/x86/dct_intrin_sse2.c
+361
-1
test/av1_fht16x8_test.cc
test/av1_fht16x8_test.cc
+95
-0
test/av1_fht8x16_test.cc
test/av1_fht8x16_test.cc
+95
-0
test/test.mk
test/test.mk
+4
-0
test/transform_test_base.h
test/transform_test_base.h
+1
-1
No files found.
aom_dsp/x86/synonyms.h
View file @
8e80f422
...
...
@@ -68,13 +68,13 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
}
static
INLINE
__m128i
xx_roundn_epu32
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
1
<<
(
bits
-
1
)
)
;
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
(
1
<<
bits
)
>>
1
);
const
__m128i
v_tmp_d
=
_mm_add_epi32
(
v_val_d
,
v_bias_d
);
return
_mm_srli_epi32
(
v_tmp_d
,
bits
);
}
static
INLINE
__m128i
xx_roundn_epi32
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
1
<<
(
bits
-
1
)
)
;
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
(
1
<<
bits
)
>>
1
);
const
__m128i
v_sign_d
=
_mm_srai_epi32
(
v_val_d
,
31
);
const
__m128i
v_tmp_d
=
_mm_add_epi32
(
_mm_add_epi32
(
v_val_d
,
v_bias_d
),
v_sign_d
);
...
...
av1/common/av1_rtcd_defs.pl
View file @
8e80f422
...
...
@@ -51,7 +51,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
}
#
# dct
#
Inverse
dct
#
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
eq
"
yes
")
{
# Note as optimized versions of these functions are added we need to add a check to ensure
...
...
@@ -368,10 +368,22 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {
# fdct functions
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
eq
"
yes
")
{
add_proto
qw/void av1_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x4 sse2/
;
add_proto
qw/void av1_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x4 sse2/
;
add_proto
qw/void av1_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fwht4x4/
;
add_proto
qw/void av1_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x8 sse2/
;
add_proto
qw/void av1_fht16x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x16 sse2/
;
add_proto
qw/void av1_fht32x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x32/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_fht4x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x8/
;
...
...
@@ -379,56 +391,84 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_fht8x4/
;
add_proto
qw/void av1_fht8x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x16/
;
specialize
qw/av1_fht8x16
sse2
/
;
add_proto
qw/void av1_fht16x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x8/
;
specialize
qw/av1_fht16x8
sse2
/
;
add_proto
qw/void av1_fht16x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x32/
;
add_proto
qw/void av1_fht32x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x16/
;
}
add_proto
qw/void av1_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x8 sse2/
;
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4/
;
add_proto
qw/void av1_f
ht16x16
/
,
"
const int16_t *input, tran_low_t *output, int stride
, int tx_type
";
specialize
qw/av1_f
ht16x16 sse2
/
;
add_proto
qw/void av1_f
dct4x4_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_f
dct4x4_1
/
;
add_proto
qw/void av1_f
ht32x32
/
,
"
const int16_t *input, tran_low_t *output, int stride
, int tx_type
";
specialize
qw/av1_f
ht32x32
/
;
add_proto
qw/void av1_f
dct8x8
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_f
dct8x8
/
;
add_proto
qw/void av1_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fwht4x4/
;
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32/
;
add_proto
qw/void av1_fdct
4x4_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
4x4_1
/
;
add_proto
qw/void av1_fdct
32x32_rd
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
32x32_rd
/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1/
;
}
else
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4 sse2/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1 sse2/
;
add_proto
qw/void av1_fdct8x8
_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8
_1
/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8
sse2
/
;
add_proto
qw/void av1_fdct
16x16
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
16x16
/
;
add_proto
qw/void av1_fdct
8x8_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
8x8_1 sse2
/
;
add_proto
qw/void av1_fdct16x16
_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16
_1
/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16
sse2
/
;
add_proto
qw/void av1_fdct
32x32
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
32x3
2/
;
add_proto
qw/void av1_fdct
16x16_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
16x16_1 sse
2/
;
add_proto
qw/void av1_fdct32x32
_rd
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32
_rd
/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32
sse2
/
;
add_proto
qw/void av1_fdct32x32_
1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_
1
/
;
add_proto
qw/void av1_fdct32x32_
rd
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_
rd sse2
/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1 sse2/
;
}
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
ne
"
yes
")
{
if
(
aom_config
("
CONFIG_EXT_TX
")
ne
"
yes
")
{
specialize
qw/av1_fht4x4 msa/
;
specialize
qw/av1_fht8x8 msa/
;
specialize
qw/av1_fht16x16 msa/
;
}
}
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
eq
"
yes
")
{
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_highbd_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct4x4/
;
...
...
@@ -453,33 +493,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_highbd_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct32x32_1/
;
}
else
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4 sse2/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1 sse2/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8 sse2/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1 sse2/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16 sse2/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1 sse2/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32 sse2/
;
add_proto
qw/void av1_fdct32x32_rd/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_rd sse2/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1 sse2/
;
add_proto
qw/void av1_highbd_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct4x4 sse2/
;
...
...
@@ -504,100 +517,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_highbd_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct32x32_1/
;
}
}
else
{
add_proto
qw/void av1_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x4 sse2/
;
add_proto
qw/void av1_fht4x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x8/
;
add_proto
qw/void av1_fht8x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x4/
;
add_proto
qw/void av1_fht8x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x16/
;
add_proto
qw/void av1_fht16x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x8/
;
add_proto
qw/void av1_fht16x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x32/
;
add_proto
qw/void av1_fht32x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x16/
;
add_proto
qw/void av1_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x8 sse2/
;
add_proto
qw/void av1_fht16x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x16 sse2/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
ne
"
yes
")
{
specialize
qw/av1_fht4x4 msa/
;
specialize
qw/av1_fht8x8 msa/
;
specialize
qw/av1_fht16x16 msa/
;
}
add_proto
qw/void av1_fht32x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x32/
;
add_proto
qw/void av1_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fwht4x4/
;
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32/
;
add_proto
qw/void av1_fdct32x32_rd/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_rd/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1/
;
}
else
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4 sse2/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1 sse2/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8 sse2/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1 sse2/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16 sse2/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1 sse2/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32 sse2/
;
add_proto
qw/void av1_fdct32x32_rd/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_rd sse2/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1 sse2/
;
}
}
add_proto
qw/void av1_fwd_idtx/
,
"
const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type
";
...
...
av1/encoder/dct.c
View file @
8e80f422
...
...
@@ -1311,8 +1311,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
// Columns
for
(
i
=
0
;
i
<
n
;
++
i
)
{
for
(
j
=
0
;
j
<
n2
;
++
j
)
temp_in
[
j
]
=
(
tran_low_t
)
fdct_round_shift
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
);
temp_in
[
j
]
=
ROUND_POWER_OF_TWO_SIGNED
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
,
DCT_CONST_BITS
);
ht
.
cols
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n2
;
++
j
)
out
[
j
*
n
+
i
]
=
temp_out
[
j
];
}
...
...
@@ -1321,7 +1321,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
for
(
i
=
0
;
i
<
n2
;
++
i
)
{
for
(
j
=
0
;
j
<
n
;
++
j
)
temp_in
[
j
]
=
out
[
j
+
i
*
n
];
ht
.
rows
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n
;
++
j
)
output
[
j
+
i
*
n
]
=
(
temp_out
[
j
]
+
1
)
>>
2
;
for
(
j
=
0
;
j
<
n
;
++
j
)
output
[
j
+
i
*
n
]
=
(
temp_out
[
j
]
+
1
+
(
temp_out
[
j
]
<
0
))
>>
2
;
}
// Note: overall scale factor of transform is 8 times unitary
}
...
...
@@ -1358,8 +1359,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
// Columns
for
(
i
=
0
;
i
<
n2
;
++
i
)
{
for
(
j
=
0
;
j
<
n
;
++
j
)
temp_in
[
j
]
=
(
tran_low_t
)
fdct_round_shift
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
);
temp_in
[
j
]
=
ROUND_POWER_OF_TWO_SIGNED
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
,
DCT_CONST_BITS
);
ht
.
cols
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n
;
++
j
)
out
[
j
*
n2
+
i
]
=
temp_out
[
j
];
}
...
...
@@ -1368,7 +1369,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
for
(
i
=
0
;
i
<
n
;
++
i
)
{
for
(
j
=
0
;
j
<
n2
;
++
j
)
temp_in
[
j
]
=
out
[
j
+
i
*
n2
];
ht
.
rows
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n2
;
++
j
)
output
[
j
+
i
*
n2
]
=
(
temp_out
[
j
]
+
1
)
>>
2
;
for
(
j
=
0
;
j
<
n2
;
++
j
)
output
[
j
+
i
*
n2
]
=
(
temp_out
[
j
]
+
1
+
(
temp_out
[
j
]
<
0
))
>>
2
;
}
// Note: overall scale factor of transform is 8 times unitary
}
...
...
av1/encoder/x86/dct_intrin_sse2.c
View file @
8e80f422
...
...
@@ -12,10 +12,11 @@
#include <assert.h>
#include <emmintrin.h> // SSE2
#include "./av1_rtcd.h"
#include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/fwd_txfm_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
...
...
@@ -2584,3 +2585,362 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
default:
assert
(
0
);
break
;
}
}
#if CONFIG_EXT_TX
static
INLINE
void
scale_sqrt2_8x8
(
__m128i
*
in
)
{
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const
__m128i
v_scale_w
=
_mm_set1_epi16
(
Sqrt2
);
const
__m128i
v_p0l_w
=
_mm_mullo_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p0h_w
=
_mm_mulhi_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p1l_w
=
_mm_mullo_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p1h_w
=
_mm_mulhi_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p2l_w
=
_mm_mullo_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p2h_w
=
_mm_mulhi_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p3l_w
=
_mm_mullo_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p3h_w
=
_mm_mulhi_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p4l_w
=
_mm_mullo_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p4h_w
=
_mm_mulhi_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p5l_w
=
_mm_mullo_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p5h_w
=
_mm_mulhi_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p6l_w
=
_mm_mullo_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p6h_w
=
_mm_mulhi_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p7l_w
=
_mm_mullo_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p7h_w
=
_mm_mulhi_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p0a_d
=
_mm_unpacklo_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p0b_d
=
_mm_unpackhi_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p1a_d
=
_mm_unpacklo_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p1b_d
=
_mm_unpackhi_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p2a_d
=
_mm_unpacklo_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p2b_d
=
_mm_unpackhi_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p3a_d
=
_mm_unpacklo_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p3b_d
=
_mm_unpackhi_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p4a_d
=
_mm_unpacklo_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p4b_d
=
_mm_unpackhi_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p5a_d
=
_mm_unpacklo_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p5b_d
=
_mm_unpackhi_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p6a_d
=
_mm_unpacklo_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p6b_d
=
_mm_unpackhi_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p7a_d
=
_mm_unpacklo_epi16
(
v_p7l_w
,
v_p7h_w
);
const
__m128i
v_p7b_d
=
_mm_unpackhi_epi16
(
v_p7l_w
,
v_p7h_w
);
in
[
0
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p0a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p0b_d
,
DCT_CONST_BITS
));
in
[
1
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p1a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p1b_d
,
DCT_CONST_BITS
));
in
[
2
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p2a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p2b_d
,
DCT_CONST_BITS
));
in
[
3
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p3a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p3b_d
,
DCT_CONST_BITS
));
in
[
4
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p4a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p4b_d
,
DCT_CONST_BITS
));
in
[
5
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p5a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p5b_d
,
DCT_CONST_BITS
));
in
[
6
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p6a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p6b_d
,
DCT_CONST_BITS
));
in
[
7
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p7a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p7b_d
,
DCT_CONST_BITS
));
}
static
INLINE
void
load_buffer_8x16
(
const
int16_t
*
input
,
__m128i
*
in
,
int
stride
,
int
flipud
,
int
fliplr
)
{
// Load 2 8x8 blocks
const
int16_t
*
t
=
input
;
const
int16_t
*
b
=
input
+
8
*
stride
;
if
(
flipud
)
{
const
int16_t
*
const
tmp
=
t
;
t
=
b
;
b
=
tmp
;
}
load_buffer_8x8
(
t
,
in
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
);
load_buffer_8x8
(
b
,
in
+
8
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
+
8
);
}
void
av1_fht8x16_sse2
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
16
];
__m128i
*
const
t
=
in
;
// Alias to top 8x8 sub block
__m128i
*
const
b
=
in
+
8
;
// Alias to bottom 8x8 sub block
switch
(
tx_type
)
{
case
DCT_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
ADST_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
DCT_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
ADST_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
#if CONFIG_EXT_TX
case
FLIPADST_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
DCT_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
1
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
FLIPADST_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
1
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
ADST_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
1
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
FLIPADST_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
IDTX
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
V_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
H_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
V_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
H_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
V_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
H_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
1
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
}
right_shift_8x8
(
t
,
2
);
right_shift_8x8
(
b
,
2
);
write_buffer_8x8
(
output
,
t
,
8
);
write_buffer_8x8
(
output
+
64
,
b
,
8
);
}
static
INLINE
void
load_buffer_16x8
(
const
int16_t
*
input
,
__m128i
*
in
,
int
stride
,
int
flipud
,
int
fliplr
)
{
// Load 2 8x8 blocks
const
int16_t
*
l
=
input
;
const
int16_t
*
r
=
input
+
8
;
if
(
fliplr
)
{
const
int16_t
*
const
tmp
=
l
;
l
=
r
;
r
=
tmp
;
}
// load first 8 columns
load_buffer_8x8
(
l
,
in
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
);
load_buffer_8x8
(
r
,
in
+
8
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
+
8
);