Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
1a800f65
Commit
1a800f65
authored
Sep 02, 2016
by
Geza Lore
Committed by
Debargha Mukherjee
Sep 09, 2016
Browse files
Add SSE2 versions of av1_fht8x16 and av1_fht16x8
Encoder speedup ~2% with ext-tx + rect-tx Change-Id: Id56ddf102a887de31d181bde6d8ef8c4f03da945
parent
e51ee021
Changes
8
Hide whitespace changes
Inline
Side-by-side
aom_dsp/x86/synonyms.h
View file @
1a800f65
...
...
@@ -68,13 +68,13 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
}
static
INLINE
__m128i
xx_roundn_epu32
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
1
<<
(
bits
-
1
)
)
;
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
(
1
<<
bits
)
>>
1
);
const
__m128i
v_tmp_d
=
_mm_add_epi32
(
v_val_d
,
v_bias_d
);
return
_mm_srli_epi32
(
v_tmp_d
,
bits
);
}
static
INLINE
__m128i
xx_roundn_epi32
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
1
<<
(
bits
-
1
)
)
;
const
__m128i
v_bias_d
=
_mm_set1_epi32
(
(
1
<<
bits
)
>>
1
);
const
__m128i
v_sign_d
=
_mm_srai_epi32
(
v_val_d
,
31
);
const
__m128i
v_tmp_d
=
_mm_add_epi32
(
_mm_add_epi32
(
v_val_d
,
v_bias_d
),
v_sign_d
);
...
...
av1/common/av1_rtcd_defs.pl
View file @
1a800f65
...
...
@@ -51,7 +51,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
}
#
# dct
#
Inverse
dct
#
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
eq
"
yes
")
{
# Note as optimized versions of these functions are added we need to add a check to ensure
...
...
@@ -368,10 +368,22 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {
# fdct functions
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
eq
"
yes
")
{
add_proto
qw/void av1_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x4 sse2/
;
add_proto
qw/void av1_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x4 sse2/
;
add_proto
qw/void av1_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fwht4x4/
;
add_proto
qw/void av1_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x8 sse2/
;
add_proto
qw/void av1_fht16x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x16 sse2/
;
add_proto
qw/void av1_fht32x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x32/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_fht4x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x8/
;
...
...
@@ -379,56 +391,84 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_fht8x4/
;
add_proto
qw/void av1_fht8x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x16/
;
specialize
qw/av1_fht8x16
sse2
/
;
add_proto
qw/void av1_fht16x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x8/
;
specialize
qw/av1_fht16x8
sse2
/
;
add_proto
qw/void av1_fht16x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x32/
;
add_proto
qw/void av1_fht32x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x16/
;
}
add_proto
qw/void av1_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x8 sse2/
;
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4/
;
add_proto
qw/void av1_f
ht16x16
/
,
"
const int16_t *input, tran_low_t *output, int stride
, int tx_type
";
specialize
qw/av1_f
ht16x16 sse2
/
;
add_proto
qw/void av1_f
dct4x4_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_f
dct4x4_1
/
;
add_proto
qw/void av1_f
ht32x32
/
,
"
const int16_t *input, tran_low_t *output, int stride
, int tx_type
";
specialize
qw/av1_f
ht32x32
/
;
add_proto
qw/void av1_f
dct8x8
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_f
dct8x8
/
;
add_proto
qw/void av1_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fwht4x4/
;
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32/
;
add_proto
qw/void av1_fdct
4x4_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
4x4_1
/
;
add_proto
qw/void av1_fdct
32x32_rd
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
32x32_rd
/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1/
;
}
else
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4 sse2/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1 sse2/
;
add_proto
qw/void av1_fdct8x8
_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8
_1
/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8
sse2
/
;
add_proto
qw/void av1_fdct
16x16
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
16x16
/
;
add_proto
qw/void av1_fdct
8x8_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
8x8_1 sse2
/
;
add_proto
qw/void av1_fdct16x16
_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16
_1
/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16
sse2
/
;
add_proto
qw/void av1_fdct
32x32
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
32x3
2/
;
add_proto
qw/void av1_fdct
16x16_1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct
16x16_1 sse
2/
;
add_proto
qw/void av1_fdct32x32
_rd
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32
_rd
/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32
sse2
/
;
add_proto
qw/void av1_fdct32x32_
1
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_
1
/
;
add_proto
qw/void av1_fdct32x32_
rd
/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_
rd sse2
/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1 sse2/
;
}
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
ne
"
yes
")
{
if
(
aom_config
("
CONFIG_EXT_TX
")
ne
"
yes
")
{
specialize
qw/av1_fht4x4 msa/
;
specialize
qw/av1_fht8x8 msa/
;
specialize
qw/av1_fht16x16 msa/
;
}
}
if
(
aom_config
("
CONFIG_AOM_HIGHBITDEPTH
")
eq
"
yes
")
{
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_highbd_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct4x4/
;
...
...
@@ -453,33 +493,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_highbd_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct32x32_1/
;
}
else
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4 sse2/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1 sse2/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8 sse2/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1 sse2/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16 sse2/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1 sse2/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32 sse2/
;
add_proto
qw/void av1_fdct32x32_rd/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_rd sse2/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1 sse2/
;
add_proto
qw/void av1_highbd_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct4x4 sse2/
;
...
...
@@ -504,100 +517,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_highbd_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_highbd_fdct32x32_1/
;
}
}
else
{
add_proto
qw/void av1_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x4 sse2/
;
add_proto
qw/void av1_fht4x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht4x8/
;
add_proto
qw/void av1_fht8x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x4/
;
add_proto
qw/void av1_fht8x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x16/
;
add_proto
qw/void av1_fht16x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x8/
;
add_proto
qw/void av1_fht16x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x32/
;
add_proto
qw/void av1_fht32x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x16/
;
add_proto
qw/void av1_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht8x8 sse2/
;
add_proto
qw/void av1_fht16x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x16 sse2/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
ne
"
yes
")
{
specialize
qw/av1_fht4x4 msa/
;
specialize
qw/av1_fht8x8 msa/
;
specialize
qw/av1_fht16x16 msa/
;
}
add_proto
qw/void av1_fht32x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x32/
;
add_proto
qw/void av1_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fwht4x4/
;
if
(
aom_config
("
CONFIG_EMULATE_HARDWARE
")
eq
"
yes
")
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32/
;
add_proto
qw/void av1_fdct32x32_rd/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_rd/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1/
;
}
else
{
add_proto
qw/void av1_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4 sse2/
;
add_proto
qw/void av1_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct4x4_1 sse2/
;
add_proto
qw/void av1_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8 sse2/
;
add_proto
qw/void av1_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct8x8_1 sse2/
;
add_proto
qw/void av1_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16 sse2/
;
add_proto
qw/void av1_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct16x16_1 sse2/
;
add_proto
qw/void av1_fdct32x32/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32 sse2/
;
add_proto
qw/void av1_fdct32x32_rd/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_rd sse2/
;
add_proto
qw/void av1_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/av1_fdct32x32_1 sse2/
;
}
}
add_proto
qw/void av1_fwd_idtx/
,
"
const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type
";
...
...
av1/encoder/dct.c
View file @
1a800f65
...
...
@@ -1311,8 +1311,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
// Columns
for
(
i
=
0
;
i
<
n
;
++
i
)
{
for
(
j
=
0
;
j
<
n2
;
++
j
)
temp_in
[
j
]
=
(
tran_low_t
)
fdct_round_shift
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
);
temp_in
[
j
]
=
ROUND_POWER_OF_TWO_SIGNED
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
,
DCT_CONST_BITS
);
ht
.
cols
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n2
;
++
j
)
out
[
j
*
n
+
i
]
=
temp_out
[
j
];
}
...
...
@@ -1321,7 +1321,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
for
(
i
=
0
;
i
<
n2
;
++
i
)
{
for
(
j
=
0
;
j
<
n
;
++
j
)
temp_in
[
j
]
=
out
[
j
+
i
*
n
];
ht
.
rows
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n
;
++
j
)
output
[
j
+
i
*
n
]
=
(
temp_out
[
j
]
+
1
)
>>
2
;
for
(
j
=
0
;
j
<
n
;
++
j
)
output
[
j
+
i
*
n
]
=
(
temp_out
[
j
]
+
1
+
(
temp_out
[
j
]
<
0
))
>>
2
;
}
// Note: overall scale factor of transform is 8 times unitary
}
...
...
@@ -1358,8 +1359,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
// Columns
for
(
i
=
0
;
i
<
n2
;
++
i
)
{
for
(
j
=
0
;
j
<
n
;
++
j
)
temp_in
[
j
]
=
(
tran_low_t
)
fdct_round_shift
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
);
temp_in
[
j
]
=
ROUND_POWER_OF_TWO_SIGNED
(
input
[
j
*
stride
+
i
]
*
4
*
Sqrt2
,
DCT_CONST_BITS
);
ht
.
cols
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n
;
++
j
)
out
[
j
*
n2
+
i
]
=
temp_out
[
j
];
}
...
...
@@ -1368,7 +1369,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
for
(
i
=
0
;
i
<
n
;
++
i
)
{
for
(
j
=
0
;
j
<
n2
;
++
j
)
temp_in
[
j
]
=
out
[
j
+
i
*
n2
];
ht
.
rows
(
temp_in
,
temp_out
);
for
(
j
=
0
;
j
<
n2
;
++
j
)
output
[
j
+
i
*
n2
]
=
(
temp_out
[
j
]
+
1
)
>>
2
;
for
(
j
=
0
;
j
<
n2
;
++
j
)
output
[
j
+
i
*
n2
]
=
(
temp_out
[
j
]
+
1
+
(
temp_out
[
j
]
<
0
))
>>
2
;
}
// Note: overall scale factor of transform is 8 times unitary
}
...
...
av1/encoder/x86/dct_intrin_sse2.c
View file @
1a800f65
...
...
@@ -12,10 +12,11 @@
#include
<assert.h>
#include
<emmintrin.h>
// SSE2
#include
"./av1_rtcd.h"
#include
"./aom_dsp_rtcd.h"
#include
"./av1_rtcd.h"
#include
"aom_dsp/txfm_common.h"
#include
"aom_dsp/x86/fwd_txfm_sse2.h"
#include
"aom_dsp/x86/synonyms.h"
#include
"aom_dsp/x86/txfm_common_sse2.h"
#include
"aom_ports/mem.h"
...
...
@@ -2584,3 +2585,362 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
default:
assert
(
0
);
break
;
}
}
#if CONFIG_EXT_TX
static
INLINE
void
scale_sqrt2_8x8
(
__m128i
*
in
)
{
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const
__m128i
v_scale_w
=
_mm_set1_epi16
(
Sqrt2
);
const
__m128i
v_p0l_w
=
_mm_mullo_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p0h_w
=
_mm_mulhi_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p1l_w
=
_mm_mullo_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p1h_w
=
_mm_mulhi_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p2l_w
=
_mm_mullo_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p2h_w
=
_mm_mulhi_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p3l_w
=
_mm_mullo_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p3h_w
=
_mm_mulhi_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p4l_w
=
_mm_mullo_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p4h_w
=
_mm_mulhi_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p5l_w
=
_mm_mullo_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p5h_w
=
_mm_mulhi_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p6l_w
=
_mm_mullo_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p6h_w
=
_mm_mulhi_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p7l_w
=
_mm_mullo_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p7h_w
=
_mm_mulhi_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p0a_d
=
_mm_unpacklo_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p0b_d
=
_mm_unpackhi_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p1a_d
=
_mm_unpacklo_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p1b_d
=
_mm_unpackhi_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p2a_d
=
_mm_unpacklo_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p2b_d
=
_mm_unpackhi_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p3a_d
=
_mm_unpacklo_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p3b_d
=
_mm_unpackhi_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p4a_d
=
_mm_unpacklo_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p4b_d
=
_mm_unpackhi_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p5a_d
=
_mm_unpacklo_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p5b_d
=
_mm_unpackhi_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p6a_d
=
_mm_unpacklo_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p6b_d
=
_mm_unpackhi_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p7a_d
=
_mm_unpacklo_epi16
(
v_p7l_w
,
v_p7h_w
);
const
__m128i
v_p7b_d
=
_mm_unpackhi_epi16
(
v_p7l_w
,
v_p7h_w
);
in
[
0
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p0a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p0b_d
,
DCT_CONST_BITS
));
in
[
1
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p1a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p1b_d
,
DCT_CONST_BITS
));
in
[
2
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p2a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p2b_d
,
DCT_CONST_BITS
));
in
[
3
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p3a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p3b_d
,
DCT_CONST_BITS
));
in
[
4
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p4a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p4b_d
,
DCT_CONST_BITS
));
in
[
5
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p5a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p5b_d
,
DCT_CONST_BITS
));
in
[
6
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p6a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p6b_d
,
DCT_CONST_BITS
));
in
[
7
]
=
_mm_packs_epi32
(
xx_roundn_epi32
(
v_p7a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32
(
v_p7b_d
,
DCT_CONST_BITS
));
}
static
INLINE
void
load_buffer_8x16
(
const
int16_t
*
input
,
__m128i
*
in
,
int
stride
,
int
flipud
,
int
fliplr
)
{
// Load 2 8x8 blocks
const
int16_t
*
t
=
input
;
const
int16_t
*
b
=
input
+
8
*
stride
;
if
(
flipud
)
{
const
int16_t
*
const
tmp
=
t
;
t
=
b
;
b
=
tmp
;
}
load_buffer_8x8
(
t
,
in
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
);
load_buffer_8x8
(
b
,
in
+
8
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
+
8
);
}
void
av1_fht8x16_sse2
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
16
];
__m128i
*
const
t
=
in
;
// Alias to top 8x8 sub block
__m128i
*
const
b
=
in
+
8
;
// Alias to bottom 8x8 sub block
switch
(
tx_type
)
{
case
DCT_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
ADST_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
DCT_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
ADST_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
#if CONFIG_EXT_TX
case
FLIPADST_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
DCT_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
1
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
FLIPADST_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
1
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
ADST_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
1
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
FLIPADST_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
IDTX
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
V_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fdct16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
H_DCT
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fdct8_sse2
(
t
);
fdct8_sse2
(
b
);
break
;
case
V_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
H_ADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
0
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
case
V_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
1
,
0
);
fadst16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fidtx8_sse2
(
t
);
fidtx8_sse2
(
b
);
break
;
case
H_FLIPADST
:
load_buffer_8x16
(
input
,
in
,
stride
,
0
,
1
);
fidtx16_8col
(
in
);
array_transpose_8x8
(
t
,
t
);
array_transpose_8x8
(
b
,
b
);
fadst8_sse2
(
t
);
fadst8_sse2
(
b
);
break
;
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
}
right_shift_8x8
(
t
,
2
);
right_shift_8x8
(
b
,
2
);
write_buffer_8x8
(
output
,
t
,
8
);
write_buffer_8x8
(
output
+
64
,
b
,
8
);
}
static
INLINE
void
load_buffer_16x8
(
const
int16_t
*
input
,
__m128i
*
in
,
int
stride
,
int
flipud
,
int
fliplr
)
{
// Load 2 8x8 blocks
const
int16_t
*
l
=
input
;
const
int16_t
*
r
=
input
+
8
;
if
(
fliplr
)
{
const
int16_t
*
const
tmp
=
l
;
l
=
r
;
r
=
tmp
;
}
// load first 8 columns
load_buffer_8x8
(
l
,
in
,
stride
,
flipud
,
fliplr
);
scale_sqrt2_8x8
(
in
);