Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
609453e7
Commit
609453e7
authored
Oct 07, 2016
by
Debargha Mukherjee
Committed by
Gerrit Code Review
Oct 07, 2016
Browse files
Merge "Added sse2 inverse 8x16 and 16x8 transforms" into nextgenv2
parents
e4dc5f8d
1baecfeb
Changes
9
Hide whitespace changes
Inline
Side-by-side
aom_dsp/x86/inv_txfm_sse2.c
View file @
609453e7
...
...
@@ -1308,7 +1308,7 @@ void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
static
void
iadst16_8col
(
__m128i
*
in
)
{
void
iadst16_8col
(
__m128i
*
in
)
{
// perform 16x16 1-D ADST for 8 columns
__m128i
s
[
16
],
x
[
16
],
u
[
32
],
v
[
32
];
const
__m128i
k__cospi_p01_p31
=
pair_set_epi16
(
cospi_1_64
,
cospi_31_64
);
...
...
@@ -1778,7 +1778,7 @@ static void iadst16_8col(__m128i *in) {
in
[
15
]
=
_mm_sub_epi16
(
kZero
,
s
[
1
]);
}
static
void
idct16_8col
(
__m128i
*
in
)
{
void
idct16_8col
(
__m128i
*
in
)
{
const
__m128i
k__cospi_p30_m02
=
pair_set_epi16
(
cospi_30_64
,
-
cospi_2_64
);
const
__m128i
k__cospi_p02_p30
=
pair_set_epi16
(
cospi_2_64
,
cospi_30_64
);
const
__m128i
k__cospi_p14_m18
=
pair_set_epi16
(
cospi_14_64
,
-
cospi_18_64
);
...
...
aom_dsp/x86/inv_txfm_sse2.h
View file @
609453e7
...
...
@@ -187,6 +187,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE
(
dest
+
15
*
stride
,
in
[
15
]);
}
void
iadst16_8col
(
__m128i
*
in
);
void
idct16_8col
(
__m128i
*
in
);
void
idct4_sse2
(
__m128i
*
in
);
void
idct8_sse2
(
__m128i
*
in
);
void
idct16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
);
...
...
aom_dsp/x86/synonyms.h
View file @
609453e7
...
...
@@ -73,6 +73,14 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
return
_mm_srli_epi32
(
v_tmp_d
,
bits
);
}
// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
static
INLINE
__m128i
xx_roundn_epi32_unsigned
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
((
1
<<
bits
)
>>
1
);
const
__m128i
v_tmp_d
=
_mm_add_epi32
(
v_val_d
,
v_bias_d
);
return
_mm_srai_epi32
(
v_tmp_d
,
bits
);
}
// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
static
INLINE
__m128i
xx_roundn_epi32
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
((
1
<<
bits
)
>>
1
);
const
__m128i
v_sign_d
=
_mm_srai_epi32
(
v_val_d
,
31
);
...
...
av1/common/av1_rtcd_defs.pl
View file @
609453e7
...
...
@@ -60,23 +60,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add/
;
...
...
@@ -87,23 +89,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add sse2/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add
sse2
/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add
sse2
/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add sse2/
;
...
...
@@ -117,23 +121,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add/
;
...
...
@@ -144,23 +150,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add sse2 neon dspr2/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add
sse2
/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add
sse2
/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add sse2 neon dspr2/
;
...
...
@@ -274,23 +282,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_highbd_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht4x4_16_add/
;
add_proto
qw/void av1_highbd_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_highbd_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht4x8_32_add/
;
add_proto
qw/void av1_highbd_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x4_32_add/
;
add_proto
qw/void av1_highbd_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x4_32_add/
;
add_proto
qw/void av1_highbd_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x16_128_add/
;
add_proto
qw/void av1_highbd_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x16_128_add/
;
add_proto
qw/void av1_highbd_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x8_128_add/
;
add_proto
qw/void av1_highbd_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x8_128_add/
;
add_proto
qw/void av1_highbd_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x32_512_add/
;
add_proto
qw/void av1_highbd_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x32_512_add/
;
add_proto
qw/void av1_highbd_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht32x16_512_add/
;
add_proto
qw/void av1_highbd_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht32x16_512_add/
;
}
add_proto
qw/void av1_highbd_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x8_64_add/
;
...
...
av1/common/x86/idct_intrin_sse2.c
View file @
609453e7
...
...
@@ -11,6 +11,7 @@
#include
"./av1_rtcd.h"
#include
"aom_dsp/x86/inv_txfm_sse2.h"
#include
"aom_dsp/x86/synonyms.h"
#include
"aom_dsp/x86/txfm_common_sse2.h"
#include
"aom_ports/mem.h"
#include
"av1/common/enums.h"
...
...
@@ -303,3 +304,535 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
dest
+=
8
;
write_buffer_8x16
(
dest
,
in1
,
stride
);
}
#if CONFIG_EXT_TX
static
void
iidtx16_8col
(
__m128i
*
in
)
{
const
__m128i
k__zero_epi16
=
_mm_set1_epi16
((
int16_t
)
0
);
const
__m128i
k__sqrt2_epi16
=
_mm_set1_epi16
((
int16_t
)
Sqrt2
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
;
__m128i
u0
,
u1
,
u2
,
u3
,
u4
,
u5
,
u6
,
u7
;
__m128i
x0
,
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
;
__m128i
y0
,
y1
,
y2
,
y3
,
y4
,
y5
,
y6
,
y7
;
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
1
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
1
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
1
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
1
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
1
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
1
);
in
[
8
]
=
_mm_slli_epi16
(
in
[
8
],
1
);
in
[
9
]
=
_mm_slli_epi16
(
in
[
9
],
1
);
in
[
10
]
=
_mm_slli_epi16
(
in
[
10
],
1
);
in
[
11
]
=
_mm_slli_epi16
(
in
[
11
],
1
);
in
[
12
]
=
_mm_slli_epi16
(
in
[
12
],
1
);
in
[
13
]
=
_mm_slli_epi16
(
in
[
13
],
1
);
in
[
14
]
=
_mm_slli_epi16
(
in
[
14
],
1
);
in
[
15
]
=
_mm_slli_epi16
(
in
[
15
],
1
);
v0
=
_mm_unpacklo_epi16
(
in
[
0
],
k__zero_epi16
);
v1
=
_mm_unpacklo_epi16
(
in
[
1
],
k__zero_epi16
);
v2
=
_mm_unpacklo_epi16
(
in
[
2
],
k__zero_epi16
);
v3
=
_mm_unpacklo_epi16
(
in
[
3
],
k__zero_epi16
);
v4
=
_mm_unpacklo_epi16
(
in
[
4
],
k__zero_epi16
);
v5
=
_mm_unpacklo_epi16
(
in
[
5
],
k__zero_epi16
);
v6
=
_mm_unpacklo_epi16
(
in
[
6
],
k__zero_epi16
);
v7
=
_mm_unpacklo_epi16
(
in
[
7
],
k__zero_epi16
);
u0
=
_mm_unpacklo_epi16
(
in
[
8
],
k__zero_epi16
);
u1
=
_mm_unpacklo_epi16
(
in
[
9
],
k__zero_epi16
);
u2
=
_mm_unpacklo_epi16
(
in
[
10
],
k__zero_epi16
);
u3
=
_mm_unpacklo_epi16
(
in
[
11
],
k__zero_epi16
);
u4
=
_mm_unpacklo_epi16
(
in
[
12
],
k__zero_epi16
);
u5
=
_mm_unpacklo_epi16
(
in
[
13
],
k__zero_epi16
);
u6
=
_mm_unpacklo_epi16
(
in
[
14
],
k__zero_epi16
);
u7
=
_mm_unpacklo_epi16
(
in
[
15
],
k__zero_epi16
);
x0
=
_mm_unpackhi_epi16
(
in
[
0
],
k__zero_epi16
);
x1
=
_mm_unpackhi_epi16
(
in
[
1
],
k__zero_epi16
);
x2
=
_mm_unpackhi_epi16
(
in
[
2
],
k__zero_epi16
);
x3
=
_mm_unpackhi_epi16
(
in
[
3
],
k__zero_epi16
);
x4
=
_mm_unpackhi_epi16
(
in
[
4
],
k__zero_epi16
);
x5
=
_mm_unpackhi_epi16
(
in
[
5
],
k__zero_epi16
);
x6
=
_mm_unpackhi_epi16
(
in
[
6
],
k__zero_epi16
);
x7
=
_mm_unpackhi_epi16
(
in
[
7
],
k__zero_epi16
);
y0
=
_mm_unpackhi_epi16
(
in
[
8
],
k__zero_epi16
);
y1
=
_mm_unpackhi_epi16
(
in
[
9
],
k__zero_epi16
);
y2
=
_mm_unpackhi_epi16
(
in
[
10
],
k__zero_epi16
);
y3
=
_mm_unpackhi_epi16
(
in
[
11
],
k__zero_epi16
);
y4
=
_mm_unpackhi_epi16
(
in
[
12
],
k__zero_epi16
);
y5
=
_mm_unpackhi_epi16
(
in
[
13
],
k__zero_epi16
);
y6
=
_mm_unpackhi_epi16
(
in
[
14
],
k__zero_epi16
);
y7
=
_mm_unpackhi_epi16
(
in
[
15
],
k__zero_epi16
);
v0
=
_mm_madd_epi16
(
v0
,
k__sqrt2_epi16
);
v1
=
_mm_madd_epi16
(
v1
,
k__sqrt2_epi16
);
v2
=
_mm_madd_epi16
(
v2
,
k__sqrt2_epi16
);
v3
=
_mm_madd_epi16
(
v3
,
k__sqrt2_epi16
);
v4
=
_mm_madd_epi16
(
v4
,
k__sqrt2_epi16
);
v5
=
_mm_madd_epi16
(
v5
,
k__sqrt2_epi16
);
v6
=
_mm_madd_epi16
(
v6
,
k__sqrt2_epi16
);
v7
=
_mm_madd_epi16
(
v7
,
k__sqrt2_epi16
);
x0
=
_mm_madd_epi16
(
x0
,
k__sqrt2_epi16
);
x1
=
_mm_madd_epi16
(
x1
,
k__sqrt2_epi16
);
x2
=
_mm_madd_epi16
(
x2
,
k__sqrt2_epi16
);
x3
=
_mm_madd_epi16
(
x3
,
k__sqrt2_epi16
);
x4
=
_mm_madd_epi16
(
x4
,
k__sqrt2_epi16
);
x5
=
_mm_madd_epi16
(
x5
,
k__sqrt2_epi16
);
x6
=
_mm_madd_epi16
(
x6
,
k__sqrt2_epi16
);
x7
=
_mm_madd_epi16
(
x7
,
k__sqrt2_epi16
);
u0
=
_mm_madd_epi16
(
u0
,
k__sqrt2_epi16
);
u1
=
_mm_madd_epi16
(
u1
,
k__sqrt2_epi16
);
u2
=
_mm_madd_epi16
(
u2
,
k__sqrt2_epi16
);
u3
=
_mm_madd_epi16
(
u3
,
k__sqrt2_epi16
);
u4
=
_mm_madd_epi16
(
u4
,
k__sqrt2_epi16
);
u5
=
_mm_madd_epi16
(
u5
,
k__sqrt2_epi16
);
u6
=
_mm_madd_epi16
(
u6
,
k__sqrt2_epi16
);
u7
=
_mm_madd_epi16
(
u7
,
k__sqrt2_epi16
);
y0
=
_mm_madd_epi16
(
y0
,
k__sqrt2_epi16
);
y1
=
_mm_madd_epi16
(
y1
,
k__sqrt2_epi16
);
y2
=
_mm_madd_epi16
(
y2
,
k__sqrt2_epi16
);
y3
=
_mm_madd_epi16
(
y3
,
k__sqrt2_epi16
);
y4
=
_mm_madd_epi16
(
y4
,
k__sqrt2_epi16
);
y5
=
_mm_madd_epi16
(
y5
,
k__sqrt2_epi16
);
y6
=
_mm_madd_epi16
(
y6
,
k__sqrt2_epi16
);
y7
=
_mm_madd_epi16
(
y7
,
k__sqrt2_epi16
);
v0
=
_mm_add_epi32
(
v0
,
k__DCT_CONST_ROUNDING
);
v1
=
_mm_add_epi32
(
v1
,
k__DCT_CONST_ROUNDING
);
v2
=
_mm_add_epi32
(
v2
,
k__DCT_CONST_ROUNDING
);
v3
=
_mm_add_epi32
(
v3
,
k__DCT_CONST_ROUNDING
);
v4
=
_mm_add_epi32
(
v4
,
k__DCT_CONST_ROUNDING
);
v5
=
_mm_add_epi32
(
v5
,
k__DCT_CONST_ROUNDING
);
v6
=
_mm_add_epi32
(
v6
,
k__DCT_CONST_ROUNDING
);
v7
=
_mm_add_epi32
(
v7
,
k__DCT_CONST_ROUNDING
);
x0
=
_mm_add_epi32
(
x0
,
k__DCT_CONST_ROUNDING
);
x1
=
_mm_add_epi32
(
x1
,
k__DCT_CONST_ROUNDING
);
x2
=
_mm_add_epi32
(
x2
,
k__DCT_CONST_ROUNDING
);
x3
=
_mm_add_epi32
(
x3
,
k__DCT_CONST_ROUNDING
);
x4
=
_mm_add_epi32
(
x4
,
k__DCT_CONST_ROUNDING
);
x5
=
_mm_add_epi32
(
x5
,
k__DCT_CONST_ROUNDING
);
x6
=
_mm_add_epi32
(
x6
,
k__DCT_CONST_ROUNDING
);
x7
=
_mm_add_epi32
(
x7
,
k__DCT_CONST_ROUNDING
);
u0
=
_mm_add_epi32
(
u0
,
k__DCT_CONST_ROUNDING
);
u1
=
_mm_add_epi32
(
u1
,
k__DCT_CONST_ROUNDING
);
u2
=
_mm_add_epi32
(
u2
,
k__DCT_CONST_ROUNDING
);
u3
=
_mm_add_epi32
(
u3
,
k__DCT_CONST_ROUNDING
);
u4
=
_mm_add_epi32
(
u4
,
k__DCT_CONST_ROUNDING
);
u5
=
_mm_add_epi32
(
u5
,
k__DCT_CONST_ROUNDING
);
u6
=
_mm_add_epi32
(
u6
,
k__DCT_CONST_ROUNDING
);
u7
=
_mm_add_epi32
(
u7
,
k__DCT_CONST_ROUNDING
);
y0
=
_mm_add_epi32
(
y0
,
k__DCT_CONST_ROUNDING
);
y1
=
_mm_add_epi32
(
y1
,
k__DCT_CONST_ROUNDING
);
y2
=
_mm_add_epi32
(
y2
,
k__DCT_CONST_ROUNDING
);
y3
=
_mm_add_epi32
(
y3
,
k__DCT_CONST_ROUNDING
);
y4
=
_mm_add_epi32
(
y4
,
k__DCT_CONST_ROUNDING
);
y5
=
_mm_add_epi32
(
y5
,
k__DCT_CONST_ROUNDING
);
y6
=
_mm_add_epi32
(
y6
,
k__DCT_CONST_ROUNDING
);
y7
=
_mm_add_epi32
(
y7
,
k__DCT_CONST_ROUNDING
);
v0
=
_mm_srai_epi32
(
v0
,
DCT_CONST_BITS
);
v1
=
_mm_srai_epi32
(
v1
,
DCT_CONST_BITS
);
v2
=
_mm_srai_epi32
(
v2
,
DCT_CONST_BITS
);
v3
=
_mm_srai_epi32
(
v3
,
DCT_CONST_BITS
);
v4
=
_mm_srai_epi32
(
v4
,
DCT_CONST_BITS
);
v5
=
_mm_srai_epi32
(
v5
,
DCT_CONST_BITS
);
v6
=
_mm_srai_epi32
(
v6
,
DCT_CONST_BITS
);
v7
=
_mm_srai_epi32
(
v7
,
DCT_CONST_BITS
);
x0
=
_mm_srai_epi32
(
x0
,
DCT_CONST_BITS
);
x1
=
_mm_srai_epi32
(
x1
,
DCT_CONST_BITS
);
x2
=
_mm_srai_epi32
(
x2
,
DCT_CONST_BITS
);
x3
=
_mm_srai_epi32
(
x3
,
DCT_CONST_BITS
);
x4
=
_mm_srai_epi32
(
x4
,
DCT_CONST_BITS
);
x5
=
_mm_srai_epi32
(
x5
,
DCT_CONST_BITS
);
x6
=
_mm_srai_epi32
(
x6
,
DCT_CONST_BITS
);
x7
=
_mm_srai_epi32
(
x7
,
DCT_CONST_BITS
);
u0
=
_mm_srai_epi32
(
u0
,
DCT_CONST_BITS
);
u1
=
_mm_srai_epi32
(
u1
,
DCT_CONST_BITS
);
u2
=
_mm_srai_epi32
(
u2
,
DCT_CONST_BITS
);
u3
=
_mm_srai_epi32
(
u3
,
DCT_CONST_BITS
);
u4
=
_mm_srai_epi32
(
u4
,
DCT_CONST_BITS
);
u5
=
_mm_srai_epi32
(
u5
,
DCT_CONST_BITS
);
u6
=
_mm_srai_epi32
(
u6
,
DCT_CONST_BITS
);
u7
=
_mm_srai_epi32
(
u7
,
DCT_CONST_BITS
);
y0
=
_mm_srai_epi32
(
y0
,
DCT_CONST_BITS
);
y1
=
_mm_srai_epi32
(
y1
,
DCT_CONST_BITS
);
y2
=
_mm_srai_epi32
(
y2
,
DCT_CONST_BITS
);
y3
=
_mm_srai_epi32
(
y3
,
DCT_CONST_BITS
);
y4
=
_mm_srai_epi32
(
y4
,
DCT_CONST_BITS
);
y5
=
_mm_srai_epi32
(
y5
,
DCT_CONST_BITS
);
y6
=
_mm_srai_epi32
(
y6
,
DCT_CONST_BITS
);
y7
=
_mm_srai_epi32
(
y7
,
DCT_CONST_BITS
);
in
[
0
]
=
_mm_packs_epi32
(
v0
,
x0
);
in
[
1
]
=
_mm_packs_epi32
(
v1
,
x1
);
in
[
2
]
=
_mm_packs_epi32
(
v2
,
x2
);
in
[
3
]
=
_mm_packs_epi32
(
v3
,
x3
);
in
[
4
]
=
_mm_packs_epi32
(
v4
,
x4
);
in
[
5
]
=
_mm_packs_epi32
(
v5
,
x5
);
in
[
6
]
=
_mm_packs_epi32
(
v6
,
x6
);
in
[
7
]
=
_mm_packs_epi32
(
v7
,
x7
);
in
[
8
]
=
_mm_packs_epi32
(
u0
,
y0
);
in
[
9
]
=
_mm_packs_epi32
(
u1
,
y1
);
in
[
10
]
=
_mm_packs_epi32
(
u2
,
y2
);
in
[
11
]
=
_mm_packs_epi32
(
u3
,
y3
);
in
[
12
]
=
_mm_packs_epi32
(
u4
,
y4
);
in
[
13
]
=
_mm_packs_epi32
(
u5
,
y5
);
in
[
14
]
=
_mm_packs_epi32
(
u6
,
y6
);
in
[
15
]
=
_mm_packs_epi32
(
u7
,
y7
);
}
static
void
iidtx8_sse2
(
__m128i
*
in
)
{
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
1
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
1
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
1
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
1
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
1
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
1
);
}
// load 8x8 array
static
INLINE
void
flip_buffer_lr_8x8
(
__m128i
*
in
)
{
in
[
0
]
=
mm_reverse_epi16
(
in
[
0
]);
in
[
1
]
=
mm_reverse_epi16
(
in
[
1
]);
in
[
2
]
=
mm_reverse_epi16
(
in
[
2
]);
in
[
3
]
=
mm_reverse_epi16
(
in
[
3
]);
in
[
4
]
=
mm_reverse_epi16
(
in
[
4
]);
in
[
5
]
=
mm_reverse_epi16
(
in
[
5
]);
in
[
6
]
=
mm_reverse_epi16
(
in
[
6
]);
in
[
7
]
=
mm_reverse_epi16
(
in
[
7
]);
}
static
INLINE
void
scale_sqrt2_8x8
(
__m128i
*
in
)
{
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const
__m128i
v_scale_w
=
_mm_set1_epi16
(
Sqrt2
);
const
__m128i
v_p0l_w
=
_mm_mullo_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p0h_w
=
_mm_mulhi_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p1l_w
=
_mm_mullo_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p1h_w
=
_mm_mulhi_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p2l_w
=
_mm_mullo_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p2h_w
=
_mm_mulhi_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p3l_w
=
_mm_mullo_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p3h_w
=
_mm_mulhi_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p4l_w
=
_mm_mullo_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p4h_w
=
_mm_mulhi_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p5l_w
=
_mm_mullo_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p5h_w
=
_mm_mulhi_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p6l_w
=
_mm_mullo_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p6h_w
=
_mm_mulhi_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p7l_w
=
_mm_mullo_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p7h_w
=
_mm_mulhi_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p0a_d
=
_mm_unpacklo_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p0b_d
=
_mm_unpackhi_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p1a_d
=
_mm_unpacklo_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p1b_d
=
_mm_unpackhi_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p2a_d
=
_mm_unpacklo_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p2b_d
=
_mm_unpackhi_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p3a_d
=
_mm_unpacklo_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p3b_d
=
_mm_unpackhi_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p4a_d
=
_mm_unpacklo_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p4b_d
=
_mm_unpackhi_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p5a_d
=
_mm_unpacklo_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p5b_d
=
_mm_unpackhi_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p6a_d
=
_mm_unpacklo_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p6b_d
=
_mm_unpackhi_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p7a_d
=
_mm_unpacklo_epi16
(
v_p7l_w
,
v_p7h_w
);
const
__m128i
v_p7b_d
=
_mm_unpackhi_epi16
(
v_p7l_w
,
v_p7h_w
);
in
[
0
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p0a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p0b_d
,
DCT_CONST_BITS
));
in
[
1
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p1a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p1b_d
,
DCT_CONST_BITS
));
in
[
2
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p2a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p2b_d
,
DCT_CONST_BITS
));
in
[
3
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p3a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p3b_d
,
DCT_CONST_BITS
));
in
[
4
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p4a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p4b_d
,
DCT_CONST_BITS
));
in
[
5
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p5a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p5b_d
,
DCT_CONST_BITS
));
in
[
6
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p6a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p6b_d
,
DCT_CONST_BITS
));
in
[
7
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p7a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p7b_d
,
DCT_CONST_BITS
));
}
void
av1_iht8x16_128_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
16
];
in
[
0
]
=
load_input_data
(
input
+
0
*
8
);
in
[
1
]
=
load_input_data
(
input
+
1
*
8
);
in
[
2
]
=
load_input_data
(
input
+
2
*
8
);
in
[
3
]
=
load_input_data
(
input
+
3
*
8
);
in
[
4
]
=
load_input_data
(
input
+
4
*
8
);
in
[
5
]
=
load_input_data
(
input
+
5
*
8
);
in
[
6
]
=
load_input_data
(
input
+
6
*
8
);
in
[
7
]
=
load_input_data
(
input
+
7
*
8
);
in
[
8
]
=
load_input_data
(
input
+
8
*
8
);
in
[
9
]
=
load_input_data
(
input
+
9
*
8
);
in
[
10
]
=
load_input_data
(
input
+
10
*
8
);
in
[
11
]
=
load_input_data
(
input
+
11
*
8
);
in
[
12
]
=
load_input_data
(
input
+
12
*
8
);
in
[
13
]
=
load_input_data
(
input
+
13
*
8
);
in
[
14
]
=
load_input_data
(
input
+
14
*
8
);
in
[
15
]
=
load_input_data
(
input
+
15
*
8
);
// Row transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
FLIPADST_DCT
:
case
H_DCT
:
idct8_sse2
(
in
);
array_transpose_8x8
(
in
,
in
);
idct8_sse2
(
in
+
8
);
array_transpose_8x8
(
in
+
8
,
in
+
8
);
break
;
case
DCT_ADST
:
case
ADST_ADST
:
case
DCT_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
case
H_ADST
:
case
H_FLIPADST
:
iadst8_sse2
(
in
);
array_transpose_8x8
(
in
,
in
);
iadst8_sse2
(
in
+
8
);
array_transpose_8x8
(
in
+
8
,
in
+
8
);
break
;
case
V_FLIPADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
iidtx8_sse2
(
in
);
iidtx8_sse2
(
in
+
8
);
break
;
default:
assert
(
0
);
break
;
}
scale_sqrt2_8x8
(
in
);
scale_sqrt2_8x8
(
in
+
8
);
// Column transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
DCT_ADST
:
case
DCT_FLIPADST
:
case
V_DCT
:
idct16_8col
(
in
);
break
;
case
ADST_DCT
:
case
ADST_ADST
:
case
FLIPADST_ADST
:
case
ADST_FLIPADST
:
case