Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
1baecfeb
Commit
1baecfeb
authored
Sep 29, 2016
by
Peter de Rivaz
Committed by
Debargha Mukherjee
Oct 06, 2016
Browse files
Added sse2 inverse 8x16 and 16x8 transforms
Change-Id: I43628407b11e5c8e6af4df69f2acdc67ac827834
parent
71e4553c
Changes
9
Hide whitespace changes
Inline
Side-by-side
aom_dsp/x86/inv_txfm_sse2.c
View file @
1baecfeb
...
...
@@ -1308,7 +1308,7 @@ void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
static
void
iadst16_8col
(
__m128i
*
in
)
{
void
iadst16_8col
(
__m128i
*
in
)
{
// perform 16x16 1-D ADST for 8 columns
__m128i
s
[
16
],
x
[
16
],
u
[
32
],
v
[
32
];
const
__m128i
k__cospi_p01_p31
=
pair_set_epi16
(
cospi_1_64
,
cospi_31_64
);
...
...
@@ -1778,7 +1778,7 @@ static void iadst16_8col(__m128i *in) {
in
[
15
]
=
_mm_sub_epi16
(
kZero
,
s
[
1
]);
}
static
void
idct16_8col
(
__m128i
*
in
)
{
void
idct16_8col
(
__m128i
*
in
)
{
const
__m128i
k__cospi_p30_m02
=
pair_set_epi16
(
cospi_30_64
,
-
cospi_2_64
);
const
__m128i
k__cospi_p02_p30
=
pair_set_epi16
(
cospi_2_64
,
cospi_30_64
);
const
__m128i
k__cospi_p14_m18
=
pair_set_epi16
(
cospi_14_64
,
-
cospi_18_64
);
...
...
aom_dsp/x86/inv_txfm_sse2.h
View file @
1baecfeb
...
...
@@ -187,6 +187,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE
(
dest
+
15
*
stride
,
in
[
15
]);
}
void
iadst16_8col
(
__m128i
*
in
);
void
idct16_8col
(
__m128i
*
in
);
void
idct4_sse2
(
__m128i
*
in
);
void
idct8_sse2
(
__m128i
*
in
);
void
idct16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
);
...
...
aom_dsp/x86/synonyms.h
View file @
1baecfeb
...
...
@@ -73,6 +73,14 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
return
_mm_srli_epi32
(
v_tmp_d
,
bits
);
}
// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
static
INLINE
__m128i
xx_roundn_epi32_unsigned
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
((
1
<<
bits
)
>>
1
);
const
__m128i
v_tmp_d
=
_mm_add_epi32
(
v_val_d
,
v_bias_d
);
return
_mm_srai_epi32
(
v_tmp_d
,
bits
);
}
// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
static
INLINE
__m128i
xx_roundn_epi32
(
__m128i
v_val_d
,
int
bits
)
{
const
__m128i
v_bias_d
=
_mm_set1_epi32
((
1
<<
bits
)
>>
1
);
const
__m128i
v_sign_d
=
_mm_srai_epi32
(
v_val_d
,
31
);
...
...
av1/common/av1_rtcd_defs.pl
View file @
1baecfeb
...
...
@@ -60,23 +60,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add/
;
...
...
@@ -87,23 +89,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add sse2/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add
sse2
/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add
sse2
/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add sse2/
;
...
...
@@ -117,23 +121,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add/
;
...
...
@@ -144,23 +150,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x4_16_add sse2 neon dspr2/
;
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht4x8_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x4_32_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add/
;
add_proto
qw/void av1_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x16_128_add
sse2
/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add/
;
add_proto
qw/void av1_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x8_128_add
sse2
/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht8x8_64_add sse2 neon dspr2/
;
...
...
@@ -274,23 +282,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void av1_highbd_iht4x4_16_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht4x4_16_add/
;
add_proto
qw/void av1_highbd_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht4x8_32_add/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
eq
"
yes
")
{
add_proto
qw/void av1_highbd_iht4x8_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht4x8_32_add/
;
add_proto
qw/void av1_highbd_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x4_32_add/
;
add_proto
qw/void av1_highbd_iht8x4_32_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x4_32_add/
;
add_proto
qw/void av1_highbd_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x16_128_add/
;
add_proto
qw/void av1_highbd_iht8x16_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x16_128_add/
;
add_proto
qw/void av1_highbd_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x8_128_add/
;
add_proto
qw/void av1_highbd_iht16x8_128_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x8_128_add/
;
add_proto
qw/void av1_highbd_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x32_512_add/
;
add_proto
qw/void av1_highbd_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht16x32_512_add/
;
add_proto
qw/void av1_highbd_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht32x16_512_add/
;
add_proto
qw/void av1_highbd_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht32x16_512_add/
;
}
add_proto
qw/void av1_highbd_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd
";
specialize
qw/av1_highbd_iht8x8_64_add/
;
...
...
av1/common/x86/idct_intrin_sse2.c
View file @
1baecfeb
...
...
@@ -11,6 +11,7 @@
#include "./av1_rtcd.h"
#include "aom_dsp/x86/inv_txfm_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
#include "av1/common/enums.h"
...
...
@@ -303,3 +304,535 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
dest
+=
8
;
write_buffer_8x16
(
dest
,
in1
,
stride
);
}
#if CONFIG_EXT_TX
static
void
iidtx16_8col
(
__m128i
*
in
)
{
const
__m128i
k__zero_epi16
=
_mm_set1_epi16
((
int16_t
)
0
);
const
__m128i
k__sqrt2_epi16
=
_mm_set1_epi16
((
int16_t
)
Sqrt2
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
;
__m128i
u0
,
u1
,
u2
,
u3
,
u4
,
u5
,
u6
,
u7
;
__m128i
x0
,
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
;
__m128i
y0
,
y1
,
y2
,
y3
,
y4
,
y5
,
y6
,
y7
;
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
1
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
1
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
1
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
1
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
1
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
1
);
in
[
8
]
=
_mm_slli_epi16
(
in
[
8
],
1
);
in
[
9
]
=
_mm_slli_epi16
(
in
[
9
],
1
);
in
[
10
]
=
_mm_slli_epi16
(
in
[
10
],
1
);
in
[
11
]
=
_mm_slli_epi16
(
in
[
11
],
1
);
in
[
12
]
=
_mm_slli_epi16
(
in
[
12
],
1
);
in
[
13
]
=
_mm_slli_epi16
(
in
[
13
],
1
);
in
[
14
]
=
_mm_slli_epi16
(
in
[
14
],
1
);
in
[
15
]
=
_mm_slli_epi16
(
in
[
15
],
1
);
v0
=
_mm_unpacklo_epi16
(
in
[
0
],
k__zero_epi16
);
v1
=
_mm_unpacklo_epi16
(
in
[
1
],
k__zero_epi16
);
v2
=
_mm_unpacklo_epi16
(
in
[
2
],
k__zero_epi16
);
v3
=
_mm_unpacklo_epi16
(
in
[
3
],
k__zero_epi16
);
v4
=
_mm_unpacklo_epi16
(
in
[
4
],
k__zero_epi16
);
v5
=
_mm_unpacklo_epi16
(
in
[
5
],
k__zero_epi16
);
v6
=
_mm_unpacklo_epi16
(
in
[
6
],
k__zero_epi16
);
v7
=
_mm_unpacklo_epi16
(
in
[
7
],
k__zero_epi16
);
u0
=
_mm_unpacklo_epi16
(
in
[
8
],
k__zero_epi16
);
u1
=
_mm_unpacklo_epi16
(
in
[
9
],
k__zero_epi16
);
u2
=
_mm_unpacklo_epi16
(
in
[
10
],
k__zero_epi16
);
u3
=
_mm_unpacklo_epi16
(
in
[
11
],
k__zero_epi16
);
u4
=
_mm_unpacklo_epi16
(
in
[
12
],
k__zero_epi16
);
u5
=
_mm_unpacklo_epi16
(
in
[
13
],
k__zero_epi16
);
u6
=
_mm_unpacklo_epi16
(
in
[
14
],
k__zero_epi16
);
u7
=
_mm_unpacklo_epi16
(
in
[
15
],
k__zero_epi16
);
x0
=
_mm_unpackhi_epi16
(
in
[
0
],
k__zero_epi16
);
x1
=
_mm_unpackhi_epi16
(
in
[
1
],
k__zero_epi16
);
x2
=
_mm_unpackhi_epi16
(
in
[
2
],
k__zero_epi16
);
x3
=
_mm_unpackhi_epi16
(
in
[
3
],
k__zero_epi16
);
x4
=
_mm_unpackhi_epi16
(
in
[
4
],
k__zero_epi16
);
x5
=
_mm_unpackhi_epi16
(
in
[
5
],
k__zero_epi16
);
x6
=
_mm_unpackhi_epi16
(
in
[
6
],
k__zero_epi16
);
x7
=
_mm_unpackhi_epi16
(
in
[
7
],
k__zero_epi16
);
y0
=
_mm_unpackhi_epi16
(
in
[
8
],
k__zero_epi16
);
y1
=
_mm_unpackhi_epi16
(
in
[
9
],
k__zero_epi16
);
y2
=
_mm_unpackhi_epi16
(
in
[
10
],
k__zero_epi16
);
y3
=
_mm_unpackhi_epi16
(
in
[
11
],
k__zero_epi16
);
y4
=
_mm_unpackhi_epi16
(
in
[
12
],
k__zero_epi16
);
y5
=
_mm_unpackhi_epi16
(
in
[
13
],
k__zero_epi16
);
y6
=
_mm_unpackhi_epi16
(
in
[
14
],
k__zero_epi16
);
y7
=
_mm_unpackhi_epi16
(
in
[
15
],
k__zero_epi16
);
v0
=
_mm_madd_epi16
(
v0
,
k__sqrt2_epi16
);
v1
=
_mm_madd_epi16
(
v1
,
k__sqrt2_epi16
);
v2
=
_mm_madd_epi16
(
v2
,
k__sqrt2_epi16
);
v3
=
_mm_madd_epi16
(
v3
,
k__sqrt2_epi16
);
v4
=
_mm_madd_epi16
(
v4
,
k__sqrt2_epi16
);
v5
=
_mm_madd_epi16
(
v5
,
k__sqrt2_epi16
);
v6
=
_mm_madd_epi16
(
v6
,
k__sqrt2_epi16
);
v7
=
_mm_madd_epi16
(
v7
,
k__sqrt2_epi16
);
x0
=
_mm_madd_epi16
(
x0
,
k__sqrt2_epi16
);
x1
=
_mm_madd_epi16
(
x1
,
k__sqrt2_epi16
);
x2
=
_mm_madd_epi16
(
x2
,
k__sqrt2_epi16
);
x3
=
_mm_madd_epi16
(
x3
,
k__sqrt2_epi16
);
x4
=
_mm_madd_epi16
(
x4
,
k__sqrt2_epi16
);
x5
=
_mm_madd_epi16
(
x5
,
k__sqrt2_epi16
);
x6
=
_mm_madd_epi16
(
x6
,
k__sqrt2_epi16
);
x7
=
_mm_madd_epi16
(
x7
,
k__sqrt2_epi16
);
u0
=
_mm_madd_epi16
(
u0
,
k__sqrt2_epi16
);
u1
=
_mm_madd_epi16
(
u1
,
k__sqrt2_epi16
);
u2
=
_mm_madd_epi16
(
u2
,
k__sqrt2_epi16
);
u3
=
_mm_madd_epi16
(
u3
,
k__sqrt2_epi16
);
u4
=
_mm_madd_epi16
(
u4
,
k__sqrt2_epi16
);
u5
=
_mm_madd_epi16
(
u5
,
k__sqrt2_epi16
);
u6
=
_mm_madd_epi16
(
u6
,
k__sqrt2_epi16
);
u7
=
_mm_madd_epi16
(
u7
,
k__sqrt2_epi16
);
y0
=
_mm_madd_epi16
(
y0
,
k__sqrt2_epi16
);
y1
=
_mm_madd_epi16
(
y1
,
k__sqrt2_epi16
);
y2
=
_mm_madd_epi16
(
y2
,
k__sqrt2_epi16
);
y3
=
_mm_madd_epi16
(
y3
,
k__sqrt2_epi16
);
y4
=
_mm_madd_epi16
(
y4
,
k__sqrt2_epi16
);
y5
=
_mm_madd_epi16
(
y5
,
k__sqrt2_epi16
);
y6
=
_mm_madd_epi16
(
y6
,
k__sqrt2_epi16
);
y7
=
_mm_madd_epi16
(
y7
,
k__sqrt2_epi16
);
v0
=
_mm_add_epi32
(
v0
,
k__DCT_CONST_ROUNDING
);
v1
=
_mm_add_epi32
(
v1
,
k__DCT_CONST_ROUNDING
);
v2
=
_mm_add_epi32
(
v2
,
k__DCT_CONST_ROUNDING
);
v3
=
_mm_add_epi32
(
v3
,
k__DCT_CONST_ROUNDING
);
v4
=
_mm_add_epi32
(
v4
,
k__DCT_CONST_ROUNDING
);
v5
=
_mm_add_epi32
(
v5
,
k__DCT_CONST_ROUNDING
);
v6
=
_mm_add_epi32
(
v6
,
k__DCT_CONST_ROUNDING
);
v7
=
_mm_add_epi32
(
v7
,
k__DCT_CONST_ROUNDING
);
x0
=
_mm_add_epi32
(
x0
,
k__DCT_CONST_ROUNDING
);
x1
=
_mm_add_epi32
(
x1
,
k__DCT_CONST_ROUNDING
);
x2
=
_mm_add_epi32
(
x2
,
k__DCT_CONST_ROUNDING
);
x3
=
_mm_add_epi32
(
x3
,
k__DCT_CONST_ROUNDING
);
x4
=
_mm_add_epi32
(
x4
,
k__DCT_CONST_ROUNDING
);
x5
=
_mm_add_epi32
(
x5
,
k__DCT_CONST_ROUNDING
);
x6
=
_mm_add_epi32
(
x6
,
k__DCT_CONST_ROUNDING
);
x7
=
_mm_add_epi32
(
x7
,
k__DCT_CONST_ROUNDING
);
u0
=
_mm_add_epi32
(
u0
,
k__DCT_CONST_ROUNDING
);
u1
=
_mm_add_epi32
(
u1
,
k__DCT_CONST_ROUNDING
);
u2
=
_mm_add_epi32
(
u2
,
k__DCT_CONST_ROUNDING
);
u3
=
_mm_add_epi32
(
u3
,
k__DCT_CONST_ROUNDING
);
u4
=
_mm_add_epi32
(
u4
,
k__DCT_CONST_ROUNDING
);
u5
=
_mm_add_epi32
(
u5
,
k__DCT_CONST_ROUNDING
);
u6
=
_mm_add_epi32
(
u6
,
k__DCT_CONST_ROUNDING
);
u7
=
_mm_add_epi32
(
u7
,
k__DCT_CONST_ROUNDING
);
y0
=
_mm_add_epi32
(
y0
,
k__DCT_CONST_ROUNDING
);
y1
=
_mm_add_epi32
(
y1
,
k__DCT_CONST_ROUNDING
);
y2
=
_mm_add_epi32
(
y2
,
k__DCT_CONST_ROUNDING
);
y3
=
_mm_add_epi32
(
y3
,
k__DCT_CONST_ROUNDING
);
y4
=
_mm_add_epi32
(
y4
,
k__DCT_CONST_ROUNDING
);
y5
=
_mm_add_epi32
(
y5
,
k__DCT_CONST_ROUNDING
);
y6
=
_mm_add_epi32
(
y6
,
k__DCT_CONST_ROUNDING
);
y7
=
_mm_add_epi32
(
y7
,
k__DCT_CONST_ROUNDING
);
v0
=
_mm_srai_epi32
(
v0
,
DCT_CONST_BITS
);
v1
=
_mm_srai_epi32
(
v1
,
DCT_CONST_BITS
);
v2
=
_mm_srai_epi32
(
v2
,
DCT_CONST_BITS
);
v3
=
_mm_srai_epi32
(
v3
,
DCT_CONST_BITS
);
v4
=
_mm_srai_epi32
(
v4
,
DCT_CONST_BITS
);
v5
=
_mm_srai_epi32
(
v5
,
DCT_CONST_BITS
);
v6
=
_mm_srai_epi32
(
v6
,
DCT_CONST_BITS
);
v7
=
_mm_srai_epi32
(
v7
,
DCT_CONST_BITS
);
x0
=
_mm_srai_epi32
(
x0
,
DCT_CONST_BITS
);
x1
=
_mm_srai_epi32
(
x1
,
DCT_CONST_BITS
);
x2
=
_mm_srai_epi32
(
x2
,
DCT_CONST_BITS
);
x3
=
_mm_srai_epi32
(
x3
,
DCT_CONST_BITS
);
x4
=
_mm_srai_epi32
(
x4
,
DCT_CONST_BITS
);
x5
=
_mm_srai_epi32
(
x5
,
DCT_CONST_BITS
);
x6
=
_mm_srai_epi32
(
x6
,
DCT_CONST_BITS
);
x7
=
_mm_srai_epi32
(
x7
,
DCT_CONST_BITS
);
u0
=
_mm_srai_epi32
(
u0
,
DCT_CONST_BITS
);
u1
=
_mm_srai_epi32
(
u1
,
DCT_CONST_BITS
);
u2
=
_mm_srai_epi32
(
u2
,
DCT_CONST_BITS
);
u3
=
_mm_srai_epi32
(
u3
,
DCT_CONST_BITS
);
u4
=
_mm_srai_epi32
(
u4
,
DCT_CONST_BITS
);
u5
=
_mm_srai_epi32
(
u5
,
DCT_CONST_BITS
);
u6
=
_mm_srai_epi32
(
u6
,
DCT_CONST_BITS
);
u7
=
_mm_srai_epi32
(
u7
,
DCT_CONST_BITS
);
y0
=
_mm_srai_epi32
(
y0
,
DCT_CONST_BITS
);
y1
=
_mm_srai_epi32
(
y1
,
DCT_CONST_BITS
);
y2
=
_mm_srai_epi32
(
y2
,
DCT_CONST_BITS
);
y3
=
_mm_srai_epi32
(
y3
,
DCT_CONST_BITS
);
y4
=
_mm_srai_epi32
(
y4
,
DCT_CONST_BITS
);
y5
=
_mm_srai_epi32
(
y5
,
DCT_CONST_BITS
);
y6
=
_mm_srai_epi32
(
y6
,
DCT_CONST_BITS
);
y7
=
_mm_srai_epi32
(
y7
,
DCT_CONST_BITS
);
in
[
0
]
=
_mm_packs_epi32
(
v0
,
x0
);
in
[
1
]
=
_mm_packs_epi32
(
v1
,
x1
);
in
[
2
]
=
_mm_packs_epi32
(
v2
,
x2
);
in
[
3
]
=
_mm_packs_epi32
(
v3
,
x3
);
in
[
4
]
=
_mm_packs_epi32
(
v4
,
x4
);
in
[
5
]
=
_mm_packs_epi32
(
v5
,
x5
);
in
[
6
]
=
_mm_packs_epi32
(
v6
,
x6
);
in
[
7
]
=
_mm_packs_epi32
(
v7
,
x7
);
in
[
8
]
=
_mm_packs_epi32
(
u0
,
y0
);
in
[
9
]
=
_mm_packs_epi32
(
u1
,
y1
);
in
[
10
]
=
_mm_packs_epi32
(
u2
,
y2
);
in
[
11
]
=
_mm_packs_epi32
(
u3
,
y3
);
in
[
12
]
=
_mm_packs_epi32
(
u4
,
y4
);
in
[
13
]
=
_mm_packs_epi32
(
u5
,
y5
);
in
[
14
]
=
_mm_packs_epi32
(
u6
,
y6
);
in
[
15
]
=
_mm_packs_epi32
(
u7
,
y7
);
}
static
void
iidtx8_sse2
(
__m128i
*
in
)
{
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
1
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
1
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
1
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
1
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
1
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
1
);
}
// load 8x8 array
static
INLINE
void
flip_buffer_lr_8x8
(
__m128i
*
in
)
{
in
[
0
]
=
mm_reverse_epi16
(
in
[
0
]);
in
[
1
]
=
mm_reverse_epi16
(
in
[
1
]);
in
[
2
]
=
mm_reverse_epi16
(
in
[
2
]);
in
[
3
]
=
mm_reverse_epi16
(
in
[
3
]);
in
[
4
]
=
mm_reverse_epi16
(
in
[
4
]);
in
[
5
]
=
mm_reverse_epi16
(
in
[
5
]);
in
[
6
]
=
mm_reverse_epi16
(
in
[
6
]);
in
[
7
]
=
mm_reverse_epi16
(
in
[
7
]);
}
static
INLINE
void
scale_sqrt2_8x8
(
__m128i
*
in
)
{
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const
__m128i
v_scale_w
=
_mm_set1_epi16
(
Sqrt2
);
const
__m128i
v_p0l_w
=
_mm_mullo_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p0h_w
=
_mm_mulhi_epi16
(
in
[
0
],
v_scale_w
);
const
__m128i
v_p1l_w
=
_mm_mullo_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p1h_w
=
_mm_mulhi_epi16
(
in
[
1
],
v_scale_w
);
const
__m128i
v_p2l_w
=
_mm_mullo_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p2h_w
=
_mm_mulhi_epi16
(
in
[
2
],
v_scale_w
);
const
__m128i
v_p3l_w
=
_mm_mullo_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p3h_w
=
_mm_mulhi_epi16
(
in
[
3
],
v_scale_w
);
const
__m128i
v_p4l_w
=
_mm_mullo_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p4h_w
=
_mm_mulhi_epi16
(
in
[
4
],
v_scale_w
);
const
__m128i
v_p5l_w
=
_mm_mullo_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p5h_w
=
_mm_mulhi_epi16
(
in
[
5
],
v_scale_w
);
const
__m128i
v_p6l_w
=
_mm_mullo_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p6h_w
=
_mm_mulhi_epi16
(
in
[
6
],
v_scale_w
);
const
__m128i
v_p7l_w
=
_mm_mullo_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p7h_w
=
_mm_mulhi_epi16
(
in
[
7
],
v_scale_w
);
const
__m128i
v_p0a_d
=
_mm_unpacklo_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p0b_d
=
_mm_unpackhi_epi16
(
v_p0l_w
,
v_p0h_w
);
const
__m128i
v_p1a_d
=
_mm_unpacklo_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p1b_d
=
_mm_unpackhi_epi16
(
v_p1l_w
,
v_p1h_w
);
const
__m128i
v_p2a_d
=
_mm_unpacklo_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p2b_d
=
_mm_unpackhi_epi16
(
v_p2l_w
,
v_p2h_w
);
const
__m128i
v_p3a_d
=
_mm_unpacklo_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p3b_d
=
_mm_unpackhi_epi16
(
v_p3l_w
,
v_p3h_w
);
const
__m128i
v_p4a_d
=
_mm_unpacklo_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p4b_d
=
_mm_unpackhi_epi16
(
v_p4l_w
,
v_p4h_w
);
const
__m128i
v_p5a_d
=
_mm_unpacklo_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p5b_d
=
_mm_unpackhi_epi16
(
v_p5l_w
,
v_p5h_w
);
const
__m128i
v_p6a_d
=
_mm_unpacklo_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p6b_d
=
_mm_unpackhi_epi16
(
v_p6l_w
,
v_p6h_w
);
const
__m128i
v_p7a_d
=
_mm_unpacklo_epi16
(
v_p7l_w
,
v_p7h_w
);
const
__m128i
v_p7b_d
=
_mm_unpackhi_epi16
(
v_p7l_w
,
v_p7h_w
);
in
[
0
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p0a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p0b_d
,
DCT_CONST_BITS
));
in
[
1
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p1a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p1b_d
,
DCT_CONST_BITS
));
in
[
2
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p2a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p2b_d
,
DCT_CONST_BITS
));
in
[
3
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p3a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p3b_d
,
DCT_CONST_BITS
));
in
[
4
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p4a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p4b_d
,
DCT_CONST_BITS
));
in
[
5
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p5a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p5b_d
,
DCT_CONST_BITS
));
in
[
6
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p6a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p6b_d
,
DCT_CONST_BITS
));
in
[
7
]
=
_mm_packs_epi32
(
xx_roundn_epi32_unsigned
(
v_p7a_d
,
DCT_CONST_BITS
),
xx_roundn_epi32_unsigned
(
v_p7b_d
,
DCT_CONST_BITS
));
}
void
av1_iht8x16_128_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
16
];
in
[
0
]
=
load_input_data
(
input
+
0
*
8
);
in
[
1
]
=
load_input_data
(
input
+
1
*
8
);
in
[
2
]
=
load_input_data
(
input
+
2
*
8
);
in
[
3
]
=
load_input_data
(
input
+
3
*
8
);
in
[
4
]
=
load_input_data
(
input
+
4
*
8
);
in
[
5
]
=
load_input_data
(
input
+
5
*
8
);
in
[
6
]
=
load_input_data
(
input
+
6
*
8
);
in
[
7
]
=
load_input_data
(
input
+
7
*
8
);
in
[
8
]
=
load_input_data
(
input
+
8
*
8
);
in
[
9
]
=
load_input_data
(
input
+
9
*
8
);
in
[
10
]
=
load_input_data
(
input
+
10
*
8
);
in
[
11
]
=
load_input_data
(
input
+
11
*
8
);
in
[
12
]
=
load_input_data
(
input
+
12
*
8
);
in
[
13
]
=
load_input_data
(
input
+
13
*
8
);
in
[
14
]
=
load_input_data
(
input
+
14
*
8
);
in
[
15
]
=
load_input_data
(
input
+
15
*
8
);
// Row transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
FLIPADST_DCT
:
case
H_DCT
:
idct8_sse2
(
in
);
array_transpose_8x8
(
in
,
in
);
idct8_sse2
(
in
+
8
);
array_transpose_8x8
(
in
+
8
,
in
+
8
);
break
;
case
DCT_ADST
:
case
ADST_ADST
:
case
DCT_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
case
H_ADST
:
case
H_FLIPADST
:
iadst8_sse2
(
in
);
array_transpose_8x8
(
in
,
in
);
iadst8_sse2
(
in
+
8
);
array_transpose_8x8
(
in
+
8
,
in
+
8
);
break
;
case
V_FLIPADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
iidtx8_sse2
(
in
);
iidtx8_sse2
(
in
+
8
);
break
;
default:
assert
(
0
);
break
;
}
scale_sqrt2_8x8
(
in
);
scale_sqrt2_8x8
(
in
+
8
);
// Column transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
DCT_ADST
:
case
DCT_FLIPADST
:
case
V_DCT
:
idct16_8col
(
in
);
break
;
case
ADST_DCT
:
case
ADST_ADST
:
case
FLIPADST_ADST
:
case
ADST_FLIPADST
: