Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
7f6bf9c7
Commit
7f6bf9c7
authored
Nov 02, 2016
by
Yi Luo
Committed by
Gerrit Code Review
Nov 02, 2016
Browse files
Merge "Hybrid inverse transforms 16x16 AVX2 optimization" into nextgenv2
parents
9679464e
73172000
Changes
9
Hide whitespace changes
Inline
Side-by-side
aom_dsp/x86/txfm_common_avx2.h
View file @
7f6bf9c7
...
...
@@ -14,6 +14,8 @@
#include
<immintrin.h>
#include
"aom_dsp/txfm_common.h"
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
...
...
@@ -24,4 +26,179 @@
_mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
(int)(b), (int)(a))
static
INLINE
void
mm256_reverse_epi16
(
__m256i
*
u
)
{
const
__m256i
control
=
_mm256_set_epi16
(
0x0100
,
0x0302
,
0x0504
,
0x0706
,
0x0908
,
0x0B0A
,
0x0D0C
,
0x0F0E
,
0x0100
,
0x0302
,
0x0504
,
0x0706
,
0x0908
,
0x0B0A
,
0x0D0C
,
0x0F0E
);
__m256i
v
=
_mm256_shuffle_epi8
(
*
u
,
control
);
*
u
=
_mm256_permute2x128_si256
(
v
,
v
,
1
);
}
static
INLINE
void
mm256_transpose_16x16
(
__m256i
*
in
)
{
__m256i
tr0_0
=
_mm256_unpacklo_epi16
(
in
[
0
],
in
[
1
]);
__m256i
tr0_1
=
_mm256_unpackhi_epi16
(
in
[
0
],
in
[
1
]);
__m256i
tr0_2
=
_mm256_unpacklo_epi16
(
in
[
2
],
in
[
3
]);
__m256i
tr0_3
=
_mm256_unpackhi_epi16
(
in
[
2
],
in
[
3
]);
__m256i
tr0_4
=
_mm256_unpacklo_epi16
(
in
[
4
],
in
[
5
]);
__m256i
tr0_5
=
_mm256_unpackhi_epi16
(
in
[
4
],
in
[
5
]);
__m256i
tr0_6
=
_mm256_unpacklo_epi16
(
in
[
6
],
in
[
7
]);
__m256i
tr0_7
=
_mm256_unpackhi_epi16
(
in
[
6
],
in
[
7
]);
__m256i
tr0_8
=
_mm256_unpacklo_epi16
(
in
[
8
],
in
[
9
]);
__m256i
tr0_9
=
_mm256_unpackhi_epi16
(
in
[
8
],
in
[
9
]);
__m256i
tr0_a
=
_mm256_unpacklo_epi16
(
in
[
10
],
in
[
11
]);
__m256i
tr0_b
=
_mm256_unpackhi_epi16
(
in
[
10
],
in
[
11
]);
__m256i
tr0_c
=
_mm256_unpacklo_epi16
(
in
[
12
],
in
[
13
]);
__m256i
tr0_d
=
_mm256_unpackhi_epi16
(
in
[
12
],
in
[
13
]);
__m256i
tr0_e
=
_mm256_unpacklo_epi16
(
in
[
14
],
in
[
15
]);
__m256i
tr0_f
=
_mm256_unpackhi_epi16
(
in
[
14
],
in
[
15
]);
// 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
// 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
// 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
// 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
// 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
// 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
// 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
// 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
// 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
// 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
// a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
// a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
// c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
// c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
// e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
// e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
__m256i
tr1_0
=
_mm256_unpacklo_epi32
(
tr0_0
,
tr0_2
);
__m256i
tr1_1
=
_mm256_unpackhi_epi32
(
tr0_0
,
tr0_2
);
__m256i
tr1_2
=
_mm256_unpacklo_epi32
(
tr0_1
,
tr0_3
);
__m256i
tr1_3
=
_mm256_unpackhi_epi32
(
tr0_1
,
tr0_3
);
__m256i
tr1_4
=
_mm256_unpacklo_epi32
(
tr0_4
,
tr0_6
);
__m256i
tr1_5
=
_mm256_unpackhi_epi32
(
tr0_4
,
tr0_6
);
__m256i
tr1_6
=
_mm256_unpacklo_epi32
(
tr0_5
,
tr0_7
);
__m256i
tr1_7
=
_mm256_unpackhi_epi32
(
tr0_5
,
tr0_7
);
__m256i
tr1_8
=
_mm256_unpacklo_epi32
(
tr0_8
,
tr0_a
);
__m256i
tr1_9
=
_mm256_unpackhi_epi32
(
tr0_8
,
tr0_a
);
__m256i
tr1_a
=
_mm256_unpacklo_epi32
(
tr0_9
,
tr0_b
);
__m256i
tr1_b
=
_mm256_unpackhi_epi32
(
tr0_9
,
tr0_b
);
__m256i
tr1_c
=
_mm256_unpacklo_epi32
(
tr0_c
,
tr0_e
);
__m256i
tr1_d
=
_mm256_unpackhi_epi32
(
tr0_c
,
tr0_e
);
__m256i
tr1_e
=
_mm256_unpacklo_epi32
(
tr0_d
,
tr0_f
);
__m256i
tr1_f
=
_mm256_unpackhi_epi32
(
tr0_d
,
tr0_f
);
// 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
// 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
// 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
// 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
// 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
// 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
// 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
// 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
// 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
// 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
// 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
// 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
// c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
// c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
// c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
// c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
tr0_0
=
_mm256_unpacklo_epi64
(
tr1_0
,
tr1_4
);
tr0_1
=
_mm256_unpackhi_epi64
(
tr1_0
,
tr1_4
);
tr0_2
=
_mm256_unpacklo_epi64
(
tr1_1
,
tr1_5
);
tr0_3
=
_mm256_unpackhi_epi64
(
tr1_1
,
tr1_5
);
tr0_4
=
_mm256_unpacklo_epi64
(
tr1_2
,
tr1_6
);
tr0_5
=
_mm256_unpackhi_epi64
(
tr1_2
,
tr1_6
);
tr0_6
=
_mm256_unpacklo_epi64
(
tr1_3
,
tr1_7
);
tr0_7
=
_mm256_unpackhi_epi64
(
tr1_3
,
tr1_7
);
tr0_8
=
_mm256_unpacklo_epi64
(
tr1_8
,
tr1_c
);
tr0_9
=
_mm256_unpackhi_epi64
(
tr1_8
,
tr1_c
);
tr0_a
=
_mm256_unpacklo_epi64
(
tr1_9
,
tr1_d
);
tr0_b
=
_mm256_unpackhi_epi64
(
tr1_9
,
tr1_d
);
tr0_c
=
_mm256_unpacklo_epi64
(
tr1_a
,
tr1_e
);
tr0_d
=
_mm256_unpackhi_epi64
(
tr1_a
,
tr1_e
);
tr0_e
=
_mm256_unpacklo_epi64
(
tr1_b
,
tr1_f
);
tr0_f
=
_mm256_unpackhi_epi64
(
tr1_b
,
tr1_f
);
// 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
// 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
// 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
// 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
// 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
// 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
// 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
// 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
// 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
// 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
// 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
// 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
// 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
// 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
// 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
// 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
in
[
0
]
=
_mm256_permute2x128_si256
(
tr0_0
,
tr0_8
,
0x20
);
// 0010 0000
in
[
8
]
=
_mm256_permute2x128_si256
(
tr0_0
,
tr0_8
,
0x31
);
// 0011 0001
in
[
1
]
=
_mm256_permute2x128_si256
(
tr0_1
,
tr0_9
,
0x20
);
in
[
9
]
=
_mm256_permute2x128_si256
(
tr0_1
,
tr0_9
,
0x31
);
in
[
2
]
=
_mm256_permute2x128_si256
(
tr0_2
,
tr0_a
,
0x20
);
in
[
10
]
=
_mm256_permute2x128_si256
(
tr0_2
,
tr0_a
,
0x31
);
in
[
3
]
=
_mm256_permute2x128_si256
(
tr0_3
,
tr0_b
,
0x20
);
in
[
11
]
=
_mm256_permute2x128_si256
(
tr0_3
,
tr0_b
,
0x31
);
in
[
4
]
=
_mm256_permute2x128_si256
(
tr0_4
,
tr0_c
,
0x20
);
in
[
12
]
=
_mm256_permute2x128_si256
(
tr0_4
,
tr0_c
,
0x31
);
in
[
5
]
=
_mm256_permute2x128_si256
(
tr0_5
,
tr0_d
,
0x20
);
in
[
13
]
=
_mm256_permute2x128_si256
(
tr0_5
,
tr0_d
,
0x31
);
in
[
6
]
=
_mm256_permute2x128_si256
(
tr0_6
,
tr0_e
,
0x20
);
in
[
14
]
=
_mm256_permute2x128_si256
(
tr0_6
,
tr0_e
,
0x31
);
in
[
7
]
=
_mm256_permute2x128_si256
(
tr0_7
,
tr0_f
,
0x20
);
in
[
15
]
=
_mm256_permute2x128_si256
(
tr0_7
,
tr0_f
,
0x31
);
}
static
INLINE
__m256i
butter_fly
(
__m256i
a0
,
__m256i
a1
,
const
__m256i
cospi
)
{
const
__m256i
dct_rounding
=
_mm256_set1_epi32
(
DCT_CONST_ROUNDING
);
__m256i
y0
=
_mm256_madd_epi16
(
a0
,
cospi
);
__m256i
y1
=
_mm256_madd_epi16
(
a1
,
cospi
);
y0
=
_mm256_add_epi32
(
y0
,
dct_rounding
);
y1
=
_mm256_add_epi32
(
y1
,
dct_rounding
);
y0
=
_mm256_srai_epi32
(
y0
,
DCT_CONST_BITS
);
y1
=
_mm256_srai_epi32
(
y1
,
DCT_CONST_BITS
);
return
_mm256_packs_epi32
(
y0
,
y1
);
}
static
INLINE
void
txfm_scaling16_avx2
(
const
int16_t
c
,
__m256i
*
in
)
{
const
__m256i
zero
=
_mm256_setzero_si256
();
const
__m256i
sqrt2_epi16
=
_mm256_set1_epi16
(
c
);
const
__m256i
dct_const_rounding
=
_mm256_set1_epi32
(
DCT_CONST_ROUNDING
);
__m256i
u0
,
u1
;
int
i
=
0
;
while
(
i
<
16
)
{
in
[
i
]
=
_mm256_slli_epi16
(
in
[
i
],
1
);
u0
=
_mm256_unpacklo_epi16
(
zero
,
in
[
i
]);
u1
=
_mm256_unpackhi_epi16
(
zero
,
in
[
i
]);
u0
=
_mm256_madd_epi16
(
u0
,
sqrt2_epi16
);
u1
=
_mm256_madd_epi16
(
u1
,
sqrt2_epi16
);
u0
=
_mm256_add_epi32
(
u0
,
dct_const_rounding
);
u1
=
_mm256_add_epi32
(
u1
,
dct_const_rounding
);
u0
=
_mm256_srai_epi32
(
u0
,
DCT_CONST_BITS
);
u1
=
_mm256_srai_epi32
(
u1
,
DCT_CONST_BITS
);
in
[
i
]
=
_mm256_packs_epi32
(
u0
,
u1
);
i
++
;
}
}
#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H
av1/av1_common.mk
View file @
7f6bf9c7
...
...
@@ -122,6 +122,8 @@ AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct8x8_msa.c
AV1_COMMON_SRCS-$(HAVE_MSA)
+=
common/mips/msa/idct16x16_msa.c
AV1_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/idct_intrin_sse2.c
AV1_COMMON_SRCS-$(HAVE_AVX2)
+=
common/x86/hybrid_inv_txfm_avx2.c
ifeq
($(CONFIG_AV1_ENCODER),yes)
AV1_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/av1_fwd_txfm_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/av1_fwd_dct32x32_impl_sse2.h
...
...
av1/common/av1_rtcd_defs.pl
View file @
7f6bf9c7
...
...
@@ -114,7 +114,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_iht8x8_64_add sse2/
;
add_proto
qw/void av1_iht16x16_256_add/
,
"
const tran_low_t *input, uint8_t *output, int pitch, int tx_type
";
specialize
qw/av1_iht16x16_256_add sse2/
;
specialize
qw/av1_iht16x16_256_add sse2
avx2
/
;
}
}
else
{
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
...
...
@@ -175,7 +175,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_iht8x8_64_add sse2 neon dspr2/
;
add_proto
qw/void av1_iht16x16_256_add/
,
"
const tran_low_t *input, uint8_t *output, int pitch, int tx_type
";
specialize
qw/av1_iht16x16_256_add sse2 dspr2/
;
specialize
qw/av1_iht16x16_256_add sse2
avx2
dspr2/
;
if
(
aom_config
("
CONFIG_EXT_TX
")
ne
"
yes
")
{
specialize
qw/av1_iht4x4_16_add msa/
;
...
...
av1/common/idct.c
View file @
7f6bf9c7
...
...
@@ -984,17 +984,12 @@ void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
av1_iht16x16_256_add
(
input
,
dest
,
stride
,
tx_type
);
break
;
case
V_DCT
:
case
H_DCT
:
case
V_ADST
:
case
H_ADST
:
case
V_FLIPADST
:
case
H_FLIPADST
:
// Use C version since DST only exists in C code
av1_iht16x16_256_add_c
(
input
,
dest
,
stride
,
tx_type
);
break
;
case
H_FLIPADST
:
av1_iht16x16_256_add
(
input
,
dest
,
stride
,
tx_type
);
break
;
case
IDTX
:
inv_idtx_add_c
(
input
,
dest
,
stride
,
16
,
tx_type
);
break
;
#endif // CONFIG_EXT_TX
default:
assert
(
0
);
break
;
...
...
av1/common/x86/hybrid_inv_txfm_avx2.c
0 → 100644
View file @
7f6bf9c7
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include
<immintrin.h>
// avx2
#include
"./aom_config.h"
#include
"./av1_rtcd.h"
#include
"aom_dsp/x86/txfm_common_avx2.h"
static
INLINE
void
load_coeff
(
const
tran_low_t
*
coeff
,
__m256i
*
in
)
{
#if CONFIG_AOM_HIGHBITDEPTH
*
in
=
_mm256_setr_epi16
(
(
int16_t
)
coeff
[
0
],
(
int16_t
)
coeff
[
1
],
(
int16_t
)
coeff
[
2
],
(
int16_t
)
coeff
[
3
],
(
int16_t
)
coeff
[
4
],
(
int16_t
)
coeff
[
5
],
(
int16_t
)
coeff
[
6
],
(
int16_t
)
coeff
[
7
],
(
int16_t
)
coeff
[
8
],
(
int16_t
)
coeff
[
9
],
(
int16_t
)
coeff
[
10
],
(
int16_t
)
coeff
[
11
],
(
int16_t
)
coeff
[
12
],
(
int16_t
)
coeff
[
13
],
(
int16_t
)
coeff
[
14
],
(
int16_t
)
coeff
[
15
]);
#else
*
in
=
_mm256_loadu_si256
((
const
__m256i
*
)
coeff
);
#endif
}
static
void
load_buffer_16x16
(
const
tran_low_t
*
coeff
,
__m256i
*
in
)
{
int
i
=
0
;
while
(
i
<
16
)
{
load_coeff
(
coeff
+
(
i
<<
4
),
&
in
[
i
]);
i
+=
1
;
}
}
static
void
recon_and_store
(
const
__m256i
*
res
,
uint8_t
*
output
)
{
const
__m128i
zero
=
_mm_setzero_si128
();
__m128i
x
=
_mm_loadu_si128
((
__m128i
const
*
)
output
);
__m128i
p0
=
_mm_unpacklo_epi8
(
x
,
zero
);
__m128i
p1
=
_mm_unpackhi_epi8
(
x
,
zero
);
p0
=
_mm_add_epi16
(
p0
,
_mm256_castsi256_si128
(
*
res
));
p1
=
_mm_add_epi16
(
p1
,
_mm256_extractf128_si256
(
*
res
,
1
));
x
=
_mm_packus_epi16
(
p0
,
p1
);
_mm_storeu_si128
((
__m128i
*
)
output
,
x
);
}
#define IDCT_ROUNDING_POS (6)
static
void
write_buffer_16x16
(
__m256i
*
in
,
const
int
stride
,
uint8_t
*
output
)
{
const
__m256i
rounding
=
_mm256_set1_epi16
(
1
<<
(
IDCT_ROUNDING_POS
-
1
));
int
i
=
0
;
while
(
i
<
16
)
{
in
[
i
]
=
_mm256_add_epi16
(
in
[
i
],
rounding
);
in
[
i
]
=
_mm256_srai_epi16
(
in
[
i
],
IDCT_ROUNDING_POS
);
recon_and_store
(
&
in
[
i
],
output
+
i
*
stride
);
i
+=
1
;
}
}
static
INLINE
void
unpack_butter_fly
(
const
__m256i
*
a0
,
const
__m256i
*
a1
,
const
__m256i
*
c0
,
const
__m256i
*
c1
,
__m256i
*
b0
,
__m256i
*
b1
)
{
__m256i
x0
,
x1
;
x0
=
_mm256_unpacklo_epi16
(
*
a0
,
*
a1
);
x1
=
_mm256_unpackhi_epi16
(
*
a0
,
*
a1
);
*
b0
=
butter_fly
(
x0
,
x1
,
*
c0
);
*
b1
=
butter_fly
(
x0
,
x1
,
*
c1
);
}
static
void
idct16_avx2
(
__m256i
*
in
)
{
const
__m256i
cospi_p30_m02
=
pair256_set_epi16
(
cospi_30_64
,
-
cospi_2_64
);
const
__m256i
cospi_p02_p30
=
pair256_set_epi16
(
cospi_2_64
,
cospi_30_64
);
const
__m256i
cospi_p14_m18
=
pair256_set_epi16
(
cospi_14_64
,
-
cospi_18_64
);
const
__m256i
cospi_p18_p14
=
pair256_set_epi16
(
cospi_18_64
,
cospi_14_64
);
const
__m256i
cospi_p22_m10
=
pair256_set_epi16
(
cospi_22_64
,
-
cospi_10_64
);
const
__m256i
cospi_p10_p22
=
pair256_set_epi16
(
cospi_10_64
,
cospi_22_64
);
const
__m256i
cospi_p06_m26
=
pair256_set_epi16
(
cospi_6_64
,
-
cospi_26_64
);
const
__m256i
cospi_p26_p06
=
pair256_set_epi16
(
cospi_26_64
,
cospi_6_64
);
const
__m256i
cospi_p28_m04
=
pair256_set_epi16
(
cospi_28_64
,
-
cospi_4_64
);
const
__m256i
cospi_p04_p28
=
pair256_set_epi16
(
cospi_4_64
,
cospi_28_64
);
const
__m256i
cospi_p12_m20
=
pair256_set_epi16
(
cospi_12_64
,
-
cospi_20_64
);
const
__m256i
cospi_p20_p12
=
pair256_set_epi16
(
cospi_20_64
,
cospi_12_64
);
const
__m256i
cospi_p16_p16
=
_mm256_set1_epi16
((
int16_t
)
cospi_16_64
);
const
__m256i
cospi_p16_m16
=
pair256_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m256i
cospi_p24_m08
=
pair256_set_epi16
(
cospi_24_64
,
-
cospi_8_64
);
const
__m256i
cospi_p08_p24
=
pair256_set_epi16
(
cospi_8_64
,
cospi_24_64
);
const
__m256i
cospi_m08_p24
=
pair256_set_epi16
(
-
cospi_8_64
,
cospi_24_64
);
const
__m256i
cospi_p24_p08
=
pair256_set_epi16
(
cospi_24_64
,
cospi_8_64
);
const
__m256i
cospi_m24_m08
=
pair256_set_epi16
(
-
cospi_24_64
,
-
cospi_8_64
);
__m256i
u0
,
u1
,
u2
,
u3
,
u4
,
u5
,
u6
,
u7
;
__m256i
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
;
__m256i
t0
,
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
;
// stage 1, (0-7)
u0
=
in
[
0
];
u1
=
in
[
8
];
u2
=
in
[
4
];
u3
=
in
[
12
];
u4
=
in
[
2
];
u5
=
in
[
10
];
u6
=
in
[
6
];
u7
=
in
[
14
];
// stage 2, (0-7)
// stage 3, (0-7)
t0
=
u0
;
t1
=
u1
;
t2
=
u2
;
t3
=
u3
;
unpack_butter_fly
(
&
u4
,
&
u7
,
&
cospi_p28_m04
,
&
cospi_p04_p28
,
&
t4
,
&
t7
);
unpack_butter_fly
(
&
u5
,
&
u6
,
&
cospi_p12_m20
,
&
cospi_p20_p12
,
&
t5
,
&
t6
);
// stage 4, (0-7)
unpack_butter_fly
(
&
t0
,
&
t1
,
&
cospi_p16_p16
,
&
cospi_p16_m16
,
&
u0
,
&
u1
);
unpack_butter_fly
(
&
t2
,
&
t3
,
&
cospi_p24_m08
,
&
cospi_p08_p24
,
&
u2
,
&
u3
);
u4
=
_mm256_add_epi16
(
t4
,
t5
);
u5
=
_mm256_sub_epi16
(
t4
,
t5
);
u6
=
_mm256_sub_epi16
(
t7
,
t6
);
u7
=
_mm256_add_epi16
(
t7
,
t6
);
// stage 5, (0-7)
t0
=
_mm256_add_epi16
(
u0
,
u3
);
t1
=
_mm256_add_epi16
(
u1
,
u2
);
t2
=
_mm256_sub_epi16
(
u1
,
u2
);
t3
=
_mm256_sub_epi16
(
u0
,
u3
);
t4
=
u4
;
t7
=
u7
;
unpack_butter_fly
(
&
u6
,
&
u5
,
&
cospi_p16_m16
,
&
cospi_p16_p16
,
&
t5
,
&
t6
);
// stage 6, (0-7)
u0
=
_mm256_add_epi16
(
t0
,
t7
);
u1
=
_mm256_add_epi16
(
t1
,
t6
);
u2
=
_mm256_add_epi16
(
t2
,
t5
);
u3
=
_mm256_add_epi16
(
t3
,
t4
);
u4
=
_mm256_sub_epi16
(
t3
,
t4
);
u5
=
_mm256_sub_epi16
(
t2
,
t5
);
u6
=
_mm256_sub_epi16
(
t1
,
t6
);
u7
=
_mm256_sub_epi16
(
t0
,
t7
);
// stage 1, (8-15)
v0
=
in
[
1
];
v1
=
in
[
9
];
v2
=
in
[
5
];
v3
=
in
[
13
];
v4
=
in
[
3
];
v5
=
in
[
11
];
v6
=
in
[
7
];
v7
=
in
[
15
];
// stage 2, (8-15)
unpack_butter_fly
(
&
v0
,
&
v7
,
&
cospi_p30_m02
,
&
cospi_p02_p30
,
&
t0
,
&
t7
);
unpack_butter_fly
(
&
v1
,
&
v6
,
&
cospi_p14_m18
,
&
cospi_p18_p14
,
&
t1
,
&
t6
);
unpack_butter_fly
(
&
v2
,
&
v5
,
&
cospi_p22_m10
,
&
cospi_p10_p22
,
&
t2
,
&
t5
);
unpack_butter_fly
(
&
v3
,
&
v4
,
&
cospi_p06_m26
,
&
cospi_p26_p06
,
&
t3
,
&
t4
);
// stage 3, (8-15)
v0
=
_mm256_add_epi16
(
t0
,
t1
);
v1
=
_mm256_sub_epi16
(
t0
,
t1
);
v2
=
_mm256_sub_epi16
(
t3
,
t2
);
v3
=
_mm256_add_epi16
(
t2
,
t3
);
v4
=
_mm256_add_epi16
(
t4
,
t5
);
v5
=
_mm256_sub_epi16
(
t4
,
t5
);
v6
=
_mm256_sub_epi16
(
t7
,
t6
);
v7
=
_mm256_add_epi16
(
t6
,
t7
);
// stage 4, (8-15)
t0
=
v0
;
t7
=
v7
;
t3
=
v3
;
t4
=
v4
;
unpack_butter_fly
(
&
v1
,
&
v6
,
&
cospi_m08_p24
,
&
cospi_p24_p08
,
&
t1
,
&
t6
);
unpack_butter_fly
(
&
v2
,
&
v5
,
&
cospi_m24_m08
,
&
cospi_m08_p24
,
&
t2
,
&
t5
);
// stage 5, (8-15)
v0
=
_mm256_add_epi16
(
t0
,
t3
);
v1
=
_mm256_add_epi16
(
t1
,
t2
);
v2
=
_mm256_sub_epi16
(
t1
,
t2
);
v3
=
_mm256_sub_epi16
(
t0
,
t3
);
v4
=
_mm256_sub_epi16
(
t7
,
t4
);
v5
=
_mm256_sub_epi16
(
t6
,
t5
);
v6
=
_mm256_add_epi16
(
t6
,
t5
);
v7
=
_mm256_add_epi16
(
t7
,
t4
);
// stage 6, (8-15)
t0
=
v0
;
t1
=
v1
;
t6
=
v6
;
t7
=
v7
;
unpack_butter_fly
(
&
v5
,
&
v2
,
&
cospi_p16_m16
,
&
cospi_p16_p16
,
&
t2
,
&
t5
);
unpack_butter_fly
(
&
v4
,
&
v3
,
&
cospi_p16_m16
,
&
cospi_p16_p16
,
&
t3
,
&
t4
);
// stage 7
in
[
0
]
=
_mm256_add_epi16
(
u0
,
t7
);
in
[
1
]
=
_mm256_add_epi16
(
u1
,
t6
);
in
[
2
]
=
_mm256_add_epi16
(
u2
,
t5
);
in
[
3
]
=
_mm256_add_epi16
(
u3
,
t4
);
in
[
4
]
=
_mm256_add_epi16
(
u4
,
t3
);
in
[
5
]
=
_mm256_add_epi16
(
u5
,
t2
);
in
[
6
]
=
_mm256_add_epi16
(
u6
,
t1
);
in
[
7
]
=
_mm256_add_epi16
(
u7
,
t0
);
in
[
8
]
=
_mm256_sub_epi16
(
u7
,
t0
);
in
[
9
]
=
_mm256_sub_epi16
(
u6
,
t1
);
in
[
10
]
=
_mm256_sub_epi16
(
u5
,
t2
);
in
[
11
]
=
_mm256_sub_epi16
(
u4
,
t3
);
in
[
12
]
=
_mm256_sub_epi16
(
u3
,
t4
);
in
[
13
]
=
_mm256_sub_epi16
(
u2
,
t5
);
in
[
14
]
=
_mm256_sub_epi16
(
u1
,
t6
);
in
[
15
]
=
_mm256_sub_epi16
(
u0
,
t7
);
}
static
void
idct16
(
__m256i
*
in
)
{
mm256_transpose_16x16
(
in
);
idct16_avx2
(
in
);
}
static
INLINE
void
butterfly_32b
(
const
__m256i
*
a0
,
const
__m256i
*
a1
,
const
__m256i
*
c0
,
const
__m256i
*
c1
,
__m256i
*
b
)
{
__m256i
x0
,
x1
;
x0
=
_mm256_unpacklo_epi16
(
*
a0
,
*
a1
);
x1
=
_mm256_unpackhi_epi16
(
*
a0
,
*
a1
);
b
[
0
]
=
_mm256_madd_epi16
(
x0
,
*
c0
);
b
[
1
]
=
_mm256_madd_epi16
(
x1
,
*
c0
);
b
[
2
]
=
_mm256_madd_epi16
(
x0
,
*
c1
);
b
[
3
]
=
_mm256_madd_epi16
(
x1
,
*
c1
);
}
static
INLINE
void
group_rounding
(
__m256i
*
a
,
int
num
)
{
const
__m256i
dct_rounding
=
_mm256_set1_epi32
(
DCT_CONST_ROUNDING
);
int
i
;
for
(
i
=
0
;
i
<
num
;
++
i
)
{
a
[
i
]
=
_mm256_add_epi32
(
a
[
i
],
dct_rounding
);
a
[
i
]
=
_mm256_srai_epi32
(
a
[
i
],
DCT_CONST_BITS
);
}
}
static
INLINE
void
add_rnd
(
const
__m256i
*
a
,
const
__m256i
*
b
,
__m256i
*
out
)
{
__m256i
x
[
4
];
x
[
0
]
=
_mm256_add_epi32
(
a
[
0
],
b
[
0
]);
x
[
1
]
=
_mm256_add_epi32
(
a
[
1
],
b
[
1
]);
x
[
2
]
=
_mm256_add_epi32
(
a
[
2
],
b
[
2
]);
x
[
3
]
=
_mm256_add_epi32
(
a
[
3
],
b
[
3
]);
group_rounding
(
x
,
4
);
out
[
0
]
=
_mm256_packs_epi32
(
x
[
0
],
x
[
1
]);
out
[
1
]
=
_mm256_packs_epi32
(
x
[
2
],
x
[
3
]);
}
static
INLINE
void
sub_rnd
(
const
__m256i
*
a
,
const
__m256i
*
b
,
__m256i
*
out
)
{
__m256i
x
[
4
];
x
[
0
]
=
_mm256_sub_epi32
(
a
[
0
],
b
[
0
]);
x
[
1
]
=
_mm256_sub_epi32
(
a
[
1
],
b
[
1
]);
x
[
2
]
=
_mm256_sub_epi32
(
a
[
2
],
b
[
2
]);
x
[
3
]
=
_mm256_sub_epi32
(
a
[
3
],
b
[
3
]);
group_rounding
(
x
,
4
);
out
[
0
]
=
_mm256_packs_epi32
(
x
[
0
],
x
[
1
]);
out
[
1
]
=
_mm256_packs_epi32
(
x
[
2
],
x
[
3
]);
}
static
INLINE
void
butterfly_rnd
(
__m256i
*
a
,
__m256i
*
out
)
{
group_rounding
(
a
,
4
);
out
[
0
]
=
_mm256_packs_epi32
(
a
[
0
],
a
[
1
]);
out
[
1
]
=
_mm256_packs_epi32
(
a
[
2
],
a
[
3
]);
}
static
void
iadst16_avx2
(
__m256i
*
in
)
{
const
__m256i
cospi_p01_p31
=
pair256_set_epi16
(
cospi_1_64
,
cospi_31_64
);
const
__m256i
cospi_p31_m01
=
pair256_set_epi16
(
cospi_31_64
,
-
cospi_1_64
);
const
__m256i
cospi_p05_p27
=
pair256_set_epi16
(
cospi_5_64
,
cospi_27_64
);
const
__m256i
cospi_p27_m05
=
pair256_set_epi16
(
cospi_27_64
,
-
cospi_5_64
);
const
__m256i
cospi_p09_p23
=
pair256_set_epi16
(
cospi_9_64
,
cospi_23_64
);
const
__m256i
cospi_p23_m09
=
pair256_set_epi16
(
cospi_23_64
,
-
cospi_9_64
);
const
__m256i
cospi_p13_p19
=
pair256_set_epi16
(
cospi_13_64
,
cospi_19_64
);
const
__m256i
cospi_p19_m13
=
pair256_set_epi16
(
cospi_19_64
,
-
cospi_13_64
);
const
__m256i
cospi_p17_p15
=
pair256_set_epi16
(
cospi_17_64
,
cospi_15_64
);
const
__m256i
cospi_p15_m17
=
pair256_set_epi16
(
cospi_15_64
,
-
cospi_17_64
);
const
__m256i
cospi_p21_p11
=
pair256_set_epi16
(
cospi_21_64
,
cospi_11_64
);
const
__m256i
cospi_p11_m21
=
pair256_set_epi16
(
cospi_11_64
,
-
cospi_21_64
);
const
__m256i
cospi_p25_p07
=
pair256_set_epi16
(
cospi_25_64
,
cospi_7_64
);
const
__m256i
cospi_p07_m25
=
pair256_set_epi16
(
cospi_7_64
,
-
cospi_25_64
);
const
__m256i
cospi_p29_p03
=
pair256_set_epi16
(
cospi_29_64
,
cospi_3_64
);
const
__m256i
cospi_p03_m29
=
pair256_set_epi16
(
cospi_3_64
,
-
cospi_29_64
);
const
__m256i
cospi_p04_p28
=
pair256_set_epi16
(
cospi_4_64
,
cospi_28_64
);
const
__m256i
cospi_p28_m04
=
pair256_set_epi16
(
cospi_28_64
,
-
cospi_4_64
);
const
__m256i
cospi_p20_p12
=
pair256_set_epi16
(
cospi_20_64
,
cospi_12_64
);
const
__m256i
cospi_p12_m20
=
pair256_set_epi16
(
cospi_12_64
,
-
cospi_20_64
);
const
__m256i
cospi_m28_p04
=
pair256_set_epi16
(
-
cospi_28_64
,
cospi_4_64
);
const
__m256i
cospi_m12_p20
=
pair256_set_epi16
(
-
cospi_12_64
,
cospi_20_64
);
const
__m256i
cospi_p08_p24
=
pair256_set_epi16
(
cospi_8_64
,
cospi_24_64
);
const
__m256i
cospi_p24_m08
=
pair256_set_epi16
(
cospi_24_64
,
-
cospi_8_64
);
const
__m256i
cospi_m24_p08
=
pair256_set_epi16
(
-
cospi_24_64
,
cospi_8_64
);
const
__m256i
cospi_m16_m16
=
_mm256_set1_epi16
((
int16_t
)
-
cospi_16_64
);
const
__m256i
cospi_p16_p16
=
_mm256_set1_epi16
((
int16_t
)
cospi_16_64
);
const
__m256i
cospi_p16_m16
=
pair256_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m256i
cospi_m16_p16
=
pair256_set_epi16
(
-
cospi_16_64
,
cospi_16_64
);
const
__m256i
zero
=
_mm256_setzero_si256
();
__m256i
x
[
16
],
s
[
16
];
__m256i
u
[
4
],
v
[
4
];
// stage 1
butterfly_32b
(
&
in
[
15
],
&
in
[
0
],
&
cospi_p01_p31
,
&
cospi_p31_m01
,
u
);
butterfly_32b
(
&
in
[
7
],
&
in
[
8
],
&
cospi_p17_p15
,
&
cospi_p15_m17
,
v
);
add_rnd
(
u
,
v
,
&
x
[
0
]);
sub_rnd
(
u
,
v
,
&
x
[
8
]);
butterfly_32b
(
&
in
[
13
],
&
in
[
2
],
&
cospi_p05_p27
,
&
cospi_p27_m05
,
u
);
butterfly_32b
(
&
in
[
5
],
&
in
[
10
],
&
cospi_p21_p11
,
&
cospi_p11_m21
,
v
);
add_rnd
(
u
,
v
,
&
x
[
2
]);
sub_rnd
(
u
,
v
,
&
x
[
10
]);
butterfly_32b
(
&
in
[
11
],
&
in
[
4
],
&
cospi_p09_p23
,
&
cospi_p23_m09
,
u
);
butterfly_32b
(
&
in
[
3
],
&
in
[
12
],
&
cospi_p25_p07
,
&
cospi_p07_m25
,
v
);
add_rnd
(
u
,
v
,
&
x
[
4
]);
sub_rnd
(
u
,
v
,
&
x
[
12
]);
butterfly_32b
(
&
in
[
9
],
&
in
[
6
],
&
cospi_p13_p19
,
&
cospi_p19_m13
,
u
);
butterfly_32b
(
&
in
[
1
],
&
in
[
14
],
&
cospi_p29_p03
,
&
cospi_p03_m29
,
v
);
add_rnd
(
u
,
v
,
&
x
[
6
]);
sub_rnd
(
u
,
v
,
&
x
[
14
]);
// stage 2
s
[
0
]
=
_mm256_add_epi16
(
x
[
0
],
x
[
4
]);
s
[
1
]
=
_mm256_add_epi16
(
x
[
1
],
x
[
5
]);
s
[
2
]
=
_mm256_add_epi16
(
x
[
2
],
x
[
6
]);
s
[
3
]
=
_mm256_add_epi16
(
x
[
3
],
x
[
7
]);
s
[
4
]
=
_mm256_sub_epi16
(
x
[
0
],
x
[
4
]);
s
[
5
]
=
_mm256_sub_epi16
(
x
[
1
],
x
[
5
]);
s
[
6
]
=
_mm256_sub_epi16
(
x
[
2
],
x
[
6
]);
s
[
7
]
=
_mm256_sub_epi16
(
x
[
3
],
x
[
7
]);
butterfly_32b
(
&
x
[
8
],
&
x
[
9
],
&
cospi_p04_p28
,
&
cospi_p28_m04
,
u
);
butterfly_32b
(
&
x
[
12
],
&
x
[
13
],
&
cospi_m28_p04
,
&
cospi_p04_p28
,
v
);
add_rnd
(
u
,
v
,
&
s
[
8
]);
sub_rnd
(
u
,
v
,
&
s
[
12
]);
butterfly_32b
(
&
x
[
10