Commit 7f6bf9c7 authored by Yi Luo's avatar Yi Luo Committed by Gerrit Code Review
Browse files

Merge "Hybrid inverse transforms 16x16 AVX2 optimization" into nextgenv2

parents 9679464e 73172000
......@@ -14,6 +14,8 @@
#include <immintrin.h>
#include "aom_dsp/txfm_common.h"
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
......@@ -24,4 +26,179 @@
_mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
(int)(b), (int)(a))
static INLINE void mm256_reverse_epi16(__m256i *u) {
const __m256i control = _mm256_set_epi16(
0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
__m256i v = _mm256_shuffle_epi8(*u, control);
*u = _mm256_permute2x128_si256(v, v, 1);
}
static INLINE void mm256_transpose_16x16(__m256i *in) {
__m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
__m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
__m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
__m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
__m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
__m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
__m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
__m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
__m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
__m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
__m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
__m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
__m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
__m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
__m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
__m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
// 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
// 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
// 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
// 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
// 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
// 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
// 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
// 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
// 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
// 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
// a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
// a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
// c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
// c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
// e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
// e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
__m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
__m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
__m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
__m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
__m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
__m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
__m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
__m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
__m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
__m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
__m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
__m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
__m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
__m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
__m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
__m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
// 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
// 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
// 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
// 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
// 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
// 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
// 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
// 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
// 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
// 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
// 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
// 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
// c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
// c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
// c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
// c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
// 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
// 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
// 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
// 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
// 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
// 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
// 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
// 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
// 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
// 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
// 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
// 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
// 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
// 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
// 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
// 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
__m256i y0 = _mm256_madd_epi16(a0, cospi);
__m256i y1 = _mm256_madd_epi16(a1, cospi);
y0 = _mm256_add_epi32(y0, dct_rounding);
y1 = _mm256_add_epi32(y1, dct_rounding);
y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
return _mm256_packs_epi32(y0, y1);
}
static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
const __m256i zero = _mm256_setzero_si256();
const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
__m256i u0, u1;
int i = 0;
while (i < 16) {
in[i] = _mm256_slli_epi16(in[i], 1);
u0 = _mm256_unpacklo_epi16(zero, in[i]);
u1 = _mm256_unpackhi_epi16(zero, in[i]);
u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
u0 = _mm256_add_epi32(u0, dct_const_rounding);
u1 = _mm256_add_epi32(u1, dct_const_rounding);
u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
in[i] = _mm256_packs_epi32(u0, u1);
i++;
}
}
#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H
......@@ -122,6 +122,8 @@ AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct8x8_msa.c
AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
ifeq ($(CONFIG_AV1_ENCODER),yes)
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h
......
......@@ -114,7 +114,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_iht8x8_64_add sse2/;
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/av1_iht16x16_256_add sse2/;
specialize qw/av1_iht16x16_256_add sse2 avx2/;
}
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
......@@ -175,7 +175,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/av1_iht16x16_256_add sse2 dspr2/;
specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_iht4x4_16_add msa/;
......
......@@ -984,17 +984,12 @@ void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
av1_iht16x16_256_add(input, dest, stride, tx_type);
break;
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
case V_FLIPADST:
case H_FLIPADST:
// Use C version since DST only exists in C code
av1_iht16x16_256_add_c(input, dest, stride, tx_type);
break;
case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h> // avx2
#include "./aom_config.h"
#include "./av1_rtcd.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
#if CONFIG_AOM_HIGHBITDEPTH
*in = _mm256_setr_epi16(
(int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
(int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
(int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
(int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
(int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
(int16_t)coeff[15]);
#else
*in = _mm256_loadu_si256((const __m256i *)coeff);
#endif
}
static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
int i = 0;
while (i < 16) {
load_coeff(coeff + (i << 4), &in[i]);
i += 1;
}
}
static void recon_and_store(const __m256i *res, uint8_t *output) {
const __m128i zero = _mm_setzero_si128();
__m128i x = _mm_loadu_si128((__m128i const *)output);
__m128i p0 = _mm_unpacklo_epi8(x, zero);
__m128i p1 = _mm_unpackhi_epi8(x, zero);
p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
x = _mm_packus_epi16(p0, p1);
_mm_storeu_si128((__m128i *)output, x);
}
#define IDCT_ROUNDING_POS (6)
static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
int i = 0;
while (i < 16) {
in[i] = _mm256_add_epi16(in[i], rounding);
in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
recon_and_store(&in[i], output + i * stride);
i += 1;
}
}
static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
const __m256i *c0, const __m256i *c1,
__m256i *b0, __m256i *b1) {
__m256i x0, x1;
x0 = _mm256_unpacklo_epi16(*a0, *a1);
x1 = _mm256_unpackhi_epi16(*a0, *a1);
*b0 = butter_fly(x0, x1, *c0);
*b1 = butter_fly(x0, x1, *c1);
}
static void idct16_avx2(__m256i *in) {
const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
__m256i u0, u1, u2, u3, u4, u5, u6, u7;
__m256i v0, v1, v2, v3, v4, v5, v6, v7;
__m256i t0, t1, t2, t3, t4, t5, t6, t7;
// stage 1, (0-7)
u0 = in[0];
u1 = in[8];
u2 = in[4];
u3 = in[12];
u4 = in[2];
u5 = in[10];
u6 = in[6];
u7 = in[14];
// stage 2, (0-7)
// stage 3, (0-7)
t0 = u0;
t1 = u1;
t2 = u2;
t3 = u3;
unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
// stage 4, (0-7)
unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
u4 = _mm256_add_epi16(t4, t5);
u5 = _mm256_sub_epi16(t4, t5);
u6 = _mm256_sub_epi16(t7, t6);
u7 = _mm256_add_epi16(t7, t6);
// stage 5, (0-7)
t0 = _mm256_add_epi16(u0, u3);
t1 = _mm256_add_epi16(u1, u2);
t2 = _mm256_sub_epi16(u1, u2);
t3 = _mm256_sub_epi16(u0, u3);
t4 = u4;
t7 = u7;
unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
// stage 6, (0-7)
u0 = _mm256_add_epi16(t0, t7);
u1 = _mm256_add_epi16(t1, t6);
u2 = _mm256_add_epi16(t2, t5);
u3 = _mm256_add_epi16(t3, t4);
u4 = _mm256_sub_epi16(t3, t4);
u5 = _mm256_sub_epi16(t2, t5);
u6 = _mm256_sub_epi16(t1, t6);
u7 = _mm256_sub_epi16(t0, t7);
// stage 1, (8-15)
v0 = in[1];
v1 = in[9];
v2 = in[5];
v3 = in[13];
v4 = in[3];
v5 = in[11];
v6 = in[7];
v7 = in[15];
// stage 2, (8-15)
unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
// stage 3, (8-15)
v0 = _mm256_add_epi16(t0, t1);
v1 = _mm256_sub_epi16(t0, t1);
v2 = _mm256_sub_epi16(t3, t2);
v3 = _mm256_add_epi16(t2, t3);
v4 = _mm256_add_epi16(t4, t5);
v5 = _mm256_sub_epi16(t4, t5);
v6 = _mm256_sub_epi16(t7, t6);
v7 = _mm256_add_epi16(t6, t7);
// stage 4, (8-15)
t0 = v0;
t7 = v7;
t3 = v3;
t4 = v4;
unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
// stage 5, (8-15)
v0 = _mm256_add_epi16(t0, t3);
v1 = _mm256_add_epi16(t1, t2);
v2 = _mm256_sub_epi16(t1, t2);
v3 = _mm256_sub_epi16(t0, t3);
v4 = _mm256_sub_epi16(t7, t4);
v5 = _mm256_sub_epi16(t6, t5);
v6 = _mm256_add_epi16(t6, t5);
v7 = _mm256_add_epi16(t7, t4);
// stage 6, (8-15)
t0 = v0;
t1 = v1;
t6 = v6;
t7 = v7;
unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
// stage 7
in[0] = _mm256_add_epi16(u0, t7);
in[1] = _mm256_add_epi16(u1, t6);
in[2] = _mm256_add_epi16(u2, t5);
in[3] = _mm256_add_epi16(u3, t4);
in[4] = _mm256_add_epi16(u4, t3);
in[5] = _mm256_add_epi16(u5, t2);
in[6] = _mm256_add_epi16(u6, t1);
in[7] = _mm256_add_epi16(u7, t0);
in[8] = _mm256_sub_epi16(u7, t0);
in[9] = _mm256_sub_epi16(u6, t1);
in[10] = _mm256_sub_epi16(u5, t2);
in[11] = _mm256_sub_epi16(u4, t3);
in[12] = _mm256_sub_epi16(u3, t4);
in[13] = _mm256_sub_epi16(u2, t5);
in[14] = _mm256_sub_epi16(u1, t6);
in[15] = _mm256_sub_epi16(u0, t7);
}
static void idct16(__m256i *in) {
mm256_transpose_16x16(in);
idct16_avx2(in);
}
static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
const __m256i *c0, const __m256i *c1,
__m256i *b) {
__m256i x0, x1;
x0 = _mm256_unpacklo_epi16(*a0, *a1);
x1 = _mm256_unpackhi_epi16(*a0, *a1);
b[0] = _mm256_madd_epi16(x0, *c0);
b[1] = _mm256_madd_epi16(x1, *c0);
b[2] = _mm256_madd_epi16(x0, *c1);
b[3] = _mm256_madd_epi16(x1, *c1);
}
static INLINE void group_rounding(__m256i *a, int num) {
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
int i;
for (i = 0; i < num; ++i) {
a[i] = _mm256_add_epi32(a[i], dct_rounding);
a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
}
}
static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
__m256i x[4];
x[0] = _mm256_add_epi32(a[0], b[0]);
x[1] = _mm256_add_epi32(a[1], b[1]);
x[2] = _mm256_add_epi32(a[2], b[2]);
x[3] = _mm256_add_epi32(a[3], b[3]);
group_rounding(x, 4);
out[0] = _mm256_packs_epi32(x[0], x[1]);
out[1] = _mm256_packs_epi32(x[2], x[3]);
}
static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
__m256i x[4];
x[0] = _mm256_sub_epi32(a[0], b[0]);
x[1] = _mm256_sub_epi32(a[1], b[1]);
x[2] = _mm256_sub_epi32(a[2], b[2]);
x[3] = _mm256_sub_epi32(a[3], b[3]);
group_rounding(x, 4);
out[0] = _mm256_packs_epi32(x[0], x[1]);
out[1] = _mm256_packs_epi32(x[2], x[3]);
}
static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
group_rounding(a, 4);
out[0] = _mm256_packs_epi32(a[0], a[1]);
out[1] = _mm256_packs_epi32(a[2], a[3]);
}
static void iadst16_avx2(__m256i *in) {
const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
const __m256i zero = _mm256_setzero_si256();
__m256i x[16], s[16];
__m256i u[4], v[4];
// stage 1
butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
add_rnd(u, v, &x[0]);
sub_rnd(u, v, &x[8]);
butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
add_rnd(u, v, &x[2]);
sub_rnd(u, v, &x[10]);
butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
add_rnd(u, v, &x[4]);
sub_rnd(u, v, &x[12]);
butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
add_rnd(u, v, &x[6]);
sub_rnd(u, v, &x[14]);
// stage 2
s[0] = _mm256_add_epi16(x[0], x[4]);
s[1] = _mm256_add_epi16(x[1], x[5]);
s[2] = _mm256_add_epi16(x[2], x[6]);
s[3] = _mm256_add_epi16(x[3], x[7]);
s[4] = _mm256_sub_epi16(x[0], x[4]);
s[5] = _mm256_sub_epi16(x[1], x[5]);
s[6] = _mm256_sub_epi16(x[2], x[6]);
s[7] = _mm256_sub_epi16(x[3], x[7]);
butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
add_rnd(u, v, &s[8]);
sub_rnd(u, v, &s[12]);
butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);