Commit 1637d424 authored by Angie Chiang's avatar Angie Chiang

Implement sse2 inv 1d txfms

Change-Id: I9a42b75de3e623f6af325edbe91e299c0662f19c
parent af73d536
......@@ -156,7 +156,9 @@ set(AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/tokenize.h")
set(AOM_AV1_COMMON_INTRIN_SSE2
"${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
"${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c"
"${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
"${AOM_ROOT}/av1/common/x86/av1_inv_txfm_sse2.c")
set(AOM_AV1_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/av1/common/x86/av1_convolve_ssse3.c")
......
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "av1/common/x86/av1_txfm_sse2.h"
void idct4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
__m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
// stage 1
__m128i x1[4];
x1[0] = input[0];
x1[1] = input[2];
x1[2] = input[1];
x1[3] = input[3];
// stage 2
__m128i x2[4];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x2[2], x2[3]);
// stage 3
output[0] = _mm_adds_epi16(x2[0], x2[3]);
output[3] = _mm_subs_epi16(x2[0], x2[3]);
output[1] = _mm_adds_epi16(x2[1], x2[2]);
output[2] = _mm_subs_epi16(x2[1], x2[2]);
}
void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
__m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
__m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
__m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
__m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
// stage 1
__m128i x1[8];
x1[0] = input[0];
x1[1] = input[4];
x1[2] = input[2];
x1[3] = input[6];
x1[4] = input[1];
x1[5] = input[5];
x1[6] = input[3];
x1[7] = input[7];
// stage 2
__m128i x2[8];
x2[0] = x1[0];
x2[1] = x1[1];
x2[2] = x1[2];
x2[3] = x1[3];
btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x2[4], x2[7]);
btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x2[5], x2[6]);
// stage 3
__m128i x3[8];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x2[2], x2[3], x3[2], x3[3]);
x3[4] = _mm_adds_epi16(x2[4], x2[5]);
x3[5] = _mm_subs_epi16(x2[4], x2[5]);
x3[6] = _mm_subs_epi16(x2[7], x2[6]);
x3[7] = _mm_adds_epi16(x2[6], x2[7]);
// stage 4
__m128i x4[8];
x4[0] = _mm_adds_epi16(x3[0], x3[3]);
x4[3] = _mm_subs_epi16(x3[0], x3[3]);
x4[1] = _mm_adds_epi16(x3[1], x3[2]);
x4[2] = _mm_subs_epi16(x3[1], x3[2]);
x4[4] = x3[4];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
x4[7] = x3[7];
// stage 5
output[0] = _mm_adds_epi16(x4[0], x4[7]);
output[7] = _mm_subs_epi16(x4[0], x4[7]);
output[1] = _mm_adds_epi16(x4[1], x4[6]);
output[6] = _mm_subs_epi16(x4[1], x4[6]);
output[2] = _mm_adds_epi16(x4[2], x4[5]);
output[5] = _mm_subs_epi16(x4[2], x4[5]);
output[3] = _mm_adds_epi16(x4[3], x4[4]);
output[4] = _mm_subs_epi16(x4[3], x4[4]);
}
void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
__m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
__m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
__m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
__m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
__m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
__m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
__m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
__m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
__m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
__m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
__m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
__m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
// stage 1
__m128i x1[16];
x1[0] = input[0];
x1[1] = input[8];
x1[2] = input[4];
x1[3] = input[12];
x1[4] = input[2];
x1[5] = input[10];
x1[6] = input[6];
x1[7] = input[14];
x1[8] = input[1];
x1[9] = input[9];
x1[10] = input[5];
x1[11] = input[13];
x1[12] = input[3];
x1[13] = input[11];
x1[14] = input[7];
x1[15] = input[15];
// stage 2
__m128i x2[16];
x2[0] = x1[0];
x2[1] = x1[1];
x2[2] = x1[2];
x2[3] = x1[3];
x2[4] = x1[4];
x2[5] = x1[5];
x2[6] = x1[6];
x2[7] = x1[7];
btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x2[8], x2[15]);
btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x2[9], x2[14]);
btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x2[10], x2[13]);
btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x2[11], x2[12]);
// stage 3
__m128i x3[16];
x3[0] = x2[0];
x3[1] = x2[1];
x3[2] = x2[2];
x3[3] = x2[3];
btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x2[4], x2[7], x3[4], x3[7]);
btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x2[5], x2[6], x3[5], x3[6]);
x3[8] = _mm_adds_epi16(x2[8], x2[9]);
x3[9] = _mm_subs_epi16(x2[8], x2[9]);
x3[10] = _mm_subs_epi16(x2[11], x2[10]);
x3[11] = _mm_adds_epi16(x2[10], x2[11]);
x3[12] = _mm_adds_epi16(x2[12], x2[13]);
x3[13] = _mm_subs_epi16(x2[12], x2[13]);
x3[14] = _mm_subs_epi16(x2[15], x2[14]);
x3[15] = _mm_adds_epi16(x2[14], x2[15]);
// stage 4
__m128i x4[16];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x3[2], x3[3], x4[2], x4[3]);
x4[4] = _mm_adds_epi16(x3[4], x3[5]);
x4[5] = _mm_subs_epi16(x3[4], x3[5]);
x4[6] = _mm_subs_epi16(x3[7], x3[6]);
x4[7] = _mm_adds_epi16(x3[6], x3[7]);
x4[8] = x3[8];
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
x4[11] = x3[11];
x4[12] = x3[12];
x4[15] = x3[15];
// stage 5
__m128i x5[16];
x5[0] = _mm_adds_epi16(x4[0], x4[3]);
x5[3] = _mm_subs_epi16(x4[0], x4[3]);
x5[1] = _mm_adds_epi16(x4[1], x4[2]);
x5[2] = _mm_subs_epi16(x4[1], x4[2]);
x5[4] = x4[4];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
x5[7] = x4[7];
x5[8] = _mm_adds_epi16(x4[8], x4[11]);
x5[11] = _mm_subs_epi16(x4[8], x4[11]);
x5[9] = _mm_adds_epi16(x4[9], x4[10]);
x5[10] = _mm_subs_epi16(x4[9], x4[10]);
x5[12] = _mm_subs_epi16(x4[15], x4[12]);
x5[15] = _mm_adds_epi16(x4[12], x4[15]);
x5[13] = _mm_subs_epi16(x4[14], x4[13]);
x5[14] = _mm_adds_epi16(x4[13], x4[14]);
// stage 6
__m128i x6[16];
x6[0] = _mm_adds_epi16(x5[0], x5[7]);
x6[7] = _mm_subs_epi16(x5[0], x5[7]);
x6[1] = _mm_adds_epi16(x5[1], x5[6]);
x6[6] = _mm_subs_epi16(x5[1], x5[6]);
x6[2] = _mm_adds_epi16(x5[2], x5[5]);
x6[5] = _mm_subs_epi16(x5[2], x5[5]);
x6[3] = _mm_adds_epi16(x5[3], x5[4]);
x6[4] = _mm_subs_epi16(x5[3], x5[4]);
x6[8] = x5[8];
x6[9] = x5[9];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x5[10], x5[13], x6[10], x6[13]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x5[11], x5[12], x6[11], x6[12]);
x6[14] = x5[14];
x6[15] = x5[15];
// stage 7
output[0] = _mm_adds_epi16(x6[0], x6[15]);
output[15] = _mm_subs_epi16(x6[0], x6[15]);
output[1] = _mm_adds_epi16(x6[1], x6[14]);
output[14] = _mm_subs_epi16(x6[1], x6[14]);
output[2] = _mm_adds_epi16(x6[2], x6[13]);
output[13] = _mm_subs_epi16(x6[2], x6[13]);
output[3] = _mm_adds_epi16(x6[3], x6[12]);
output[12] = _mm_subs_epi16(x6[3], x6[12]);
output[4] = _mm_adds_epi16(x6[4], x6[11]);
output[11] = _mm_subs_epi16(x6[4], x6[11]);
output[5] = _mm_adds_epi16(x6[5], x6[10]);
output[10] = _mm_subs_epi16(x6[5], x6[10]);
output[6] = _mm_adds_epi16(x6[6], x6[9]);
output[9] = _mm_subs_epi16(x6[6], x6[9]);
output[7] = _mm_adds_epi16(x6[7], x6[8]);
output[8] = _mm_subs_epi16(x6[7], x6[8]);
}
void idct32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
__m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
__m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
__m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
__m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
__m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
__m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
__m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
__m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
__m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
__m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
__m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
__m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
__m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
__m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
__m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
__m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
__m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
__m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
__m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
__m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
__m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
__m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
__m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
__m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
__m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
__m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
__m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
__m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
__m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
__m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
__m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
__m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
__m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
__m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
// stage 1
__m128i x1[32];
x1[0] = input[0];
x1[1] = input[16];
x1[2] = input[8];
x1[3] = input[24];
x1[4] = input[4];
x1[5] = input[20];
x1[6] = input[12];
x1[7] = input[28];
x1[8] = input[2];
x1[9] = input[18];
x1[10] = input[10];
x1[11] = input[26];
x1[12] = input[6];
x1[13] = input[22];
x1[14] = input[14];
x1[15] = input[30];
x1[16] = input[1];
x1[17] = input[17];
x1[18] = input[9];
x1[19] = input[25];
x1[20] = input[5];
x1[21] = input[21];
x1[22] = input[13];
x1[23] = input[29];
x1[24] = input[3];
x1[25] = input[19];
x1[26] = input[11];
x1[27] = input[27];
x1[28] = input[7];
x1[29] = input[23];
x1[30] = input[15];
x1[31] = input[31];
// stage 2
__m128i x2[32];
x2[0] = x1[0];
x2[1] = x1[1];
x2[2] = x1[2];
x2[3] = x1[3];
x2[4] = x1[4];
x2[5] = x1[5];
x2[6] = x1[6];
x2[7] = x1[7];
x2[8] = x1[8];
x2[9] = x1[9];
x2[10] = x1[10];
x2[11] = x1[11];
x2[12] = x1[12];
x2[13] = x1[13];
x2[14] = x1[14];
x2[15] = x1[15];
btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x2[16], x2[31]);
btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x2[17], x2[30]);
btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x2[18], x2[29]);
btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x2[19], x2[28]);
btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x2[20], x2[27]);
btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x2[21], x2[26]);
btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x2[22], x2[25]);
btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x2[23], x2[24]);
// stage 3
__m128i x3[32];
x3[0] = x2[0];
x3[1] = x2[1];
x3[2] = x2[2];
x3[3] = x2[3];
x3[4] = x2[4];
x3[5] = x2[5];
x3[6] = x2[6];
x3[7] = x2[7];
btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x2[8], x2[15], x3[8], x3[15]);
btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x2[9], x2[14], x3[9], x3[14]);
btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x2[10], x2[13], x3[10], x3[13]);
btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x2[11], x2[12], x3[11], x3[12]);
x3[16] = _mm_adds_epi16(x2[16], x2[17]);
x3[17] = _mm_subs_epi16(x2[16], x2[17]);
x3[18] = _mm_subs_epi16(x2[19], x2[18]);
x3[19] = _mm_adds_epi16(x2[18], x2[19]);
x3[20] = _mm_adds_epi16(x2[20], x2[21]);
x3[21] = _mm_subs_epi16(x2[20], x2[21]);
x3[22] = _mm_subs_epi16(x2[23], x2[22]);
x3[23] = _mm_adds_epi16(x2[22], x2[23]);
x3[24] = _mm_adds_epi16(x2[24], x2[25]);
x3[25] = _mm_subs_epi16(x2[24], x2[25]);
x3[26] = _mm_subs_epi16(x2[27], x2[26]);
x3[27] = _mm_adds_epi16(x2[26], x2[27]);
x3[28] = _mm_adds_epi16(x2[28], x2[29]);
x3[29] = _mm_subs_epi16(x2[28], x2[29]);
x3[30] = _mm_subs_epi16(x2[31], x2[30]);
x3[31] = _mm_adds_epi16(x2[30], x2[31]);
// stage 4
__m128i x4[32];
x4[0] = x3[0];
x4[1] = x3[1];
x4[2] = x3[2];
x4[3] = x3[3];
btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x3[4], x3[7], x4[4], x4[7]);
btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x3[5], x3[6], x4[5], x4[6]);
x4[8] = _mm_adds_epi16(x3[8], x3[9]);
x4[9] = _mm_subs_epi16(x3[8], x3[9]);
x4[10] = _mm_subs_epi16(x3[11], x3[10]);
x4[11] = _mm_adds_epi16(x3[10], x3[11]);
x4[12] = _mm_adds_epi16(x3[12], x3[13]);
x4[13] = _mm_subs_epi16(x3[12], x3[13]);
x4[14] = _mm_subs_epi16(x3[15], x3[14]);
x4[15] = _mm_adds_epi16(x3[14], x3[15]);
x4[16] = x3[16];
btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x3[17], x3[30], x4[17], x4[30]);
btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x3[18], x3[29], x4[18], x4[29]);
x4[19] = x3[19];
x4[20] = x3[20];
btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x3[21], x3[26], x4[21], x4[26]);
btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x3[22], x3[25], x4[22], x4[25]);
x4[23] = x3[23];
x4[24] = x3[24];
x4[27] = x3[27];
x4[28] = x3[28];
x4[31] = x3[31];
// stage 5
__m128i x5[32];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x4[2], x4[3], x5[2], x5[3]);
x5[4] = _mm_adds_epi16(x4[4], x4[5]);
x5[5] = _mm_subs_epi16(x4[4], x4[5]);
x5[6] = _mm_subs_epi16(x4[7], x4[6]);
x5[7] = _mm_adds_epi16(x4[6], x4[7]);
x5[8] = x4[8];
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
x5[11] = x4[11];
x5[12] = x4[12];
x5[15] = x4[15];
x5[16] = _mm_adds_epi16(x4[16], x4[19]);
x5[19] = _mm_subs_epi16(x4[16], x4[19]);
x5[17] = _mm_adds_epi16(x4[17], x4[18]);
x5[18] = _mm_subs_epi16(x4[17], x4[18]);
x5[20] = _mm_subs_epi16(x4[23], x4[20]);
x5[23] = _mm_adds_epi16(x4[20], x4[23]);
x5[21] = _mm_subs_epi16(x4[22], x4[21]);
x5[22] = _mm_adds_epi16(x4[21], x4[22]);
x5[24] = _mm_adds_epi16(x4[24], x4[27]);
x5[27] = _mm_subs_epi16(x4[24], x4[27]);
x5[25] = _mm_adds_epi16(x4[25], x4[26]);
x5[26] = _mm_subs_epi16(x4[25], x4[26]);
x5[28] = _mm_subs_epi16(x4[31], x4[28]);
x5[31] = _mm_adds_epi16(x4[28], x4[31]);
x5[29] = _mm_subs_epi16(x4[30], x4[29]);
x5[30] = _mm_adds_epi16(x4[29], x4[30]);
// stage 6
__m128i x6[32];
x6[0] = _mm_adds_epi16(x5[0], x5[3]);
x6[3] = _mm_subs_epi16(x5[0], x5[3]);
x6[1] = _mm_adds_epi16(x5[1], x5[2]);
x6[2] = _mm_subs_epi16(x5[1], x5[2]);
x6[4] = x5[4];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x5[5], x5[6], x6[5], x6[6]);
x6[7] = x5[7];
x6[8] = _mm_adds_epi16(x5[8], x5[11]);
x6[11] = _mm_subs_epi16(x5[8], x5[11]);
x6[9] = _mm_adds_epi16(x5[9], x5[10]);
x6[10] = _mm_subs_epi16(x5[9], x5[10]);
x6[12] = _mm_subs_epi16(x5[15], x5[12]);
x6[15] = _mm_adds_epi16(x5[12], x5[15]);
x6[13] = _mm_subs_epi16(x5[14], x5[13]);
x6[14] = _mm_adds_epi16(x5[13], x5[14]);
x6[16] = x5[16];
x6[17] = x5[17];
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[18], x5[29], x6[18], x6[29]);
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[19], x5[28], x6[19], x6[28]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[20], x5[27], x6[20], x6[27]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[21], x5[26], x6[21], x6[26]);
x6[22] = x5[22];
x6[23] = x5[23];
x6[24] = x5[24];
x6[25] = x5[25];
x6[30] = x5[30];
x6[31] = x5[31];
// stage 7
__m128i x7[32];
x7[0] = _mm_adds_epi16(x6[0], x6[7]);
x7[7] = _mm_subs_epi16(x6[0], x6[7]);
x7[1] = _mm_adds_epi16(x6[1], x6[6]);
x7[6] = _mm_subs_epi16(x6[1], x6[6]);
x7[2] = _mm_adds_epi16(x6[2], x6[5]);
x7[5] = _mm_subs_epi16(x6[2], x6[5]);
x7[3] = _mm_adds_epi16(x6[3], x6[4]);
x7[4] = _mm_subs_epi16(x6[3], x6[4]);
x7[8] = x6[8];
x7[9] = x6[9];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x6[10], x6[13], x7[10], x7[13]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x6[11], x6[12], x7[11], x7[12]);
x7[14] = x6[14];
x7[15] = x6[15];
x7[16] = _mm_adds_epi16(x6[16], x6[23]);
x7[23] = _mm_subs_epi16(x6[16], x6[23]);
x7[17] = _mm_adds_epi16(x6[17], x6[22]);
x7[22] = _mm_subs_epi16(x6[17], x6[22]);
x7[18] = _mm_adds_epi16(x6[18], x6[21]);
x7[21] = _mm_subs_epi16(x6[18], x6[21]);
x7[19] = _mm_adds_epi16(x6[19], x6[20]);
x7[20] = _mm_subs_epi16(x6[19], x6[20]);
x7[24] = _mm_subs_epi16(x6[31], x6[24]);
x7[31] = _mm_adds_epi16(x6[24], x6[31]);
x7[25] = _mm_subs_epi16(x6[30], x6[25]);
x7[30] = _mm_adds_epi16(x6[25], x6[30]);
x7[26] = _mm_subs_epi16(x6[29], x6[26]);
x7[29] = _mm_adds_epi16(x6[26], x6[29]);
x7[27] = _mm_subs_epi16(x6[28], x6[27]);
x7[28] = _mm_adds_epi16(x6[27], x6[28]);
// stage 8
__m128i x8[32];
x8[0] = _mm_adds_epi16(x7[0], x7[15]);
x8[15] = _mm_subs_epi16(x7[0], x7[15]);
x8[1] = _mm_adds_epi16(x7[1], x7[14]);
x8[14] = _mm_subs_epi16(x7[1], x7[14]);
x8[2] = _mm_adds_epi16(x7[2], x7[13]);
x8[13] = _mm_subs_epi16(x7[2], x7[13]);
x8[3] = _mm_adds_epi16(x7[3], x7[12]);
x8[12] = _mm_subs_epi16(x7[3], x7[12]);
x8[4] = _mm_adds_epi16(x7[4], x7[11]);
x8[11] = _mm_subs_epi16(x7[4], x7[11]);
x8[5] = _mm_adds_epi16(x7[5], x7[10]);
x8[10] = _mm_subs_epi16(x7[5], x7[10]);
x8[6] = _mm_adds_epi16(x7[6], x7[9]);
x8[9] = _mm_subs_epi16(x7[6], x7[9]);
x8[7] = _mm_adds_epi16(x7[7], x7[8]);
x8[8] = _mm_subs_epi16(x7[7], x7[8]);
x8[16] = x7[16];
x8[17] = x7[17];
x8[18] = x7[18];
x8[19] = x7[19];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x7[20], x7[27], x8[20], x8[27]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x7[21], x7[26], x8[21], x8[26]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x7[22], x7[25], x8[22], x8[25]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x7[23], x7[24], x8[23], x8[24]);
x8[28] = x7[28];
x8[29] = x7[29];
x8[30] = x7[30];
x8[31] = x7[31];
// stage 9
output[0] = _mm_adds_epi16(x8[0], x8[31]);
output[31] = _mm_subs_epi16(x8[0], x8[31]);
output[1] = _mm_adds_epi16(x8[1], x8[30]);
output[30] = _mm_subs_epi16(x8[1], x8[30]);
output[2] = _mm_adds_epi16(x8[2], x8[29]);
output[29] = _mm_subs_epi16(x8[2], x8[29]);
output[3] = _mm_adds_epi16(x8[3], x8[28]);
output[28] = _mm_subs_epi16(x8[3], x8[28]);
output[4] = _mm_adds_epi16(x8[4], x8[27]);
output[27] = _mm_subs_epi16(x8[4], x8[27]);
output[5] = _mm_adds_epi16(x8[5], x8[26]);
output[26] = _mm_subs_epi16(x8[5], x8[26]);
output[6] = _mm_adds_epi16(x8[6], x8[25]);
output[25] = _mm_subs_epi16(x8[6], x8[25]);
output[7] = _mm_adds_epi16(x8[7], x8[24]);
output[24] = _mm_subs_epi16(x8[7], x8[24]);
output[8] = _mm_adds_epi16(x8[8], x8[23]);
output[23] = _mm_subs_epi16(x8[8], x8[23]);
output[9] = _mm_adds_epi16(x8[9], x8[22]);
output[22] = _mm_subs_epi16(x8[9], x8[22]);
output[10] = _mm_adds_epi16(x8[10], x8[21]);
output[21] = _mm_subs_epi16(x8[10], x8[21]);
output[11] = _mm_adds_epi16(x8[11], x8[20]);
output[20] = _mm_subs_epi16(x8[11], x8[20]);
output[12] = _mm_adds_epi16(x8[12], x8[19]);
output[19] = _mm_subs_epi16(x8[12], x8[19]);
output[13] = _mm_adds_epi16(x8[13], x8[18]);
output[18] = _mm_subs_epi16(x8[13], x8[18]);
output[14] = _mm_adds_epi16(x8[14], x8[17]);
output[17] = _mm_subs_epi16(x8[14], x8[17]);
output[15] = _mm_adds_epi16(x8[15], x8[16]);
output[16] = _mm_subs_epi16(x8[15], x8[16]);
}
void idct64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p63_m01 = pair_set_epi16(cospi[63], -cospi[1]);
__m128i cospi_p01_p63 = pair_set_epi16(cospi[1], cospi[63]);
__m128i cospi_p31_m33 = pair_set_epi16(cospi[31], -cospi[33]);
__m128i cospi_p33_p31 = pair_set_epi16(cospi[33], cospi[31]);
__m128i cospi_p47_m17 = pair_set_epi16(cospi[47], -cospi[17]);
__m128i cospi_p17_p47 = pair_set_epi16(cospi[17], cospi[47]);
__m128i cospi_p15_m49 = pair_set_epi16(cospi[15], -cospi[49]);
__m128i cospi_p49_p15 = pair_set_epi16(cospi[49], cospi[15]);
__m128i cospi_p55_m09 = pair_set_epi16(cospi[55], -cospi[9]);
__m128i cospi_p09_p55 = pair_set_epi16(cospi[9], cospi[55]);