Commit 1a796617 authored by Angie Chiang's avatar Angie Chiang

Implement sse2 fwd 1d txfms

Change-Id: I8dcaa6882d47a097498c8f8af515b1185df4fdf3
parent 3d288156
......@@ -181,6 +181,7 @@ set(AOM_AV1_ENCODER_ASM_SSE2
"${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
set(AOM_AV1_ENCODER_INTRIN_SSE2
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
"${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c"
"${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c")
......
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <emmintrin.h> // SSE2
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "av1/common/av1_txfm.h"
#define pair_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
{ \
__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
__m128i t1 = _mm_unpackhi_epi16(in0, in1); \
__m128i u0 = _mm_madd_epi16(t0, w0); \
__m128i u1 = _mm_madd_epi16(t1, w0); \
__m128i v0 = _mm_madd_epi16(t0, w1); \
__m128i v1 = _mm_madd_epi16(t1, w1); \
\
__m128i a0 = _mm_add_epi32(u0, __rounding); \
__m128i a1 = _mm_add_epi32(u1, __rounding); \
__m128i b0 = _mm_add_epi32(v0, __rounding); \
__m128i b1 = _mm_add_epi32(v1, __rounding); \
\
__m128i c0 = _mm_srai_epi32(a0, cos_bit); \
__m128i c1 = _mm_srai_epi32(a1, cos_bit); \
__m128i d0 = _mm_srai_epi32(b0, cos_bit); \
__m128i d1 = _mm_srai_epi32(b1, cos_bit); \
\
out0 = _mm_packs_epi32(c0, c1); \
out1 = _mm_packs_epi32(d0, d1); \
}
void fdct4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
// stage 1
__m128i x1[4];
x1[0] = _mm_adds_epi16(input[0], input[3]);
x1[3] = _mm_subs_epi16(input[0], input[3]);
x1[1] = _mm_adds_epi16(input[1], input[2]);
x1[2] = _mm_subs_epi16(input[1], input[2]);
// stage 2
__m128i x2[4];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
// stage 3
output[0] = x2[0];
output[1] = x2[2];
output[2] = x2[1];
output[3] = x2[3];
}
void fdct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
__m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
__m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
__m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
__m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
// stage 1
__m128i x1[8];
x1[0] = _mm_adds_epi16(input[0], input[7]);
x1[7] = _mm_subs_epi16(input[0], input[7]);
x1[1] = _mm_adds_epi16(input[1], input[6]);
x1[6] = _mm_subs_epi16(input[1], input[6]);
x1[2] = _mm_adds_epi16(input[2], input[5]);
x1[5] = _mm_subs_epi16(input[2], input[5]);
x1[3] = _mm_adds_epi16(input[3], input[4]);
x1[4] = _mm_subs_epi16(input[3], input[4]);
// stage 2
__m128i x2[8];
x2[0] = _mm_adds_epi16(x1[0], x1[3]);
x2[3] = _mm_subs_epi16(x1[0], x1[3]);
x2[1] = _mm_adds_epi16(x1[1], x1[2]);
x2[2] = _mm_subs_epi16(x1[1], x1[2]);
x2[4] = x1[4];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
x2[7] = x1[7];
// stage 3
__m128i x3[8];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
x3[4] = _mm_adds_epi16(x2[4], x2[5]);
x3[5] = _mm_subs_epi16(x2[4], x2[5]);
x3[6] = _mm_subs_epi16(x2[7], x2[6]);
x3[7] = _mm_adds_epi16(x2[7], x2[6]);
// stage 4
__m128i x4[8];
x4[0] = x3[0];
x4[1] = x3[1];
x4[2] = x3[2];
x4[3] = x3[3];
btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
// stage 5
output[0] = x4[0];
output[1] = x4[4];
output[2] = x4[2];
output[3] = x4[6];
output[4] = x4[1];
output[5] = x4[5];
output[6] = x4[3];
output[7] = x4[7];
}
void fdct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
__m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
__m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
__m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
__m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
__m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
__m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
__m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
__m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
__m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
__m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
__m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
__m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
__m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
// stage 1
__m128i x1[16];
x1[0] = _mm_adds_epi16(input[0], input[15]);
x1[15] = _mm_subs_epi16(input[0], input[15]);
x1[1] = _mm_adds_epi16(input[1], input[14]);
x1[14] = _mm_subs_epi16(input[1], input[14]);
x1[2] = _mm_adds_epi16(input[2], input[13]);
x1[13] = _mm_subs_epi16(input[2], input[13]);
x1[3] = _mm_adds_epi16(input[3], input[12]);
x1[12] = _mm_subs_epi16(input[3], input[12]);
x1[4] = _mm_adds_epi16(input[4], input[11]);
x1[11] = _mm_subs_epi16(input[4], input[11]);
x1[5] = _mm_adds_epi16(input[5], input[10]);
x1[10] = _mm_subs_epi16(input[5], input[10]);
x1[6] = _mm_adds_epi16(input[6], input[9]);
x1[9] = _mm_subs_epi16(input[6], input[9]);
x1[7] = _mm_adds_epi16(input[7], input[8]);
x1[8] = _mm_subs_epi16(input[7], input[8]);
// stage 2
__m128i x2[16];
x2[0] = _mm_adds_epi16(x1[0], x1[7]);
x2[7] = _mm_subs_epi16(x1[0], x1[7]);
x2[1] = _mm_adds_epi16(x1[1], x1[6]);
x2[6] = _mm_subs_epi16(x1[1], x1[6]);
x2[2] = _mm_adds_epi16(x1[2], x1[5]);
x2[5] = _mm_subs_epi16(x1[2], x1[5]);
x2[3] = _mm_adds_epi16(x1[3], x1[4]);
x2[4] = _mm_subs_epi16(x1[3], x1[4]);
x2[8] = x1[8];
x2[9] = x1[9];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
x2[14] = x1[14];
x2[15] = x1[15];
// stage 3
__m128i x3[16];
x3[0] = _mm_adds_epi16(x2[0], x2[3]);
x3[3] = _mm_subs_epi16(x2[0], x2[3]);
x3[1] = _mm_adds_epi16(x2[1], x2[2]);
x3[2] = _mm_subs_epi16(x2[1], x2[2]);
x3[4] = x2[4];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
x3[7] = x2[7];
x3[8] = _mm_adds_epi16(x2[8], x2[11]);
x3[11] = _mm_subs_epi16(x2[8], x2[11]);
x3[9] = _mm_adds_epi16(x2[9], x2[10]);
x3[10] = _mm_subs_epi16(x2[9], x2[10]);
x3[12] = _mm_subs_epi16(x2[15], x2[12]);
x3[15] = _mm_adds_epi16(x2[15], x2[12]);
x3[13] = _mm_subs_epi16(x2[14], x2[13]);
x3[14] = _mm_adds_epi16(x2[14], x2[13]);
// stage 4
__m128i x4[16];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
x4[4] = _mm_adds_epi16(x3[4], x3[5]);
x4[5] = _mm_subs_epi16(x3[4], x3[5]);
x4[6] = _mm_subs_epi16(x3[7], x3[6]);
x4[7] = _mm_adds_epi16(x3[7], x3[6]);
x4[8] = x3[8];
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
x4[11] = x3[11];
x4[12] = x3[12];
x4[15] = x3[15];
// stage 5
__m128i x5[16];
x5[0] = x4[0];
x5[1] = x4[1];
x5[2] = x4[2];
x5[3] = x4[3];
btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
x5[8] = _mm_adds_epi16(x4[8], x4[9]);
x5[9] = _mm_subs_epi16(x4[8], x4[9]);
x5[10] = _mm_subs_epi16(x4[11], x4[10]);
x5[11] = _mm_adds_epi16(x4[11], x4[10]);
x5[12] = _mm_adds_epi16(x4[12], x4[13]);
x5[13] = _mm_subs_epi16(x4[12], x4[13]);
x5[14] = _mm_subs_epi16(x4[15], x4[14]);
x5[15] = _mm_adds_epi16(x4[15], x4[14]);
// stage 6
__m128i x6[16];
x6[0] = x5[0];
x6[1] = x5[1];
x6[2] = x5[2];
x6[3] = x5[3];
x6[4] = x5[4];
x6[5] = x5[5];
x6[6] = x5[6];
x6[7] = x5[7];
btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
// stage 7
output[0] = x6[0];
output[1] = x6[8];
output[2] = x6[4];
output[3] = x6[12];
output[4] = x6[2];
output[5] = x6[10];
output[6] = x6[6];
output[7] = x6[14];
output[8] = x6[1];
output[9] = x6[9];
output[10] = x6[5];
output[11] = x6[13];
output[12] = x6[3];
output[13] = x6[11];
output[14] = x6[7];
output[15] = x6[15];
}
void fdct32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
__m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
__m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
__m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
__m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
__m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
__m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
__m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
__m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
__m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
__m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
__m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
__m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
__m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
__m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
__m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
__m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
__m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
__m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
__m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
__m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
__m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
__m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
__m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
__m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
__m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
__m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
__m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
__m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
__m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
// stage 1
__m128i x1[32];
x1[0] = _mm_adds_epi16(input[0], input[31]);
x1[31] = _mm_subs_epi16(input[0], input[31]);
x1[1] = _mm_adds_epi16(input[1], input[30]);
x1[30] = _mm_subs_epi16(input[1], input[30]);
x1[2] = _mm_adds_epi16(input[2], input[29]);
x1[29] = _mm_subs_epi16(input[2], input[29]);
x1[3] = _mm_adds_epi16(input[3], input[28]);
x1[28] = _mm_subs_epi16(input[3], input[28]);
x1[4] = _mm_adds_epi16(input[4], input[27]);
x1[27] = _mm_subs_epi16(input[4], input[27]);
x1[5] = _mm_adds_epi16(input[5], input[26]);
x1[26] = _mm_subs_epi16(input[5], input[26]);
x1[6] = _mm_adds_epi16(input[6], input[25]);
x1[25] = _mm_subs_epi16(input[6], input[25]);
x1[7] = _mm_adds_epi16(input[7], input[24]);
x1[24] = _mm_subs_epi16(input[7], input[24]);
x1[8] = _mm_adds_epi16(input[8], input[23]);
x1[23] = _mm_subs_epi16(input[8], input[23]);
x1[9] = _mm_adds_epi16(input[9], input[22]);
x1[22] = _mm_subs_epi16(input[9], input[22]);
x1[10] = _mm_adds_epi16(input[10], input[21]);
x1[21] = _mm_subs_epi16(input[10], input[21]);
x1[11] = _mm_adds_epi16(input[11], input[20]);
x1[20] = _mm_subs_epi16(input[11], input[20]);
x1[12] = _mm_adds_epi16(input[12], input[19]);
x1[19] = _mm_subs_epi16(input[12], input[19]);
x1[13] = _mm_adds_epi16(input[13], input[18]);
x1[18] = _mm_subs_epi16(input[13], input[18]);
x1[14] = _mm_adds_epi16(input[14], input[17]);
x1[17] = _mm_subs_epi16(input[14], input[17]);
x1[15] = _mm_adds_epi16(input[15], input[16]);
x1[16] = _mm_subs_epi16(input[15], input[16]);
// stage 2
__m128i x2[32];
x2[0] = _mm_adds_epi16(x1[0], x1[15]);
x2[15] = _mm_subs_epi16(x1[0], x1[15]);
x2[1] = _mm_adds_epi16(x1[1], x1[14]);
x2[14] = _mm_subs_epi16(x1[1], x1[14]);
x2[2] = _mm_adds_epi16(x1[2], x1[13]);
x2[13] = _mm_subs_epi16(x1[2], x1[13]);
x2[3] = _mm_adds_epi16(x1[3], x1[12]);
x2[12] = _mm_subs_epi16(x1[3], x1[12]);
x2[4] = _mm_adds_epi16(x1[4], x1[11]);
x2[11] = _mm_subs_epi16(x1[4], x1[11]);
x2[5] = _mm_adds_epi16(x1[5], x1[10]);
x2[10] = _mm_subs_epi16(x1[5], x1[10]);
x2[6] = _mm_adds_epi16(x1[6], x1[9]);
x2[9] = _mm_subs_epi16(x1[6], x1[9]);
x2[7] = _mm_adds_epi16(x1[7], x1[8]);
x2[8] = _mm_subs_epi16(x1[7], x1[8]);
x2[16] = x1[16];
x2[17] = x1[17];
x2[18] = x1[18];
x2[19] = x1[19];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
x2[28] = x1[28];
x2[29] = x1[29];
x2[30] = x1[30];
x2[31] = x1[31];
// stage 3
__m128i x3[32];
x3[0] = _mm_adds_epi16(x2[0], x2[7]);
x3[7] = _mm_subs_epi16(x2[0], x2[7]);
x3[1] = _mm_adds_epi16(x2[1], x2[6]);
x3[6] = _mm_subs_epi16(x2[1], x2[6]);
x3[2] = _mm_adds_epi16(x2[2], x2[5]);
x3[5] = _mm_subs_epi16(x2[2], x2[5]);
x3[3] = _mm_adds_epi16(x2[3], x2[4]);
x3[4] = _mm_subs_epi16(x2[3], x2[4]);
x3[8] = x2[8];
x3[9] = x2[9];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
x3[14] = x2[14];
x3[15] = x2[15];
x3[16] = _mm_adds_epi16(x2[16], x2[23]);
x3[23] = _mm_subs_epi16(x2[16], x2[23]);
x3[17] = _mm_adds_epi16(x2[17], x2[22]);
x3[22] = _mm_subs_epi16(x2[17], x2[22]);
x3[18] = _mm_adds_epi16(x2[18], x2[21]);
x3[21] = _mm_subs_epi16(x2[18], x2[21]);
x3[19] = _mm_adds_epi16(x2[19], x2[20]);
x3[20] = _mm_subs_epi16(x2[19], x2[20]);
x3[24] = _mm_subs_epi16(x2[31], x2[24]);
x3[31] = _mm_adds_epi16(x2[31], x2[24]);
x3[25] = _mm_subs_epi16(x2[30], x2[25]);
x3[30] = _mm_adds_epi16(x2[30], x2[25]);
x3[26] = _mm_subs_epi16(x2[29], x2[26]);
x3[29] = _mm_adds_epi16(x2[29], x2[26]);
x3[27] = _mm_subs_epi16(x2[28], x2[27]);
x3[28] = _mm_adds_epi16(x2[28], x2[27]);
// stage 4
__m128i x4[32];
x4[0] = _mm_adds_epi16(x3[0], x3[3]);
x4[3] = _mm_subs_epi16(x3[0], x3[3]);
x4[1] = _mm_adds_epi16(x3[1], x3[2]);
x4[2] = _mm_subs_epi16(x3[1], x3[2]);
x4[4] = x3[4];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
x4[7] = x3[7];
x4[8] = _mm_adds_epi16(x3[8], x3[11]);
x4[11] = _mm_subs_epi16(x3[8], x3[11]);
x4[9] = _mm_adds_epi16(x3[9], x3[10]);
x4[10] = _mm_subs_epi16(x3[9], x3[10]);
x4[12] = _mm_subs_epi16(x3[15], x3[12]);
x4[15] = _mm_adds_epi16(x3[15], x3[12]);
x4[13] = _mm_subs_epi16(x3[14], x3[13]);
x4[14] = _mm_adds_epi16(x3[14], x3[13]);
x4[16] = x3[16];
x4[17] = x3[17];
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
x4[22] = x3[22];
x4[23] = x3[23];
x4[24] = x3[24];
x4[25] = x3[25];
x4[30] = x3[30];
x4[31] = x3[31];
// stage 5
__m128i x5[32];
btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
x5[4] = _mm_adds_epi16(x4[4], x4[5]);
x5[5] = _mm_subs_epi16(x4[4], x4[5]);
x5[6] = _mm_subs_epi16(x4[7], x4[6]);
x5[7] = _mm_adds_epi16(x4[7], x4[6]);
x5[8] = x4[8];
btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
x5[11] = x4[11];
x5[12] = x4[12];
x5[15] = x4[15];
x5[16] = _mm_adds_epi16(x4[16], x4[19]);
x5[19] = _mm_subs_epi16(x4[16], x4[19]);
x5[17] = _mm_adds_epi16(x4[17], x4[18]);
x5[18] = _mm_subs_epi16(x4[17], x4[18]);
x5[20] = _mm_subs_epi16(x4[23], x4[20]);
x5[23] = _mm_adds_epi16(x4[23], x4[20]);
x5[21] = _mm_subs_epi16(x4[22], x4[21]);
x5[22] = _mm_adds_epi16(x4[22], x4[21]);
x5[24] = _mm_adds_epi16(x4[24], x4[27]);
x5[27] = _mm_subs_epi16(x4[24], x4[27]);
x5[25] = _mm_adds_epi16(x4[25], x4[26]);
x5[26] = _mm_subs_epi16(x4[25], x4[26]);
x5[28] = _mm_subs_epi16(x4[31], x4[28]);
x5[31] = _mm_adds_epi16(x4[31], x4[28]);
x5[29] = _mm_subs_epi16(x4[30], x4[29]);
x5[30] = _mm_adds_epi16(x4[30], x4[29]);
// stage 6
__m128i x6[32];
x6[0] = x5[0];
x6[1] = x5[1];
x6[2] = x5[2];
x6[3] = x5[3];
btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
x6[8] = _mm_adds_epi16(x5[8], x5[9]);
x6[9] = _mm_subs_epi16(x5[8], x5[9]);
x6[10] = _mm_subs_epi16(x5[11], x5[10]);
x6[11] = _mm_adds_epi16(x5[11], x5[10]);
x6[12] = _mm_adds_epi16(x5[12], x5[13]);
x6[13] = _mm_subs_epi16(x5[12], x5[13]);
x6[14] = _mm_subs_epi16(x5[15], x5[14]);
x6[15] = _mm_adds_epi16(x5[15], x5[14]);
x6[16] = x5[16];
btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
x6[19] = x5[19];
x6[20] = x5[20];
btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
x6[23] = x5[23];
x6[24] = x5[24];
x6[27] = x5[27];
x6[28] = x5[28];
x6[31] = x5[31];
// stage 7
__m128i x7[32];
x7[0] = x6[0];
x7[1] = x6[1];
x7[2] = x6[2];
x7[3] = x6[3];
x7[4] = x6[4];
x7[5] = x6[5];
x7[6] = x6[6];
x7[7] = x6[7];
btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
x7[16] = _mm_adds_epi16(x6[16], x6[17]);
x7[17] = _mm_subs_epi16(x6[16], x6[17]);
x7[18] = _mm_subs_epi16(x6[19], x6[18]);
x7[19] = _mm_adds_epi16(x6[19], x6[18]);
x7[20] = _mm_adds_epi16(x6[20], x6[21]);
x7[21] = _mm_subs_epi16(x6[20], x6[21]);
x7[22] = _mm_subs_epi16(x6[23], x6[22]);
x7[23] = _mm_adds_epi16(x6[23], x6[22]);
x7[24] = _mm_adds_epi16(x6[24], x6[25]);
x7[25] = _mm_subs_epi16(x6[24], x6[25]);
x7[26] = _mm_subs_epi16(x6[27], x6[26]);
x7[27] = _mm_adds_epi16(x6[27], x6[26]);
x7[28] = _mm_adds_epi16(x6[28], x6[29]);
x7[29] = _mm_subs_epi16(x6[28], x6[29]);
x7[30] = _mm_subs_epi16(x6[31], x6[30]);
x7[31] = _mm_adds_epi16(x6[31], x6[30]);
// stage 8
__m128i x8[32];
x8[0] = x7[0];
x8[1] = x7[1];
x8[2] = x7[2];
x8[3]