Commit 3d98205f authored by Christian Duvivier's avatar Christian Duvivier Committed by Jingning Han
Browse files

Move fdct32x32 SSE2 implementation in separate file.

This is in preparation for the SSE2 version of the high-precision
32x32 forward DCT which will share a lot of code with the existing
low precision version used for rate-distortion search.

Change-Id: I7084b6bdfb480b1fabb8493fb14e3f7fcc7888c0
parent a93b115c
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
void FDCT32x32_NAME(int16_t *input,
int16_t *output_org, int pitch) {
// Calculate pre-multiplied strides
const int str1 = pitch >> 1;
const int str2 = pitch;
const int str3 = pitch + str1;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
// Constants
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i kZero = _mm_set1_epi16(0);
const __m128i kOne = _mm_set1_epi16(1);
// Do the two transform/transpose passes
int pass;
for (pass = 0; pass < 2; ++pass) {
// We process eight columns (transposed rows in second pass) at a time.
int column_start;
for (column_start = 0; column_start < 32; column_start += 8) {
__m128i step1[32];
__m128i step2[32];
__m128i step3[32];
__m128i out[32];
// Stage 1
// Note: even though all the loads below are aligned, using the aligned
// intrinsic make the code slightly slower.
if (0 == pass) {
int16_t *in = &input[column_start];
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
int16_t *ina = in + 0 * str1;
int16_t *inb = in + 31 * str1;
__m128i *step1a = &step1[ 0];
__m128i *step1b = &step1[31];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
step1a[ 0] = _mm_add_epi16(ina0, inb0);
step1a[ 1] = _mm_add_epi16(ina1, inb1);
step1a[ 2] = _mm_add_epi16(ina2, inb2);
step1a[ 3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
int16_t *ina = in + 4 * str1;
int16_t *inb = in + 27 * str1;
__m128i *step1a = &step1[ 4];
__m128i *step1b = &step1[27];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
step1a[ 0] = _mm_add_epi16(ina0, inb0);
step1a[ 1] = _mm_add_epi16(ina1, inb1);
step1a[ 2] = _mm_add_epi16(ina2, inb2);
step1a[ 3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
int16_t *ina = in + 8 * str1;
int16_t *inb = in + 23 * str1;
__m128i *step1a = &step1[ 8];
__m128i *step1b = &step1[23];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
step1a[ 0] = _mm_add_epi16(ina0, inb0);
step1a[ 1] = _mm_add_epi16(ina1, inb1);
step1a[ 2] = _mm_add_epi16(ina2, inb2);
step1a[ 3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
int16_t *ina = in + 12 * str1;
int16_t *inb = in + 19 * str1;
__m128i *step1a = &step1[12];
__m128i *step1b = &step1[19];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
step1a[ 0] = _mm_add_epi16(ina0, inb0);
step1a[ 1] = _mm_add_epi16(ina1, inb1);
step1a[ 2] = _mm_add_epi16(ina2, inb2);
step1a[ 3] = _mm_add_epi16(ina3, inb3);
step1b[-3] = _mm_sub_epi16(ina3, inb3);
step1b[-2] = _mm_sub_epi16(ina2, inb2);
step1b[-1] = _mm_sub_epi16(ina1, inb1);
step1b[-0] = _mm_sub_epi16(ina0, inb0);
step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
} else {
int16_t *in = &intermediate[column_start];
// step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
// Note: using the same approach as above to have common offset is
// counter-productive as all offsets can be calculated at compile
// time.
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
__m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
__m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
__m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
__m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
__m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
__m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
__m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
__m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
step1[ 0] = _mm_add_epi16(in00, in31);
step1[ 1] = _mm_add_epi16(in01, in30);
step1[ 2] = _mm_add_epi16(in02, in29);
step1[ 3] = _mm_add_epi16(in03, in28);
step1[28] = _mm_sub_epi16(in03, in28);
step1[29] = _mm_sub_epi16(in02, in29);
step1[30] = _mm_sub_epi16(in01, in30);
step1[31] = _mm_sub_epi16(in00, in31);
}
{
__m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
__m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
__m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
__m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
__m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
__m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
__m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
__m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
step1[ 4] = _mm_add_epi16(in04, in27);
step1[ 5] = _mm_add_epi16(in05, in26);
step1[ 6] = _mm_add_epi16(in06, in25);
step1[ 7] = _mm_add_epi16(in07, in24);
step1[24] = _mm_sub_epi16(in07, in24);
step1[25] = _mm_sub_epi16(in06, in25);
step1[26] = _mm_sub_epi16(in05, in26);
step1[27] = _mm_sub_epi16(in04, in27);
}
{
__m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
__m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
__m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
__m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
__m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
__m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
__m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
__m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
step1[ 8] = _mm_add_epi16(in08, in23);
step1[ 9] = _mm_add_epi16(in09, in22);
step1[10] = _mm_add_epi16(in10, in21);
step1[11] = _mm_add_epi16(in11, in20);
step1[20] = _mm_sub_epi16(in11, in20);
step1[21] = _mm_sub_epi16(in10, in21);
step1[22] = _mm_sub_epi16(in09, in22);
step1[23] = _mm_sub_epi16(in08, in23);
}
{
__m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
__m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
__m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
__m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
__m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
__m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
__m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
__m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
step1[12] = _mm_add_epi16(in12, in19);
step1[13] = _mm_add_epi16(in13, in18);
step1[14] = _mm_add_epi16(in14, in17);
step1[15] = _mm_add_epi16(in15, in16);
step1[16] = _mm_sub_epi16(in15, in16);
step1[17] = _mm_sub_epi16(in14, in17);
step1[18] = _mm_sub_epi16(in13, in18);
step1[19] = _mm_sub_epi16(in12, in19);
}
}
// Stage 2
{
step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
step2[10] = _mm_sub_epi16(step1[5], step1[10]);
step2[11] = _mm_sub_epi16(step1[4], step1[11]);
step2[12] = _mm_sub_epi16(step1[3], step1[12]);
step2[13] = _mm_sub_epi16(step1[2], step1[13]);
step2[14] = _mm_sub_epi16(step1[1], step1[14]);
step2[15] = _mm_sub_epi16(step1[0], step1[15]);
}
{
const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
// dct_const_round_shift
const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
// Combine
step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
}
// Stage 3
{
step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
}
{
const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
// dct_const_round_shift
const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
// Combine
step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
}
{
step3[16] = _mm_add_epi16(step2[23], step1[16]);
step3[17] = _mm_add_epi16(step2[22], step1[17]);
step3[18] = _mm_add_epi16(step2[21], step1[18]);
step3[19] = _mm_add_epi16(step2[20], step1[19]);
step3[20] = _mm_sub_epi16(step1[19], step2[20]);
step3[21] = _mm_sub_epi16(step1[18], step2[21]);
step3[22] = _mm_sub_epi16(step1[17], step2[22]);
step3[23] = _mm_sub_epi16(step1[16], step2[23]);
step3[24] = _mm_sub_epi16(step1[31], step2[24]);
step3[25] = _mm_sub_epi16(step1[30], step2[25]);
step3[26] = _mm_sub_epi16(step1[29], step2[26]);
step3[27] = _mm_sub_epi16(step1[28], step2[27]);
step3[28] = _mm_add_epi16(step2[27], step1[28]);
step3[29] = _mm_add_epi16(step2[26], step1[29]);
step3[30] = _mm_add_epi16(step2[25], step1[30]);
step3[31] = _mm_add_epi16(step2[24], step1[31]);
}
#if FDCT32x32_LOW_PRECISION
// dump the magnitude by half, hence the intermediate values are within
// the range of 16 bits.
if (1 == pass) {
__m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
__m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
__m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
__m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
__m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
__m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
__m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
__m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
__m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
__m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
__m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
__m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
__m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
__m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
__m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
__m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
__m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
__m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
__m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
__m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
__m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
__m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
__m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
__m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
__m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
__m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
__m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
__m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
__m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
__m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
__m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
__m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
step3[10] = _mm_add_epi16(step3[10], kOne);
step3[11] = _mm_add_epi16(step3[11], kOne);
step3[12] = _mm_add_epi16(step3[12], kOne);
step3[13] = _mm_add_epi16(step3[13], kOne);
step2[14] = _mm_add_epi16(step2[14], kOne);
step2[15] = _mm_add_epi16(step2[15], kOne);
step3[16] = _mm_add_epi16(step3[16], kOne);
step3[17] = _mm_add_epi16(step3[17], kOne);
step3[18] = _mm_add_epi16(step3[18], kOne);
step3[19] = _mm_add_epi16(step3[19], kOne);
step3[20] = _mm_add_epi16(step3[20], kOne);
step3[21] = _mm_add_epi16(step3[21], kOne);
step3[22] = _mm_add_epi16(step3[22], kOne);
step3[23] = _mm_add_epi16(step3[23], kOne);
step3[24] = _mm_add_epi16(step3[24], kOne);
step3[25] = _mm_add_epi16(step3[25], kOne);
step3[26] = _mm_add_epi16(step3[26], kOne);
step3[27] = _mm_add_epi16(step3[27], kOne);
step3[28] = _mm_add_epi16(step3[28], kOne);
step3[29] = _mm_add_epi16(step3[29], kOne);
step3[30] = _mm_add_epi16(step3[30], kOne);
step3[31] = _mm_add_epi16(step3[31], kOne);
step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
step3[10] = _mm_srai_epi16(step3[10], 2);
step3[11] = _mm_srai_epi16(step3[11], 2);
step3[12] = _mm_srai_epi16(step3[12], 2);
step3[13] = _mm_srai_epi16(step3[13], 2);