Commit 043f4964 authored by Linfeng Zhang's avatar Linfeng Zhang
Browse files

Implement fdct4x8_new_sse2 and fadst4x8_new_sse2

Change-Id: I9ab260c5ca31fe7e06bfc0f806893463c5255c45
parent 1fffc1f4
......@@ -24,6 +24,22 @@
extern "C" {
#endif
static INLINE void btf_16_w4_sse2(
const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
__m128i *const out0, __m128i *const out1) {
const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
const __m128i u0 = _mm_madd_epi16(t0, *w0);
const __m128i v0 = _mm_madd_epi16(t0, *w1);
const __m128i a0 = _mm_add_epi32(u0, __rounding);
const __m128i b0 = _mm_add_epi32(v0, __rounding);
const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
*out0 = _mm_packs_epi32(c0, c0);
*out1 = _mm_packs_epi32(d0, c0);
}
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
{ \
__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
......
......@@ -12,7 +12,7 @@
#include "av1/common/x86/av1_txfm_sse2.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
// TODO(linfengz): specialize fdct4x8 and fadst4x8 optimization.
// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
int8_t cos_bit) {
......@@ -78,6 +78,75 @@ void fdct4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
output[3] = x2[3];
}
void fdct4x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
__m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
__m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
__m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
__m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
__m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
// stage 1
__m128i x1[8];
x1[0] = _mm_adds_epi16(input[0], input[7]);
x1[7] = _mm_subs_epi16(input[0], input[7]);
x1[1] = _mm_adds_epi16(input[1], input[6]);
x1[6] = _mm_subs_epi16(input[1], input[6]);
x1[2] = _mm_adds_epi16(input[2], input[5]);
x1[5] = _mm_subs_epi16(input[2], input[5]);
x1[3] = _mm_adds_epi16(input[3], input[4]);
x1[4] = _mm_subs_epi16(input[3], input[4]);
// stage 2
__m128i x2[8];
x2[0] = _mm_adds_epi16(x1[0], x1[3]);
x2[3] = _mm_subs_epi16(x1[0], x1[3]);
x2[1] = _mm_adds_epi16(x1[1], x1[2]);
x2[2] = _mm_subs_epi16(x1[1], x1[2]);
x2[4] = x1[4];
btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
&x1[6], &x2[5], &x2[6]);
x2[7] = x1[7];
// stage 3
__m128i x3[8];
btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
&x2[1], &x3[0], &x3[1]);
btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
&x2[3], &x3[2], &x3[3]);
x3[4] = _mm_adds_epi16(x2[4], x2[5]);
x3[5] = _mm_subs_epi16(x2[4], x2[5]);
x3[6] = _mm_subs_epi16(x2[7], x2[6]);
x3[7] = _mm_adds_epi16(x2[7], x2[6]);
// stage 4
__m128i x4[8];
x4[0] = x3[0];
x4[1] = x3[1];
x4[2] = x3[2];
x4[3] = x3[3];
btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
&x3[7], &x4[4], &x4[7]);
btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
&x3[6], &x4[5], &x4[6]);
// stage 5
output[0] = x4[0];
output[1] = x4[4];
output[2] = x4[2];
output[3] = x4[6];
output[4] = x4[1];
output[5] = x4[5];
output[6] = x4[3];
output[7] = x4[7];
}
void fdct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
......@@ -1392,6 +1461,102 @@ static void fadst4_new_sse2(const __m128i *input, __m128i *output,
output[3] = _mm_srli_si128(output[1], 8);
}
void fadst4x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __zero = _mm_setzero_si128();
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
__m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
__m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
__m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
__m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
__m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
__m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
__m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
__m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
__m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
__m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
__m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
__m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
__m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
// stage 1
__m128i x1[8];
x1[0] = input[0];
x1[1] = _mm_subs_epi16(__zero, input[7]);
x1[2] = _mm_subs_epi16(__zero, input[3]);
x1[3] = input[4];
x1[4] = _mm_subs_epi16(__zero, input[1]);
x1[5] = input[6];
x1[6] = input[2];
x1[7] = _mm_subs_epi16(__zero, input[5]);
// stage 2
__m128i x2[8];
x2[0] = x1[0];
x2[1] = x1[1];
btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
&x1[3], &x2[2], &x2[3]);
x2[4] = x1[4];
x2[5] = x1[5];
btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
&x1[7], &x2[6], &x2[7]);
// stage 3
__m128i x3[8];
x3[0] = _mm_adds_epi16(x2[0], x2[2]);
x3[2] = _mm_subs_epi16(x2[0], x2[2]);
x3[1] = _mm_adds_epi16(x2[1], x2[3]);
x3[3] = _mm_subs_epi16(x2[1], x2[3]);
x3[4] = _mm_adds_epi16(x2[4], x2[6]);
x3[6] = _mm_subs_epi16(x2[4], x2[6]);
x3[5] = _mm_adds_epi16(x2[5], x2[7]);
x3[7] = _mm_subs_epi16(x2[5], x2[7]);
// stage 4
__m128i x4[8];
x4[0] = x3[0];
x4[1] = x3[1];
x4[2] = x3[2];
x4[3] = x3[3];
btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
&x3[5], &x4[4], &x4[5]);
btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
&x3[7], &x4[6], &x4[7]);
// stage 5
__m128i x5[8];
x5[0] = _mm_adds_epi16(x4[0], x4[4]);
x5[4] = _mm_subs_epi16(x4[0], x4[4]);
x5[1] = _mm_adds_epi16(x4[1], x4[5]);
x5[5] = _mm_subs_epi16(x4[1], x4[5]);
x5[2] = _mm_adds_epi16(x4[2], x4[6]);
x5[6] = _mm_subs_epi16(x4[2], x4[6]);
x5[3] = _mm_adds_epi16(x4[3], x4[7]);
x5[7] = _mm_subs_epi16(x4[3], x4[7]);
// stage 6
__m128i x6[8];
btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
&x5[1], &x6[0], &x6[1]);
btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
&x5[3], &x6[2], &x6[3]);
btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
&x5[5], &x6[4], &x6[5]);
btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
&x5[7], &x6[6], &x6[7]);
// stage 7
output[0] = x6[1];
output[1] = x6[6];
output[2] = x6[3];
output[3] = x6[4];
output[4] = x6[5];
output[5] = x6[2];
output[6] = x6[7];
output[7] = x6[0];
}
static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
int8_t cos_bit) {
const int32_t *sinpi = sinpi_arr(cos_bit);
......@@ -1826,41 +1991,41 @@ static const transform_2d_sse2 txfm4_arr[] = {
};
static const transform_2d_sse2 txfm4x8_arr[16] = {
{ fdct8_new_sse2, fdct4_new_sse2 }, // DCT_DCT
{ fadst8_new_sse2, fdct4_new_sse2 }, // ADST_DCT
{ fdct8_new_sse2, fadst8x4_new_sse2 }, // DCT_ADST
{ fadst8_new_sse2, fadst8x4_new_sse2 }, // ADST_ADST
{ fadst8_new_sse2, fdct4_new_sse2 }, // FLIPADST_DCT
{ fdct8_new_sse2, fadst8x4_new_sse2 }, // DCT_FLIPADST
{ fadst8_new_sse2, fadst8x4_new_sse2 }, // FLIPADST_FLIPADST
{ fadst8_new_sse2, fadst8x4_new_sse2 }, // ADST_FLIPADST
{ fadst8_new_sse2, fadst8x4_new_sse2 }, // FLIPADST_ADST
{ fdct4x8_new_sse2, fdct4_new_sse2 }, // DCT_DCT
{ fadst4x8_new_sse2, fdct4_new_sse2 }, // ADST_DCT
{ fdct4x8_new_sse2, fadst8x4_new_sse2 }, // DCT_ADST
{ fadst4x8_new_sse2, fadst8x4_new_sse2 }, // ADST_ADST
{ fadst4x8_new_sse2, fdct4_new_sse2 }, // FLIPADST_DCT
{ fdct4x8_new_sse2, fadst8x4_new_sse2 }, // DCT_FLIPADST
{ fadst4x8_new_sse2, fadst8x4_new_sse2 }, // FLIPADST_FLIPADST
{ fadst4x8_new_sse2, fadst8x4_new_sse2 }, // ADST_FLIPADST
{ fadst4x8_new_sse2, fadst8x4_new_sse2 }, // FLIPADST_ADST
{ fidentity8_new_sse2, fidentity8x4_new_sse2 }, // IDTX
{ fdct8_new_sse2, fidentity8x4_new_sse2 }, // V_DCT
{ fdct4x8_new_sse2, fidentity8x4_new_sse2 }, // V_DCT
{ fidentity8_new_sse2, fdct4_new_sse2 }, // H_DCT
{ fadst8_new_sse2, fidentity8x4_new_sse2 }, // V_ADST
{ fadst4x8_new_sse2, fidentity8x4_new_sse2 }, // V_ADST
{ fidentity8_new_sse2, fadst8x4_new_sse2 }, // H_ADST
{ fadst8_new_sse2, fidentity8x4_new_sse2 }, // V_FLIPADST
{ fadst4x8_new_sse2, fidentity8x4_new_sse2 }, // V_FLIPADST
{ fidentity8_new_sse2, fadst8x4_new_sse2 }, // H_FLIPADST
};
static const transform_2d_sse2 txfm8x4_arr[] = {
{ fdct4_new_sse2, fdct8_new_sse2 }, // DCT_DCT
{ fadst8x4_new_sse2, fdct8_new_sse2 }, // ADST_DCT
{ fdct4_new_sse2, fadst8_new_sse2 }, // DCT_ADST
{ fadst8x4_new_sse2, fadst8_new_sse2 }, // ADST_ADST
{ fadst8x4_new_sse2, fdct8_new_sse2 }, // FLIPADST_DCT
{ fdct4_new_sse2, fadst8_new_sse2 }, // DCT_FLIPADST
{ fadst8x4_new_sse2, fadst8_new_sse2 }, // FLIPADST_FLIPADST
{ fadst8x4_new_sse2, fadst8_new_sse2 }, // ADST_FLIPADST
{ fadst8x4_new_sse2, fadst8_new_sse2 }, // FLIPADST_ADST
{ fdct4_new_sse2, fdct4x8_new_sse2 }, // DCT_DCT
{ fadst8x4_new_sse2, fdct4x8_new_sse2 }, // ADST_DCT
{ fdct4_new_sse2, fadst4x8_new_sse2 }, // DCT_ADST
{ fadst8x4_new_sse2, fadst4x8_new_sse2 }, // ADST_ADST
{ fadst8x4_new_sse2, fdct4x8_new_sse2 }, // FLIPADST_DCT
{ fdct4_new_sse2, fadst4x8_new_sse2 }, // DCT_FLIPADST
{ fadst8x4_new_sse2, fadst4x8_new_sse2 }, // FLIPADST_FLIPADST
{ fadst8x4_new_sse2, fadst4x8_new_sse2 }, // ADST_FLIPADST
{ fadst8x4_new_sse2, fadst4x8_new_sse2 }, // FLIPADST_ADST
{ fidentity8x4_new_sse2, fidentity8_new_sse2 }, // IDTX
{ fdct4_new_sse2, fidentity8_new_sse2 }, // V_DCT
{ fidentity8x4_new_sse2, fdct8_new_sse2 }, // H_DCT
{ fidentity8x4_new_sse2, fdct4x8_new_sse2 }, // H_DCT
{ fadst8x4_new_sse2, fidentity8_new_sse2 }, // V_ADST
{ fidentity8x4_new_sse2, fadst8_new_sse2 }, // H_ADST
{ fidentity8x4_new_sse2, fadst4x8_new_sse2 }, // H_ADST
{ fadst8x4_new_sse2, fidentity8_new_sse2 }, // V_FLIPADST
{ fidentity8x4_new_sse2, fadst8_new_sse2 }, // H_FLIPADST
{ fidentity8x4_new_sse2, fadst4x8_new_sse2 }, // H_FLIPADST
};
static const transform_2d_sse2 txfm8_arr[] = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment