Commit 1109b6b8 authored by Jingning Han's avatar Jingning Han
Browse files

Enable SSE2 4x4 ADST/DCT transform

This commit enables SSE2 4x4 foward hybrid transform. The runtime
goes from 249 cycles down to 74 cycles. Overall around 2% speed-up
at no compression performance change.

Change-Id: Iad4d526346e05c7be896466c05500711bb763660
parent 59070f6e
......@@ -579,7 +579,7 @@ fi
# fdct functions
prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
specialize vp9_short_fht4x4
specialize vp9_short_fht4x4 sse2
prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
specialize vp9_short_fht8x8 sse2
......
......@@ -117,6 +117,166 @@ void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i mask;
in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
in[0] = _mm_slli_epi16(in[0], 4);
in[1] = _mm_slli_epi16(in[1], 4);
in[2] = _mm_slli_epi16(in[2], 4);
in[3] = _mm_slli_epi16(in[3], 4);
mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
in[0] = _mm_add_epi16(in[0], mask);
in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
}
static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
const __m128i kOne = _mm_set1_epi16(1);
__m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
__m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
__m128i out01 = _mm_add_epi16(in01, kOne);
__m128i out23 = _mm_add_epi16(in23, kOne);
out01 = _mm_srai_epi16(out01, 2);
out23 = _mm_srai_epi16(out23, 2);
_mm_store_si128((__m128i *)(output + 0 * 8), out01);
_mm_store_si128((__m128i *)(output + 1 * 8), out23);
}
static INLINE void transpose_4x4(__m128i *res) {
// Combine and transpose
// 00 01 02 03 20 21 22 23
// 10 11 12 13 30 31 32 33
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
// 00 10 20 30 01 11 21 31
// 02 12 22 32 03 13 23 33
// only use the first 4 16-bit integers
res[1] = _mm_unpackhi_epi64(res[0], res[0]);
res[3] = _mm_unpackhi_epi64(res[2], res[2]);
}
void fdct4_1d_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[4], v[4];
u[0] = _mm_add_epi16(in[0], in[3]);
u[1] = _mm_add_epi16(in[1], in[2]);
u[2] = _mm_sub_epi16(in[1], in[2]);
u[3] = _mm_sub_epi16(in[0], in[3]);
v[0] = _mm_unpacklo_epi16(u[0], u[1]);
v[1] = _mm_unpacklo_epi16(u[2], u[3]);
u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1
u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
in[0] = _mm_packs_epi32(u[0], u[1]);
in[1] = _mm_packs_epi32(u[2], u[3]);
transpose_4x4(in);
}
void fadst4_1d_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
const __m128i kZero = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
__m128i in7 = _mm_add_epi16(in[0], in[1]);
in7 = _mm_sub_epi16(in7, in[3]);
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpacklo_epi16(in[2], in[3]);
u[2] = _mm_unpacklo_epi16(in7, kZero);
u[3] = _mm_unpacklo_epi16(in[2], kZero);
v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
u[0] = _mm_add_epi32(v[0], v[1]);
u[1] = v[2];
u[2] = _mm_add_epi32(v[3], v[4]);
u[3] = _mm_sub_epi32(u[2], u[0]);
u[4] = _mm_slli_epi32(v[5], 2);
u[5] = _mm_sub_epi32(u[4], v[5]);
u[6] = _mm_add_epi32(u[3], u[5]);
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
in[0] = _mm_packs_epi32(u[0], u[2]);
in[1] = _mm_packs_epi32(u[1], u[3]);
transpose_4x4(in);
}
void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[4];
load_buffer_4x4(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct4_1d_sse2(in);
fdct4_1d_sse2(in);
break;
case 1: // ADST_DCT
fadst4_1d_sse2(in);
fdct4_1d_sse2(in);
break;
case 2: // DCT_ADST
fdct4_1d_sse2(in);
fadst4_1d_sse2(in);
break;
case 3: // ADST_ADST
fadst4_1d_sse2(in);
fadst4_1d_sse2(in);
break;
default:
assert(0);
break;
}
write_buffer_4x4(output, in);
}
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const int stride = pitch >> 1;
int pass;
......@@ -863,7 +1023,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
switch (tx_type) {
case 0: // DCT_DCT
fdct8_1d_sse2(in);
fadst8_1d_sse2(in);
fdct8_1d_sse2(in);
break;
case 1: // ADST_DCT
fadst8_1d_sse2(in);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment