Commit 1bb11781 authored by Jingning Han's avatar Jingning Han
Browse files

Rework idct8x8_10 SSE2 implementation

This commit optimizes the SSE2 implmentation of idct8x8_10. It exploits
the fact that only top-left 4x4 block contains non-zero coefficients,
and hence reduces the instructions needed.

The runtime of idct8x8_10_sse2 goes down from 216 to 198 CPU cycles,
estimated by averaging over 100000 runs. For pedestrian_area_1080p 300
frames coded at 4000kbps, the average decoding speed goes up from
79.3 fps to 79.7 fps.

Change-Id: I6d277bbaa3ec9e1562667906975bae06904cb180
parent 3bcece95
......@@ -361,24 +361,23 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
}
#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
\
#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
out0, out1, out2, out3) \
{ \
const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
\
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
out4 = out5 = out6 = out7 = zero; \
}
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
......@@ -394,6 +393,14 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
}
// Define Macro for multiplying elements by constants and adding them together.
#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
......@@ -563,8 +570,8 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// 2-D
for (i = 0; i < 2; i++) {
// 8x8 Transpose is copied from vp9_fdct8x8_sse2()
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
// 4-stage 1D idct8x8
IDCT8_1D
......@@ -1032,12 +1039,11 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
// 8x4 Transpose
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
// Stage1
{ //NOLINT
const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
tmp0 = _mm_madd_epi16(lo_17, stg1_0);
tmp2 = _mm_madd_epi16(lo_17, stg1_1);
......@@ -1053,16 +1059,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
stp1_4 = _mm_packs_epi32(tmp0, zero);
stp1_7 = _mm_packs_epi32(tmp2, zero);
stp1_5 = _mm_packs_epi32(tmp4, zero);
stp1_6 = _mm_packs_epi32(tmp6, zero);
stp1_4 = _mm_packs_epi32(tmp0, tmp2);
stp1_5 = _mm_packs_epi32(tmp4, tmp6);
}
// Stage2
{ //NOLINT
const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
tmp0 = _mm_madd_epi16(lo_04, stg2_0);
tmp2 = _mm_madd_epi16(lo_04, stg2_1);
......@@ -1078,24 +1082,26 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
stp2_0 = _mm_packs_epi32(tmp0, zero);
stp2_1 = _mm_packs_epi32(tmp2, zero);
stp2_2 = _mm_packs_epi32(tmp4, zero);
stp2_3 = _mm_packs_epi32(tmp6, zero);
stp2_0 = _mm_packs_epi32(tmp0, tmp2);
stp2_2 = _mm_packs_epi32(tmp6, tmp4);
stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
stp2_4 = tmp0;
stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
}
// Stage3
{ //NOLINT
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
tmp0 = _mm_madd_epi16(lo_56, stg3_0);
tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
......@@ -1105,27 +1111,19 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
stp1_5 = _mm_packs_epi32(tmp0, zero);
stp1_6 = _mm_packs_epi32(tmp2, zero);
stp1_5 = _mm_packs_epi32(tmp0, tmp2);
}
// Stage4
in0 = _mm_adds_epi16(stp1_0, stp2_7);
in1 = _mm_adds_epi16(stp1_1, stp1_6);
in2 = _mm_adds_epi16(stp1_2, stp1_5);
in3 = _mm_adds_epi16(stp1_3, stp2_4);
in4 = _mm_subs_epi16(stp1_3, stp2_4);
in5 = _mm_subs_epi16(stp1_2, stp1_5);
in6 = _mm_subs_epi16(stp1_1, stp1_6);
in7 = _mm_subs_epi16(stp1_0, stp2_7);
tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
// Columns. 4x8 Transpose
TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7)
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
in4 = in5 = in6 = in7 = zero;
// 1D idct8x8
IDCT8_1D
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
in1 = _mm_adds_epi16(in1, final_rounding);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment