Commit 41a350a8 authored by Jingning Han's avatar Jingning Han
Browse files

Change eob threshold for partial inverse 8x8 2D-DCT to 12

The scanning order has the first 12 coefficients of the 8x8 2D-DCT
sitting in the top left 4x4 block. Hence the partial inverse 8x8
2D-DCT allows to handle cases with eob below 12.

The overall runtime of the inverse 8x8 2D-DCT unit is reduced from
166 cycles (using SSE2) to 150 cycles (using SSSE3).

Change-Id: I4514f9748042809ac84df4c14382c00f313f1cd2
parent 9e7b09bc
...@@ -132,8 +132,8 @@ INSTANTIATE_TEST_CASE_P( ...@@ -132,8 +132,8 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct16x16_1_add_c, &vp9_idct16x16_1_add_c,
TX_16X16, 1), TX_16X16, 1),
make_tuple(&vp9_idct8x8_64_add_c, make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_10_add_c, &vp9_idct8x8_12_add_c,
TX_8X8, 10), TX_8X8, 12),
make_tuple(&vp9_idct8x8_64_add_c, make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_c, &vp9_idct8x8_1_add_c,
TX_8X8, 1), TX_8X8, 1),
...@@ -154,8 +154,8 @@ INSTANTIATE_TEST_CASE_P( ...@@ -154,8 +154,8 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct16x16_1_add_neon, &vp9_idct16x16_1_add_neon,
TX_16X16, 1), TX_16X16, 1),
make_tuple(&vp9_idct8x8_64_add_c, make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_10_add_neon, &vp9_idct8x8_12_add_neon,
TX_8X8, 10), TX_8X8, 12),
make_tuple(&vp9_idct8x8_64_add_c, make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_neon, &vp9_idct8x8_1_add_neon,
TX_8X8, 1), TX_8X8, 1),
...@@ -181,8 +181,8 @@ INSTANTIATE_TEST_CASE_P( ...@@ -181,8 +181,8 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct16x16_1_add_sse2, &vp9_idct16x16_1_add_sse2,
TX_16X16, 1), TX_16X16, 1),
make_tuple(&vp9_idct8x8_64_add_c, make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_10_add_sse2, &vp9_idct8x8_12_add_sse2,
TX_8X8, 10), TX_8X8, 12),
make_tuple(&vp9_idct8x8_64_add_c, make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_sse2, &vp9_idct8x8_1_add_sse2,
TX_8X8, 1), TX_8X8, 1),
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
; ;
EXPORT |vp9_idct8x8_64_add_neon| EXPORT |vp9_idct8x8_64_add_neon|
EXPORT |vp9_idct8x8_10_add_neon| EXPORT |vp9_idct8x8_12_add_neon|
ARM ARM
REQUIRE8 REQUIRE8
PRESERVE8 PRESERVE8
...@@ -310,13 +310,13 @@ ...@@ -310,13 +310,13 @@
bx lr bx lr
ENDP ; |vp9_idct8x8_64_add_neon| ENDP ; |vp9_idct8x8_64_add_neon|
;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
; ;
; r0 int16_t input ; r0 int16_t input
; r1 uint8_t *dest ; r1 uint8_t *dest
; r2 int dest_stride) ; r2 int dest_stride)
|vp9_idct8x8_10_add_neon| PROC |vp9_idct8x8_12_add_neon| PROC
push {r4-r9} push {r4-r9}
vpush {d8-d15} vpush {d8-d15}
vld1.s16 {q8,q9}, [r0]! vld1.s16 {q8,q9}, [r0]!
...@@ -514,6 +514,6 @@ ...@@ -514,6 +514,6 @@
vpop {d8-d15} vpop {d8-d15}
pop {r4-r9} pop {r4-r9}
bx lr bx lr
ENDP ; |vp9_idct8x8_10_add_neon| ENDP ; |vp9_idct8x8_12_add_neon|
END END
...@@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, ...@@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
} }
} }
void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
int dest_stride) { int dest_stride) {
DECLARE_ALIGNED(32, int16_t, out[8 * 8]); DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out; int16_t *outptr = out;
......
...@@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, ...@@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
} }
} }
void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8] = { 0 }; int16_t out[8 * 8] = { 0 };
int16_t *outptr = out; int16_t *outptr = out;
int i, j; int i, j;
...@@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { ...@@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
if (eob == 1) if (eob == 1)
// DC only DCT coefficient // DC only DCT coefficient
vp9_idct8x8_1_add(input, dest, stride); vp9_idct8x8_1_add(input, dest, stride);
else if (eob <= 10) else if (eob <= 12)
vp9_idct8x8_10_add(input, dest, stride); vp9_idct8x8_12_add(input, dest, stride);
else else
vp9_idct8x8_64_add(input, dest, stride); vp9_idct8x8_64_add(input, dest, stride);
} }
......
...@@ -312,8 +312,8 @@ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/; ...@@ -312,8 +312,8 @@ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64"; specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64"; specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/; specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
......
...@@ -995,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, ...@@ -995,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
RECON_AND_STORE(dest, in[7]); RECON_AND_STORE(dest, in[7]);
} }
void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4); const __m128i final_rounding = _mm_set1_epi16(1<<4);
......
...@@ -185,7 +185,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride ...@@ -185,7 +185,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
RET RET
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
cglobal idct8x8_10_add, 3, 5, 13, input, output, stride cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192] mova m8, [pd_8192]
mova m11, [pw_16] mova m11, [pw_16]
mova m12, [pw_11585x2] mova m12, [pw_11585x2]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment