Commit 6c17c9fa authored by Yunqing Wang's avatar Yunqing Wang

Optimize 16x16 dequant and idct

As suggested by Yaowu, simplified 16x16 dequant and idct. In decoder,
after detoken step, we know the number of non-zero dct coefficients
(eobs) in a macroblock. Idct calculation can be skipped or simplified
based on eobs, which improves the decoder performance.

Change-Id: I9ffa1cb134bcb5a7d64fcf90c81871a96d1b4018
parent 4626faf1
......@@ -32,6 +32,7 @@ void vp9_machine_specific_config(VP9_COMMON *ctx) {
rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
rtcd->idct.ihaar2 = vp9_short_ihaar2x2_c;
rtcd->idct.idct16x16 = vp9_short_idct16x16_c;
rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;
rtcd->subpix.eighttap16x16 = vp9_eighttap_predict16x16_c;
rtcd->subpix.eighttap8x8 = vp9_eighttap_predict8x8_c;
......
......@@ -50,6 +50,11 @@
#endif
extern prototype_idct(vp9_idct_idct16x16);
#ifndef vp9_idct_idct10_16x16
#define vp9_idct_idct10_16x16 vp9_short_idct10_16x16_c
#endif
extern prototype_idct(vp9_idct_idct10_16x16);
#ifndef vp9_idct_idct8
#define vp9_idct_idct8 vp9_short_idct8x8_c
#endif
......@@ -133,6 +138,7 @@ typedef struct {
vp9_idct_fn_t ihaar2_1;
vp9_idct_fn_t idct16x16;
vp9_idct_fn_t idct10_16x16;
} vp9_idct_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
......
......@@ -1502,6 +1502,161 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
output[j * 16 + i] = temp_out[j];
}
}
/* The following function is called when we know the maximum number of non-zero
* dct coefficients is less or equal 10.
*/
static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],
int last_shift_bits) {
int16_t step[16] = {0};
int intermediate[16] = {0};
int temp1, temp2;
int last_rounding = 0;
if (last_shift_bits > 0)
last_rounding = 1 << (last_shift_bits - 1);
// step 1 and 2
step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
// for odd input
temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 *= C8;
intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 *= C8;
intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
// step 3
output[0] = step[ 0];
output[1] = step[ 1];
output[2] = step[ 1];
output[3] = step[ 0];
temp1 = step[ 4] * C14;
output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = step[ 4] * C2;
output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = step[ 5] * C10;
output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = step[ 5] * C6;
output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
output[8] = step[ 8] + step[11];
output[9] = step[ 9] + step[10];
output[10] = step[ 9] - step[10];
output[11] = step[ 8] - step[11];
output[12] = step[12] + step[15];
output[13] = step[13] + step[14];
output[14] = step[13] - step[14];
output[15] = step[12] - step[15];
// output 4
step[ 0] = output[0] + output[7];
step[ 1] = output[1] + output[6];
step[ 2] = output[2] + output[5];
step[ 3] = output[3] + output[4];
step[ 4] = output[3] - output[4];
step[ 5] = output[2] - output[5];
step[ 6] = output[1] - output[6];
step[ 7] = output[0] - output[7];
temp1 = output[8] * C7;
temp2 = output[15] * C9;
step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[9] * C11;
temp2 = output[14] * C5;
step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[10] * C3;
temp2 = output[13] * C13;
step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[11] * C15;
temp2 = output[12] * C1;
step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[11] * C1;
temp2 = output[12] * C15;
step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[10] * C13;
temp2 = output[13] * C3;
step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[9] * C5;
temp2 = output[14] * C11;
step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
temp1 = output[8] * C9;
temp2 = output[15] * C7;
step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
// step 5
output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
}
void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
int16_t out[16 * 16];
int16_t *outptr = &out[0];
const int short_pitch = pitch >> 1;
int i, j;
int16_t temp_in[16], temp_out[16];
/* First transform rows. Since all non-zero dct coefficients are in
* upper-left 4x4 area, we only need to calculate first 4 rows here.
*/
vpx_memset(out, 0, sizeof(out));
for (i = 0; i < 4; ++i) {
butterfly_16x16_idct10_1d(input, outptr, 0);
input += short_pitch;
outptr += 16;
}
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
butterfly_16x16_idct10_1d(temp_in, temp_out, 3);
for (j = 0; j < 16; ++j)
output[j*16 + i] = temp_out[j];
}
}
#undef INITIAL_SHIFT
#undef INITIAL_ROUNDING
#undef RIGHT_SHIFT
......
......@@ -54,7 +54,7 @@ specialize vp9_dequant_idct_add_y_block_8x8
prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs, struct macroblockd *xd"
specialize vp9_dequant_idct_add_uv_block_8x8
prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
specialize vp9_dequant_idct_add_16x16
prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
......
......@@ -401,7 +401,7 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
} else {
vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
xd->predictor, xd->dst.y_buffer,
16, xd->dst.y_stride);
16, xd->dst.y_stride, xd->eobs[0]);
}
} else if (tx_size == TX_8X8) {
#if CONFIG_SUPERBLOCKS
......
......@@ -19,6 +19,28 @@
extern int dec_debug;
#endif
static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
int stride, int width, int height) {
int r, c;
for (r = 0; r < height; r++) {
for (c = 0; c < width; c++) {
int a = diff[c] + pred[c];
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
dest[c] = (uint8_t) a;
}
dest += stride;
diff += width;
pred += pitch;
}
}
void vp9_dequantize_b_c(BLOCKD *d) {
int i;
......@@ -37,7 +59,6 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int r, c;
int i;
for (i = 0; i < 16; i++) {
......@@ -48,23 +69,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
......@@ -115,7 +120,6 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int r, c;
int i;
for (i = 0; i < 16; i++) {
......@@ -127,23 +131,7 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
......@@ -152,7 +140,6 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
int i;
short output[16];
short *diff_ptr = output;
int r, c;
input[0] = (short)Dc;
......@@ -165,23 +152,7 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
#if CONFIG_LOSSLESS
......@@ -190,7 +161,6 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int r, c;
int i;
for (i = 0; i < 16; i++) {
......@@ -201,23 +171,7 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
......@@ -227,7 +181,6 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
int i;
short output[16];
short *diff_ptr = output;
int r, c;
input[0] = (short)dc;
......@@ -238,23 +191,7 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
#endif
......@@ -461,7 +398,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
int pitch, int stride) {
short output[256];
short *diff_ptr = output;
int r, c, i;
int i;
input[0]= input[0] * dq[0];
......@@ -477,55 +414,80 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
vpx_memset(input, 0, 512);
for (r = 0; r < 16; r++) {
for (c = 0; c < 16; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 16;
pred += pitch;
}
recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
}
void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride) {
short output[256];
short *diff_ptr = output;
void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride,
uint16_t eobs) {
int16_t output[256];
int16_t *diff_ptr = output;
int r, c, i;
input[0]= input[0] * dq[0];
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eobs == 0) {
/* All 0 DCT coefficient */
vp9_copy_mem16x16(pred, pitch, dest, stride);
} else if (eobs == 1) {
/* DC only DCT coefficient. */
int16_t out;
// recover quantizer for 4 4x4 blocks
for (i = 1; i < 256; i++)
input[i] = input[i] * dq[1];
out = (input[0] * dq[0] + 2) >> 2;
out = (out + 2) >> 2;
out = (out + 4) >> 3;
// the idct halves ( >> 1) the pitch
vp9_short_idct16x16_c(input, output, 32);
input[0] = 0;
vpx_memset(input, 0, 512);
for (r = 0; r < 16; r++) {
for (c = 0; c < 16; c++) {
int a = out + pred[c];
for (r = 0; r < 16; r++) {
for (c = 0; c < 16; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
dest[c] = (uint8_t) a;
}
dest[c] = (unsigned char) a;
dest += stride;
pred += pitch;
}
dest += stride;
diff_ptr += 16;
pred += pitch;
} else if (eobs <= 10) {
input[0]= input[0] * dq[0];
input[1] = input[1] * dq[1];
input[2] = input[2] * dq[1];
input[3] = input[3] * dq[1];
input[16] = input[16] * dq[1];
input[17] = input[17] * dq[1];
input[18] = input[18] * dq[1];
input[32] = input[32] * dq[1];
input[33] = input[33] * dq[1];
input[48] = input[48] * dq[1];
// the idct halves ( >> 1) the pitch
vp9_short_idct10_16x16_c(input, output, 32);
input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0;
input[32] = input[33] = 0;
input[48] = 0;
recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
} else {
input[0]= input[0] * dq[0];
// recover quantizer for 4 4x4 blocks
for (i = 1; i < 256; i++)
input[i] = input[i] * dq[1];
// the idct halves ( >> 1) the pitch
vp9_short_idct16x16_c(input, output, 32);
vpx_memset(input, 0, 512);
recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment