Commit e60478d4 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

Optimize 8x8 dequant and idct

Similar to 16x16 dequant and idct, based on the value of eobs, the
8x8 dequant and idct calculation was simplified to improve decorder
performance.

Combined vp9_dequant_idct_add_8x8 and vp9_dequant_dc_idct_add_8x8
to eliminate duplicate code.

Change-Id: Ia58e50ab27f7012b7379c495837c9c0b5ba9cf7f
parent 5d65614f
......@@ -29,6 +29,7 @@ void vp9_machine_specific_config(VP9_COMMON *ctx) {
rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_c;
rtcd->idct.idct8 = vp9_short_idct8x8_c;
rtcd->idct.idct10_8 = vp9_short_idct10_8x8_c;
rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
rtcd->idct.ihaar2 = vp9_short_ihaar2x2_c;
rtcd->idct.idct16x16 = vp9_short_idct16x16_c;
......
......@@ -60,6 +60,11 @@ extern prototype_idct(vp9_idct_idct10_16x16);
#endif
extern prototype_idct(vp9_idct_idct8);
#ifndef vp9_idct_idct10_8
#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c
#endif
extern prototype_idct(vp9_idct_idct10_8);
#ifndef vp9_idct_idct8_1
#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
#endif
......@@ -132,6 +137,7 @@ typedef struct {
vp9_second_order_fn_t iwalsh16;
vp9_idct_fn_t idct8;
vp9_idct_fn_t idct10_8;
vp9_idct_fn_t idct8_1;
vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
vp9_idct_fn_t ihaar2;
......
......@@ -967,6 +967,127 @@ void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
}
}
/* Row IDCT when only first 4 coefficients are non-zero. */
static void idctrow10(int *blk) {
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
(x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
= blk[5] = blk[6] = blk[7] = blk[0] << 3;
return;
}
x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
/* first stage */
x5 = W7 * x4;
x4 = W1 * x4;
x6 = W3 * x7;
x7 = -W5 * x7;
/* second stage */
x2 = W6 * x3;
x3 = W2 * x3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
/* third stage */
x7 = x0 + x3;
x8 = x0 - x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
/* fourth stage */
blk[0] = (x7 + x1) >> 8;
blk[1] = (x3 + x2) >> 8;
blk[2] = (x0 + x4) >> 8;
blk[3] = (x8 + x6) >> 8;
blk[4] = (x8 - x6) >> 8;
blk[5] = (x0 - x4) >> 8;
blk[6] = (x3 - x2) >> 8;
blk[7] = (x7 - x1) >> 8;
}
/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
static void idctcol10(int *blk) {
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
(x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
(x7 = blk[8 * 3]))) {
blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
= blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
= blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
return;
}
x0 = (blk[8 * 0] << 8) + 16384;
/* first stage */
x5 = (W7 * x4 + 4) >> 3;
x4 = (W1 * x4 + 4) >> 3;
x6 = (W3 * x7 + 4) >> 3;
x7 = (-W5 * x7 + 4) >> 3;
/* second stage */
x2 = (W6 * x3 + 4) >> 3;
x3 = (W2 * x3 + 4) >> 3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
/* third stage */
x7 = x0 + x3;
x8 = x0 - x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
/* fourth stage */
blk[8 * 0] = (x7 + x1) >> 14;
blk[8 * 1] = (x3 + x2) >> 14;
blk[8 * 2] = (x0 + x4) >> 14;
blk[8 * 3] = (x8 + x6) >> 14;
blk[8 * 4] = (x8 - x6) >> 14;
blk[8 * 5] = (x0 - x4) >> 14;
blk[8 * 6] = (x3 - x2) >> 14;
blk[8 * 7] = (x7 - x1) >> 14;
}
void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
int X[TX_DIM * TX_DIM];
int i, j;
int shortpitch = pitch >> 1;
for (i = 0; i < TX_DIM; i++) {
for (j = 0; j < TX_DIM; j++) {
X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+ (coefs[i * TX_DIM + j] < 0)) >> 2;
}
}
/* Do first 4 row idct only since non-zero dct coefficients are all in
* upper-left 4x4 area. */
for (i = 0; i < 4; i++)
idctrow10(X + 8 * i);
for (i = 0; i < 8; i++)
idctcol10(X + i);
for (i = 0; i < TX_DIM; i++) {
for (j = 0; j < TX_DIM; j++) {
block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
}
}
}
void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
int i;
......
......@@ -57,12 +57,9 @@ specialize vp9_dequant_idct_add_uv_block_8x8
prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
specialize vp9_dequant_idct_add_16x16
prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
specialize vp9_dequant_idct_add_8x8
prototype void vp9_dequant_dc_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
specialize vp9_dequant_dc_idct_add_8x8
prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
specialize vp9_dequant_idct_add
......
......@@ -442,7 +442,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
vp9_ht_dequant_idct_add_8x8_c(tx_type,
q, dq, pre, dst, 16, stride);
} else {
vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0,
xd->eobs[idx]);
}
q += 64;
} else {
......
......@@ -19,8 +19,8 @@
extern int dec_debug;
#endif
static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
int stride, int width, int height) {
static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
uint8_t *dest, int stride, int width, int height) {
int r, c;
for (r = 0; r < height; r++) {
......@@ -41,12 +41,34 @@ static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
}
}
static void add_constant_residual(const int16_t diff, const uint8_t *pred,
int pitch, uint8_t *dest, int stride,
int width, int height) {
int r, c;
for (r = 0; r < height; r++) {
for (c = 0; c < width; c++) {
int a = diff + pred[c];
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
dest[c] = (uint8_t) a;
}
dest += stride;
pred += pitch;
}
}
void vp9_dequantize_b_c(BLOCKD *d) {
int i;
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
short *DQC = d->dequant;
int16_t *DQ = d->dqcoeff;
int16_t *Q = d->qcoeff;
int16_t *DQC = d->dequant;
for (i = 0; i < 16; i++) {
DQ[i] = Q[i] * DQC[i];
......@@ -54,11 +76,11 @@ void vp9_dequantize_b_c(BLOCKD *d) {
}
void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
unsigned char *pred, unsigned char *dest,
void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
uint8_t *pred, uint8_t *dest,
int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
int i;
for (i = 0; i < 16; i++) {
......@@ -69,18 +91,15 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
unsigned char *pred, unsigned char *dest,
void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
uint8_t *pred, uint8_t *dest,
int pitch, int stride) {
short output[64];
short *diff_ptr = output;
int b, r, c;
int16_t output[64];
int16_t *diff_ptr = output;
int i;
unsigned char *origdest = dest;
unsigned char *origpred = pred;
input[0] = dq[0] * input[0];
for (i = 1; i < 64; i++) {
......@@ -91,35 +110,13 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
vpx_memset(input, 0, 128);
for (b = 0; b < 4; b++) {
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 8;
pred += pitch;
}
// shift buffer pointers to next 4x4 block in the submacroblock
diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
}
add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
}
void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride) {
short output[16];
short *diff_ptr = output;
void vp9_dequant_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride) {
int16_t output[16];
int16_t *diff_ptr = output;
int i;
for (i = 0; i < 16; i++) {
......@@ -131,17 +128,17 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
void vp9_dequant_dc_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride,
int Dc) {
int i;
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
input[0] = (short)Dc;
input[0] = (int16_t)Dc;
for (i = 1; i < 16; i++) {
input[i] = dq[i] * input[i];
......@@ -152,15 +149,15 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
#if CONFIG_LOSSLESS
void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
unsigned char *pred, unsigned char *dest,
void vp9_dequant_idct_add_lossless_c(int16_t *input, int16_t *dq,
uint8_t *pred, uint8_t *dest,
int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
int i;
for (i = 0; i < 16; i++) {
......@@ -171,18 +168,18 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
unsigned char *pred,
unsigned char *dest,
void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, int16_t *dq,
uint8_t *pred,
uint8_t *dest,
int pitch, int stride, int dc) {
int i;
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
input[0] = (short)dc;
input[0] = (int16_t)dc;
for (i = 1; i < 16; i++) {
input[i] = dq[i] * input[i];
......@@ -191,18 +188,18 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
#endif
void vp9_dequantize_b_2x2_c(BLOCKD *d) {
int i;
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
short *DQC = d->dequant;
int16_t *DQ = d->dqcoeff;
int16_t *Q = d->qcoeff;
int16_t *DQC = d->dequant;
for (i = 0; i < 16; i++) {
DQ[i] = (short)((Q[i] * DQC[i]));
DQ[i] = (int16_t)((Q[i] * DQC[i]));
}
#ifdef DEC_DEBUG
if (dec_debug) {
......@@ -216,14 +213,12 @@ void vp9_dequantize_b_2x2_c(BLOCKD *d) {
#endif
}
void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride) {
short output[64];
short *diff_ptr = output;
int r, c, b;
void vp9_dequant_idct_add_8x8_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride,
int dc, uint16_t eobs) {
int16_t output[64];
int16_t *diff_ptr = output;
int i;
unsigned char *origdest = dest;
unsigned char *origpred = pred;
#ifdef DEC_DEBUG
if (dec_debug) {
......@@ -236,101 +231,57 @@ void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
}
#endif
input[0] = input[0] * dq[0];
// recover quantizer for 4 4x4 blocks
for (i = 1; i < 64; i++) {
input[i] = input[i] * dq[1];
}
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
printf("Input DQ 8x8\n");
for (j = 0; j < 64; j++) {
printf("%d ", input[j]);
if (j % 8 == 7) printf("\n");
}
}
#endif
// the idct halves ( >> 1) the pitch
vp9_short_idct8x8_c(input, output, 16);
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
printf("Output 8x8\n");
for (j = 0; j < 64; j++) {
printf("%d ", output[j]);
if (j % 8 == 7) printf("\n");
}
}
#endif
vpx_memset(input, 0, 128);// test what should i put here
/* If dc is 1, then input[0] is the reconstructed value, do not need
* dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
*/
if (!dc)
input[0] *= dq[0];
for (b = 0; b < 4; b++) {
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to decide what to do.
* TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
* Combine that with code here.
*/
if (eobs == 0) {
/* All 0 DCT coefficient */
vp9_copy_mem8x8(pred, pitch, dest, stride);
} else if (eobs == 1) {
/* DC only DCT coefficient. */
int16_t out;
if (a < 0)
a = 0;
/* Note: the idct1 will need to be modified accordingly whenever
* vp9_short_idct8x8_c() is modified. */
out = (input[0] + 1 + (input[0] < 0)) >> 2;
out = out << 3;
out = (out + 32) >> 7;
if (a > 255)
a = 255;
input[0] = 0;
dest[c] = (unsigned char) a;
}
add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
} else if (eobs <= 10) {
input[1] = input[1] * dq[1];
input[2] = input[2] * dq[1];
input[3] = input[3] * dq[1];
input[8] = input[8] * dq[1];
input[9] = input[9] * dq[1];
input[10] = input[10] * dq[1];
input[16] = input[16] * dq[1];
input[17] = input[17] * dq[1];
input[24] = input[24] * dq[1];
dest += stride;
diff_ptr += 8;
pred += pitch;
}
diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
}
#ifdef DEC_DEBUG
if (dec_debug) {
int k, j;
printf("Final 8x8\n");
for (j = 0; j < 8; j++) {
for (k = 0; k < 8; k++) {
printf("%d ", origdest[k]);
}
printf("\n");
origdest += stride;
}
}
#endif
}
vp9_short_idct10_8x8_c(input, output, 16);
void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
int Dc) { // Dc for 1st order T in some rear case
short output[64];
short *diff_ptr = output;
int r, c, b;
int i;
unsigned char *origdest = dest;
unsigned char *origpred = pred;
input[0] = input[1] = input[2] = input[3] = 0;
input[8] = input[9] = input[10] = 0;
input[16] = input[17] = 0;
input[24] = 0;
input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
// dc value is recovered after dequantization, since dc need not quantization
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
printf("Input 8x8\n");
for (j = 0; j < 64; j++) {
printf("%d ", input[j]);
if (j % 8 == 7) printf("\n");
}
}
#endif
add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
} else {
// recover quantizer for 4 4x4 blocks
for (i = 1; i < 64; i++) {
input[i] = input[i] * dq[1];
}
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
......@@ -354,30 +305,11 @@ void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
}
}
#endif
vpx_memset(input, 0, 128);
for (b = 0; b < 4; b++) {
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
vpx_memset(input, 0, 128);
if (a > 255)
a = 255;
add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 8;
pred += pitch;
}
diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;