Commit 4c7c15ee authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review
Browse files

Merge "Optimize 8x8 dequant and idct" into experimental

parents 7bb2afa1 e60478d4
......@@ -29,10 +29,11 @@ void vp9_machine_specific_config(VP9_COMMON *ctx) {
rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_c;
rtcd->idct.idct8 = vp9_short_idct8x8_c;
rtcd->idct.idct10_8 = vp9_short_idct10_8x8_c;
rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
rtcd->idct.ihaar2 = vp9_short_ihaar2x2_c;
rtcd->idct.idct16x16 = vp9_short_idct16x16_c;
rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;
rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;
rtcd->subpix.eighttap16x16 = vp9_eighttap_predict16x16_c;
rtcd->subpix.eighttap8x8 = vp9_eighttap_predict8x8_c;
......
......@@ -60,6 +60,11 @@ extern prototype_idct(vp9_idct_idct10_16x16);
#endif
extern prototype_idct(vp9_idct_idct8);
#ifndef vp9_idct_idct10_8
#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c
#endif
extern prototype_idct(vp9_idct_idct10_8);
#ifndef vp9_idct_idct8_1
#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
#endif
......@@ -132,6 +137,7 @@ typedef struct {
vp9_second_order_fn_t iwalsh16;
vp9_idct_fn_t idct8;
vp9_idct_fn_t idct10_8;
vp9_idct_fn_t idct8_1;
vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
vp9_idct_fn_t ihaar2;
......
......@@ -967,6 +967,127 @@ void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
}
}
/* Row IDCT when only first 4 coefficients are non-zero. */
static void idctrow10(int *blk) {
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
(x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
= blk[5] = blk[6] = blk[7] = blk[0] << 3;
return;
}
x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
/* first stage */
x5 = W7 * x4;
x4 = W1 * x4;
x6 = W3 * x7;
x7 = -W5 * x7;
/* second stage */
x2 = W6 * x3;
x3 = W2 * x3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
/* third stage */
x7 = x0 + x3;
x8 = x0 - x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
/* fourth stage */
blk[0] = (x7 + x1) >> 8;
blk[1] = (x3 + x2) >> 8;
blk[2] = (x0 + x4) >> 8;
blk[3] = (x8 + x6) >> 8;
blk[4] = (x8 - x6) >> 8;
blk[5] = (x0 - x4) >> 8;
blk[6] = (x3 - x2) >> 8;
blk[7] = (x7 - x1) >> 8;
}
/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
static void idctcol10(int *blk) {
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
(x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
(x7 = blk[8 * 3]))) {
blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
= blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
= blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
return;
}
x0 = (blk[8 * 0] << 8) + 16384;
/* first stage */
x5 = (W7 * x4 + 4) >> 3;
x4 = (W1 * x4 + 4) >> 3;
x6 = (W3 * x7 + 4) >> 3;
x7 = (-W5 * x7 + 4) >> 3;
/* second stage */
x2 = (W6 * x3 + 4) >> 3;
x3 = (W2 * x3 + 4) >> 3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
/* third stage */
x7 = x0 + x3;
x8 = x0 - x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
/* fourth stage */
blk[8 * 0] = (x7 + x1) >> 14;
blk[8 * 1] = (x3 + x2) >> 14;
blk[8 * 2] = (x0 + x4) >> 14;
blk[8 * 3] = (x8 + x6) >> 14;
blk[8 * 4] = (x8 - x6) >> 14;
blk[8 * 5] = (x0 - x4) >> 14;
blk[8 * 6] = (x3 - x2) >> 14;
blk[8 * 7] = (x7 - x1) >> 14;
}
void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
int X[TX_DIM * TX_DIM];
int i, j;
int shortpitch = pitch >> 1;
for (i = 0; i < TX_DIM; i++) {
for (j = 0; j < TX_DIM; j++) {
X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+ (coefs[i * TX_DIM + j] < 0)) >> 2;
}
}
/* Do first 4 row idct only since non-zero dct coefficients are all in
* upper-left 4x4 area. */
for (i = 0; i < 4; i++)
idctrow10(X + 8 * i);
for (i = 0; i < 8; i++)
idctcol10(X + i);
for (i = 0; i < TX_DIM; i++) {
for (j = 0; j < TX_DIM; j++) {
block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
}
}
}
void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
int i;
......
......@@ -57,12 +57,9 @@ specialize vp9_dequant_idct_add_uv_block_8x8
prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
specialize vp9_dequant_idct_add_16x16
prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
specialize vp9_dequant_idct_add_8x8
prototype void vp9_dequant_dc_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
specialize vp9_dequant_dc_idct_add_8x8
prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
specialize vp9_dequant_idct_add
......
......@@ -461,7 +461,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
vp9_ht_dequant_idct_add_8x8_c(tx_type,
q, dq, pre, dst, 16, stride);
} else {
vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0,
xd->eobs[idx]);
}
q += 64;
} else {
......
......@@ -19,8 +19,8 @@
extern int dec_debug;
#endif
static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
int stride, int width, int height) {
static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
uint8_t *dest, int stride, int width, int height) {
int r, c;
for (r = 0; r < height; r++) {
......@@ -41,12 +41,34 @@ static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
}
}
static void add_constant_residual(const int16_t diff, const uint8_t *pred,
int pitch, uint8_t *dest, int stride,
int width, int height) {
int r, c;
for (r = 0; r < height; r++) {
for (c = 0; c < width; c++) {
int a = diff + pred[c];
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
dest[c] = (uint8_t) a;
}
dest += stride;
pred += pitch;
}
}
void vp9_dequantize_b_c(BLOCKD *d) {
int i;
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
short *DQC = d->dequant;
int16_t *DQ = d->dqcoeff;
int16_t *Q = d->qcoeff;
int16_t *DQC = d->dequant;
for (i = 0; i < 16; i++) {
DQ[i] = Q[i] * DQC[i];
......@@ -54,11 +76,11 @@ void vp9_dequantize_b_c(BLOCKD *d) {
}
void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
unsigned char *pred, unsigned char *dest,
void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
uint8_t *pred, uint8_t *dest,
int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
int i;
for (i = 0; i < 16; i++) {
......@@ -69,18 +91,15 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
unsigned char *pred, unsigned char *dest,
void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
uint8_t *pred, uint8_t *dest,
int pitch, int stride) {
short output[64];
short *diff_ptr = output;
int b, r, c;
int16_t output[64];
int16_t *diff_ptr = output;
int i;
unsigned char *origdest = dest;
unsigned char *origpred = pred;
input[0] = dq[0] * input[0];
for (i = 1; i < 64; i++) {
......@@ -91,35 +110,13 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
vpx_memset(input, 0, 128);
for (b = 0; b < 4; b++) {
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 8;
pred += pitch;
}
// shift buffer pointers to next 4x4 block in the submacroblock
diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
}
add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
}
void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride) {
short output[16];
short *diff_ptr = output;
void vp9_dequant_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride) {
int16_t output[16];
int16_t *diff_ptr = output;
int i;
for (i = 0; i < 16; i++) {
......@@ -131,17 +128,17 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
void vp9_dequant_dc_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride,
int Dc) {
int i;
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
input[0] = (short)Dc;
input[0] = (int16_t)Dc;
for (i = 1; i < 16; i++) {
input[i] = dq[i] * input[i];
......@@ -152,15 +149,15 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
#if CONFIG_LOSSLESS
void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
unsigned char *pred, unsigned char *dest,
void vp9_dequant_idct_add_lossless_c(int16_t *input, int16_t *dq,
uint8_t *pred, uint8_t *dest,
int pitch, int stride) {
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
int i;
for (i = 0; i < 16; i++) {
......@@ -171,18 +168,18 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
unsigned char *pred,
unsigned char *dest,
void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, int16_t *dq,
uint8_t *pred,
uint8_t *dest,
int pitch, int stride, int dc) {
int i;
short output[16];
short *diff_ptr = output;
int16_t output[16];
int16_t *diff_ptr = output;
input[0] = (short)dc;
input[0] = (int16_t)dc;
for (i = 1; i < 16; i++) {
input[i] = dq[i] * input[i];
......@@ -191,18 +188,18 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
vpx_memset(input, 0, 32);
recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
}
#endif
void vp9_dequantize_b_2x2_c(BLOCKD *d) {
int i;
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
short *DQC = d->dequant;
int16_t *DQ = d->dqcoeff;
int16_t *Q = d->qcoeff;
int16_t *DQC = d->dequant;
for (i = 0; i < 16; i++) {
DQ[i] = (short)((Q[i] * DQC[i]));
DQ[i] = (int16_t)((Q[i] * DQC[i]));
}
#ifdef DEC_DEBUG
if (dec_debug) {
......@@ -216,14 +213,12 @@ void vp9_dequantize_b_2x2_c(BLOCKD *d) {
#endif
}
void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride) {
short output[64];
short *diff_ptr = output;
int r, c, b;
void vp9_dequant_idct_add_8x8_c(int16_t *input, int16_t *dq, uint8_t *pred,
uint8_t *dest, int pitch, int stride,
int dc, uint16_t eobs) {
int16_t output[64];
int16_t *diff_ptr = output;
int i;
unsigned char *origdest = dest;
unsigned char *origpred = pred;
#ifdef DEC_DEBUG
if (dec_debug) {
......@@ -236,101 +231,57 @@ void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
}
#endif
input[0] = input[0] * dq[0];
/* If dc is 1, then input[0] is the reconstructed value, do not need
* dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
*/
if (!dc)
input[0] *= dq[0];
// recover quantizer for 4 4x4 blocks
for (i = 1; i < 64; i++) {
input[i] = input[i] * dq[1];
}
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
printf("Input DQ 8x8\n");
for (j = 0; j < 64; j++) {
printf("%d ", input[j]);
if (j % 8 == 7) printf("\n");
}
}
#endif
// the idct halves ( >> 1) the pitch
vp9_short_idct8x8_c(input, output, 16);
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
printf("Output 8x8\n");
for (j = 0; j < 64; j++) {
printf("%d ", output[j]);
if (j % 8 == 7) printf("\n");
}
}
#endif
vpx_memset(input, 0, 128);// test what should i put here
for (b = 0; b < 4; b++) {
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to decide what to do.
* TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
* Combine that with code here.
*/
if (eobs == 0) {
/* All 0 DCT coefficient */
vp9_copy_mem8x8(pred, pitch, dest, stride);
} else if (eobs == 1) {
/* DC only DCT coefficient. */
int16_t out;
if (a < 0)
a = 0;
/* Note: the idct1 will need to be modified accordingly whenever
* vp9_short_idct8x8_c() is modified. */
out = (input[0] + 1 + (input[0] < 0)) >> 2;
out = out << 3;
out = (out + 32) >> 7;
if (a > 255)
a = 255;
input[0] = 0;
dest[c] = (unsigned char) a;
}
add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
} else if (eobs <= 10) {
input[1] = input[1] * dq[1];
input[2] = input[2] * dq[1];
input[3] = input[3] * dq[1];
input[8] = input[8] * dq[1];
input[9] = input[9] * dq[1];
input[10] = input[10] * dq[1];
input[16] = input[16] * dq[1];
input[17] = input[17] * dq[1];
input[24] = input[24] * dq[1];
dest += stride;
diff_ptr += 8;
pred += pitch;
}
diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
}
#ifdef DEC_DEBUG
if (dec_debug) {
int k, j;
printf("Final 8x8\n");
for (j = 0; j < 8; j++) {
for (k = 0; k < 8; k++) {
printf("%d ", origdest[k]);
}
printf("\n");
origdest += stride;
}
}
#endif
}
vp9_short_idct10_8x8_c(input, output, 16);
void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
int Dc) { // Dc for 1st order T in some rear case
short output[64];
short *diff_ptr = output;
int r, c, b;
int i;
unsigned char *origdest = dest;
unsigned char *origpred = pred;
input[0] = input[1] = input[2] = input[3] = 0;
input[8] = input[9] = input[10] = 0;
input[16] = input[17] = 0;
input[24] = 0;
input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
// dc value is recovered after dequantization, since dc need not quantization
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
printf("Input 8x8\n");
for (j = 0; j < 64; j++) {
printf("%d ", input[j]);
if (j % 8 == 7) printf("\n");
add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
} else {
// recover quantizer for 4 4x4 blocks
for (i = 1; i < 64; i++) {
input[i] = input[i] * dq[1];
}
}
#endif
for (i = 1; i < 64; i++) {
input[i] = input[i] * dq[1];
}
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
......@@ -342,8 +293,8 @@ void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
}
#endif
// the idct halves ( >> 1) the pitch
vp9_short_idct8x8_c(input, output, 16);
// the idct halves ( >> 1) the pitch
vp9_short_idct8x8_c(input, output, 16);
#ifdef DEC_DEBUG
if (dec_debug) {
int j;
......@@ -354,30 +305,11 @@ void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
}
}
#endif
vpx_memset(input, 0, 128);
for (b = 0; b < 4; b++) {
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int a = diff_ptr[c] + pred[c];
vpx_memset(input, 0, 128);
if (a < 0)