Commit b2fa74ac authored by Jeff Muizelaar's avatar Jeff Muizelaar Committed by Fritz Koenig
Browse files

Combine idct and reconstruction steps

This moves the prediction step before the idct and combines the idct and
reconstruction steps into a single step. Combining them seems to give an
overall decoder performance improvement of about 1%.

Change-Id: I90d8b167ec70d79c7ba2ee484106a78b3d16e318
parent 0ce39012
......@@ -15,7 +15,6 @@
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_idct_scalar(vp8_dc_only_idct_armv6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
......@@ -25,9 +24,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
#undef vp8_idct_idct1_scalar
#define vp8_idct_idct1_scalar vp8_dc_only_idct_armv6
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
......@@ -38,7 +34,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar(vp8_dc_only_idct_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
......@@ -48,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
#undef vp8_idct_idct1_scalar
#define vp8_idct_idct1_scalar vp8_dc_only_idct_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
......
......@@ -43,7 +43,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
rtcd->idct.idct1_scalar = vp8_dc_only_idct_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
......@@ -75,7 +74,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
rtcd->idct.idct1_scalar = vp8_dc_only_idct_armv6;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_armv6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_armv6;
......
......@@ -32,7 +32,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c;
rtcd->idct.idct16 = vp8_short_idct4x4llm_c;
rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_c;
......
......@@ -18,8 +18,10 @@
#define prototype_idct(sym) \
void sym(short *input, short *output, int pitch)
#define prototype_idct_scalar(sym) \
void sym(short input, short *output, int pitch)
#define prototype_idct_scalar_add(sym) \
void sym(short input, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride)
#if ARCH_X86 || ARCH_X86_64
#include "x86/idct_x86.h"
......@@ -39,10 +41,10 @@ extern prototype_idct(vp8_idct_idct1);
#endif
extern prototype_idct(vp8_idct_idct16);
#ifndef vp8_idct_idct1_scalar
#define vp8_idct_idct1_scalar vp8_dc_only_idct_c
#ifndef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
#endif
extern prototype_idct_scalar(vp8_idct_idct1_scalar);
extern prototype_idct_scalar_add(vp8_idct_idct1_scalar_add);
#ifndef vp8_idct_iwalsh1
......@@ -56,14 +58,14 @@ extern prototype_second_order(vp8_idct_iwalsh1);
extern prototype_second_order(vp8_idct_iwalsh16);
typedef prototype_idct((*vp8_idct_fn_t));
typedef prototype_idct_scalar((*vp8_idct_scalar_fn_t));
typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t));
typedef prototype_second_order((*vp8_second_order_fn_t));
typedef struct
{
vp8_idct_fn_t idct1;
vp8_idct_fn_t idct16;
vp8_idct_scalar_fn_t idct1_scalar;
vp8_idct_fn_t idct1;
vp8_idct_fn_t idct16;
vp8_idct_scalar_add_fn_t idct1_scalar_add;
vp8_second_order_fn_t iwalsh1;
vp8_second_order_fn_t iwalsh16;
......
......@@ -104,23 +104,30 @@ void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch)
}
}
void vp8_dc_only_idct_c(short input_dc, short *output, int pitch)
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
{
int i;
int a1;
short *op = output;
int shortpitch = pitch >> 1;
a1 = ((input_dc + 4) >> 3);
int a1 = ((input_dc + 4) >> 3);
int r, c;
for (i = 0; i < 4; i++)
for (r = 0; r < 4; r++)
{
op[0] = a1;
op[1] = a1;
op[2] = a1;
op[3] = a1;
op += shortpitch;
for (c = 0; c < 4; c++)
{
int a = a1 + pred_ptr[c] ;
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dst_ptr[c] = (unsigned char) a ;
}
dst_ptr += stride;
pred_ptr += pitch;
}
}
void vp8_short_inv_walsh4x4_c(short *input, short *output)
......
......@@ -22,7 +22,6 @@
#if HAVE_MMX
extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
extern prototype_idct(vp8_short_idct4x4llm_mmx);
extern prototype_idct_scalar(vp8_dc_only_idct_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
......@@ -34,9 +33,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
#undef vp8_idct_idct1_scalar
#define vp8_idct_idct1_scalar vp8_dc_only_idct_mmx
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
......
......@@ -42,7 +42,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
{
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx;
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
rtcd->idct.idct1_scalar = vp8_dc_only_idct_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx;
......
......@@ -14,32 +14,14 @@
#if HAVE_ARMV6
extern prototype_dequant_block(vp8_dequantize_b_v6);
extern prototype_dequant_idct(vp8_dequant_idct_v6);
extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_v6
#undef vp8_dequant_idct
#define vp8_dequant_idct vp8_dequant_idct_v6
#undef vp8_dequant_idct_dc
#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6
#endif
#if HAVE_ARMV7
extern prototype_dequant_block(vp8_dequantize_b_neon);
extern prototype_dequant_idct(vp8_dequant_idct_neon);
extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
#undef vp8_dequant_idct
#define vp8_dequant_idct vp8_dequant_idct_neon
#undef vp8_dequant_idct_dc
#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon
#endif
#endif
......@@ -23,8 +23,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
pbi->mb.rtcd = &pbi->common.rtcd;
#if HAVE_ARMV7
pbi->dequant.block = vp8_dequantize_b_neon;
pbi->dequant.idct = vp8_dequant_idct_neon;
pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
pbi->dboolhuff.debool = vp8dx_decode_bool_c;
......@@ -32,8 +30,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
#elif HAVE_ARMV6
pbi->dequant.block = vp8_dequantize_b_v6;
pbi->dequant.idct = vp8_dequant_idct_v6;
pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
pbi->dboolhuff.debool = vp8dx_decode_bool_c;
......
......@@ -126,7 +126,6 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
}
}
static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
{
/* If the MV points so far into the UMV border that no visible pixels
......@@ -182,8 +181,48 @@ static void clamp_mvs(MACROBLOCKD *xd)
}
static void reconstruct_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
{
int eobtotal = 0;
MV orig_mvs[24];
int i, do_clamp = xd->mbmi.need_to_clamp_mvs;
if (xd->mbmi.mb_skip_coeff)
{
vp8_reset_mb_tokens_context(xd);
}
else
{
eobtotal = vp8_decode_mb_tokens(pbi, xd);
}
/* Perform temporary clamping of the MV to be used for prediction */
if (do_clamp)
{
if (xd->mbmi.mode == SPLITMV)
for (i=0; i<24; i++)
orig_mvs[i] = xd->block[i].bmi.mv.as_mv;
else
{
orig_mvs[0] = xd->mbmi.mv.as_mv;
orig_mvs[1] = xd->block[16].bmi.mv.as_mv;
}
clamp_mvs(xd);
}
xd->mode_info_context->mbmi.dc_diff = 1;
if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV && eobtotal == 0)
{
xd->mode_info_context->mbmi.dc_diff = 0;
skip_recon_mb(pbi, xd);
return;
}
if (xd->segmentation_enabled)
mb_init_dequantizer(pbi, xd);
// do prediction
if (xd->frame_type == KEY_FRAME || xd->mbmi.ref_frame == INTRA_FRAME)
{
vp8_build_intra_predictors_mbuv(xd);
......@@ -191,29 +230,19 @@ static void reconstruct_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
if (xd->mbmi.mode != B_PRED)
{
vp8_build_intra_predictors_mby_ptr(xd);
vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
}
else
{
vp8_recon_intra4x4mb(RTCD_VTABLE(recon), xd);
} else {
vp8_intra_prediction_down_copy(xd);
}
}
else
{
vp8_build_inter_predictors_mb(xd);
vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
}
}
static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
{
int i;
BLOCKD *b = &xd->block[24];
// dequantization and idct
if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
{
BLOCKD *b = &xd->block[24];
DEQUANT_INVOKE(&pbi->dequant, block)(b);
// do 2nd order transform on the dc block
......@@ -243,105 +272,66 @@ static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_dc)(b->qcoeff, &b->dequant[0][0], b->diff, 32, xd->block[24].diff[i]);
DEQUANT_INVOKE(&pbi->dequant, idct_dc_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride,
xd->block[24].diff[i]);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(xd->block[24].diff[i], b->diff, 32);
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(xd->block[24].diff[i], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
}
}
for (i = 16; i < 24; i++)
}
else if ((xd->frame_type == KEY_FRAME || xd->mbmi.ref_frame == INTRA_FRAME) && xd->mbmi.mode == B_PRED)
{
for (i = 0; i < 16; i++)
{
b = &xd->block[i];
BLOCKD *b = &xd->block[i];
vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, 16);
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, 16);
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
}
else
{
for (i = 0; i < 24; i++)
for (i = 0; i < 16; i++)
{
b = &xd->block[i];
BLOCKD *b = &xd->block[i];
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, (32 - (i & 16)));
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, (32 - (i & 16)));
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
}
}
void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
{
int eobtotal = 0;
MV orig_mvs[24];
int i, do_clamp = xd->mbmi.need_to_clamp_mvs;
if (xd->mbmi.mb_skip_coeff)
{
vp8_reset_mb_tokens_context(xd);
}
else
for (i = 16; i < 24; i++)
{
eobtotal = vp8_decode_mb_tokens(pbi, xd);
}
/* Perform temporary clamping of the MV to be used for prediction */
if (do_clamp)
{
if (xd->mbmi.mode == SPLITMV)
for (i=0; i<24; i++)
orig_mvs[i] = xd->block[i].bmi.mv.as_mv;
else
BLOCKD *b = &xd->block[i];
if (b->eob > 1)
{
orig_mvs[0] = xd->mbmi.mv.as_mv;
orig_mvs[1] = xd->block[16].bmi.mv.as_mv;
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
}
clamp_mvs(xd);
}
xd->mode_info_context->mbmi.dc_diff = 1;
if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV && eobtotal == 0)
{
xd->mode_info_context->mbmi.dc_diff = 0;
skip_recon_mb(pbi, xd);
}
else
{
if (xd->segmentation_enabled)
mb_init_dequantizer(pbi, xd);
de_quantand_idct(pbi, xd);
reconstruct_mb(pbi, xd);
}
/* Restore the original MV so as not to affect the entropy context. */
if (do_clamp)
{
if (xd->mbmi.mode == SPLITMV)
for (i=0; i<24; i++)
xd->block[i].bmi.mv.as_mv = orig_mvs[i];
else
{
xd->mbmi.mv.as_mv = orig_mvs[0];
xd->block[16].bmi.mv.as_mv = orig_mvs[1];
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
}
......
......@@ -32,8 +32,12 @@ void vp8_dequantize_b_c(BLOCKD *d)
}
}
void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
{
// output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly
short output[16*4];
short *diff_ptr = output;
int r, c;
int i;
for (i = 0; i < 16; i++)
......@@ -41,13 +45,38 @@ void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
input[i] = dq[i] * input[i];
}
vp8_short_idct4x4llm_c(input, output, pitch);
vp8_short_idct4x4llm_c(input, output, pitch*2);
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += pitch;
pred += pitch;
}
}
void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc)
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
{
int i;
// output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly
short output[16*4];
short *diff_ptr = output;
int r, c;
input[0] = (short)Dc;
......@@ -56,6 +85,27 @@ void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, in
input[i] = dq[i] * input[i];
}
vp8_short_idct4x4llm_c(input, output, pitch);
vp8_short_idct4x4llm_c(input, output, pitch*2);
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += pitch;
pred += pitch;
}
}
......@@ -16,11 +16,16 @@
#define prototype_dequant_block(sym) \
void sym(BLOCKD *x)
#define prototype_dequant_idct(sym) \
void sym(short *input, short *dq, short *output, int pitch)
#define prototype_dequant_idct_add(sym) \
void sym(short *input, short *dq, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride)
#define prototype_dequant_idct_dc(sym) \
void sym(short *input, short *dq, short *output, int pitch, int dc)
#define prototype_dequant_idct_dc_add(sym) \
void sym(short *input, short *dq, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride, \
int dc)
#if ARCH_X86 || ARCH_X86_64
#include "x86/dequantize_x86.h"
......@@ -35,25 +40,26 @@
#endif
extern prototype_dequant_block(vp8_dequant_block);
#ifndef vp8_dequant_idct
#define vp8_dequant_idct vp8_dequant_idct_c
#ifndef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_c
#endif
extern prototype_dequant_idct(vp8_dequant_idct);
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
#ifndef vp8_dequant_idct_dc
#define vp8_dequant_idct_dc vp8_dequant_dc_idct_c
#ifndef vp8_dequant_idct_dc_add
#define vp8_dequant_idct_dc_add vp8_dequant_dc_idct_add_c
#endif
extern prototype_dequant_idct_dc(vp8_dequant_idct_dc);
extern prototype_dequant_idct_dc_add(vp8_dequant_idct_dc_add);
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
typedef prototype_dequant_idct((*vp8_dequant_idct_fn_t));
typedef prototype_dequant_idct_dc((*vp8_dequant_idct_dc_fn_t));
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
typedef prototype_dequant_idct_dc_add((*vp8_dequant_idct_dc_add_fn_t));
typedef struct
{
vp8_dequant_block_fn_t block;
vp8_dequant_idct_fn_t idct;
vp8_dequant_idct_dc_fn_t idct_dc;
vp8_dequant_block_fn_t block;
vp8_dequant_idct_add_fn_t idct_add;
vp8_dequant_idct_dc_add_fn_t idct_dc_add;
} vp8_dequant_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
......
......@@ -21,8 +21,8 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
#if CONFIG_RUNTIME_CPU_DETECT
pbi->mb.rtcd = &pbi->common.rtcd;
pbi->dequant.block = vp8_dequantize_b_c;
pbi->dequant.idct = vp8_dequant_idct_c;
pbi->dequant.idct_dc = vp8_dequant_dc_idct_c;
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_c;
pbi->dboolhuff.start = vp8dx_start_decode_c;