Commit b6b91432 authored by Jingning Han's avatar Jingning Han
Browse files

Dual buffer encoding for intra modes

Overall change (using dual buffer scheme for superblocks of both inter
and intra modes) reduces speed 2 runtime:
bluesky_1080p at 6000kbps:   263553ms -> 257441ms
riverbed_1080p at 8000kbps:  233230ms -> 225308ms.

Change-Id: Idf8d70f768a4b0d97b2a8506372c57b7b4022119
parent 8ce0967d
......@@ -27,16 +27,16 @@ typedef struct {
typedef struct {
MODE_INFO mic;
uint8_t *zcoeff_blk;
int16_t *coeff[MAX_MB_PLANE][2];
int16_t *qcoeff[MAX_MB_PLANE][2];
int16_t *dqcoeff[MAX_MB_PLANE][2];
uint16_t *eobs[MAX_MB_PLANE][2];
int16_t *coeff[MAX_MB_PLANE][3];
int16_t *qcoeff[MAX_MB_PLANE][3];
int16_t *dqcoeff[MAX_MB_PLANE][3];
uint16_t *eobs[MAX_MB_PLANE][3];
// dual buffer pointers, 0: in use, 1: best in store
int16_t *coeff_pbuf[MAX_MB_PLANE][2];
int16_t *qcoeff_pbuf[MAX_MB_PLANE][2];
int16_t *dqcoeff_pbuf[MAX_MB_PLANE][2];
uint16_t *eobs_pbuf[MAX_MB_PLANE][2];
int16_t *coeff_pbuf[MAX_MB_PLANE][3];
int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
int is_coded;
int num_4x4_blk;
......@@ -94,6 +94,7 @@ struct macroblock {
MACROBLOCKD e_mbd;
int skip_block;
int select_txfm_size;
int skip_recode;
int skip_optimize;
int q_index;
......
......@@ -377,6 +377,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
int max_plane;
assert(mi->mbmi.mode < MB_MODE_COUNT);
assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
......@@ -385,13 +386,21 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
*mi_addr = *mi;
for (i = 0; i < MAX_MB_PLANE; ++i) {
max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
for (i = 0; i < max_plane; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][1];
pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
pd[i].eobs = ctx->eobs_pbuf[i][1];
}
for (i = max_plane; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][2];
pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
pd[i].eobs = ctx->eobs_pbuf[i][2];
}
// Restore the coding context of the MB to that that was in place
// when the mode was picked for it
for (y = 0; y < mi_height; y++)
......@@ -619,6 +628,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
pd[i].eobs = ctx->eobs_pbuf[i][0];
}
ctx->is_coded = 0;
x->skip_recode = 0;
// Set to zero to make sure we do not use the previous encoded frame stats
xd->mi_8x8[0]->mbmi.skip_coeff = 0;
......@@ -2406,6 +2416,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8;
x->skip_optimize = ctx->is_coded;
ctx->is_coded = 1;
x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
......
......@@ -432,19 +432,18 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
// TODO(jingning): per transformed block zero forcing only enabled for
// luma component. will integrate chroma components as well.
if (x->zcoeff_blk[tx_size][block] && plane == 0) {
int i, k;
int i, j;
pd->eobs[block] = 0;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &k);
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
ctx->ta[plane][i] = 0;
ctx->tl[plane][k] = 0;
ctx->tl[plane][j] = 0;
return;
}
if (x->select_txfm_size || xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8)
if (!x->skip_recode)
vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
if (x->optimize && (x->select_txfm_size ||
xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8|| !x->skip_optimize)) {
if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
} else {
int i, k;
......@@ -515,10 +514,10 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
struct optimize_ctx ctx;
struct encode_b_args arg = {x, &ctx};
if (x->select_txfm_size || xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8)
if (!x->skip_recode)
vp9_subtract_sb(x, bsize);
if (x->optimize) {
if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i)
optimize_init_b(i, bsize, &arg);
......@@ -563,10 +562,12 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 32 * (block & twmask);
yoff = 32 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
if (!x->skip_recode) {
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_subtract_block(32, 32, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
if (x->use_lp32x32fdct)
......@@ -576,6 +577,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
}
if (!x->skip_encode && *eob)
vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
break;
......@@ -588,16 +590,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 16 * (block & twmask);
yoff = 16 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
if (!x->skip_recode) {
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_subtract_block(16, 16, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
}
if (!x->skip_encode && *eob)
vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
......@@ -610,16 +614,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 8 * (block & twmask);
yoff = 8 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
if (!x->skip_recode) {
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_subtract_block(8, 8, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
}
if (!x->skip_encode && *eob)
vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
......@@ -635,10 +641,12 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 4 * (block & twmask);
yoff = 4 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
if (!x->skip_recode) {
src = p->src.buf + yoff * p->src.stride + xoff;
src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_subtract_block(4, 4, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
if (tx_type != DCT_DCT)
......@@ -648,6 +656,8 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
}
if (!x->skip_encode && *eob) {
if (tx_type == DCT_DCT)
// this is like vp9_short_idct4x4 but has a special case around eob<=1
......
......@@ -535,6 +535,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
pd[i].eobs = ctx->eobs_pbuf[i][1];
}
x->skip_recode = 0;
// Initialise the MV cost table to the defaults
......
......@@ -1452,7 +1452,7 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 2; ++k) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
vpx_memalign(16, num_pix * sizeof(int16_t)));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
......@@ -1474,7 +1474,7 @@ static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
vpx_free(ctx->zcoeff_blk);
ctx->zcoeff_blk = 0;
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 2; ++k) {
for (k = 0; k < 3; ++k) {
vpx_free(ctx->coeff[i][k]);
ctx->coeff[i][k] = 0;
vpx_free(ctx->qcoeff[i][k]);
......
......@@ -246,7 +246,8 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
vp9_set_speed_features(cpi);
cpi->mb.select_txfm_size = cpi->sf.tx_size_search_method == USE_LARGESTALL ?
cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
cm->frame_type != KEY_FRAME) ?
0 : 1;
set_block_thresholds(cpi);
......@@ -1329,6 +1330,7 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
}
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
PICK_MODE_CONTEXT *ctx,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
BLOCK_SIZE bsize) {
......@@ -1364,6 +1366,27 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
*rate_tokenonly = this_rate_tokenonly;
*distortion = this_distortion;
*skippable = s;
if (!x->select_txfm_size) {
int i;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = x->e_mbd.plane;
for (i = 1; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][2];
pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
pd[i].eobs = ctx->eobs_pbuf[i][2];
ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0];
ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0];
ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0];
ctx->coeff_pbuf[i][0] = p[i].coeff;
ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff;
ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
ctx->eobs_pbuf[i][0] = pd[i].eobs;
}
}
}
}
......@@ -1389,8 +1412,9 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
return this_rd;
}
static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
int *rate_uv, int *rate_uv_tokenonly,
static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
BLOCK_SIZE bsize, int *rate_uv,
int *rate_uv_tokenonly,
int64_t *dist_uv, int *skip_uv,
MB_PREDICTION_MODE *mode_uv) {
MACROBLOCK *const x = &cpi->mb;
......@@ -1403,7 +1427,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
// Else do a proper rd search for each possible transform size that may
// be considered in the main rd loop.
} else {
rd_pick_intra_sbuv_mode(cpi, x,
rd_pick_intra_sbuv_mode(cpi, x, ctx,
rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
}
......@@ -3033,12 +3057,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
return this_rd; // if 0, this will be re-calculated by caller
}
static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
int i;
static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int max_plane) {
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = x->e_mbd.plane;
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (i = 0; i < max_plane; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][1];
pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
......@@ -3075,7 +3100,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returnrate = INT_MAX;
return;
}
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
&dist_uv, &uv_skip, bsize);
} else {
y_skip = 0;
......@@ -3084,7 +3109,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returnrate = INT_MAX;
return;
}
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
&dist_uv, &uv_skip, BLOCK_8X8);
}
......@@ -3450,7 +3475,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
if (rate_uv_intra[uv_tx] == INT_MAX) {
choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx],
&rate_uv_tokenonly[uv_tx],
&dist_uv[uv_tx], &skip_uv[uv_tx],
&mode_uv[uv_tx]);
......@@ -3584,6 +3609,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Did this mode help.. i.e. is it the new best mode
if (this_rd < best_rd || x->skip) {
int max_plane = MAX_MB_PLANE;
if (!mode_excluded) {
// Note index of best mode so far
best_mode_index = mode_index;
......@@ -3591,6 +3617,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (ref_frame == INTRA_FRAME) {
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
max_plane = 1;
}
*returnrate = rate2;
......@@ -3599,7 +3626,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_mbmode = *mbmi;
best_skip2 = this_skip2;
if (!x->select_txfm_size)
swap_block_ptr(x, ctx);
swap_block_ptr(x, ctx, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(uint8_t) * ctx->num_4x4_blk);
......@@ -3706,7 +3733,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Do Intra UV best rd mode selection if best mode choice above was intra.
if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
&skip_uv[uv_tx_size],
......@@ -4075,7 +4102,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
distortion2 += distortion_y;
if (rate_uv_intra[TX_4X4] == INT_MAX) {
choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4],
&rate_uv_tokenonly[TX_4X4],
&dist_uv[TX_4X4], &skip_uv[TX_4X4],
&mode_uv[TX_4X4]);
......@@ -4329,12 +4356,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
// Did this mode help.. i.e. is it the new best mode
if (this_rd < best_rd || x->skip) {
if (!mode_excluded) {
int max_plane = MAX_MB_PLANE;
// Note index of best mode so far
best_mode_index = mode_index;
if (ref_frame == INTRA_FRAME) {
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
max_plane = 1;
}
*returnrate = rate2;
......@@ -4345,7 +4374,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
best_mbmode = *mbmi;
best_skip2 = this_skip2;
if (!x->select_txfm_size)
swap_block_ptr(x, ctx);
swap_block_ptr(x, ctx, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(uint8_t) * ctx->num_4x4_blk);
......@@ -4452,7 +4481,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
// Do Intra UV best rd mode selection if best mode choice above was intra.
if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
&skip_uv[uv_tx_size],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment