Commit b5f8b70c authored by Yue Chen's avatar Yue Chen

Speed up rd selection in OBMC experiment

Instead of testing all interpfilter-BMC/OBMC combinations, we choose
the best interpolation filter based on regular inter prediction.

Reduction in encoding time: ~10%
Drop in performance gain: 0.08% lowres, 0.04% midres

Change-Id: Ifc19097a918ac76b529db9af4c60e2c70e93f7ad
parent 68d9a14e
...@@ -5854,28 +5854,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -5854,28 +5854,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
!is_comp_interintra_pred && !is_comp_interintra_pred &&
#endif // CONFIG_EXT_INTER #endif // CONFIG_EXT_INTER
is_obmc_allowed(mbmi); is_obmc_allowed(mbmi);
int best_obmc_flag = 0;
#if CONFIG_VP9_HIGHBITDEPTH
DECLARE_ALIGNED(16, uint16_t, tmp_buf1_16[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
DECLARE_ALIGNED(16, uint16_t, tmp_buf2_16[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
uint8_t *tmp_buf1, *tmp_buf2;
uint8_t *obmc_tmp_buf1[3];
uint8_t *obmc_tmp_buf2[3];
#else
DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
uint8_t *obmc_tmp_buf1[3] = {tmp_buf1, tmp_buf1 + CU_SIZE * CU_SIZE,
tmp_buf1 + CU_SIZE * CU_SIZE * 2};
uint8_t *obmc_tmp_buf2[3] = {tmp_buf2, tmp_buf2 + CU_SIZE * CU_SIZE,
tmp_buf2 + CU_SIZE * CU_SIZE * 2};
#endif // CONFIG_VP9_HIGHBITDEPTH
int obmc_tmp_stride[3] = {CU_SIZE, CU_SIZE, CU_SIZE};
uint8_t skip_txfm_bestfilter[2][MAX_MB_PLANE << 2] = {{0}, {0}};
int64_t bsse_bestfilter[2][MAX_MB_PLANE << 2] = {{0}, {0}};
int skip_txfm_sb_bestfilter[2] = {0};
int64_t skip_sse_sb_bestfilter[2] = {INT64_MAX};
int rate2_nocoeff, best_rate2 = INT_MAX, int rate2_nocoeff, best_rate2 = INT_MAX,
best_skippable, best_xskip, best_disable_skip = 0; best_skippable, best_xskip, best_disable_skip = 0;
#if CONFIG_SUPERTX #if CONFIG_SUPERTX
...@@ -5933,25 +5911,9 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -5933,25 +5911,9 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16); tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
#if CONFIG_OBMC
tmp_buf1 = CONVERT_TO_BYTEPTR(tmp_buf1_16);
tmp_buf2 = CONVERT_TO_BYTEPTR(tmp_buf2_16);
#endif // CONFIG_OBMC
} else { } else {
tmp_buf = (uint8_t *)tmp_buf16; tmp_buf = (uint8_t *)tmp_buf16;
#if CONFIG_OBMC
tmp_buf1 = (uint8_t *)tmp_buf1_16;
tmp_buf2 = (uint8_t *)tmp_buf2_16;
#endif // CONFIG_OBMC
} }
#if CONFIG_OBMC
obmc_tmp_buf1[0] = tmp_buf1;
obmc_tmp_buf1[1] = tmp_buf1 + 4096;
obmc_tmp_buf1[2] = tmp_buf1 + 8192;
obmc_tmp_buf2[0] = tmp_buf2;
obmc_tmp_buf2[1] = tmp_buf2 + 4096;
obmc_tmp_buf2[2] = tmp_buf2 + 8192;
#endif // CONFIG_OBMC
#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_HIGHBITDEPTH
if (is_comp_pred) { if (is_comp_pred) {
...@@ -6215,15 +6177,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6215,15 +6177,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
int64_t rs_rd; int64_t rs_rd;
int tmp_skip_sb = 0; int tmp_skip_sb = 0;
int64_t tmp_skip_sse = INT64_MAX; int64_t tmp_skip_sse = INT64_MAX;
#if CONFIG_OBMC
int obmc_flag = 0;
int tmp_skip_sb_obmc = 0;
int64_t tmp_skip_sse_obmc = INT64_MAX;
int64_t rdobmc = INT64_MAX;
uint8_t *obmc_tmp_buf[3];
uint8_t tmp_skip_txfm[MAX_MB_PLANE << 2] = {0};
int64_t tmp_bsse[MAX_MB_PLANE << 2] = {0};
#endif // CONFIG_OBMC
mbmi->interp_filter = i; mbmi->interp_filter = i;
rs = vp10_get_switchable_rate(cpi, xd); rs = vp10_get_switchable_rate(cpi, xd);
...@@ -6236,21 +6189,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6236,21 +6189,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
if (cm->interp_filter == SWITCHABLE) if (cm->interp_filter == SWITCHABLE)
rd += rs_rd; rd += rs_rd;
#if CONFIG_OBMC
if (allow_obmc) {
obmc_flag = best_obmc_flag;
rd += RDCOST(x->rdmult, x->rddiv,
cpi->obmc_cost[bsize][obmc_flag], 0);
}
#endif // CONFIG_OBMC
*mask_filter = VPXMAX(*mask_filter, rd); *mask_filter = VPXMAX(*mask_filter, rd);
} else { } else {
int rate_sum = 0; int rate_sum = 0;
int64_t dist_sum = 0; int64_t dist_sum = 0;
#if CONFIG_OBMC
int rate_sum_obmc = 0;
int64_t dist_sum_obmc = 0;
#endif // CONFIG_OBMC
if (i > 0 && cpi->sf.adaptive_interp_filter_search && if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
(cpi->sf.interp_filter_search_mask & (1 << i))) { (cpi->sf.interp_filter_search_mask & (1 << i))) {
rate_sum = INT_MAX; rate_sum = INT_MAX;
...@@ -6267,18 +6210,10 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6267,18 +6210,10 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
(cm->interp_filter == mbmi->interp_filter || (cm->interp_filter == mbmi->interp_filter ||
(i == 0 && intpel_mv && IsInterpolatingFilter(i))))) { (i == 0 && intpel_mv && IsInterpolatingFilter(i))))) {
restore_dst_buf(xd, orig_dst, orig_dst_stride); restore_dst_buf(xd, orig_dst, orig_dst_stride);
#if CONFIG_OBMC
for (j = 0; j < MAX_MB_PLANE; j++) {
obmc_tmp_buf[j] = obmc_tmp_buf1[j];
}
#endif // CONFIG_OBMC
} else { } else {
for (j = 0; j < MAX_MB_PLANE; j++) { for (j = 0; j < MAX_MB_PLANE; j++) {
xd->plane[j].dst.buf = tmp_buf + j * 64 * 64; xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
xd->plane[j].dst.stride = 64; xd->plane[j].dst.stride = 64;
#if CONFIG_OBMC
obmc_tmp_buf[j] = obmc_tmp_buf2[j];
#endif // CONFIG_OBMC
} }
} }
vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
...@@ -6286,34 +6221,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6286,34 +6221,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
&tmp_skip_sb, &tmp_skip_sse); &tmp_skip_sb, &tmp_skip_sse);
rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
#if CONFIG_OBMC
if (allow_obmc) {
rd += RDCOST(x->rdmult, x->rddiv, cpi->obmc_cost[bsize][0], 0);
memcpy(tmp_skip_txfm, x->skip_txfm, sizeof(tmp_skip_txfm));
memcpy(tmp_bsse, x->bsse, sizeof(tmp_bsse));
vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 1,
obmc_tmp_buf, obmc_tmp_stride,
dst_buf1, dst_stride1,
dst_buf2, dst_stride2);
for (j = 0; j < MAX_MB_PLANE; ++j) {
xd->plane[j].dst.buf = obmc_tmp_buf[j];
xd->plane[j].dst.stride = obmc_tmp_stride[j];
}
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum_obmc, &dist_sum_obmc,
&tmp_skip_sb_obmc, &tmp_skip_sse_obmc);
rdobmc = RDCOST(x->rdmult, x->rddiv,
rate_sum_obmc + cpi->obmc_cost[bsize][1],
dist_sum_obmc);
if ((double)rdobmc <= 0.99 * (double)rd) {
obmc_flag = 1;
rd = rdobmc;
rate_sum = rate_sum_obmc;
dist_sum = dist_sum_obmc;
}
}
#endif // CONFIG_OBMC
filter_cache[i] = rd; filter_cache[i] = rd;
filter_cache[SWITCHABLE_FILTERS] = filter_cache[SWITCHABLE_FILTERS] =
VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
...@@ -6336,10 +6243,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6336,10 +6243,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
newbest = i == 0 || rd < best_rd; newbest = i == 0 || rd < best_rd;
if (newbest) { if (newbest) {
#if CONFIG_OBMC
if (allow_obmc)
best_obmc_flag = obmc_flag;
#endif // CONFIG_OBMC
best_rd = rd; best_rd = rd;
best_filter = mbmi->interp_filter; best_filter = mbmi->interp_filter;
if (cm->interp_filter == SWITCHABLE && i && if (cm->interp_filter == SWITCHABLE && i &&
...@@ -6353,31 +6256,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6353,31 +6256,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
pred_exists = 1; pred_exists = 1;
tmp_rd = best_rd; tmp_rd = best_rd;
#if CONFIG_OBMC
if (allow_obmc) {
skip_txfm_sb_bestfilter[0] = tmp_skip_sb;
skip_sse_sb_bestfilter[0] = tmp_skip_sse;
memcpy(skip_txfm_bestfilter[0], tmp_skip_txfm, sizeof(skip_txfm));
memcpy(bsse_bestfilter[0], tmp_bsse, sizeof(bsse));
skip_txfm_sb_bestfilter[1] = tmp_skip_sb_obmc;
skip_sse_sb_bestfilter[1] = tmp_skip_sse_obmc;
memcpy(skip_txfm_bestfilter[1], x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse_bestfilter[1], x->bsse, sizeof(bsse));
if (best_obmc_flag) {
tmp_skip_sb = tmp_skip_sb_obmc;
tmp_skip_sse = tmp_skip_sse_obmc;
} else {
memcpy(x->skip_txfm, tmp_skip_txfm, sizeof(tmp_skip_txfm));
memcpy(x->bsse, tmp_bsse, sizeof(tmp_bsse));
}
} else {
skip_txfm_sb_bestfilter[0] = tmp_skip_sb;
skip_sse_sb_bestfilter[0] = tmp_skip_sse;
memcpy(skip_txfm_bestfilter[0], x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse_bestfilter[0], x->bsse, sizeof(bsse));
}
#endif // CONFIG_OBMC
skip_txfm_sb = tmp_skip_sb; skip_txfm_sb = tmp_skip_sb;
skip_sse_sb = tmp_skip_sse; skip_sse_sb = tmp_skip_sse;
memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
...@@ -6702,15 +6580,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6702,15 +6580,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
#endif // CONFIG_EXT_INTERP #endif // CONFIG_EXT_INTERP
#endif // CONFIG_EXT_INTER #endif // CONFIG_EXT_INTER
#if CONFIG_OBMC
if (allow_obmc)
mbmi->obmc = best_obmc_flag;
else
mbmi->obmc = 0;
#endif // CONFIG_OBMC
if (pred_exists) { if (pred_exists) {
#if !CONFIG_OBMC
if (best_needs_copy) { if (best_needs_copy) {
// again temporarily set the buffers to local memory to prevent a memcpy // again temporarily set the buffers to local memory to prevent a memcpy
for (i = 0; i < MAX_MB_PLANE; i++) { for (i = 0; i < MAX_MB_PLANE; i++) {
...@@ -6718,20 +6588,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6718,20 +6588,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
xd->plane[i].dst.stride = 64; xd->plane[i].dst.stride = 64;
} }
} }
#endif // !CONFIG_OBMC rd = tmp_rd;
rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
#if CONFIG_OBMC
if (allow_obmc)
rd += RDCOST(x->rdmult, x->rddiv,
cpi->obmc_cost[bsize][mbmi->obmc], 0);
#endif // CONFIG_OBMC
} else { } else {
int tmp_rate; int tmp_rate;
int64_t tmp_dist; int64_t tmp_dist;
#if CONFIG_OBMC
int64_t rdobmc = INT64_MAX;
restore_dst_buf(xd, orig_dst, orig_dst_stride);
#endif // CONFIG_OBMC
// Handles the special case when a filter that is not in the // Handles the special case when a filter that is not in the
// switchable list (ex. bilinear) is indicated at the frame level, or // switchable list (ex. bilinear) is indicated at the frame level, or
// skip condition holds. // skip condition holds.
...@@ -6739,34 +6600,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6739,34 +6600,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
&skip_txfm_sb, &skip_sse_sb); &skip_txfm_sb, &skip_sse_sb);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
#if CONFIG_OBMC
skip_txfm_sb_bestfilter[0] = skip_txfm_sb;
skip_sse_sb_bestfilter[0] = skip_sse_sb;
memcpy(skip_txfm_bestfilter[0], x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse_bestfilter[0], x->bsse, sizeof(bsse));
if (allow_obmc) {
rd += RDCOST(x->rdmult, x->rddiv, cpi->obmc_cost[bsize][0], 0);
vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 1,
obmc_tmp_buf1, obmc_tmp_stride,
dst_buf1, dst_stride1,
dst_buf2, dst_stride2);
for (i = 0; i < MAX_MB_PLANE; ++i) {
xd->plane[i].dst.buf = obmc_tmp_buf1[i];
xd->plane[i].dst.stride = obmc_tmp_stride[i];
}
model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
&skip_txfm_sb, &skip_sse_sb);
rdobmc = RDCOST(x->rdmult, x->rddiv,
rs + tmp_rate + cpi->obmc_cost[bsize][1], tmp_dist);
skip_txfm_sb_bestfilter[1] = skip_txfm_sb;
skip_sse_sb_bestfilter[1] = skip_sse_sb;
memcpy(skip_txfm_bestfilter[1], x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse_bestfilter[1], x->bsse, sizeof(bsse));
if ((double)rdobmc <= 0.99 * (double)rd)
rd = rdobmc;
}
#endif // CONFIG_OBMC
memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse, x->bsse, sizeof(bsse)); memcpy(bsse, x->bsse, sizeof(bsse));
} }
...@@ -6857,40 +6690,16 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6857,40 +6690,16 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
#if CONFIG_OBMC #if CONFIG_OBMC
best_rd = INT64_MAX; best_rd = INT64_MAX;
for (mbmi->obmc = 0; mbmi->obmc <= allow_obmc; mbmi->obmc++) { for (mbmi->obmc = 0; mbmi->obmc <= allow_obmc; mbmi->obmc++) {
int64_t tmp_rd; int64_t tmp_rd, tmp_dist;
int tmp_rate;
if (pred_exists) {
if (best_needs_copy) { if (mbmi->obmc) {
if (mbmi->obmc) { vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0,
for (i = 0; i < MAX_MB_PLANE; i++) { NULL, NULL,
xd->plane[i].dst.buf = obmc_tmp_buf2[i]; dst_buf1, dst_stride1,
xd->plane[i].dst.stride = 64; dst_buf2, dst_stride2);
} model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
} else { &skip_txfm_sb, &skip_sse_sb);
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
xd->plane[i].dst.stride = 64;
}
}
} else {
if (mbmi->obmc) {
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = obmc_tmp_buf1[i];
xd->plane[i].dst.stride = 64;
}
} else {
restore_dst_buf(xd, orig_dst, orig_dst_stride);
}
}
} else {
if (mbmi->obmc) {
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = obmc_tmp_buf1[i];
xd->plane[i].dst.stride = 64;
}
} else {
restore_dst_buf(xd, orig_dst, orig_dst_stride);
}
} }
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
...@@ -6905,11 +6714,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -6905,11 +6714,6 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_HIGHBITDEPTH
skip_txfm_sb = skip_txfm_sb_bestfilter[mbmi->obmc];
skip_sse_sb = skip_sse_sb_bestfilter[mbmi->obmc];
memcpy(x->skip_txfm, skip_txfm_bestfilter[mbmi->obmc],
sizeof(skip_txfm));
memcpy(x->bsse, bsse_bestfilter[mbmi->obmc], sizeof(bsse));
x->skip = 0; x->skip = 0;
*rate2 = rate2_nocoeff; *rate2 = rate2_nocoeff;
...@@ -7010,10 +6814,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x, ...@@ -7010,10 +6814,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
// The cost of skip bit needs to be added. // The cost of skip bit needs to be added.
#if CONFIG_OBMC #if CONFIG_OBMC
mbmi->skip = xd->lossless[mbmi->segment_id] ? 0 : 1; mbmi->skip = 0;
if (xd->lossless[mbmi->segment_id])
*rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
else
#endif // CONFIG_OBMC #endif // CONFIG_OBMC
*rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1); *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment