Commit 2f2302f8 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

Preload reference area in sub-pixel motion search (real-time mode)

This change implemented same idea in change "Preload reference area
to an intermediate buffer in sub-pixel motion search." The changes
were made to vp8_find_best_sub_pixel_step() and vp8_find_best_half
_pixel_step() functions which are called when speed >= 5. Test
result (using tulip clip):

1. On Core2 Quad machine(Linux)
rt mode, speed (-5 ~ -8), encoding speed gain: 2% ~ 3%
rt mode, speed (-9 ~ -11), encoding speed gain: 1% ~ 2%
rt mode, speed (-12 ~ -14), no noticeable encoding speed gain

2. On Xeon machine(Linux)
Test on speed (-5 ~ -14) didn't show noticeable speed change.

Change-Id: I21bec2d6e7fbe541fcc0f4c0366bbdf3e2076aa2
parent f11613b6
......@@ -343,12 +343,26 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int bestmse = INT_MAX;
int_mv startmv;
int_mv this_mv;
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
unsigned char *z = (*(b->base_src) + b->src);
int left, right, up, down, diag;
unsigned int sse;
int whichdir ;
int thismse;
int y_stride;
#if ARCH_X86 || ARCH_X86_64
MACROBLOCKD *xd = &x->e_mbd;
unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
unsigned char *y;
y_stride = 32;
/* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
y = xd->y_buf + y_stride + 1;
#else
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
y_stride = d->pre_stride;
#endif
// central mv
bestmv->as_mv.row <<= 3;
......@@ -356,14 +370,14 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
startmv = *bestmv;
// calculate central point error
bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
*distortion = bestmse;
bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
// go left then right and check error
this_mv.as_mv.row = startmv.as_mv.row;
this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (left < bestmse)
......@@ -375,7 +389,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 8;
thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse)
......@@ -389,7 +403,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
// go up then down and check error
this_mv.as_mv.col = startmv.as_mv.col;
this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (up < bestmse)
......@@ -401,7 +415,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.row += 8;
thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse)
......@@ -424,23 +438,23 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
case 0:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
break;
case 1:
this_mv.as_mv.col += 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
break;
case 2:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row += 4;
thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
break;
case 3:
default:
this_mv.as_mv.col += 4;
this_mv.as_mv.row += 4;
thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
break;
}
......@@ -459,7 +473,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
// time to check quarter pels.
if (bestmv->as_mv.row < startmv.as_mv.row)
y -= d->pre_stride;
y -= y_stride;
if (bestmv->as_mv.col < startmv.as_mv.col)
y--;
......@@ -474,12 +488,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
if (startmv.as_mv.col & 7)
{
this_mv.as_mv.col = startmv.as_mv.col - 2;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
else
{
this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
......@@ -493,7 +507,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 4;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse)
......@@ -510,12 +524,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
if (startmv.as_mv.row & 7)
{
this_mv.as_mv.row = startmv.as_mv.row - 2;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
else
{
this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
}
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
......@@ -529,7 +543,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.row += 4;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse)
......@@ -559,12 +573,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
if (startmv.as_mv.col & 7)
{
this_mv.as_mv.col -= 2;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
else
{
this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);;
thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);;
}
}
else
......@@ -574,12 +588,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
if (startmv.as_mv.col & 7)
{
this_mv.as_mv.col -= 2;
thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
}
else
{
this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
thismse = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
thismse = vfp->svf(y - y_stride - 1, y_stride, 6, 6, z, b->src_stride, &sse);
}
}
......@@ -590,12 +604,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
if (startmv.as_mv.row & 7)
{
this_mv.as_mv.row -= 2;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
else
{
this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
}
break;
......@@ -605,19 +619,19 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
if (startmv.as_mv.col & 7)
{
this_mv.as_mv.col -= 2;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
else
{
this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
}
break;
case 3:
this_mv.as_mv.col += 2;
this_mv.as_mv.row += 2;
thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
break;
}
......@@ -634,7 +648,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
return bestmse;
}
int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
const vp8_variance_fn_ptr_t *vfp,
......@@ -644,11 +658,25 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
int bestmse = INT_MAX;
int_mv startmv;
int_mv this_mv;
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
unsigned char *z = (*(b->base_src) + b->src);
int left, right, up, down, diag;
unsigned int sse;
int thismse;
int y_stride;
#if ARCH_X86 || ARCH_X86_64
MACROBLOCKD *xd = &x->e_mbd;
unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
unsigned char *y;
y_stride = 32;
/* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
y = xd->y_buf + y_stride + 1;
#else
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
y_stride = d->pre_stride;
#endif
// central mv
bestmv->as_mv.row <<= 3;
......@@ -656,14 +684,14 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
startmv = *bestmv;
// calculate central point error
bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
*distortion = bestmse;
bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
// go left then right and check error
this_mv.as_mv.row = startmv.as_mv.row;
this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (left < bestmse)
......@@ -675,7 +703,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 8;
thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse)
......@@ -689,7 +717,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
// go up then down and check error
this_mv.as_mv.col = startmv.as_mv.col;
this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (up < bestmse)
......@@ -701,7 +729,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.row += 8;
thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse)
......@@ -723,22 +751,22 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
case 0:
this_mv.col = (this_mv.col - 8) | 4;
this_mv.row = (this_mv.row - 8) | 4;
diag = vfp->svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
diag = vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 1:
this_mv.col += 4;
this_mv.row = (this_mv.row - 8) | 4;
diag = vfp->svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
diag = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 2:
this_mv.col = (this_mv.col - 8) | 4;
this_mv.row += 4;
diag = vfp->svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
diag = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 3:
this_mv.col += 4;
this_mv.row += 4;
diag = vfp->svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
diag = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
break;
}
......@@ -753,7 +781,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
#else
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (diag < bestmse)
......@@ -765,7 +793,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 8;
thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (diag < bestmse)
......@@ -778,7 +806,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = startmv.as_mv.row + 4;
thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (diag < bestmse)
......@@ -790,7 +818,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
}
this_mv.as_mv.col += 8;
thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (diag < bestmse)
......
......@@ -1181,6 +1181,9 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
// Should we do a full search (best quality only)
if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)
{
/* Check if mvp_full is within the range. */
vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
thissme = cpi->full_search_sad(x, c, e, &mvp_full,
sadpb, 16, v_fn_ptr,
x->mvcost, bsi->ref_mv);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment