Commit 60cb39da authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

Dual 16x16 inter prediction.

This patch introduces the concept of dual inter16x16 prediction. A
16x16 inter-predicted macroblock can use 2 references instead of 1,
where both references use the same mvmode (new, near/est, zero). In the
case of newmv, this means that two MVs are coded instead of one. The
frame can be encoded in 3 ways: all MBs single-prediction, all MBs dual
prediction, or per-MB single/dual prediction selection ("hybrid"), in
which case a single bit is coded per-MB to indicate whether the MB uses
single or dual inter prediction.

In the future, we can (maybe?) get further gains by mixing this with
Adrian's 32x32 work, per-segment dual prediction settings, or adding
support for dual splitmv/8x8mv inter prediction.

Gain (on derf-set, CQ mode) is ~2.8% (SSIM) or ~3.6% (glb PSNR). Most
gain is at medium/high bitrates, but there's minor gains at low bitrates
also. Output was confirmed to match between encoder and decoder.

Note for optimization people: this patch introduces a 2nd version of
16x16/8x8 sixtap/bilin functions, which does an avg instead of a
store. They may want to look and make sure this is implemented to
their satisfaction so we can optimize it best in the future.

Change-ID: I59dc84b07cbb3ccf073ac0f756d03d294cb19281
parent b4ad9b5d
......@@ -217,6 +217,7 @@ HAVE_LIST="
unistd_h
"
EXPERIMENT_LIST="
dualpred
extend_qrange
segmentation
segfeatures
......
......@@ -193,6 +193,9 @@ void vp8_create_common(VP8_COMMON *oci)
vp8_default_bmode_probs(oci->fc.bmode_prob);
oci->mb_no_coeff_skip = 1;
#if CONFIG_DUALPRED
oci->dual_pred_mode = HYBRID_PREDICTION;
#endif /* CONFIG_DUALPRED */
oci->no_lpf = 0;
oci->filter_type = NORMAL_LOOPFILTER;
oci->use_bilinear_mc_filter = 0;
......
......@@ -184,6 +184,10 @@ typedef struct
TX_SIZE txfm_size;
#endif
int_mv mv;
#if CONFIG_DUALPRED
MV_REFERENCE_FRAME second_ref_frame;
int_mv second_mv;
#endif
unsigned char partitioning;
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char need_to_clamp_mvs;
......@@ -236,6 +240,11 @@ typedef struct MacroBlockD
int fullpixel_mask;
YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
#if CONFIG_DUALPRED
struct {
uint8_t *y_buffer, *u_buffer, *v_buffer;
} second_pre;
#endif /* CONFIG_DUALPRED */
YV12_BUFFER_CONFIG dst;
#if CONFIG_NEWNEAR
......@@ -305,6 +314,10 @@ typedef struct MacroBlockD
vp8_subpix_fn_t subpixel_predict8x4;
vp8_subpix_fn_t subpixel_predict8x8;
vp8_subpix_fn_t subpixel_predict16x16;
#if CONFIG_DUALPRED
vp8_subpix_fn_t subpixel_predict_avg8x8;
vp8_subpix_fn_t subpixel_predict_avg16x16;
#endif /* CONFIG_DUALPRED */
void *current_bc;
......
......@@ -128,6 +128,61 @@ static void filter_block2d_second_pass
}
}
#if CONFIG_DUALPRED
/*
* The only functional difference between filter_block2d_second_pass()
* and this function is that filter_block2d_second_pass() does a sixtap
* filter on the input and stores it in the output. This function
* (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
* and then averages that with the content already present in the output
* ((filter_result + dest + 1) >> 1) and stores that in the output.
*/
static void filter_block2d_second_pass_avg
(
int *src_ptr,
unsigned char *output_ptr,
int output_pitch,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
/* Apply filter */
Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[0] * vp8_filter[2]) +
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
output_ptr[j] = (unsigned char) ((output_ptr[j] + Temp + 1) >> 1);
src_ptr++;
}
/* Start next row */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_pitch;
}
}
#endif /* CONFIG_DUALPRED */
static void filter_block2d
(
......@@ -193,6 +248,32 @@ void vp8_sixtap_predict8x8_c
}
#if CONFIG_DUALPRED
void vp8_sixtap_predict_avg8x8_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
int FData[13*16]; /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
/* First filter 1-D horizontally... */
filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
/* then filter verticaly... */
filter_block2d_second_pass_avg(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
}
#endif /* CONFIG_DUALPRED */
void vp8_sixtap_predict8x4_c
(
unsigned char *src_ptr,
......@@ -245,6 +326,33 @@ void vp8_sixtap_predict16x16_c
}
#if CONFIG_DUALPRED
void vp8_sixtap_predict_avg16x16_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
int FData[21*24]; /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
/* First filter 1-D horizontally... */
filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData,
src_pixels_per_line, 1, 21, 16, HFilter);
/* then filter verticaly... */
filter_block2d_second_pass_avg(FData + 32, dst_ptr, dst_pitch,
16, 16, 16, 16, VFilter);
}
#endif /* CONFIG_DUALPRED */
/****************************************************************************
*
......@@ -349,6 +457,46 @@ static void filter_block2d_bil_second_pass
}
}
#if CONFIG_DUALPRED
/*
* As before for filter_block2d_second_pass_avg(), the functional difference
* between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
* is that filter_block2d_bil_second_pass() does a bilinear filter on input
* and stores the result in output; filter_block2d_bil_second_pass_avg(),
* instead, does a bilinear filter on input, averages the resulting value
* with the values already present in the output and stores the result of
* that back into the output ((filter_result + dest + 1) >> 1).
*/
static void filter_block2d_bil_second_pass_avg
(
unsigned short *src_ptr,
unsigned char *dst_ptr,
int dst_pitch,
unsigned int height,
unsigned int width,
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
for (i = 0; i < height; i++)
{
for (j = 0; j < width; j++)
{
/* Apply filter */
Temp = ((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[width] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2);
dst_ptr[j] = (unsigned int)(((Temp >> VP8_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
src_ptr++;
}
/* Next row... */
dst_ptr += dst_pitch;
}
}
#endif /* CONFIG_DUALPRED */
/****************************************************************************
*
......@@ -395,6 +543,28 @@ static void filter_block2d_bil
filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
}
#if CONFIG_DUALPRED
static void filter_block2d_bil_avg
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
unsigned int src_pitch,
unsigned int dst_pitch,
const short *HFilter,
const short *VFilter,
int Width,
int Height
)
{
unsigned short FData[17*16]; /* Temp data buffer used in filtering */
/* First filter 1-D horizontally... */
filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
/* then 1-D vertically... */
filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
}
#endif /* CONFIG_DUALPRED */
void vp8_bilinear_predict4x4_c
(
......@@ -454,6 +624,28 @@ void vp8_bilinear_predict8x8_c
}
#if CONFIG_DUALPRED
void vp8_bilinear_predict_avg8x8_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
dst_pitch, HFilter, VFilter, 8, 8);
}
#endif /* CONFIG_DUALPRED */
void vp8_bilinear_predict8x4_c
(
unsigned char *src_ptr,
......@@ -492,3 +684,25 @@ void vp8_bilinear_predict16x16_c
filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
}
#if CONFIG_DUALPRED
void vp8_bilinear_predict_avg16x16_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
dst_pitch, HFilter, VFilter, 16, 16);
}
#endif /* CONFIG_DUALPRED */
......@@ -84,6 +84,10 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
#endif
rtcd->recon.copy16x16 = vp8_copy_mem16x16_c;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_c;
#if CONFIG_DUALPRED
rtcd->recon.avg16x16 = vp8_avg_mem16x16_c;
rtcd->recon.avg8x8 = vp8_avg_mem8x8_c;
#endif /* CONFIG_DUALPRED */
rtcd->recon.copy8x4 = vp8_copy_mem8x4_c;
rtcd->recon.recon = vp8_recon_b_c;
#if CONFIG_I8X8
......@@ -112,14 +116,22 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
#endif
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_c;
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_c;
rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_c;
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_c;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_c;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
#if CONFIG_DUALPRED
rtcd->subpix.sixtap_avg16x16 = vp8_sixtap_predict_avg16x16_c;
rtcd->subpix.sixtap_avg8x8 = vp8_sixtap_predict_avg8x8_c;
#endif /* CONFIG_DUALPRED */
rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_c;
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_c;
rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_c;
#if CONFIG_DUALPRED
rtcd->subpix.bilinear_avg16x16 = vp8_bilinear_predict_avg16x16_c;
rtcd->subpix.bilinear_avg8x8 = vp8_bilinear_predict_avg8x8_c;
#endif /* CONFIG_DUALPRED */
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_c;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_c;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c;
......
......@@ -73,6 +73,16 @@ typedef enum
BILINEAR = 1
} INTERPOLATIONFILTERTYPE;
#if CONFIG_DUALPRED
typedef enum
{
SINGLE_PREDICTION_ONLY = 0,
DUAL_PREDICTION_ONLY = 1,
HYBRID_PREDICTION = 2,
NB_PREDICTION_TYPES = 3,
} DUALPREDMODE_TYPE;
#endif /* CONFIG_DUALPRED */
typedef struct VP8_COMMON_RTCD
{
#if CONFIG_RUNTIME_CPU_DETECT
......@@ -130,6 +140,9 @@ typedef struct VP8Common
/* profile settings */
int experimental;
int mb_no_coeff_skip;
#if CONFIG_DUALPRED
DUALPREDMODE_TYPE dual_pred_mode;
#endif /* CONFIG_DUALPRED */
int no_lpf;
int use_bilinear_mc_filter;
int full_pixel;
......
......@@ -49,6 +49,18 @@ extern prototype_copy_block(vp8_recon_copy16x16);
#endif
extern prototype_copy_block(vp8_recon_copy8x8);
#if CONFIG_DUALPRED
#ifndef vp8_recon_avg16x16
#define vp8_recon_avg16x16 vp8_avg_mem16x16_c
#endif
extern prototype_copy_block(vp8_recon_avg16x16);
#ifndef vp8_recon_avg8x8
#define vp8_recon_avg8x8 vp8_avg_mem8x8_c
#endif
extern prototype_copy_block(vp8_recon_avg8x8);
#endif /* CONFIG_DUALPRED */
#ifndef vp8_recon_copy8x4
#define vp8_recon_copy8x4 vp8_copy_mem8x4_c
#endif
......@@ -157,6 +169,10 @@ typedef struct vp8_recon_rtcd_vtable
{
vp8_copy_block_fn_t copy16x16;
vp8_copy_block_fn_t copy8x8;
#if CONFIG_DUALPRED
vp8_copy_block_fn_t avg16x16;
vp8_copy_block_fn_t avg8x8;
#endif /* CONFIG_DUALPRED */
vp8_copy_block_fn_t copy8x4;
vp8_recon_fn_t recon;
#if CONFIG_I8X8
......
......@@ -62,6 +62,30 @@ void vp8_copy_mem16x16_c(
}
#if CONFIG_DUALPRED
void vp8_avg_mem16x16_c(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride)
{
int r;
for (r = 0; r < 16; r++)
{
int n;
for (n = 0; n < 16; n++)
{
dst[n] = (dst[n] + src[n] + 1) >> 1;
}
src += src_stride;
dst += dst_stride;
}
}
#endif /* CONFIG_DUALPRED */
void vp8_copy_mem8x8_c(
unsigned char *src,
int src_stride,
......@@ -92,6 +116,30 @@ void vp8_copy_mem8x8_c(
}
#if CONFIG_DUALPRED
void vp8_avg_mem8x8_c(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride)
{
int r;
for (r = 0; r < 8; r++)
{
int n;
for (n = 0; n < 8; n++)
{
dst[n] = (dst[n] + src[n] + 1) >> 1;
}
src += src_stride;
dst += dst_stride;
}
}
#endif /* CONFIG_DUALPRED */
void vp8_copy_mem8x4_c(
unsigned char *src,
int src_stride,
......@@ -388,6 +436,74 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
}
#if CONFIG_DUALPRED
/*
* This function should be called after an initial call to
* vp8_build_inter16x16_predictors_mb() or _mby()/_mbuv().
* It will run a second sixtap filter on a (different) ref
* frame and average the result with the output of the
* first sixtap filter. The second reference frame is stored
* in x->second_pre (the reference frame index is in
* x->mode_info_context->mbmi.second_ref_frame). The second
* motion vector is x->mode_info_context->mbmi.second_mv.
*
* This allows blending prediction from two reference frames
* which sometimes leads to better prediction than from a
* single reference framer.
*/
void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_y,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride)
{
int offset;
unsigned char *ptr;
unsigned char *uptr, *vptr;
int mv_row = x->mode_info_context->mbmi.second_mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.second_mv.as_mv.col;
unsigned char *ptr_base = x->second_pre.y_buffer;
int pre_stride = x->block[0].pre_stride;
ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict_avg16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, dst_ystride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, avg16x16)(ptr, pre_stride, dst_y, dst_ystride);
}
/* calc uv motion vectors */
mv_row = (mv_row + (mv_row > 0)) >> 1;
mv_col = (mv_col + (mv_col > 0)) >> 1;
mv_row &= x->fullpixel_mask;
mv_col &= x->fullpixel_mask;
pre_stride >>= 1;
offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
uptr = x->second_pre.u_buffer + offset;
vptr = x->second_pre.v_buffer + offset;
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict_avg8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, dst_u, dst_uvstride);
x->subpixel_predict_avg8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, dst_v, dst_uvstride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, avg8x8)(uptr, pre_stride, dst_u, dst_uvstride);
RECON_INVOKE(&x->rtcd->recon, avg8x8)(vptr, pre_stride, dst_v, dst_uvstride);
}
}
#endif /* CONFIG_DUALPRED */
static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
{
int i;
......@@ -490,6 +606,17 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
{
vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
&x->predictor[320], 16, 8);
#if CONFIG_DUALPRED
if (x->mode_info_context->mbmi.second_ref_frame)
{
/* 256 = offset of U plane in Y+U+V buffer;
* 320 = offset of V plane in Y+U+V buffer.
* (256=16x16, 320=16x16+8x8). */
vp8_build_2nd_inter16x16_predictors_mb(x, x->predictor,
&x->predictor[256],
&x->predictor[320], 16, 8);
}
#endif /* CONFIG_DUALPRED */
}
else
{
......
......@@ -19,6 +19,12 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride);
extern void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_y,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride);
extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
......
......@@ -34,6 +34,18 @@ extern prototype_subpixel_predict(vp8_subpix_sixtap16x16);
#endif
extern prototype_subpixel_predict(vp8_subpix_sixtap8x8);
#if CONFIG_DUALPRED
#ifndef vp8_subpix_sixtap_avg16x16
#define vp8_subpix_sixtap_avg16x16 vp8_sixtap_predict_avg16x16_c
#endif
extern prototype_subpixel_predict(vp8_subpix_sixtap_avg16x16);
#ifndef vp8_subpix_sixtap_avg8x8
#define vp8_subpix_sixtap_avg8x8 vp8_sixtap_predict_avg8x8_c
#endif
extern prototype_subpixel_predict(vp8_subpix_sixtap_avg8x8);
#endif /* CONFIG_DUALPRED */
#ifndef vp8_subpix_sixtap8x4
#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_c
#endif
......@@ -54,6 +66,18 @@ extern prototype_subpixel_predict(vp8_subpix_bilinear16x16);
#endif
extern prototype_subpixel_predict(vp8_subpix_bilinear8x8);
#if CONFIG_DUALPRED
#ifndef vp8_subpix_bilinear_avg16x16
#define vp8_subpix_bilinear_avg16x16 vp8_bilinear_predict_avg16x16_c
#endif
extern prototype_subpixel_predict(vp8_subpix_bilinear_avg16x16);
#ifndef vp8_subpix_bilinear_avg8x8
#define vp8_subpix_bilinear_avg8x8 vp8_bilinear_predict_avg8x8_c
#endif
extern prototype_subpixel_predict(vp8_subpix_bilinear_avg8x8);
#endif /* CONFIG_DUALPRED */
#ifndef vp8_subpix_bilinear8x4
#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_c
#endif
......@@ -69,10 +93,18 @@ typedef struct
{
vp8_subpix_fn_t sixtap16x16;
vp8_subpix_fn_t sixtap8x8;
#if CONFIG_DUALPRED
vp8_subpix_fn_t sixtap_avg16x16;
vp8_subpix_fn_t sixtap_avg8x8