Commit 342a368f authored by Yunqing Wang's avatar Yunqing Wang

Do sub-pixel motion search in up-sampled reference frames

Up-sampled the reference frames to 8 times in each dimension using
the 8-tap interpolation filter. In sub-pixel motion search, use the
up-sampled reference frames to find the best matching blocks. This
largely improved the motion search precision, and thus, improved
the compression quality. There was no change in decoder side.

Borg test and speed test results:
1. On derflr set,
Overall PSNR gain: 1.306%, and SSIM gain: 1.512%.
Average speed loss on derf set was 6.0%.
2. On stdhd set,
Overall PSNR gain: 0.754%, and SSIM gain: 0.814%.
On hevchd set,
Overall PSNR gain: 0.465%, and SSIM gain: 0.527%.
Speed loss on HD clips was 3.5%.

Change-Id: I300ebaafff57e88914f3dedc8784cb21d316b04f
parent db084506
......@@ -283,6 +283,7 @@ EXPERIMENT_LIST="
loop_restoration
ext_partition
obmc
affine_motion
"
CONFIG_LIST="
dependency_tracking
......
This diff is collapsed.
......@@ -286,6 +286,13 @@ typedef struct IMAGE_STAT {
double worst;
} ImageStat;
#if CONFIG_AFFINE_MOTION
typedef struct {
int ref_count;
YV12_BUFFER_CONFIG buf;
} EncRefCntBuffer;
#endif
typedef struct VP10_COMP {
QUANTS quants;
ThreadData td;
......@@ -304,6 +311,12 @@ typedef struct VP10_COMP {
YV12_BUFFER_CONFIG *unscaled_last_source;
YV12_BUFFER_CONFIG scaled_last_source;
#if CONFIG_AFFINE_MOTION
// Up-sampled reference buffers
EncRefCntBuffer upsampled_ref_bufs[MAX_REF_FRAMES];
int upsampled_ref_idx[MAX_REF_FRAMES];
#endif
TileDataEnc *tile_data;
int allocated_tiles; // Keep track of memory allocated for tiles.
......@@ -692,4 +705,18 @@ void vp10_new_framerate(VP10_COMP *cpi, double framerate);
} // extern "C"
#endif
#if CONFIG_AFFINE_MOTION
// Update up-sampled reference frame index.
static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
int new_uidx) {
const int ref_index = *uidx;
if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
ubufs[ref_index].ref_count--;
*uidx = new_uidx;
ubufs[new_uidx].ref_count++;
}
#endif
#endif // VP10_ENCODER_ENCODER_H_
......@@ -64,7 +64,11 @@ static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi,
&v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
cond_cost_list(cpi, cost_list),
NULL, NULL,
#if CONFIG_AFFINE_MOTION
&distortion, &sse, NULL, 0, 0, 0);
#else
&distortion, &sse, NULL, 0, 0);
#endif
}
#if CONFIG_EXT_INTER
......
This diff is collapsed.
......@@ -116,7 +116,11 @@ typedef int (fractional_mv_step_fp) (
int *mvjcost, int *mvcost[2],
int *distortion, unsigned int *sse1,
const uint8_t *second_pred,
#if CONFIG_AFFINE_MOTION
int w, int h, int use_upsampled_ref);
#else
int w, int h);
#endif
extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree;
extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned;
......
......@@ -3929,7 +3929,8 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
int_mv* ref_mv_sub8x8[2],
#endif
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv) {
int *rate_mv,
const int block) {
const VP10_COMMON *const cm = &cpi->common;
const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
......@@ -4076,6 +4077,40 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
if (bestsme < INT_MAX) {
int dis; /* TODO: use dis in distortion calculation later. */
unsigned int sse;
#if CONFIG_AFFINE_MOTION
// Use up-sampled reference frames.
struct macroblockd_plane *const pd = &xd->plane[0];
struct buf_2d backup_pred = pd->pre[0];
const YV12_BUFFER_CONFIG *upsampled_ref =
get_upsampled_ref(cpi, refs[id]);
// Set pred for Y plane
setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
NULL, pd->subsampling_x, pd->subsampling_y);
// If bsize < BLOCK_8X8, adjust pred pointer for this block
if (bsize < BLOCK_8X8)
pd->pre[0].buf =
&pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block,
pd->pre[0].stride)) << 3];
bestsme = cpi->find_fractional_mv_step(
x, &tmp_mv,
&ref_mv[id].as_mv,
cpi->common.allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[bsize],
0, cpi->sf.mv.subpel_iters_per_step,
NULL,
x->nmvjointcost, x->mvcost,
&dis, &sse, second_pred,
pw, ph, 1);
// Restore the reference frames.
pd->pre[0] = backup_pred;
#else
(void) block;
bestsme = cpi->find_fractional_mv_step(
x, &tmp_mv,
&ref_mv[id].as_mv,
......@@ -4087,6 +4122,7 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
x->nmvjointcost, x->mvcost,
&dis, &sse, second_pred,
pw, ph);
#endif
}
// Restore the pointer to the first (possibly scaled) prediction buffer.
......@@ -4367,6 +4403,43 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
if (bestsme < INT_MAX) {
int distortion;
#if CONFIG_AFFINE_MOTION
const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
// Use up-sampled reference frames.
struct macroblockd_plane *const pd = &xd->plane[0];
struct buf_2d backup_pred = pd->pre[0];
const YV12_BUFFER_CONFIG *upsampled_ref =
get_upsampled_ref(cpi, mbmi->ref_frame[0]);
// Set pred for Y plane
setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
upsampled_ref->y_stride,
(mi_row << 3), (mi_col << 3),
NULL, pd->subsampling_x, pd->subsampling_y);
// adjust pred pointer for this block
pd->pre[0].buf =
&pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i,
pd->pre[0].stride)) << 3];
cpi->find_fractional_mv_step(
x,
new_mv,
&bsi->ref_mv[0]->as_mv,
cm->allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[bsize],
cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_iters_per_step,
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&distortion,
&x->pred_sse[mbmi->ref_frame[0]],
NULL, pw, ph, 1);
// Restore the reference frames.
pd->pre[0] = backup_pred;
#else
cpi->find_fractional_mv_step(
x,
new_mv,
......@@ -4380,6 +4453,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
&distortion,
&x->pred_sse[mbmi->ref_frame[0]],
NULL, 0, 0);
#endif
// save motion search result for use in compound prediction
#if CONFIG_EXT_INTER
......@@ -4426,7 +4500,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
#else
seg_mvs[i],
#endif // CONFIG_EXT_INTER
&rate_mv);
&rate_mv, i);
#if CONFIG_EXT_INTER
compound_seg_newmvs[i][0].as_int =
frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
......@@ -4975,6 +5049,33 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
if (bestsme < INT_MAX) {
int dis; /* TODO: use dis in distortion calculation later. */
#if CONFIG_AFFINE_MOTION
const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
// Use up-sampled reference frames.
struct macroblockd_plane *const pd = &xd->plane[0];
struct buf_2d backup_pred = pd->pre[0];
const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
// Set pred for Y plane
setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
NULL, pd->subsampling_x, pd->subsampling_y);
bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
cm->allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[bsize],
cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_iters_per_step,
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL,
pw, ph, 1);
// Restore the reference frames.
pd->pre[0] = backup_pred;
#else
cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
cm->allow_high_precision_mv,
x->errorperbit,
......@@ -4984,6 +5085,7 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
#endif
}
*rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
......@@ -5328,7 +5430,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
joint_motion_search(cpi, x, bsize, frame_mv,
mi_row, mi_col, NULL, single_newmv, &rate_mv);
mi_row, mi_col, NULL, single_newmv, &rate_mv, 0);
} else {
rate_mv = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
&x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
......@@ -5358,7 +5460,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
joint_motion_search(cpi, x, bsize, frame_mv,
mi_row, mi_col,
single_newmv, &rate_mv);
single_newmv, &rate_mv, 0);
} else {
rate_mv = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
&x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
......
......@@ -106,4 +106,20 @@ void vp10_build_prediction_by_left_preds(VP10_COMP *cpi,
} // extern "C"
#endif
#if CONFIG_AFFINE_MOTION
static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
const int ref) {
// Use up-sampled reference frames.
int ref_idx = 0;
if (ref == LAST_FRAME)
ref_idx = cpi->lst_fb_idx;
else if (ref == GOLDEN_FRAME)
ref_idx = cpi->gld_fb_idx;
else if (ref == ALTREF_FRAME)
ref_idx = cpi->alt_fb_idx;
return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
}
#endif
#endif // VP10_ENCODER_RDOPT_H_
......@@ -320,7 +320,11 @@ static int temporal_filter_find_matching_mb_c(VP10_COMP *cpi,
0, mv_sf->subpel_iters_per_step,
cond_cost_list(cpi, cost_list),
NULL, NULL,
#if CONFIG_AFFINE_MOTION
&distortion, &sse, NULL, 0, 0, 0);
#else
&distortion, &sse, NULL, 0, 0);
#endif
// Restore input state
x->plane[0].src = src;
......
......@@ -272,6 +272,41 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
}
}
#if CONFIG_AFFINE_MOTION
// Get pred block from up-sampled reference.
void vpx_upsampled_pred_c(uint8_t *comp_pred,
int width, int height,
const uint8_t *ref, int ref_stride) {
int i, j, k;
int stride = ref_stride << 3;
for (i = 0; i < height; i++) {
for (j = 0, k = 0; j < width; j++, k += 8) {
comp_pred[j] = ref[k];
}
comp_pred += width;
ref += stride;
}
}
void vpx_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
int width, int height,
const uint8_t *ref, int ref_stride) {
int i, j;
int stride = ref_stride << 3;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = ref[(j << 3)] + pred[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += stride;
}
}
#endif
#if CONFIG_VP9_HIGHBITDEPTH
static void highbd_variance64(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
......
......@@ -1464,6 +1464,13 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
if (vpx_config("CONFIG_AFFINE_MOTION") eq "yes") {
add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
specialize qw/vpx_upsampled_pred sse2/;
add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
specialize qw/vpx_comp_avg_upsampled_pred sse2/;
}
#
# Subpixel Variance
#
......
......@@ -475,3 +475,232 @@ FNS(ssse3, ssse3);
#undef FNS
#undef FN
#endif // CONFIG_USE_X86INC
#if CONFIG_AFFINE_MOTION
void vpx_upsampled_pred_sse2(uint8_t *comp_pred,
int width, int height,
const uint8_t *ref, int ref_stride) {
int i, j;
int stride = ref_stride << 3;
if (width >= 16) {
// read 16 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j+= 16) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
__m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
__m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
__m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
__m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
__m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
t1 = _mm_unpacklo_epi8(s2, s3);
s3 = _mm_unpackhi_epi8(s2, s3);
t2 = _mm_unpacklo_epi8(s4, s5);
s5 = _mm_unpackhi_epi8(s4, s5);
t3 = _mm_unpacklo_epi8(s6, s7);
s7 = _mm_unpackhi_epi8(s6, s7);
s0 = _mm_unpacklo_epi8(t0, s1);
s2 = _mm_unpacklo_epi8(t1, s3);
s4 = _mm_unpacklo_epi8(t2, s5);
s6 = _mm_unpacklo_epi8(t3, s7);
*(int *)comp_pred = _mm_cvtsi128_si32(s0);
*(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2);
*(int *)(comp_pred + 8) = _mm_cvtsi128_si32(s4);
*(int *)(comp_pred + 12) = _mm_cvtsi128_si32(s6);
comp_pred += 16;
ref += 16 * 8;
}
ref += stride - (width << 3);
}
} else if (width >= 8) {
// read 8 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j+= 8) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
__m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
__m128i t0, t1;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
t1 = _mm_unpacklo_epi8(s2, s3);
s3 = _mm_unpackhi_epi8(s2, s3);
s0 = _mm_unpacklo_epi8(t0, s1);
s2 = _mm_unpacklo_epi8(t1, s3);
*(int *)comp_pred = _mm_cvtsi128_si32(s0);
*(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2);
comp_pred += 8;
ref += 8 * 8;
}
ref += stride - (width << 3);
}
} else {
// read 4 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j+= 4) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i t0;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
s0 = _mm_unpacklo_epi8(t0, s1);
*(int *)comp_pred = _mm_cvtsi128_si32(s0);
comp_pred += 4;
ref += 4 * 8;
}
ref += stride - (width << 3);
}
}
}
void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
int width, int height,
const uint8_t *ref, int ref_stride) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
int i, j;
int stride = ref_stride << 3;
if (width >= 16) {
// read 16 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j+= 16) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
__m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
__m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
__m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
__m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
__m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
__m128i p0 = _mm_loadu_si128((const __m128i *)pred);
__m128i p1;
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
t1 = _mm_unpacklo_epi8(s2, s3);
s3 = _mm_unpackhi_epi8(s2, s3);
t2 = _mm_unpacklo_epi8(s4, s5);
s5 = _mm_unpackhi_epi8(s4, s5);
t3 = _mm_unpacklo_epi8(s6, s7);
s7 = _mm_unpackhi_epi8(s6, s7);
s0 = _mm_unpacklo_epi8(t0, s1);
s2 = _mm_unpacklo_epi8(t1, s3);
s4 = _mm_unpacklo_epi8(t2, s5);
s6 = _mm_unpacklo_epi8(t3, s7);
s0 = _mm_unpacklo_epi32(s0, s2);
s4 = _mm_unpacklo_epi32(s4, s6);
s0 = _mm_unpacklo_epi8(s0, zero);
s4 = _mm_unpacklo_epi8(s4, zero);
p1 = _mm_unpackhi_epi8(p0, zero);
p0 = _mm_unpacklo_epi8(p0, zero);
p0 = _mm_adds_epu16(s0, p0);
p1 = _mm_adds_epu16(s4, p1);
p0 = _mm_adds_epu16(p0, one);
p1 = _mm_adds_epu16(p1, one);
p0 = _mm_srli_epi16(p0, 1);
p1 = _mm_srli_epi16(p1, 1);
p0 = _mm_packus_epi16(p0, p1);
*(int *)comp_pred = _mm_cvtsi128_si32(p0);
p0 = _mm_srli_si128(p0, 4);
*(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0);
p0 = _mm_srli_si128(p0, 4);
*(int *)(comp_pred + 8) = _mm_cvtsi128_si32(p0);
p0 = _mm_srli_si128(p0, 4);
*(int *)(comp_pred + 12) = _mm_cvtsi128_si32(p0);
comp_pred += 16;
pred += 16;
ref += 16 * 8;
}
ref += stride - (width << 3);
}
} else if (width >= 8) {
// read 8 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j+= 8) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
__m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
__m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
__m128i t0, t1;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
t1 = _mm_unpacklo_epi8(s2, s3);
s3 = _mm_unpackhi_epi8(s2, s3);
s0 = _mm_unpacklo_epi8(t0, s1);
s2 = _mm_unpacklo_epi8(t1, s3);
s0 = _mm_unpacklo_epi32(s0, s2);
s0 = _mm_unpacklo_epi8(s0, zero);
p0 = _mm_unpacklo_epi8(p0, zero);
p0 = _mm_adds_epu16(s0, p0);
p0 = _mm_adds_epu16(p0, one);
p0 = _mm_srli_epi16(p0, 1);
p0 = _mm_packus_epi16(p0, zero);
*(int *)comp_pred = _mm_cvtsi128_si32(p0);
p0 = _mm_srli_si128(p0, 4);
*(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0);
comp_pred += 8;
pred += 8;
ref += 8 * 8;
}
ref += stride - (width << 3);
}
} else {
// read 4 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j+= 4) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
__m128i t0;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
s0 = _mm_unpacklo_epi8(t0, s1);
s0 = _mm_unpacklo_epi8(s0, zero);
p0 = _mm_unpacklo_epi8(p0, zero);
p0 = _mm_adds_epu16(s0, p0);
p0 = _mm_adds_epu16(p0, one);
p0 = _mm_srli_epi16(p0, 1);
p0 = _mm_packus_epi16(p0, zero);
*(int *)comp_pred = _mm_cvtsi128_si32(p0);
comp_pred += 4;
pred += 4;
ref += 4 * 8;
}
ref += stride - (width << 3);
}
}
}
#endif
......@@ -210,6 +210,30 @@ void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
extend_frame(ybf, inner_bw);
}
void vpx_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
int ext_size = ybf->border;
assert(ybf->y_height - ybf->y_crop_height < 16);
assert(ybf->y_width - ybf->y_crop_width < 16);
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
extend_plane_high(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
ext_size, ext_size,
ext_size + ybf->y_height - ybf->y_crop_height,
ext_size + ybf->y_width - ybf->y_crop_width);
return;
}
#endif
extend_plane(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
ext_size, ext_size,
ext_size + ybf->y_height - ybf->y_crop_height,
ext_size + ybf->y_width - ybf->y_crop_width);
}
#if CONFIG_VP9_HIGHBITDEPTH
void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
......
......@@ -28,5 +28,8 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes"))
add_proto qw/void vpx_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
specialize qw/vpx_extend_frame_inner_borders dspr2/;
add_proto qw/void vpx_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
specialize qw/vpx_extend_frame_borders_y/;
}
1;