Commit 8d568312 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

vp9_sub_pixel_variance16x2 SSE2 optimization

About 5% decoder speedup.

Change-Id: Ib6687d337af758a536a0e7e289f400990f1f9794
parent c14439c3
......@@ -185,33 +185,33 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
offset = ref_y_stride * row_offset + col_offset;
score = 0;
if (xd->up_available) {
vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src, xd->dst.y_stride, &sse);
vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src, xd->dst.y_stride, &sse);
score += sse;
#if CONFIG_SUPERBLOCKS
if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
vp9_sub_pixel_variance16x2_c(above_ref + offset + 16,
ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src + 16, xd->dst.y_stride, &sse);
vp9_sub_pixel_variance16x2(above_ref + offset + 16,
ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src + 16, xd->dst.y_stride, &sse);
score += sse;
}
#if CONFIG_SUPERBLOCKS64
if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
vp9_sub_pixel_variance16x2_c(above_ref + offset + 32,
ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src + 32, xd->dst.y_stride, &sse);
vp9_sub_pixel_variance16x2(above_ref + offset + 32,
ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src + 32, xd->dst.y_stride, &sse);
score += sse;
vp9_sub_pixel_variance16x2_c(above_ref + offset + 48,
ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src + 48, xd->dst.y_stride, &sse);
vp9_sub_pixel_variance16x2(above_ref + offset + 48,
ref_y_stride,
SP(this_mv.as_mv.col),
SP(this_mv.as_mv.row),
above_src + 48, xd->dst.y_stride, &sse);
score += sse;
}
#endif
......
......@@ -252,6 +252,11 @@ specialize vp9_sad16x3 sse2
prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"
specialize vp9_sad3x16 sse2
if [ "$CONFIG_SUBPELREFMV" = "yes" ]; then
prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x2 sse2
fi
#
# Sub Pixel Filters
#
......
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#define HALFNDX 8
void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared);
void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared);
void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared);
void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared);
unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 2,
&xsum0, &xxsum0);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 2,
&xsum0, &xxsum0);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 2,
&xsum0, &xxsum0);
} else {
vp9_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 2,
xoffset, yoffset,
&xsum0, &xxsum0);
vp9_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 2,
xoffset, yoffset,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));
}
......@@ -95,6 +95,9 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
ifeq ($(CONFIG_SUBPELREFMV),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
endif
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment